File Coverage

blib/lib/Lingua/Norms/SUBTLEX.pm
Criterion Covered Total %
statement 335 377 88.8
branch 103 174 59.2
condition 14 29 48.2
subroutine 60 68 88.2
pod 29 29 100.0
total 541 677 79.9


line stmt bran cond sub pod time code
1             package Lingua::Norms::SUBTLEX;
2 17     17   1233861 use 5.12.0;
  17         185  
3 17     17   106 use strict;
  17         28  
  17         372  
4 17     17   83 use warnings FATAL => 'all';
  17         27  
  17         675  
5 17     17   98 use base qw(Lingua::Orthon);
  17         20  
  17         9007  
6 17     17   558972 use Config;
  17         40  
  17         688  
7 17     17   83 use Carp qw(carp croak);
  17         32  
  17         768  
8            
9             #use Encode qw(encode decode);
10             #use Encode::Guess;
11 17     17   8038 use English '-no_match_vars';
  17         52953  
  17         97  
12 17     17   4935 use File::Spec;
  17         33  
  17         424  
13 17     17   70 use List::AllUtils qw(all any first firstidx none uniq);
  17         27  
  17         889  
14 17     17   93 use Number::Misc qw(is_numeric);
  17         31  
  17         836  
15 17     17   12706 use Path::Tiny;
  17         172141  
  17         793  
16 17     17   8519 use Readonly;
  17         57378  
  17         853  
17 17     17   116 use Statistics::Lite qw(count max mean median stddev sum);
  17         40  
  17         847  
18 17     17   6852 use String::Trim qw(trim);
  17         8660  
  17         1001  
19 17     17   111 use String::Util qw(hascontent crunch fullchomp nocontent unquote);
  17         44  
  17         746  
20 17     17   7980 use Text::CSV::Hashify;
  17         1533986  
  17         1168  
21 17     17   7934 use Text::CSV::Separator qw(get_separator);
  17         28486  
  17         981  
22 17     17   7981 use Text::Unidecode;
  17         28128  
  17         51317  
23            
24             #use open ':encoding(utf8)';
25            
26             $Lingua::Norms::SUBTLEX::VERSION = '0.07';
27            
28             =pod
29            
30             =encoding utf8
31            
32             =head1 NAME
33            
34             Lingua::Norms::SUBTLEX - Retrieve word frequencies and related values and lists from subtitles corpora
35            
36             =head1 VERSION
37            
38             This is documentation for B of Lingua::Norms::SUBTLEX.
39            
40             =head1 SYNOPSIS
41            
42             use Lingua::Norms::SUBTLEX 0.07;
43             my $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'UK');
44            
45             # Is the string 'frog' in the subtitles corpus?
46             my $bool = $subtlex->is_normed(string => 'frog');
47            
48             # Occurrences-per-million:
49             # - for a single string:
50             my $frq = $subtlex->frq_opm(string => 'frog'); # freq. per million; also count, log-f, Zipf
51            
52             # - for a list of strings:
53             my $href = $subtlex->frq_hash(strings => [qw/frog fish ape/]); # freqs. for a list of words
54             print "'$_' opm\t$href->{$_}\n" for keys %{$href};
55            
56             # stats:
57             printf "mean opm\t%f\n", $subtlex->frq_mean(strings => [qw/frog fish ape/]); # or median, std-dev.
58            
59             # parts-of-speech:
60             printf "'frog' part-of-speech = %s\n", $subtlex->pos_dom(string => 'frog');
61            
62             # retrieve (list of) words to certain specs, e.g., min/max range:
63             my $aref = $subtlex->select_words(freq => [2, 400], length => [4, 4], cv_pattern => 'CCVC', regex => '^f');
64             printf "Number of 4-letter CCVC strings with 2-400 opm starting with 'f' = %d\n", scalar @{$aref};
65            
66             printf "A randomly selected subtitles string is '%s'\n", $subtlex->random_string();
67            
68             =head1 DESCRIPTION
69            
70             This module facilitates access to corpus frequency and other lexical attributes of character strings (generally, words), as provided in the various SUBTLEX and related projects (see L) on the basis of the representation of these strings in film and television subtitles (see L). Word frequencies obtained in this way have been shown to be generally more predictive of performance in word recognition tasks than frequencies derived from books, newsgroup posts, and similar sources (but see Herdagdelen & Marelli, 2017).
71            
72             There are three main groups of measures that are potentially retrievable from the SUBTLEX datatables: (1) frequency; (2) contextual diversity (number of films/TV episodes appeared in); and (3) parts-of-speech. The module tries to uniformly offer, across the available files, frequency as a count (L), occurrences per million (L), logarithm of the opm or frequency count (L), and/or the 7-point scaled Zipf frequency (L). "Contextual diversity" is given as a count (L), a percentage (L), and/or a logarithm (L). For parts-of-speech, the module returns, via L, the dominant linguistic syntactical role of the word, as well as all defined parts-of-speech for a word (via L).
73            
74             However, not all these methods are available across all projects; e.g., SUBTLEX-NL does not define Zipf frequency, and SUBTLEX-DE does not define CD, POS or Zipf frequency. In these cases, the method in question will return an empty string.
75            
76             =head1 CORPORA SPECS and SOURCES
77            
78             The SUBTLEX files need to be downloaded via the URLs shown in the table below (only a small sample from each of each of the SUBTLEX corpora is included in the installation distribution for testing purposes). So, for example, for the I norms, install the file named "SUBTLEX-US frequency list with PoS and Zipf information.csv" via L.
79            
80             The local directory location or actual pathname of these files can be given in class construction (by the arguments B and B, respectively); or it will be sought from the default location--within the directory "SUBTLEX" alongside the module itself in the locally configured Perl sitelib--given the B argument to L, or to L. The filenames of the original files downloaded from the following sites should be found in this way, but it should uniquely include the "key" shown in the table. The module will attempt to identify the correct field separator for the file (which can be comma-separated or tab-delimited). Only the files specified in the table are likely to be reliably accessed at this time.
81            
82             =for html

  

83            
LanguageKeyURLFile
84            
DutchNL_allcrr.ugent.beSUBTLEX-NL.with-pos.txt
85            
 NL_mincrr.ugent.beSUBTLEX-NL.cd-above2.with-pos.txt
86            
English (American)USexpsy.ugent.be/subtlexusSUBTLEX-US frequency list with PoS and Zipf information.csv
87            
88            
English (British)UKpsychology.nottingham.ac.ukSUBTLEX-UK.txt
89            
FrenchFRlexique.orgLexique381.txt
90            
GermanDEcrr.ugent.beSUBTLEX-DE_cleaned_with_Google00.txt
91            
PortuguesePTp-pal.di.uminho.ptSUBTLEX-PT_Soares_et_al._QJEP.csv
92            

93            
94             Notes regarding these different corpora.
95            
96             =over 4
97            
98             =item * SUBTLEX-DE
99            
100             The file has separate entries for words starting with an uppercase and a lowercase letter (e.g., for when a letter-string is both a noun and an adjective).
101            
102             =item * Lexique (SUBTLEX-FR)
103            
104             If not giving the full path to this file, it should be renamed to include "FR" (e.g., "FR_Lexique.csv") and stored in the default directory. The file also includes frequencies from books.
105            
106             =item * SUBTLEX-PT
107            
108             The I subtitles data are available as an Excel file (directly from L). This file needs to be saved as a (csv) text file to be usable here.
109            
110             =item * SUBTLEX-UK
111            
112             Includes words that might be spelled with a dash both with a dash and without; so there are separate entries for I and I, and for I and I. It includes some strings with apostrophes (e.g., I, I); but common contractions like I, I and I do not appear; they are stripped of their apostrophes, listed, e.g., as I, I and I. All strings are in lower-case; so I is represented as I.
113            
114             =item * SUBTLEX-US
115            
116             There are no strings with capitalized onsets in this file, or with punctuation marks, including apostrophes and dashes (e.g., I and I are represented as I and I; I as I, and I as I).
117            
118             The earlier, original file "SUBTLEXusExcel2007.csv" presents strings as they were originally capitalised: there is, e.g., I and I--but neither I nor I. This file does not provide part-of-speech or Zipf frequencies.
119            
120             =back
121            
122             There are several other languages from this project which might be supported by this module in a later version (originally, only SUBTLEX-US was supported).
123            
124             See the new() method as to how this module handles case-sensitivity and diacritical marks. For files where strings are UTF-8 encoded, the strings being looked up should also be UTF-8 encoded (if they are diacritically marked, e.g. "embâcle")(see L).
125            
126             If using Miscrosoft Excel to save any of these files, even if in CSV format, Excel will turn the words "true" and "false" into the Boolean strings "TRUE" and "FALSE", as well as throw them aside from alphabetic sorting (right down to the bottom of an alphabetic sort). That will surely stuff up any neatly intended pattern-matching for these words.
127            
128             =head1 SUBROUTINES/METHODS
129            
130             All methods are called via the class object, and with named (hash of) arguments, usually B, where relevant.
131            
132             =head2 new
133            
134             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE'); # - looking in Perl sitelib
135             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE', dir => 'file_directory'); # folder in which file is located
136             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE', path => 'file/is/here.csv'); # complete path to file for given language
137            
138             Returns a class object for accessing other methods. The argument B is required, specifying the particular language datafile by a "key" as given in the above table. Optional arguments B or B can be given to specify the location or filepath of the database. The default location is the "Lingua/Norms/SUBTLEX" directory within the 'sitelib' configured for the local Perl installation (as per L). The method will C if the file cannot be found.
139            
140             The optional argument B specifies how string comparison, as when looking up a given word in the SUBTLEX corpus, should be conducted, with the function used to test string equality being derived from the C function in L (part of the standard Perl distribution). This matching level applies to the look-up of strings within all methods, including those specifically assessing orthographic equality. This argument can take one of three values: see L:
141            
142             =cut
143            
144             sub new {
145 23     23 1 13631 my ( $class, %args ) = @_;
146 23 50       114 my $self = bless {}, ref($class) ? ref($class) : $class;
147             $self->{'_MODULE_DIR'} =
148 23         1586 File::Spec->catdir( $Config{'sitelib'}, qw/Lingua Norms SUBTLEX/ );
149 23         175 $self->set_lang(%args);
150 23         270 $self->set_eq( match_level => $args{'match_level'} );
151            
152             #_set_encoding($args{'decode'});
153 23         951 return $self;
154             }
155            
156             =head2 Frequencies and POS for individual words or word-lists
157            
158             =head3 is_normed
159            
160             $bool = $subtlex->is_normed(string => $word);
161            
162             I: isa_word
163            
164             Returns 1 or 0 as to whether or not the letter-string passed as B is represented in the subtitles file. For some files, this might be thought of as a lexical decision ("does this string spell a word?"); but others include misspelled words (e.g., "pyscho"), digit strings, abbreviations ...
165            
166             =cut
167            
168             sub is_normed {
169 10     10 1 116431 my ( $self, %args ) = @_;
170 10         32 my $str = _get_usr_str( $args{'string'} );
171 10         16 my $res = 0; # boolean to return from this sub
172 10 50       479 open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
173 10         312 while (<$fh>) {
174 227 100       60266 next if $INPUT_LINE_NUMBER == 1; # skip headings
175 217         375 my $file_str = _get_file_str( $_, $self->{'_DELIM'} )
176             ; # have to declare as can be empty (!)
177 217 50       5089 next if nocontent($file_str);
178 217 100       1897 if ( $self->{'_EQ'}->( $str, $file_str ) )
179             { # first token equals given string?
180 6         1737 $res = 1; # set result to return as true
181 6         13 last; # got it, so abort look-up
182             }
183             }
184 10 50       978 close $fh or croak $OS_ERROR;
185 10         73 return $res; # zero if string not found in file
186             }
187             *isa_word = \&is_normed;
188            
189             =head3 frq_count
190            
191             $int = $subtlex->frq_count(string => 'aword');
192            
193             Returns the raw number of occurrences in all the films/TV episodes for the word passed as B, or 0 if the string is not found in language file.
194            
195             =cut
196            
197             sub frq_count {
198 6     6 1 3104 my ( $self, %args ) = @_;
199             return _val_or_0(
200             _get_val_for_str(
201             _get_usr_str( $args{'string'} ),
202             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_count_idx' ),
203 6         22 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  18         134  
204             )
205             );
206             }
207            
208             =head3 frq_opm
209            
210             $val = $subtlex->frq_opm(string => 'aword');
211            
212             I: opm
213            
214             Returns frequency per million for the word passed as B, or 0 if the string is not found in language file.
215            
216             =cut
217            
218             sub frq_opm {
219 30     30 1 7528 my ( $self, %args ) = @_;
220             return _val_or_0(
221             _get_val_for_str(
222             _get_usr_str( $args{'string'} ),
223             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' ),
224 30         77 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  90         591  
225             )
226             );
227             }
228             *freq = \&frq_opm; # legacy only
229            
230             =head3 frq_log
231            
232             $val = $subtlex->frq_log(string => 'aword');
233            
234             Returns log frequency per million for the word passed as B, or the empty-string if the string is not represented in the norms.
235            
236             =cut
237            
238             sub frq_log {
239 5     5 1 1946 my ( $self, %args ) = @_;
240             return _get_val_for_str(
241             _get_usr_str( $args{'string'} ),
242             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_log_idx' ),
243 5         15 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         104  
244             );
245             }
246             *lfreq = \&frq_log; # legacy only
247            
248             =head3 frq_zipf
249            
250             $val = $subtlex->frq_zipf(string => 'aword');
251            
252             Returns Zipf frequency for the word passed as B, or the empty-string if the string is not represented in the language file. The Zipf scale ranges from about 1 to 7, with values of 1-3 generally representing low frequency words, and values of generally 4-7+ representing high frequency words, with respect to various recognition measures used in the study of word frequency effects. See Van Heuven et al. (2014) and L for more information.
253            
254             =cut
255            
256             sub frq_zipf {
257 6     6 1 1429 my ( $self, %args ) = @_;
258             return _get_val_for_str(
259             _get_usr_str( $args{'string'} ),
260             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_zipf_idx' ),
261 6         19 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  18         124  
262             );
263             }
264             *zipf = \&frq_zipf; # legacy only
265            
266             =head3 frq_zipf_calc
267            
268             $calc = $subtlex->frq_zipf_calc( string => 'favourite' );
269             $calc = $subtlex->frq_zipf_calc( string => 'favourite', corpus_size => POS_FLOAT_in_millions, n_wordtypes => POS_INT );
270            
271             Returns an estimate of Zipf frequency by calculating its value from the given or retrievable L or L, and the given or retrievable values of the corpus_size and n_wordtypes for the particular SUBTLEX project; i.e., the values of corpus_size and n_wordtypes can be provided as named arguments. As introduced by Van Heuven et al. (2014) (see also L):
272            
273             =for html

  Zipf = log10[ ( frq_count + 1 ) / ( corpus_size + n_wordtypes )/1000000 ] + 3

274            
275             How well the returned value satisfies the "border relations" desired of the index (e.g., that up to 1 opm corresponds to Zipf of E 3) depends on the reliability of the corpus size and wordtype counts, and any rounding of these values (where relevant) and (if required) of the opm. Examinations of the returned values show that, when using the canned and reported values (which is the default here), they align with these definitions, and with any canned Zipf values, within the margins of about the third or fourth decimal place.
276            
277             =cut
278            
279             sub frq_zipf_calc {
280 0     0 1 0 my ( $self, %args ) = @_;
281             my $corpus_size =
282             defined $args{'size_corpus'}
283             ? $args{'size_corpus'}
284 0 0       0 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'size_corpus_mill' );
285             my $n_wordtypes =
286             defined $args{'n_wordtypes'}
287             ? $args{'n_wordtypes'}
288 0 0       0 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'n_wordtypes' );
289            
290 0         0 $n_wordtypes /= 1_000_000;
291            
292             my $frq_count =
293             is_numeric( $args{'frq_count'} )
294             ? $args{'frq_count'}
295             : is_numeric( $args{'frq_opm'} ) ? sprintf "%.0f",
296 0 0       0 $args{'frq_opm'} * $corpus_size : eval { $self->frq_count(%args) };
  0 0       0  
297            
298 0 0 0     0 if ($EVAL_ERROR or not is_numeric($frq_count) ) {
299 0         0 my $frq_opm = eval { $self->frq_opm(%args) };
  0         0  
300 0 0 0     0 if (not $EVAL_ERROR and is_numeric($frq_opm) ) {
301 0         0 $frq_count = sprintf "%.0f", $frq_opm * $corpus_size;
302             }
303             }
304 0   0     0 $frq_count ||= 0;
305            
306 0         0 return _log10( ( 1 + $frq_count ) / ( $corpus_size + $n_wordtypes ) ) + 3;
307            
308             }
309            
310             =head3 frq_opm2count
311            
312             $int = $subtlex->frq_opm2count(string => STRING);
313            
314             Returns the raw number of occurrences of a string (the frq_count) based on the number of occurrences per million (frq_opm), and the corpus size in millions. Returns 0 if the string is not found in language file.
315            
316             The B can be given as a named argument, or it will be retrieved by the L respective method, where this is defined for a particular language file. The B (in millions) can also be given as a named argument, or it will be retrieved from the specifications file (specs.csv in the module's directory), where this value has been obtainable from published reports.
317            
318             =cut
319            
320             sub frq_opm2count {
321 5     5 1 2433 my ( $self, %args ) = @_;
322             my $frq_opm =
323 5 50       27 defined $args{'frq_opm'} ? $args{'frq_opm'} : $self->frq_opm(%args);
324             my $corpus_size =
325             defined $args{'size_corpus'}
326             ? $args{'size_corpus'}
327 5 50       115 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'size_corpus_mill' );
328 5         95 return sprintf "%.0f", $frq_opm * $corpus_size;
329             }
330            
331             =head3 cd_count
332            
333             $cd = $subtlex->cd_count(string => STRING);
334            
335             Returns the number of samples (films/TV episodes) comprising the corpus in which the string occurred in its subtitles; so-called "contextual diversity". Returns 0 if the string is not found in language file.
336            
337             =cut
338            
339             sub cd_count {
340 5     5 1 1027 my ( $self, %args ) = @_;
341             return _val_or_0(
342             _get_val_for_str(
343             _get_usr_str( $args{'string'} ),
344             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_count_idx' ),
345 5         13 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         96  
346             )
347             );
348             }
349            
350             =head3 cd_pct
351            
352             $cd = $subtlex->cd_pct(string => 'aword');
353            
354             Returns a percentage measure for the number of samples (films/TV episodes) comprising the corpus in which the B occurred in its subtitles; so-called "contextual diversity". Returns 0 if the string is not found in language file.
355            
356             =cut
357            
358             sub cd_pct {
359 5     5 1 563 my ( $self, %args ) = @_;
360             return _val_or_0(
361             _get_val_for_str(
362             _get_usr_str( $args{'string'} ),
363             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_pct_idx' ),
364 5         16 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         120  
365             )
366             );
367             }
368            
369             =head3 cd_log
370            
371             $cd = $subtlex->cd_log(string => 'aword');
372            
373             Returns log10(L + 1) for the given string, with 4-digit precision. Note: Brysbaert and New (2009) state that "this is the best value to use if one wants to match words on word frequency" (p. 988).
374            
375             =cut
376            
377             sub cd_log {
378 5     5 1 613 my ( $self, %args ) = @_;
379             return _get_val_for_str(
380             _get_usr_str( $args{'string'} ),
381             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_log_idx' ),
382 5         13 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         93  
383             );
384             }
385            
386             =head3 pos_dom
387            
388             $pos_str = $subtlex->pos_dom(string => STRING, conform => BOOL);
389            
390             Returns the dominant part-of-speech for the given string. The return value is undefined if the string is not found. If the field in the original file (as in SUBTLEX-PT) is actually for all possible parts-of-speech, the first element in the returned string (once split by non-word characters), is returned (assuming, as in SUBTLEX-PT) that this is indeed the most frequent part-of-speech for the particular string.
391            
392             For interpretation of the POS codes: for NL, see L ("SPEC" is there defined as "often personal or geographical names" and so similar to "Name" in SUBTLEX-UK).
393            
394             To transliterate the various codes into a common two-letter code, then set B => 1 (default is not defined, returning the POS string as given in the original files). The two-letter codes are:
395            
396             NN noun (common)
397             NM name (proper)
398             PN pronoun
399             VB verb
400             AJ adjective
401             AV adverb
402             PP proposition
403             CJ conjunction
404             IJ interjection
405             DA determiner or article
406             NB number
407             OT other
408             UK unknown
409            
410             The "OT" code includes some rare POS values (e.g., "marker", "ONO"), anomalous values (e.g., "2"), and values not defined in the associated reports. The "UK" code ("unknown") is comprised of values specifically recorded as "unclassified" or similar, or where the POS field is empty.
411            
412             =cut
413            
414             sub pos_dom {
415 13     13 1 2469 my ( $self, %args ) = @_;
416             my $str = _get_val_for_str(
417             _get_usr_str( $args{'string'} ),
418             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'pos_dom_idx' ),
419 13         44 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  39         283  
420             );
421 13         83 my @ari = map { trim($_) } grep { hascontent($_) } split /[\W]/xsm, $str;
  15         161  
  15         58  
422             return $args{'conform'}
423 13 100       311 ? _pos_is( $ari[0], $self->{'_FIELDS'}, $self->{'_LANG'} )->[0]
424             : $ari[0];
425             }
426             *pos = \&pos_dom;
427            
428             =head3 pos_all
429            
430             $pos_aref = $subtlex->pos_all(string => STRING, conform => BOOL);
431            
432             Returns all parts-of-speech for the given string as a referenced array. The return value is an empty list if the string is not found. If the language file does not define this field, the returned value is simply the same as what would, if possible, be returned from L (i.e., if that value is defined), but now as a referenced array.
433            
434             =cut
435            
436             sub pos_all {
437 9     9 1 2150 my ( $self, %args ) = @_;
438             my $str = _get_val_for_str(
439             _get_usr_str( $args{'string'} ),
440             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'pos_all_idx' ),
441 9         26 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  27         167  
442             );
443            
444             # grep to ensure no empty values as might come from a head/trailing delimiter:
445 9         59 my @ari = map { trim($_) } grep { hascontent($_) } split /[\W]/xsm, $str;
  26         316  
  28         152  
446             return $args{'conform'}
447 9 100       166 ? [ map { @{ _pos_is( $_, $self->{'_FIELDS'}, $self->{'_LANG'} ) } }
  8         11  
  8         16  
448             @ari ]
449             : \@ari;
450             }
451            
452             =head2 Multiple strings/values lists
453            
454             Array given as measures to the following methods might include one or more of the following:
455            
456             frq_count
457             frq_opm
458             frq_log
459             frq_zipf
460             cd_count
461             cd_pct
462             cd_log
463             pos_dom
464             pos_all
465            
466             =head3 values_list
467            
468             $aref = $subtlex->values_list(string => STRING, values => AREF);
469            
470             Returns values for a single letter-string as a referenced array.
471            
472             =cut
473            
474             sub values_list {
475 3     3 1 1893 my ( $self, %args ) = @_;
476 3         7 my @idx_ari;
477 3 50       9 if ( ref $args{'values'} ) {
478 3         5 for my $field ( @{ $args{'values'} } ) {
  3         8  
479             push @idx_ari,
480 6         64 $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $field . '_idx' );
481             }
482             }
483             return _get_val_for_strs( _get_usr_str( $args{'string'} ),
484 3         37 \@idx_ari, map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
  9         21  
485             }
486            
487             =head3 multi_list
488            
489             $hashref = $subtlex->multi_list(strings => AREF_of_char_strings, measures => AREF_of_FIELD_NAMES);
490            
491             $frq_hashref = $subtlex->multi_list(strings => [qw/ICH PEA CHOWDER ZEER AIME/], measures => [qw/frq_opm frq_zipf/]);
492             # $frq_hashref = {
493             # ICH => {
494             # frq_opm => 20000,
495             # frq_zipf => 7.01,
496             # },
497             # PEA => {
498             # frq_opm ...
499             # },
500             # ...
501             # }
502            
503             Returns multiple values for a list of strings as a hashref of hashrefs. This is perhaps the most efficient method here for retrieving several values for several words, but only for a small number of words; it could take a long time to return given large lists.
504            
505             So, given one or more words in the array ref B, and several measures/values to find for each of them (such as 'frq_opm', 'pos_dom' or any other values defined for the particular language file) in the the array B, the method looks line-by-line through the file to check if the line's string is equal to any of those in B. If so, it collates the relevant measures in a hash keyed by the string, whose values are themselves a hash of the measure-names keying each respective measure-value. The found string is then removed from the look-up list, and the next line is looked-up in the same way. The search stops as soon as there are no more strings in the look-up list (all have been found).
506            
507             In this way, there is only one pass through the file for the entire search; no line is looked-up more than once for all strings or their respective measure values. The method could be used for looking up a single string and/or a single value, but the other methods for doing this avoid the overhead of checking an array of strings, and splitting the line against the delimiter; this is only done here to facilitate caching multiple values whereas other methods avoid doing this as they only need to find one value after a known number of delimiters.
508            
509             =cut
510            
511             sub multi_list {
512 1     1 1 245 my ( $self, %args ) = @_;
513             croak 'Need a referenced list of strings to look up'
514 1 50       5 if !ref $args{'strings'};
515 1         2 my @strings = map { _get_usr_str($_) } @{ $args{'strings'} };
  2         5  
  1         4  
516 1         2 my %idx_hash = ();
517 1 50       4 if ( ref $args{'measures'} ) {
518 1         1 for my $field ( @{ $args{'measures'} } ) {
  1         3  
519             my $idx =
520 3         12 $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $field . '_idx' );
521 3 50       42 if ( nocontent($idx) ) {
522 0         0 next;
523            
524             # croak "The requested value '$field' is not defined for the current SUBTLEX file";
525             }
526 3         31 $idx_hash{$idx} = $field;
527             }
528             }
529             return _get_any_vals_for_string_list( [@strings], \%idx_hash,
530 1         5 map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
  3         9  
531             }
532            
533             =head2 Descriptive frequency statistics for lists
534            
535             These methods return a descriptive statistic (sum, mean, median or standard deviation) for a list of B. Like L, they take an optional argument B to specify if the returned values should be occurrences per million, log frequencies, or Zipf values. Providing this as an argument obviates the need to provide multiple methods for each different type of frequency measure, e.g., "mean_opm()", mean_log_opm()", ...
536            
537             Because not all types of frequency scales (count, opm, log, Zipf) are provided in all SUBTLEX corpora, these methods will C if there are no canned stats for the particular scale called for.
538            
539             It might be thought useful to allow any valid scale to be returned by, say, providing each method without a value for B; a hash-ref of frequency values, keyed by scale-type, might be returned. However, this seems to be unrecommended; it assumes that users are blind as to what measures they want (as well as to what they can get).
540            
541             =head3 frq_sum
542            
543             $sum = $subtlex->frq_sum(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
544            
545             Returns the sum of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
546            
547             =cut
548            
549             sub frq_sum {
550 0     0 1 0 my ( $self, %args ) = @_;
551 0         0 return sum( $self->_frq_vals(%args) );
552             }
553            
554             =head3 frq_mean
555            
556             $mean = $subtlex->frq_mean(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
557            
558             Returns the arithmetic average of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
559            
560             =cut
561            
562             sub frq_mean {
563 5     5 1 42256 my ( $self, %args ) = @_;
564 5         27 return mean( $self->_frq_vals(%args) );
565             }
566             *mean_freq = \&frq_mean;
567            
568             =head3 frq_median
569            
570             $median = $subtlex->frq_median(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
571            
572             Returns the median count, opm, log (usually opm) or Zipf frequency for the given B, depending on the value of B.
573            
574             =cut
575            
576             sub frq_median {
577 1     1 1 441 my ( $self, %args ) = @_;
578 1         15 return median( $self->_frq_vals(%args) );
579             }
580             *median_freq = \*frq_median;
581            
582             =head3 frq_sd
583            
584             $sd = $subtlex->frq_sd(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
585            
586             Returns the standard deviation of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
587            
588             =cut
589            
590             sub frq_sd {
591 1     1 1 764 my ( $self, %args ) = @_;
592 1         6 return stddev( $self->_frq_vals(%args) );
593             }
594             *sd_freq = \*frq_sd;
595            
596             sub _frq_vals {
597 7     7   23 my ( $self, %args ) = @_;
598             croak
599             'No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array'
600 7 50       24 if !$args{'strings'};
601             my $strs =
602             ref $args{'strings'}
603 7 50       32 ? $args{'strings'}
604             : croak 'No reference to an array of letter-strings found';
605             my $col_i =
606             hascontent( $args{'scale'} )
607             ? $self->{'_FIELDS'}
608             ->datum( $self->{'_LANG'}, 'frq_' . $args{'scale'} . '_idx' )
609 7 100       33 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' );
610 7         222 my @vals = ();
611 7         12 for my $str ( @{$strs} ) {
  7         18  
612             push @vals,
613             _get_val_for_str( _get_usr_str($str), $col_i,
614 19         54 map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
  57         129  
615             }
616 7         87 return @vals;
617             }
618            
619             =head2 Retrieving letter-strings/words
620            
621             =head3 select_strings
622            
623             $aref = $subtlex->select_strings(frq_opm => [1, 20], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
624             $aref = $subtlex->select_strings(frq_zipf => [0, 2], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
625            
626             I: select_words
627            
628             Returns a list of strings (presumably words) from the SUBTLEX corpus that satisfies certain criteria, as per the following arguments:
629            
630             =over 2
631            
632             =item length
633            
634             minimum and/or maximum length of the string (or "letter-length")
635            
636             =item frq_opm, frq_log, cd_count, etc.
637            
638             minimum and/or maximum frequency (as given in whatever unit offered by the datafile for the set language)
639            
640             =item cv_pattern
641            
642             a consonant-vowel pattern, given as a string by the usual convention, e.g., 'CCVCC' defines a 5-letter word starting and ending with pairs of consonants, the pairs separated by a vowel. 'Y' is defined here as a consonant. The tested strings are stripped of marks and otherwise ASCII transliterated (using L) ahead of the check.
643            
644             =item regex
645            
646             a regular expression (L). In the examples above, only letter-strings starting with the letter 'f', followed by one of more other letters, are specified for retrieval. Alternatively, for example, the regex value '[^aeiouy]$' specifies that the letter-strings to be returned must not end with a vowel (or 'y'). The tested strings are stripped of marks and otherwise ASCII transliterated (using L) ahead of matching, so if the string in the file has, say, a 'u' with an Umlaut, it will match a 'u' in the regex.
647            
648             =back
649            
650             For the minimum/maximum constrained criteria, the two limits are given as a referenced array where the first element is the minimum and the second element is the maximum. For example, [3, 7] would specify letter-strings of 3 to 7 letters in length; [4, 4] specifies letter-strings of only 4 letters in length. If only one of these is to be constrained, then the array would be given as, e.g., [3] to specify a minimum of 3 letters without constraining the maximum, or ['',7] for a maximum of 7 letters without constraining the minimum (checking if the element C as per String::Util).
651            
652             The value returned is always a reference to the list of words retrieved (or to an empty list if none was retrieved).
653            
654             Calling this method as "list_strings" or "list_words" is deprecated; to avoid confusion with L, which also returns a list of strings. A deprecation warning and wrap to the method is in place as of version 0.06 if using this name; they will be removed in a subsequent version.
655            
656             =cut
657            
658             sub select_strings {
659 7     7 1 6515 my ( $self, %args ) = @_;
660 7         13 my %patterns = ();
661 7 100       28 if ( hascontent( $args{'regex'} ) ) {
662 1         22 $patterns{'regex'} = qr/$args{'regex'}/xms;
663             }
664 7 100       60 if ( hascontent( $args{'cv_pattern'} ) ) {
665 5         48 my $tmp = q{};
666 5         29 my @c = split m//ms, uc( $args{'cv_pattern'} );
667 5         15 foreach (@c) {
668 36 100       71 $tmp .= $_ eq 'C' ? '[BCDFGHJKLMNPQRSTVWXYZ]' : '[AEIOU]';
669             }
670 5         269 $patterns{'cv_pattern'} = qr/^$tmp$/ixms;
671             }
672            
673 7         30 my @list = ();
674 7 50       281 open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
675             LINES:
676 7         178 while (<$fh>) {
677 218 100       5406 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
678 211         927 my @line = split m/\Q$self->{'_DELIM'}\E/xms;
679 211 100       273 next if !_in_range( length( $line[0] ), @{ $args{'length'} } );
  211         371  
680 157         280 for ( keys %patterns ) {
681 131 100       303 next LINES if unidecode( $line[0] ) !~ $patterns{$_};
682             }
683 36         186 for (qw/frq_count frq_opm frq_log frq_zipf cd_count cd_pct cd_log/) {
684 152 100 66     343 if (
685             ref $args{$_}
686             and hascontent(
687             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $_ . '_idx' )
688             )
689             )
690             {
691             next LINES
692             if !_in_range(
693             _clean_value(
694             $line[
695             $self->{'_FIELDS'}
696             ->datum( $self->{'_LANG'}, $_ . '_idx' )
697             ]
698             ),
699 32 100       601 @{ $args{$_} }
  32         743  
700             );
701             }
702             }
703 16 50       33 if ( ref $args{'pos'} ) {
704             next LINES
705             if none {
706             $_ eq $line[ $self->{'_FIELDS'}
707 0     0   0 ->datum( $self->{'_LANG'}, 'pos_dom_idx' ) ]
708             }
709 0 0       0 @{ $args{'pos'} };
  0         0  
710             }
711 16         63 push @list, $line[0];
712             }
713 7 50       170 close $fh or croak $OS_ERROR;
714            
715 7         73 return \@list;
716             }
717             *select_words = \&select_strings;
718            
719             =head3 all_strings
720            
721             $aref = $subtlex->all_strings();
722            
723             I: all_words
724            
725             Returns a reference to an array of all letter-strings in the corpus. These are culled of empty and duplicate strings, and then alphabetically sorted.
726            
727             =cut
728            
729             sub all_strings {
730 1     1 1 2 my ( $self, %args ) = @_;
731 1         3 my @list = ();
732 1 50       37 open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
733 1         29 while (<$fh>) {
734 7 100       145 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
735 6         9 push @list, _get_file_str( $_, $self->{'_DELIM'} );
736             }
737 1 50       46 close $fh or croak $OS_ERROR;
738 1         6 return [ sort { lc($a) cmp lc($b) } uniq( grep { hascontent($_) } @list ) ];
  8         29  
  6         34  
739             }
740             *all_words = \&all_strings;
741            
742             =head3 random_string
743            
744             $string = $subtlex->random_string();
745             @data = $subtlex->random_string();
746            
747             I: random_word
748            
749             Picks a random line from the corpus, using L (except the top header line). Returns the word in that line if called in scalar context; otherwise, the array of data for that line. (A future version might let specifying a match to specific criteria, self-aborting after trying X lines.)
750            
751             =cut
752            
753             sub random_string {
754 2     2 1 489 my ( $self, %args ) = @_;
755 2         5 eval { require File::RandomLine; };
  2         451  
756 2 50       2794 croak 'Need to install and access module File::RandomLine' if $EVAL_ERROR;
757             my $rl =
758 2         55 File::RandomLine->new( $self->{'_PATH'}, { algorithm => 'uniform' } );
759 2         342 my @ari = ();
760 2   66     9 while ( not scalar @ari or $ari[0] eq 'Word' ) {
761 2         17 @ari = split m/\Q$self->{'_DELIM'}\E/xms, $rl->next;
762             }
763 2 100       182 return wantarray ? @ari : $ari[0];
764             }
765             *random_word = \&random_string;
766            
767             =head2 Miscellaneous
768            
769             =head3 n_lines
770            
771             $num = $subtlex->n_lines();
772            
773             Returns the number of lines, less the column headings and any lines with no content, in the installed language file. Expects/accepts no arguments.
774            
775             =cut
776            
777             sub n_lines {
778 3     3 1 1486 my $self = shift;
779 3         8 my $z = 0;
780 3 50       119 open( my $fh, q{<}, $self->{'_PATH'} ) or croak $OS_ERROR;
781 3         46 while (<$fh>) {
782 94 100       159 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
783 91 50       129 next if nocontent($_);
784 91         755 $z++;
785             }
786 3 50       34 close $fh or croak $OS_ERROR;
787 3         17 return $z;
788             }
789             *nlines = \&n_lines; # legacy alias
790            
791             =head3 pct_alpha
792            
793             Returns the percentage of strings in the subtitles file that satisfy "look like words" relative to the number of lines (as per L). Specifically, after ASCII transliteration of the string (per L), does it match to /[\p{XPosixAlpha}\-']/ (per L, but including apostrophes and dashes)?
794            
795             =cut
796            
797             sub pct_alpha {
798 1     1 1 266 my ( $self, %args ) = @_;
799 1         3 my $all_strs_aref = $self->all_strings();
800 1         1 my $count_all = count( @{$all_strs_aref} );
  1         4  
801 1         17 my $pct = q{};
802 1 50       3 if ( $count_all > 0 ) {
803 17     17   145 my $count_alpha_strings = count( grep { m/[\p{XPosixAlpha}\-']/xsm }
  17         62  
  17         286  
  5         31  
804 1         2 map { unidecode($_) } @{$all_strs_aref} );
  5         53  
  1         2  
805 1         12 $pct = 100 * $count_alpha_strings / $count_all;
806             }
807 1         3 return $pct;
808             }
809            
810             =head3 set_lang
811            
812             $lang = $subtlex->set_lang(lang => STR); # DE, FR, NL_all, NL_min, PT, UK or US
813             $lang = $subtlex->set_lang(lang => STR, path => 'this/is/the/file.csv');
814             $lang = $subtlex->set_lang(lang => STR, dir => 'file/is/in/here');
815            
816             Set or guess location of datafile; see L. Naturally, the given value of B (required)--which is used as a database ID--should correspond with any given B to the SUBTLEX datafile (optional but recommended). If only a B value is given, the SUBTLEX datafile should be named so that it uniquely includes the specific value of B.
817            
818             =cut
819            
820             sub set_lang {
821 25     25 1 424 my ( $self, %args ) = @_;
822             ## firstly, establish the language to use, and the directory in which this module lives:
823 25 100       114 return if nocontent($args{'lang'});
824 24         456 $self->_set_spec_hash( $args{'fieldpath'} );
825             #@ is the complete pathname actually given in args?
826 24 50       127 croak 'Need a valid attribute' if not ref $self->{'_FIELDS'}->record( $args{'lang'} );
827            
828 24         435 $self->{'_LANG'} = delete $args{'lang'};
829            
830 24 50       102 if ( hascontent( $args{'path'} ) ) {
831 24 50       744 if ( !-e $args{'path'} ) {
832 0         0 croak
833             "Path given for SUBTLEX corpus does not exist: '$args{'path'}'";
834             }
835             else {
836 24         117 $self->{'_PATH'} = delete $args{'path'};
837             }
838             }
839             else {
840 0         0 my ( $lang, $dir, $path ) = ( $self->{'_LANG'} );
841 0 0       0 if ( $args{'dir'} ) { # check it's a dir:
842             croak "Value for argument 'dir' ($args{'dir'}) is not a directory"
843 0 0       0 if !-d $args{'dir'};
844 0         0 $dir = delete $args{'dir'};
845             }
846             else { # use module's dir :
847 0         0 $dir = $self->{'_MODULE_DIR'};
848             }
849 0         0 for ( path($dir)->children ) {
850 0 0       0 if (/(?:SUBTLEX[\-_])?\Q$lang/imsx) {
851 0         0 $path = $_;
852 0         0 last;
853             }
854             }
855 0 0 0     0 if ( nocontent($path) or not -T $path )
856             { # only already defined if it exists
857 0         0 croak
858             "Cannot find required SUBTLEX datafile for language '$self->{'_LANG'}' within '$dir'.\nInstall the database (from the URL given in the POD) into either:\n\t(1) the Lingua/Norms/SUBTLEX directory within your Perl distribution (with the filename specified in the POD);\n\t(2) a directory you specify to new(dir => 'my/dir/to/lang/file') (again with the filename specified in the POD); or\n\t(3) a directory, specifying the complete path to that file in new(path => 'this/is/the/file.csv'), including its filename";
859             }
860             else {
861 0         0 $self->{'_PATH'} = $path;
862             }
863             }
864            
865             $self->{'_DELIM'} =
866 24         122 get_separator( path => $self->{'_PATH'}, lucky => 1 );
867            
868 24         6454 return $self->{'_LANG'};
869             }
870            
871             =head3 get_lang
872            
873             $str = $subtlex->get_lang();
874            
875             Returns the language code (e.g., 'UK', 'FR') currently set for the module (which determines the file being looked up, if not explicitly given). The empty string is returned if the language has not been set.
876            
877             =cut
878            
879             sub get_lang {
880 8     8 1 3543 my ( $self, %args ) = @_;
881 8 50       35 return hascontent( $self->{'_LANG'} ) ? $self->{'_LANG'} : q{};
882             }
883            
884             =head3 get_path2db
885            
886             $path = $subtlex->get_path2db();
887            
888             Returns the path (directory and filename) from which the module's methods are currently set to look-up strings, frequencies, etc.
889            
890             =cut
891            
892             sub get_path2db {
893 0     0 1 0 my ( $self, %args ) = @_;
894 0         0 return path( $self->{'_PATH'} )->stringify;
895             }
896            
897             =head3 get_index
898            
899             $int = $subtlex->get_index(measure => 'frq_opm');
900            
901             Returns the index within the currently looked-up file that contains the given B.
902            
903             =cut
904            
905             sub get_index {
906 5     5 1 2657 my ( $self, %args ) = @_;
907 5 50       18 my $var = delete $args{'measure'} or croak 'Need a named measure';
908 5         27 return $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $var . '_idx' )
909             ; #{$var};
910             }
911            
912             =head3 set_eq
913            
914             $subtlex->set_eq(match_level => INT); # undef, 0, 1, 2 or 3
915            
916             See L.
917            
918             =head3 url2datafile
919            
920             $url = $subtlex->url2datafile(lang => STRING);
921             %loc = $subtlex->url2datafile(lang => STRING);
922            
923             Returns the URL (complete path) where the SUBTLEX file for a given language is stored, and from which it should be downloadable. These are locations as specified (at the time of releasing this version of the module) at L and/or L, and so as listed in the L section. This could include an archive within which the file needs to be retrieved. Called in list context, this method returns a hash with keys for 'www_dir', 'archive' (if the file is within an archive) and 'filename'. (This module does not fetch the file off the WWW itself; it should be installed and available locally--see L).
924            
925             =cut
926            
927       0 1   sub url2datafile {
928            
929             #my ($self, %args) = @_;
930             #croak 'A value for the argument needs to be given for SUBTLEX url2datafile' if nocontent($args{'lang'});
931             #my $lang = delete $args{'lang'};
932             #croak "The value for the argument => $lang is not recognised" if none { $_ => $lang } (qw/UK US NL DE/);
933             # Hard-copy of WWW dirs, archives (where rel) and filenames for the SUBTLEX files:
934             ## some datafiles are within compressed archives, some not, so ...
935             #my %req_filespecs = %{$path_hash{$lang}};
936            
937             #return wantarray ? %req_filespecs : File::Spec->catfile($req_filespecs{$lang}->{'www_dir'}, $req_filespecs{$lang}->{'archive'}, $req_filespecs{$lang}->{'file'});
938             }
939            
940             ### PRIVATMETHODEN:
941            
942             sub _get_usr_str {
943 118     118   193 my $str = shift;
944 118 50       313 croak 'No string to test; pass a value for to the requested method'
945             if nocontent($str);
946 118         1497 return $str;
947            
948             #return decode( 'UTF-8', $str );#
949             }
950            
951             # Given a line from a SUBTLEX file, return all the characters from the start of the line up to the delimiter for that file, after stripping it of any quote characters - e.g., if the line starts: "abacus",20,30 ... and the delimiter is a comma, return: abacus
952             sub _get_file_str {
953 1680     1680   2999 my ( $line, $delim ) = @_;
954 1680         5977 $line =~ /^([^\Q$delim\E]+)/xms;
955 1680         3186 return trim( unquote($1) );
956            
957             #my $str = decode('UTF-8', trim(unquote($1)) );
958             #print STDERR "<$str>\n";
959             #return $str;
960            
961             #my $code = guess_encoding($str, qw/ascii utf8 utf16 iso-8859-1 cp1250 latin1 greek/);
962             #print STDERR "$str\t", $code->decode($str), "\n";
963             #return $code->decode($str);
964             }
965            
966             sub _get_val_for_str {
967 103     103   237 my ( $str, $col_i, $path, $delim, $eq_fn ) = @_;
968 103 50       199 croak
969             'No word to test; pass a letter-string named \'string\' to the function'
970             if nocontent($str);
971 103 50       906 croak "The requested value is not defined for the current SUBTLEX corpus"
972             if nocontent($col_i);
973            
974 103         858 my $val = q{}; # default value returned is empty string
975 103 50       3889 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
976 103         1448 while (<$fh>) {
977 1522 100       81826 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
978 1419         2025 my $file_str =
979             _get_file_str( $_, $delim ); # have to declare as can be empty (!)
980 1419 50       32686 next if nocontent($file_str);
981 1419 100       12315 if ( $eq_fn->( $str, $file_str ) ) {
982 102         6409 $val = _get_val( $_, $delim, $col_i );
983 102         2576 last;
984             }
985             }
986 103 50       1664 close $fh or croak $OS_ERROR;
987 103         683 return $val;
988             }
989            
990             sub _get_val {
991 102     102   218 my ( $line, $delim, $col_i ) = @_;
992            
993             # if the line has quoted fields, and uses the delimiter within the quotes,
994             # as in SUBTLEX-PT, need to firstly clean the line up:
995             # this "fix" assumes the quotes are either double- or single quotes and nothing else,
996             # and there is no trailing delimiter.
997             # It strips the quotes, and replaces the comma with a vertical bar:
998 102         822 $line =~ s/["']([^"'\Q$delim\E]+)\Q$delim\E([^"'\Q$delim\E]+)["']/$1|$2/gxsm;
999            
1000 102         2205 $line =~ m/^(
1001             [^\Q$delim\E]* # any character from the start not including the delimiter (which might be \t)
1002             \Q$delim\E # now ending with the delimiter, perhaps as a quoted string
1003             )
1004             {$col_i,}? # as many times as necessary to get to the required field value
1005             ([^\Q$delim\E]*) # which should be here
1006             /msx;
1007 102         325 return _clean_value($2); # now format the number, strip space ...
1008             }
1009            
1010             sub _get_val_for_strs {
1011 3     3   7 my ( $str, $col_i_aref, $path, $delim, $eq_fn ) = @_;
1012            
1013             # Check we have a string, and valid filed indices:
1014 3 50       7 croak
1015             'No word to test; pass a letter-string named \'string\' to the function'
1016             if nocontent($str);
1017             croak "The requested value is not defined for the SUBTLEX corpus"
1018 3 50   6   31 if any { nocontent($_) } @{$col_i_aref};
  6         30  
  3         11  
1019            
1020 3         31 my $val = [];
1021            
1022             # Search for the string, and isolate the requested values:
1023 3 50       123 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
1024 3         62 while (<$fh>) {
1025 37 100       162 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
1026 34         47 my $file_str =
1027             _get_file_str( $_, $delim ); # have to declare as can be empty (!)
1028 34 50       746 next if nocontent($file_str);
1029 34 100       284 if ( $eq_fn->( $str, $file_str ) ) {
1030 3         32 my @line = split m/\Q$delim\E/xms;
1031 3         15 for my $col_i ( @{$col_i_aref} ) {
  3         12  
1032 6         70 push @{$val}, _clean_value( $line[$col_i] );
  6         16  
1033             }
1034 3         69 last;
1035             }
1036             }
1037 3 50       49 close $fh or croak;
1038            
1039             # return the reference to array if there is more than 1 value, otherwise just the single value itself
1040             ## but if the string itself was not found, return the empty string for the number of requested fields:
1041 3         13 my $n_vals = scalar grep { hascontent($_) } @{$val};
  6         32  
  3         7  
1042             return
1043             $n_vals
1044             ? $n_vals > 1
1045             ? $val
1046             : $val->[0]
1047 3 50       47 : scalar @{$col_i_aref} > 1 ? [ q{} x scalar @{$col_i_aref} ]
  0 0       0  
  0 50       0  
1048             : q{};
1049             }
1050            
1051             sub _get_any_vals_for_string_list {
1052 1     1   4 my ( $str_aref, $col_i_href, $path, $delim, $eq_fn ) = @_;
1053 1         2 my %string_vals = ();
1054 1         2 my @usr_strings = sort { $a cmp $b } @{$str_aref};
  1         4  
  1         4  
1055            
1056             # Search for the string, and isolate the requested values:
1057 1 50       40 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
1058 1         17 while (<$fh>) {
1059 5 100       28 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
1060 4         9 my $file_str =
1061             _get_file_str( $_, $delim ); # have to declare as can be empty (!)
1062 4 50       135 next if nocontent($file_str);
1063 4 100   7   50 if ( my $found = first { $eq_fn->( $_, $file_str ) } @usr_strings ) {
  7         21  
1064 2         33 my @line = split m/\Q$delim\E/xms; # split the line
1065 2         5 for my $col_i ( keys %{$col_i_href} ) {
  2         7  
1066 6         146 $string_vals{$file_str}->{ $col_i_href->{$col_i} } =
1067             _clean_value( $line[$col_i] );
1068             }
1069 2 100       43 last if scalar keys %string_vals == scalar @{$str_aref};
  2         9  
1070 1     2   11 splice @usr_strings, ( firstidx { $_ eq $found } @usr_strings ), 1;
  2         10  
1071            
1072             #print STDERR "checking ",join(q{,}, @usr_strings),"\n";
1073             }
1074             }
1075 1 50       18 close $fh or croak;
1076 1         12 return \%string_vals;
1077            
1078             # return the reference to array if there is more than 1 value, otherwise just the single value itself
1079             ## but if the string itself was not found, return the empty string for the number of requested fields:
1080             #my $n_vals = scalar grep { hascontent($_) } @{$val};
1081             #return $n_vals ? $n_vals > 1 ? $val : $val->[0] : scalar @{$col_i_aref} > 1 ? [q{} x scalar @{$col_i_aref}] : q{};
1082             }
1083            
1084             # Loads a hash-ref of the "specs" for each language file, including the field indices in each file for the measures they contain:
1085             ## Called only by new() after setting the MODULE_DIR
1086             sub _set_spec_hash {
1087 24     24   76 my ( $self, $fieldpath ) = @_;
1088 24   33     62 $fieldpath ||= File::Spec->catfile( $self->{'_MODULE_DIR'}, 'specs.csv' );
1089 24         204 $self->{'_FIELDS'} = Text::CSV::Hashify->new(
1090             { file => $fieldpath, format => 'hoh', key => 'Lang_stub' } );
1091 24         40402 return;
1092             }
1093            
1094             sub _in_range {
1095 243     243   356 my ( $n, $min, $max ) = @_;
1096 243         265 my $res = 1;
1097 243 50       398 if ( !is_numeric($n) ) {
1098 0         0 $res = 0;
1099             }
1100             else {
1101 243 100 100     3241 if ( hascontent($min) and $n < $min ) { # fails min
1102 48         377 $res = 0;
1103             }
1104 243 100 100     1259 if ( $res && ( hascontent($max) and $n > $max ) ) { # fails max and min
      100        
1105 26         198 $res = 0;
1106             }
1107             }
1108 243         1214 return $res;
1109             }
1110            
1111             sub _clean_value {
1112 146     146   620 my $val = shift;
1113 146 50       356 return q{} if nocontent($val);
1114 146         1427 $val =~ s/,([^,]+)$/.$1/xsm; # replace ultimate , with .
1115 146         338 return trim( unquote($val) );
1116             }
1117            
1118             sub _pos_is {
1119 12     12   499 my ( $pos_aref, $fields, $lang ) = @_;
1120 12 50       35 $pos_aref = [$pos_aref] if !ref $pos_aref;
1121 12         17 my @test_str = map { split /[\W\.]+/xsm } @{$pos_aref};
  12         31  
  12         22  
1122 12 50       28 return [qw/UK/] if !scalar @test_str;
1123 12         16 my @pos_ari = ();
1124 12         22 for my $pos_str (@test_str) {
1125             push @pos_ari, first {
1126             hascontent( $fields->datum( $lang, 'pos_' . $_ ) )
1127 47 100   47   189 and first { $_ =~ m/^$pos_str$/xsm }(split /\|/, $fields->datum( $lang, 'pos_' . $_ ))
  46         1443  
1128             }
1129 12         62 qw/NN VB AJ AV CJ PN PP DA NM IJ NB OT UK/;
1130            
1131             }
1132 12         54 return \@pos_ari;
1133             }
1134            
1135             sub _log10 {
1136 0     0   0 return log(shift) / log(10);
1137             }
1138            
1139             sub _val_or_0 {
1140 46     46   78 my $val = shift;
1141 46 100       118 return ( is_numeric($val) ) ? $val : 0;
1142             }
1143            
1144             sub _croak_defunct {
1145 0     0     croak
1146             'That method is defunct. See the POD for an alternative, and the CHANGES file';
1147             }
1148             *freqhash = \&_croak_defunct;
1149             *ldist = \&_croak_defunct;
1150             *on_count = \&_croak_defunct;
1151             *on_ldist = \&_croak_defunct;
1152             *on_freq_max = \&_croak_defunct;
1153             *on_zipf_mean = \&_croak_defunct;
1154             *on_freq_mean = \&_croak_defunct;
1155             *on_lfreq_mean = \&_croak_defunct;
1156             *on_frq_opm_max = \&_croak_defunct;
1157             *on_frq_opm_max = \&_croak_defunct;
1158             *on_frq_zipf_mean = \&_croak_defunct;
1159            
1160             sub _carp_deprecated {
1161 0     0     my ( $self, %args ) = @_;
1162 0           carp
1163             'That method is deprecated. See the POD for an alternative, and the CHANGES file';
1164 0           return;
1165             }
1166             *list_words = \&_carp_deprecated;
1167             *list_strings = \&_carp_deprecated;
1168            
1169             =head1 DIAGNOSTICS
1170            
1171             =over 4
1172            
1173             =item * Need a valid attribute
1174            
1175             When constructing the class object with L, the B argument must have a valid value, as indicated in the table above. Also, the module needs to read in the contents of a file named "specs.csv" which should be located within the SUBTLEX directory where the module itself is located (alongside the downloaded SUBTLEX files). This file specifies the field indices for the various stats within each SUBTLEX datafile. Check that this file is indeed within the Perl/site/lib/Lingua/Norms/SUBTLEX directory. If it is not, download and install the file to that location via the L package of this module.
1176            
1177             =item * Value given to argument 'dir' (VALUE) in new() is not a directory
1178            
1179             Croaked from L if called with a value for the argument B, and this value is not actually a directory/folder. This is the directory/folder in which the actual SUBTLEX datafiles should be located.
1180            
1181             =item * Cannot find required database for language ...
1182            
1183             Croaked from L if none of the given values to arguments B, B or B are valid, and even the default site/lib directory and US database are not accessible. Check that your have indeed a file with the given value of B (DE, NL, UK or US) within the Perl/site/lib/Lingua/Norms/SUBTLEX directory, or at least that the SUBTLEX-US file is located within it, and can be read via your script.
1184            
1185             =item * Cannot determine fields for given language
1186            
1187             Croaked upon construction if no fields are recognized for the given language. The value given to B must be one of DE, NL, UK or US.
1188            
1189             =item * The requested value is not defined for the ... SUBTLEX corpus
1190            
1191             Croaked when calling for a value for a statistic that is not defined for a given language, e.g., when requesting a value for the Zipf frequency in the NL corpus.
1192            
1193             =item * No string to test; pass a value for to FUNCTION()
1194            
1195             Croaked by several methods that expect a value for the named argument B, and when no such value is given. These methods require the letter-string to be passed to it as a I => I pair, with the key B followed by the value of the string to test.
1196            
1197             =item * No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array
1198            
1199             Same as above but specifically croaked by L which accepts more than one string in a single call.
1200            
1201             =item * Need to install and have access to module File::RandomLine
1202            
1203             Croaked by method L if the module it depends on (File::RandomLine) is not installed or accessible. This should have been installed (if not already) upon installation of the present module. See L to download and install this module manually.
1204            
1205             =back
1206            
1207             =head1 DEPENDENCIES
1208            
1209             L : for L
1210            
1211             L : for C method
1212            
1213             L : C, C, C, C and other functions
1214            
1215             L : C
1216            
1217             L : for directory reading when calling L
1218            
1219             L : for various statistical methods
1220            
1221             L : C
1222            
1223             L : for determining valid string values
1224            
1225             L : reads in the specs file
1226            
1227             L : for determining the field delimiter within the datafiles
1228            
1229             L : for plain ASCII transliterations of Unicode text
1230            
1231             =head1 REFERENCES
1232            
1233             Brysbaert, M., Buchmeier, M., Conrad, M., Jacobs, A.M., Boelte, J., & Boehl, A. (2011). The word frequency effect: A review of recent developments and implications for the choice of frequency estimates in German. I, I<58>, 412-424. doi: L<10.1027/1618-3169/a000123|http://dx.doi.org/10.1027/1618-3169/a000123>
1234            
1235             Brysbaert, M., & New, B. (2009). Moving beyond Kucera and Francis: A critical evaluation of current word frequency norms and the introduction of a new and improved word frequency measure for American English. I, I<41>, 977-990. doi: L<10.3758/BRM.41.4.977|http://dx.doi.org/10.3758/BRM.41.4.977>
1236            
1237             Brysbaert, M., New, B., & Keuleers,E. (2012). Adding part-of-speech information to the SUBTLEX-US word frequencies. I, I<44>, 991-997. doi: L<10.3758/s13428-012-0190-4|http://dx.doi.org/10.3758/s13428-012-0190-4>
1238            
1239             Herdagdelen, A., & Marelli, M. (2017). Social media and language processing: How Facebook and Twitter provide the best frequency estimates for studying word recognition. I, I<41>, 976-995. doi:L<10.1111/cogs.12392|http://dx.doi.org/10.1111/cogs.12392>
1240            
1241             Keuleers, E., Brysbaert, M., & New, B. (2010). SUBTLEX-NL: A new frequency measure for Dutch words based on film subtitles. I, I<42>, 643-650. doi: L<10.3758/BRM.42.3.643|http://dx.doi.org/10.3758/BRM.42.3.643>
1242            
1243             New, B., Brysbaert, M., Veronis, J., & Pallier, C. (2007). The use of film subtitles to estimate word frequencies. I, I<28>, 661-677.
1244            
1245             Soares, A. P., Machado, J., Costa, A., Comesaña, M., & Perea, M. (in press). On the advantages of frequency measures extracted from subtitles: The case of Portuguese. I.
1246            
1247             Van Heuven, W. J. B., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). SUBTLEX-UK: A new and improved word frequency database for British English. I, I<67>, 1176-1190. doi: L<10.1080/17470218.2013.850521|http://dx.doi.org/10.1080/17470218.2013.850521>
1248            
1249             =head1 AUTHOR
1250            
1251             Roderick Garton, C<< >>
1252            
1253             =head1 BUGS AND LIMITATIONS
1254            
1255             Please report any bugs or feature requests to C, or through
1256             the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
1257            
1258             =head1 SUPPORT
1259            
1260             You can find documentation for this module with the perldoc command.
1261            
1262             perldoc Lingua::Norms::SUBTLEX
1263            
1264             You can also look for information at:
1265            
1266             =over 4
1267            
1268             =item * RT: CPAN's request tracker (report bugs here)
1269            
1270             L
1271            
1272             =item * AnnoCPAN: Annotated CPAN documentation
1273            
1274             L
1275            
1276             =item * CPAN Ratings
1277            
1278             L
1279            
1280             =item * Search CPAN
1281            
1282             L
1283            
1284             =back
1285            
1286             =head1 LICENSE AND COPYRIGHT
1287            
1288             Copyright 2014-2018 Roderick Garton.
1289            
1290             This program is free software; you can redistribute it and/or modify it
1291             under the terms of either: the GNU General Public License as published
1292             by the Free Software Foundation; or the Artistic License.
1293            
1294             See L for more information.
1295            
1296             To the maximum extent permitted by applicable law, the author of this module disclaims all warranties, either express or implied, including but not limited to implied warranties of merchantability and fitness for a particular purpose, with regard to the software and the accompanying documentation.
1297            
1298             =cut
1299            
1300             1; # End of Lingua::Norms::SUBTLEX