File Coverage

blib/lib/Lingua/Identify/Blacklists.pm
Criterion Covered Total %
statement 197 303 65.0
branch 70 144 48.6
condition 21 42 50.0
subroutine 24 30 80.0
pod 8 19 42.1
total 320 538 59.4


line stmt bran cond sub pod time code
1             #-*-perl-*-
2              
3             package Lingua::Identify::Blacklists;
4              
5 4     4   128194 use 5.008;
  4         15  
  4         380  
6 4     4   21 use strict;
  4         8  
  4         139  
7              
8 4     4   3848 use File::ShareDir qw/dist_dir/;
  4         31569  
  4         619  
9 4     4   40 use File::Basename qw/dirname/;
  4         8  
  4         290  
10 4     4   4362 use File::GetLineMaxLength;
  4         1626  
  4         328  
11              
12 4     4   3798 use Lingua::Identify qw(:language_identification);;
  4         346792  
  4         925  
13 4     4   4219 use Lingua::Identify::CLD;
  4         28607  
  4         193  
14              
15 4     4   47 use Exporter 'import';
  4         7  
  4         8559  
16             our @EXPORT_OK = qw( identify identify_file identify_stdin
17             train train_blacklist run_experiment
18             available_languages available_blacklists );
19             our %EXPORT_TAGS = ( all => \@EXPORT_OK );
20              
21             our $VERSION = '0.04';
22              
23             =encoding UTF-8
24              
25             =head1 NAME
26              
27             Lingua::Identify::Blacklists - Language identification for related languages based on blacklists
28              
29             =head1 VERSION
30              
31             Version 0.03
32              
33             =head1 SYNOPSIS
34              
35             use Lingua::Identfy::Blacklists qw/:all/;
36              
37             # detect language for a given text
38             # (discriminate between Bosanian, Croatian and Serbian)
39             my $lang = identify( ".... text to be classified ...",
40             langs => ['bs','hr','sr']);
41              
42             # check if the assumed language ('hr') is confused with another one
43             my $lang = identify( ".... text to be classified ...", assumed => 'hr' );
44              
45             # use a general-purpose identfier and check confusable langauges if necessary
46             my $lang = identify( ".... text to be classified ...");
47              
48             # delect language in the given file (Unicode UTF-8 is assumed)
49             my $lang = identify_file( $filename, langs => [...] );
50             my $lang = identify_file( $filename, assumed => '..' );
51             my $lang = identify_file( $filename );
52              
53             # delect language for every line separately from the given file
54             # (return a list of lang-IDs)
55             my @langs = identify_file( $filename, every_line => 1, langs = [...] );
56             my @langs = identify_file( $filename, every_line => 1, assumed = '..' );
57             my @langs = identify_file( $filename, every_line => 1 );
58              
59              
60             # learn classifiers (blacklists) for all pairs of languages
61             # given some training data
62             train( { cs => $file_with_cs_text,
63             sk => $file_with_sk_text,
64             pl => $file_with_pl_text } );
65              
66             # learn a blacklist from a given pair of texts (prints to STDOUT)
67             train_blacklist( $filename1, $filename2 );
68              
69             # ... the same but write to outfile
70             train_blacklist( $filename1, $filename2, outfile => $outfilename );
71              
72             # train and evaluate the classification using given training/test data
73             my @traindata = ($trainfile1, $trainfile2, $trainfile3);
74             my @evaldata = ($testfile1, $testfile2, $testfile3);
75             run_experiment(\@traindata, \@evaldata, $lang1 $lang2, $lang3);
76              
77             # train with different parameters (optional)
78             my %para = (
79             min_high => 5, # minimal token frequency in one langusgae
80             max_low => 2, # maximal token frequency in the other language
81             min_diff => 0.7 ); # score difference threshold
82              
83             train( { cs => $file_with_cs_text, sk => $file_with_sk_text }, %para );
84              
85             =head1 Description
86              
87             This module adds a blacklist classifier to a general purpose language identification tool. Related languages can easily be confused with each other and standard language detection tools do not work very well for distinguishing them. With this module one can train so-called blacklists of words for language pairs containing words that should not (or very rarely) occur in one language while being quite common in the other. These blacklists are then used to discriminate between those "confusable" related languages.
88              
89             Since version 0.03 it also integrates a standard language identifier (Lingua::Identify::CLD) and can now be used for general language identification. It calls the blacklist classifier only for those languages that can be confused and for which appropriate blacklists are trained.
90              
91              
92             =head1 Settings
93              
94             Module-internal variables that can be modified:
95              
96             $BLACKLISTDIR # directory with all blacklists (default: module-share-dir)
97             $LOWERCASE # lowercase all data, yes/no (1/0), default: 1
98             $TOKENIZE # tokenize all data, yes/no (1/0), default: 1
99             $ALPHA_ONLY # don't use tokens with non-alphabetic characters, default: 1
100             $MAX_LINE_LENGTH # max line length when reading from files (default=2**16)
101             $CLD_TEXT_SIZE # text size in characters used for language ident. with CLD
102             $VERBOSE # verbose output (default=0)
103              
104             Tokenization is very simple and replaces all non-alphabetic characters with a white-space character.
105              
106             =cut
107              
108              
109             our $BLACKLISTDIR;
110             eval{ $BLACKLISTDIR = &dist_dir('Lingua-Identify-Blacklists') . '/blacklists' };
111              
112             our $LOWERCASE = 1;
113             our $TOKENIZE = 1;
114             our $ALPHA_ONLY = 1;
115             our $MAX_LINE_LENGTH = 2**16; # limit the length of one line to be read
116             our $CLD_TEXT_SIZE = 2**16; # text size used for detecting lang with CLD
117             our $VERBOSE = 0;
118              
119             my %blacklists = (); # hash of blacklists (langpair => {blacklist}, ...)
120             my %confusable = (); # hash of confusable languages (lang => [other_langs])
121              
122             ## the compact language identifier from Google Chrome
123             my $CLD = new Lingua::Identify::CLD;
124              
125              
126             # load all blacklists in the gneral BLACKLISTDIR
127             &load_blacklists( $BLACKLISTDIR );
128              
129              
130              
131              
132             =head1 Exported Functions
133              
134             =head2 C<$langID = identify( $text [,%options] )>
135              
136             Analyses a given text and returns a language ID as the result of the classification. C<%options> can be used to change the behaviour of the classifier. Possible options are
137              
138             assumed => $assumed_lang
139             langs => \@list_of_possible_langs
140             use_margin => $score
141              
142             If C are specified, it runs the classifier with blacklists for those languages (in a cascaded way, i.e. best1 = lang1 vs lang2, best2 = best1 vs lang3, ...). If C is specified, it runs all versus all and returns the language that wins the most (with margin=$score).
143              
144             If the C language is given, it runs the blacklist classifier for all languages that can be confused with $assumed_lang (if blacklist models exist for them).
145              
146             If neither C not C are specified, it first runs a general-purpose language identification (using Lingua::Identify::CLD and Lingua::Identify) and then checks with the blacklist classifier whether the detected language can be confused with another one. For example, CLD frequently classifies Serbian and Bosnian texts as Croatian but the blacklist classifier will detect that (and hopefully correct the decision).
147              
148             =cut
149              
150             sub identify{
151 605     605 1 3349 my $text = shift;
152 605         1703 my %options = @_;
153              
154 605         1207 my %dic = ();
155 605         1052 my $total = 0;
156              
157             # run the blacklist classifier if 'langs' are specified
158 605 50       2004 if (exists $options{langs}){
159 0         0 &process_string( $text, \%dic, $total, $options{text_size} );
160 0         0 return &classify( \%dic, %options );
161             }
162              
163             # otherwise: check if there is an 'assumed' language
164             # if not: classify with CLD
165 605 50       2568 $options{assumed} = &identify_language( $text )
166             unless (exists $options{assumed});
167              
168             # if there is an 'assumed' language:
169             # check if it can be confused with others (i.e. blacklists exist)
170 605 100       2494 if (exists $confusable{$options{assumed}}){
171 603         1773 $options{langs} = $confusable{$options{assumed}};
172             # finally: process the text and classify
173 603         1997 &process_string( $text, \%dic, $total );
174 603         3437 return &classify( \%dic, %options );
175             }
176 2         10 return $options{assumed};
177             }
178              
179              
180              
181             =head2 C<$langID = identify_file( $filename [,%options] )>
182              
183             Does the same as C but reads text from a file. It also takes the same options as the 'identify' function but allows two extra options:
184              
185             text_size => $size, # number of tokens to be used for classification
186             every_line => 1
187              
188             Using the C option, the classifier checks every input line seperately and returns a list of language ID's.
189              
190             @langIDs = identify_file( $filename, every_line => 1, %options )
191              
192             =cut
193              
194              
195              
196             sub identify_file{
197 6     6 1 3480 my $file = shift;
198 6         36 my %options = @_;
199            
200 6         12 my %dic = ();
201 6         14 my $total = 0;
202 6         18 my @predictions = ();
203            
204 6 50       39 my $fh = defined $file ? open_file($file) : *STDIN;
205 6         70 my $reader = File::GetLineMaxLength->new($fh);
206              
207             # mode 1: classify every line separately
208 6 100       206 if ($options{every_line}){
209 3         10 my @predictions = ();
210 3         17 while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
211 600         121627 chomp $line;
212 600         2223 push( @predictions, &identify( $line, %options ) );
213             }
214 3         636 return @predictions;
215             }
216              
217             # mode 2: classify all text together (optional: size limit)
218 3         9 my $text = '';
219 3         14 while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
220              
221             # save text if no languages are given (for blacklists)
222 600 50 33     121868 unless (exists $options{langs} || exists $options{assumed}){
223 600 100       13479 if ( length($text) < $CLD_TEXT_SIZE ){
224 72         865 $text .= $line;
225             }
226             }
227              
228             # prepare the data for blacklist classification
229             # (TODO: is this cheaper than keeping the text in memory and
230             # processing it later when needed?)
231 600         1275 chomp $line;
232 600         1491 &process_string($line,\%dic,$total);
233 600 50       4887 if ($options{text_size}){ # use only a certain number of words
234 0 0       0 if ($total > $options{text_size}){
235 0 0       0 print STDERR "use $total tokens for classification\n"
236             if ($VERBOSE);
237 0         0 last;
238             }
239             }
240             }
241              
242             # no languages selected?
243 3 50       88 unless (exists $options{langs}){
244             # no assumed language set
245 3 50       9 unless (exists $options{assumed}){
246             # try to identify with the text we have saved above
247 3 50       23 $options{assumed} = &identify_language( $text )
248             unless (exists $options{assumed});
249             }
250 3 50       18 if (exists $confusable{$options{assumed}}){
251 3         11 $options{langs} = $confusable{$options{assumed}};
252             }
253             }
254              
255             # finally: classify with blacklists
256 3 50       16 if (exists $options{langs}){
257 3         48 return &classify( \%dic, %options );
258             }
259              
260             # no blacklists in this case ...
261 0         0 return $options{assumed};
262             }
263              
264              
265              
266             =head2 C<$langID = identify_stdin( [,%options] )>
267              
268             The same as C but reads from STDIN
269              
270             =cut
271              
272              
273             sub identify_stdin{
274 0     0 1 0 return identify_file( undef, @_ );
275             }
276              
277              
278              
279              
280             =head2 C
281              
282             Trains classifiers by learning blacklisted words for pairwise language discrimination. Returns nothing. Blacklists are stored in C. You may have to run the process as administrator if you don't have write permissions.
283              
284             C<%traindata> is a hash of training data files associated with their corresponding language IDs:
285              
286             'hr' => $croatian_text_file,
287             'sr' => $serbian_text_file,
288             ...
289              
290             C<%options> is a hash of optional parameters that change the behaviour of the learning algorithm. Possible parameters are:
291              
292             min_high => $freq1, # minimal token frequency in one langusgae
293             max_low => $freq2, # maximal token frequency in the other language
294             min_diff => $score, # score difference threshold
295             text_size => $size, # maximum number of tokens to be used per text
296              
297              
298             =cut
299              
300              
301             sub train{
302 1     1 1 25 my $traindata = shift;
303 1         3 my %options = @_;
304              
305 1         2 my @langs = keys %{$traindata};
  1         7  
306              
307 1         6 for my $s (0..$#langs){
308 3         19 for my $t ($s+1..$#langs){
309 3         267 print "traing blacklist for $langs[$s]-$langs[$t] ... ";
310 3         39 &train_blacklist( $$traindata{$langs[$s]},$$traindata{$langs[$t]},
311             outfile => "$BLACKLISTDIR/$langs[$s]-$langs[$t].txt",
312             %options );
313 3         986 print "saved in '$BLACKLISTDIR/$langs[$s]-$langs[$t].txt'\n";
314             }
315             }
316             }
317              
318              
319             =head2 C
320              
321             This function learns a blacklist of words to discriminate between the language given in $file1 and the language given in $file2. It takes the same arguments (%options) as the C function above with one additional parameter:
322              
323             outfile => $output_file
324              
325             Using this parameter, the blacklist will be written to the specified file. Otherwise it will be printed to STDOUT.
326              
327             The function returns nothing otherwise.
328              
329             =cut
330              
331              
332              
333             sub train_blacklist{
334 3     3 1 15 my ($file1,$file2,%options) = @_;
335              
336 3 50       16 my $min_high = exists $options{min_high} ? $options{min_high} : 10;
337 3 50       85 my $max_low = exists $options{min_low} ? $options{max_low} : 3;
338 3 50       10 my $min_diff = exists $options{min_diff} ? $options{min_diff} : 0.8;
339              
340 3         9 my %dic1=();
341 3         6 my %dic2=();
342              
343 3         24 my $total1 = &read_file($file1,\%dic1,$options{text_size});
344 3         29 my $total2 = &read_file($file2,\%dic2,$options{text_size});
345              
346 3 50       24 if ($options{outfile}){
347 3 50       386 mkdir dirname($options{outfile}) unless (-d dirname($options{outfile}));
348 3   50     483 open O,">$options{outfile}" || die "cannot write to $options{outfile}\n";
349 3         63 binmode(O,":encoding(UTF-8)");
350             }
351              
352 3         24432 foreach my $w (keys %dic1){
353 57240 100 66     418085 next if ((!exists $dic1{$w} || $dic1{$w}<$min_high) &&
      100        
      66        
354             (!exists $dic2{$w} || $dic2{$w}<$min_high));
355 3691 100 66     26389 next if ((exists $dic1{$w} && $dic1{$w}>$max_low) &&
      100        
      66        
356             (exists $dic2{$w} && $dic2{$w}>$max_low));
357              
358 926 50       2080 my $c1 = exists $dic1{$w} ? $dic1{$w} : 0;
359 926 100       2115 my $c2 = exists $dic2{$w} ? $dic2{$w} : 0;
360              
361 926         1366 my $s1 = $c1 * $total2;
362 926         945 my $s2 = $c2 * $total1;
363 926         1446 my $diff = ($s1 - $s2) / ($s1 + $s2);
364              
365 926 100       2434 if (abs($diff) > $min_diff){
366 468 50       810 if ($options{outfile}){
367 468         4864 print O "$diff\t$w\t$c1\t$c2\n";
368             }
369             else{
370 0         0 print "$diff\t$w\t$c1\t$c2\n";
371             }
372             }
373             }
374             # don't forget words that do NOT appear in dic1!!!
375 3         52112 foreach my $w (keys %dic2){
376 69888 100       169996 next if (exists $dic1{$w});
377 48705 100       155449 next if ($dic2{$w}<10);
378 429 50       920 my $c1 = exists $dic1{$w} ? $dic1{$w} : 0;
379 429 50       939 my $c2 = exists $dic2{$w} ? $dic2{$w} : 0;
380 429 50       858 if ($options{outfile}){
381 429         3045 print O "-1\t$w\t$c1\t$c2\n";
382             }
383             else{
384 0         0 print "-1\t$w\t$c1\t$c2\n";
385             }
386             }
387 3 50       58813 close O if ($options{outfile});
388             }
389              
390             =head2 C<@langs = available_languages()>
391              
392             Returns a list of languages covered by the blacklists in the BLACKLISTDIR.
393              
394             =cut
395              
396             sub available_languages{
397 0 0   0 1 0 unless (keys %blacklists){
398 0         0 &load_blacklists( $BLACKLISTDIR );
399             }
400 0         0 my %langs = ();
401 0         0 foreach (keys %blacklists){
402 0         0 my ($lang1,$lang2) = split(/\-/);
403 0         0 $langs{$lang1}=1;
404 0         0 $langs{$lang2}=1;
405             }
406 0         0 return keys %langs;
407             }
408              
409              
410             =head2 C<%lists = available_blacklists()>
411              
412             Resturns a hash of available language pairs (for which blacklists exist in the system).
413              
414             %lists = ( srclang1 => { trglang1a => blacklist1a, trglang1b => blacklist1b },
415             srclang2 => { trglang2a => blacklist2a, ... }
416             .... )
417              
418             =cut
419              
420              
421             sub available_blacklists{
422 4 50   4 1 33 unless (keys %blacklists){
423 0         0 &load_blacklists( $BLACKLISTDIR );
424             }
425 4         12 my %pairs = ();
426 4         24 foreach (keys %blacklists){
427 24         62 my ($lang1,$lang2) = split(/\-/);
428 24         87 $pairs{$lang1}{$lang2} = $_;
429 24 50 66     134 $pairs{$lang2}{$lang1} = $_
430             unless (defined $pairs{$lang2} && defined $pairs{$lang2}{$lang1});
431             }
432 4         46 return %pairs;
433             }
434              
435              
436              
437              
438             =head2 C
439              
440             This function allows to run experiments, i.e. training and evaluating classifiers for the given languages (C<@langs>). The arrays of training data and test data need to be of the same size as C<@langs>. The function prints the overall accurcy and a confusion table given the data sets and the classification. C<%options> can be used to set classifier-specific parameters.
441              
442             =cut
443              
444              
445             sub run_experiment{
446              
447 4     4   5556 use Benchmark;
  4         40594  
  4         40  
448              
449 0     0 1 0 my $trainfiles = shift;
450 0         0 my $evalfiles = shift;
451 0 0       0 my $options = ref($_[0]) eq 'HASH' ? shift : {};
452              
453 0         0 my @traindata =
454 0 0       0 ref($trainfiles) eq 'ARRAY' ? @{$trainfiles} : split(/\s+/,$trainfiles);
455 0         0 my @evaldata =
456 0 0       0 ref($evalfiles) eq 'ARRAY' ? @{$evalfiles} : split(/\s+/,$evalfiles);
457 0         0 my @langs = @_;
458              
459 0 0       0 die "no languages given!\n" unless (@langs);
460 0 0 0     0 die "no training nor evaluation data given!\n"
461             unless ($#traindata == $#evaldata || $#traindata == $#langs);
462              
463 0         0 my %trainset = ();
464 0         0 for (0..$#langs){ $trainset{$langs[$_]} = $traindata[$_]; }
  0         0  
465              
466             # train blacklists
467              
468 0 0       0 if ($#traindata == $#langs){
469 0   0     0 $BLACKLISTDIR = $$options{blacklist_dir} || "blacklist-experiment";
470 0         0 my $t1 = new Benchmark;
471 0         0 &train( \%trainset, %{$options} );
  0         0  
472 0         0 print STDERR "training took: ".
473             timestr(timediff(new Benchmark, $t1)).".\n";
474             }
475              
476 0         0 &initialize();
477              
478             # classify test data
479              
480 0 0       0 if ($#evaldata == $#langs){
481 0         0 print STDERR "classify ....\n";
482              
483 0         0 my $correct=0;
484 0         0 my $count=0;
485 0         0 my %guesses=();
486              
487 0         0 my %correct_lang=();
488 0         0 my %count_lang=();
489              
490 0         0 my $t1 = new Benchmark;
491 0         0 foreach my $i (0..$#langs){
492 0   0     0 open IN,"<:encoding(UTF-8)",$evaldata[$i] || die "...";
493 0         0 while (){
494 0         0 chomp;
495 0         0 my %dic = ();
496 0         0 &process_string($_,\%dic);
497 0         0 my $guess = &classify(\%dic,@langs);
498 0         0 $count++;
499 0         0 $count_lang{$langs[$i]}++;
500 0 0       0 if ($guess eq $langs[$i]){
501 0         0 $correct++;
502 0         0 $correct_lang{$langs[$i]}++;
503             }
504 0         0 $guesses{$langs[$i]}{$guess}++;
505             }
506 0         0 close IN;
507             }
508 0         0 print STDERR "classification took: ".
509             timestr(timediff(new Benchmark, $t1)).".\n";
510              
511 0         0 printf "accuracy: %6.4f\n ",$correct/$count;
512 0         0 foreach my $c (@langs){
513 0         0 print " $c";
514             }
515 0         0 print "\n";
516 0         0 foreach my $c (@langs){
517 0         0 print "$c ";
518 0         0 foreach my $g (@langs){
519 0         0 printf "%4d",$guesses{$c}{$g};
520             }
521 0         0 printf " %6.4f",$correct_lang{$c}/$count_lang{$c};
522 0         0 print "\n";
523             }
524             }
525 0         0 system("wc -l $Lingua::Identify::Blacklists::BLACKLISTDIR/*.txt");
526             }
527              
528              
529             =head2 Module-internal functions
530              
531             The following functions are not exported and are mainly used for internal purposes (but may be used from the outside if needed).
532              
533             initialize() # reset the repository of blacklists
534             identify_language($text) # return lang-ID for $text (using CLD)
535             classify(\%dic,%options) # run the classifier
536             classify_cascaded(\%dic,@langs) # run a cascade of binary classifications
537              
538             # run all versus all and return the one that wins most binary decisions
539             # (a score margin is used to adjust the reliability of the decisions)
540              
541             classify_with_margin(\%dic,$margin,@langs)
542              
543             load_blacklists($dir) # load all blacklists available in $dir
544             load_blacklist(\%list,$dir, # load a lang-pair specific blacklist
545             $lang1,$lang2)
546             read_file($file,\%dic,$max) # read a file and count token frequencies
547             process_string($string) # process a given string (lowercasing ...)
548              
549             =cut
550              
551              
552 0     0 0 0 sub initialize{ %blacklists = (); %confusable = (); }
  0         0  
553              
554             sub identify_language{
555 608     608 0 3136 my ($lang, $id, $conf) = $CLD->identify( $_[0] );
556              
557             # strangely enough CLD is not really reliable for English
558             # (all kinds of garbish input is recognized as English)
559             # --> check with Lingua::Identify
560 608 100       290648 if ($id eq 'en'){
561 2 100       12 $id = $id = langof( $_[0] ) ? $id : 'unknown';
562             }
563 608         9051 return $id;
564             }
565              
566             sub classify{
567 606     606 0 1078 my $dic = shift;
568 606         2488 my %options = @_;
569 606 50       1848 $options{langs} = '' unless ($options{langs});
570              
571 606         1975 my @langs = ref($options{langs}) eq 'ARRAY' ?
572 606 50       2204 @{$options{langs}} : split( /\s+/, $options{langs} ) ;
573              
574 606 50       1501 @langs = available_languages() unless (@langs);
575              
576 606 50       1540 return &classify_with_margin( $dic, $options{use_margin}, @langs )
577             if ($options{use_margin});
578 606         1531 return &classify_cascaded( $dic, @langs );
579             }
580              
581             sub classify_cascaded{
582 606     606 0 852 my $dic = shift;
583 606         1444 my @langs = @_;
584              
585 606         920 my $lang1 = shift(@langs);
586 606         1159 foreach my $lang2 (@langs){
587              
588             # load blacklists on demand
589 1212 100       4382 unless (exists $blacklists{"$lang1-$lang2"}){
590 4         22 $blacklists{"$lang1-$lang2"}={};
591 4         24 &load_blacklist($blacklists{"$lang1-$lang2"},
592             $BLACKLISTDIR,$lang1,$lang2);
593             }
594 1212         2418 my $list = $blacklists{"$lang1-$lang2"};
595              
596 1212         1630 my $score = 0;
597 1212         1318 foreach my $w (keys %{$dic}){
  1212         124792  
598 448258 100       1066843 if (exists $$list{$w}){
599 32881         88165 $score += $$dic{$w} * $$list{$w};
600 32881 50       73836 print STDERR "$$dic{$w} x $w found ($$list{$w})\n" if ($VERBOSE);
601             }
602             }
603 1212 100       47538 if ($score < 0){
604 389         710 $lang1 = $lang2;
605             }
606 1212 50       3976 print STDERR "select $lang1 ($score)\n" if ($VERBOSE);
607             }
608 606         42116 return $lang1;
609             }
610              
611              
612             # OTHER WAY OF CLASSIFYING
613             # test all against all ...
614              
615             sub classify_with_margin{
616 0     0 0 0 my $dic = shift;
617 0         0 my $margin = shift;
618 0         0 my @langs = @_;
619              
620 0         0 my %selected = ();
621 0         0 while (@langs){
622 0         0 my $lang1 = shift(@langs);
623 0         0 foreach my $lang2 (@langs){
624              
625             # load blacklists on demand
626 0 0       0 unless (exists $blacklists{"$lang1-$lang2"}){
627 0         0 $blacklists{"$lang1-$lang2"}={};
628 0         0 &load_blacklist($blacklists{"$lang1-$lang2"},
629             $BLACKLISTDIR,$lang1,$lang2);
630             }
631 0         0 my $list = $blacklists{"$lang1-$lang2"};
632              
633 0         0 my $score = 0;
634 0         0 foreach my $w (keys %{$dic}){
  0         0  
635 0 0       0 if (exists $$list{$w}){
636 0         0 $score += $$dic{$w} * $$list{$w};
637 0 0       0 print STDERR "$$dic{$w} x $w found ($$list{$w})\n"
638             if ($VERBOSE);
639             }
640             }
641 0 0       0 next if (abs($score) < $margin);
642 0 0       0 if ($score < 0){
643             # $selected{$lang2}-=$score;
644 0         0 $selected{$lang2}++;
645 0 0       0 print STDERR "select $lang2 ($score)\n" if ($VERBOSE);
646             }
647             else{
648             # $selected{$lang1}+=$score;
649 0         0 $selected{$lang1}++;
650 0 0       0 print STDERR "select $lang1 ($score)\n" if ($VERBOSE);
651             }
652             }
653             }
654 0         0 my ($best) = sort { $selected{$b} <=> $selected{$a} } keys %selected;
  0         0  
655 0         0 return $best;
656             }
657              
658              
659             # load_all_blacklists = alias for load_blacklists
660              
661 0     0 0 0 sub load_all_blacklists{ return load_blacklists(@_); }
662              
663             sub load_blacklists{
664 4   33 4 0 27 my $dir = shift || $BLACKLISTDIR;
665              
666 4 50       219 opendir(my $dh, $dir) || die "cannot read directory '$dir'\n";
667 4         108 while(readdir $dh) {
668 32 100       458 if (/^(.*)-(.*).txt$/){
669 24         262 $blacklists{"$1-$2"}={};
670 24         177 &load_blacklist($blacklists{"$1-$2"}, $dir, $1, $2);
671             }
672             }
673 4         196 closedir $dh;
674              
675             # update list of confusable languages
676 4         26 my %lists = &available_blacklists();
677 4         22 foreach my $lang (keys %lists){
678 36         39 @{$confusable{$lang}} = keys %{$lists{$lang}};
  36         86  
  36         81  
679 36         47 unshift( @{$confusable{$lang}}, $lang );
  36         204  
680             }
681             }
682              
683              
684             sub load_blacklist{
685 28     28 0 214 my ($list,$dir,$lang1,$lang2) = @_;
686              
687 28         57 my $inverse = 0;
688 28 100       1974 if (! -e "$dir/$lang1-$lang2.txt"){
689 4         15 ($lang1,$lang2) = ($lang2,$lang1);
690 4         7 $inverse = 1;
691             }
692              
693 4   50 4   46 open F,"<:encoding(UTF-8)","$dir/$lang1-$lang2.txt" || die "...";
  4         9  
  4         40  
  28         1944  
694 28         105837 while (){
695 348340         688923 chomp;
696 348340         1019433 my ($score,$word) = split(/\t/);
697 348340 100       1886411 $$list{$word} = $inverse ? 0-$score : $score;
698             }
699 28         1732 close F;
700             }
701              
702             sub open_file{
703 12     12 0 23 my $file = shift;
704             # allow gzipped input
705 12         22 my $fh;
706 12 50       61 if ($file=~/\.gz$/){
707 0   0     0 open $fh,"gzip -cd < $file |" || die "cannot open file '$file'";
708 0         0 binmode($fh,":encoding(UTF-8)");
709             }
710             else{
711 12   50     1227 open $fh,"<:encoding(UTF-8)",$file || die "cannot open file '$file'";
712             }
713 12         1262 return $fh;
714             }
715              
716             sub read_file{
717 6     6 0 22 my ($file,$dic,$max)=@_;
718              
719             # use File::GetLineMaxLength to avoid filling the memory
720             # when reading from files without new lines
721 6         22 my $fh = open_file( $file );
722 6         68 my $reader = File::GetLineMaxLength->new($fh);
723              
724 6         166 my $total = 0;
725 6         76 while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
726 1200         242853 chomp $line;
727 1200         2835 &process_string($line,$dic,$total);
728 1200 50       9707 if ($max){
729 0 0       0 if ($total > $max){
730 0         0 print STDERR "read $total tokens from $file\n";
731 0         0 last;
732             }
733             }
734             }
735 6         517 close $fh;
736 6         80 return $total;
737             }
738              
739              
740             # process_string($string,\%dic,\$wordcount[,$maxwords])
741              
742             sub process_string{
743 2403 50   2403 0 183010 $_[0]=lc($_[0]) if ($LOWERCASE);
744 4 50   4   10645 $_[0]=~s/((\A|\s)\P{IsAlpha}+|\P{IsAlpha}+(\s|\Z))/ /gs if ($TOKENIZE);
  4         11  
  4         92  
  2403         2831403  
745              
746 2403 50       817104 my @words = $ALPHA_ONLY ?
747             grep(/^\p{IsAlpha}/,split(/\s+/,$_[0])) :
748             split(/\s+/,$_[0]);
749              
750             # use only $maxwords words
751 2403 50       62516 splice(@words,$_[3]) if ($_[3]);
752              
753 2403         5741 foreach my $w (@words){${$_[1]}{$w}++;$_[2]++;}
  1016058         945553  
  1016058         2204555  
  1016058         1345165  
754             }
755              
756             1;
757              
758             __END__