File Coverage

blib/lib/Encode/Detect/Upload.pm
Criterion Covered Total %
statement 244 354 68.9
branch 96 206 46.6
condition 30 69 43.4
subroutine 19 20 95.0
pod 11 11 100.0
total 400 660 60.6


line stmt bran cond sub pod time code
1             package Encode::Detect::Upload;
2              
3             our $VERSION=0.04;
4              
5             =encoding utf8
6              
7             =head1 NAME
8              
9             Encode::Detect::Upload - Attempt to guess user's locale encoding from IP,
10             HTTP_ACCEPT_LANGUAGE and HTTP_USER_AGENT
11              
12             =head1 SYNOPSIS
13              
14             use Encode::Detect::Upload;
15             my $detector = new Encode::Detect::Upload;
16             # Feelin lucky!
17             my $charset = $detector->detect();
18             # More sensible
19             my ( $charset_list, $meta ) = $detector->detect();
20              
21             =head1 DESCRIPTION
22              
23             Dealing with input from globally disperse users can be a real pain. Although when
24             setting web forms to utf-8 browsers will often do the right thing, in some
25             instances, such as text file uploads, you are stuck will trying to figure out
26             the files charset encoding. L uses Mozilla's universal
27             charset detector, which works great most of the time. But when it doesn't your
28             stuck with asking the user, a user that all to often these days has a very low
29             technical ability, and likely doesn't know what a charset it.
30              
31             In my experience with dealing with such user uploads, the charset of the file
32             usually relates to the users OS, location and language settings. Although it's
33             true that the file could have any encoding, the file could have been created on
34             a different machine, with a different locale to the one that is doing the upload.
35             But the use of this modules techniques along with that of
36             L more cases can be handled correctly. Methods for
37             helping the user chose encoding are also provided.
38              
39             =cut
40              
41 3     3   112241 use utf8;
  3         32  
  3         18  
42 3     3   104 use strict;
  3         5  
  3         163  
43 3     3   16 use warnings;
  3         11  
  3         140  
44 3     3   14 use Carp;
  3         6  
  3         265  
45              
46 3     3   2956 use Encode::Detect::Upload::Data;
  3         19  
  3         367  
47 3     3   3772 use Encode;
  3         46079  
  3         25517  
48              
49             my $country_lang = \%Encode::Detect::Upload::Data::country_lang;
50             my $lang_charset = \%Encode::Detect::Upload::Data::lang_charset;
51              
52             # Try to load some other modules
53             my $has_ipcountry = eval { require IP::Country; };
54              
55             my $has_geoip = !$has_ipcountry && eval { require Geo::IP; };
56              
57             my $has_detect = eval { require Encode::Detect::Detector; };
58              
59              
60             =head2 Methods
61              
62             =over 4
63              
64             =item new()
65              
66             =item new(\%params)
67              
68             =item new(%params)
69              
70             Returns a new detection object. Parameters may be passed either as
71             key/value pairs or as a hash references. The following parameters are
72             recognised:
73              
74             die_on_missing Whether missing method parameters cause fatal errors (default: true)
75              
76             =cut
77              
78             sub new {
79 2     2 1 1803 my $class = shift;
80 2         10 my %config = (
81             die_on_missing => 1,
82             );
83 2         6 my $param;
84 2 50 33     25 if ( @_ == 1 && ref $_[0] eq 'HASH' ) {
    50          
85 0         0 $param = $_[0];
86             }
87             elsif ( @_ % 2 == 0 ) {
88 2         7 $param = { @_ };
89             }
90             else {
91 0         0 croak( "Invalid parameters, must be either single hashref or key=>value pairs" );
92             }
93 2 50       16 if ( $param ) {
94 2         12 %config = (
95             %config,
96             %$param,
97             );
98             }
99 2         10 my $self = bless \%config, $class;
100 2         9 return $self;
101             }
102              
103              
104             =item get_os()
105              
106             =item get_os($user_agent_string)
107              
108             Extracts the operating system name from the supplied User-Agent header value,
109             or C<$ENV{HTTP_USER_AGENT}> if not supplied. Dies if no user agent string
110             is available.
111             Returns either C, C, C or undefined if no match was
112             made.
113              
114             =cut
115              
116             sub get_os {
117 10     10 1 1243 my $self = shift;
118 10         19 my $agent = shift;
119 10   100     51 $agent ||= $ENV{HTTP_USER_AGENT};
120 10 100 66     68 croak( 'No USER_AGENT string passed, and $ENV{HTTP_USER_AGENT} is empty' )
121             if !$agent && $self->{die_on_missing};
122              
123             # Basic regexps for matching
124 9 100       68 return 'Windows' if $agent =~ /Windows/;
125 3 100       18 return 'Macintosh' if $agent =~ /\b(?:Macintosh|Mac)\b/;
126 2 100       14 return 'Linux' if $agent =~ /Linux/;
127 1         5 return undef;
128             }
129              
130              
131             =item get_country()
132              
133             =item get_country($ip_address)
134              
135             =item get_country($ip_address,$geo_ip_data_filename)
136              
137             Looks up the user's country from the supplied IP address, or C<$ENV{REMOTE_ADDR}>
138             by default. Dies if neither of L or L is installed.
139             Returns the ISO 2 character country code.
140              
141             =cut
142              
143             sub get_country {
144 0     0 1 0 my $self = shift;
145 0 0 0     0 croak( 'Could not load IP::Country or Geo::IP' ) unless $has_ipcountry || $has_geoip;
146 0         0 my $ip = shift;
147 0   0     0 $ip ||= $ENV{REMOTE_ADDR};
148 0 0       0 croak( 'No IP passed, and $ENV{REMOTE_ADDR} is empty' ) unless $ip;
149 0 0 0     0 $ip=~/\A(?:0|[1-9]\d*)(?:\.(?:0|[1-9]\d*)){3}\z/ && !grep $_>255,split /\./,$ip
150             or croak( "$ip is not a valid IP" );
151              
152             # Use the available IP -> Country DB
153 0 0       0 if ( $has_ipcountry ) {
154 0         0 my $reg = IP::Country->new(); # TODO Cache?
155 0         0 my $country = $reg->inet_atocc($ip);
156 0 0       0 $country = undef if $country eq '**';
157 0 0       0 return defined $country ? lc $country : $country;
158             }
159 0 0       0 if ( $has_geoip ) {
160 0         0 my $gi;
161 0         0 my $data_file = shift;
162 0 0       0 if ( $data_file ) {
163 0 0       0 die( "Geo::IP data file $data_file does not exist" ) unless -e $data_file;
164 0         0 $gi = Geo::IP->new( $data_file, 0 ); # 0 = GEOIP_STANDARD
165             }
166             else {
167 0         0 $gi = Geo::IP->new( 0 );
168             }
169 0         0 return lc $gi->country_code_by_addr( $ip );
170             }
171             }
172              
173              
174             =item get_country_lang($iso_2code)
175              
176             Returns the language tag(s) associated with the supplied country code.
177             In scalar context returns the primary language tag; in list context
178             returns all associated language tags. Dies if the supplied country
179             code is undefined. Returns undef if no matching country is found.
180              
181             Language tags are defined in section 3.10 or RFC 2616, and can be 2
182             or 3 letters, optionally followed by a series of subtags, separated
183             by dashes.
184              
185             =cut
186              
187             sub get_country_lang {
188 3     3 1 3610 my $self = shift;
189 3         5 my $country_code = shift;
190 3 100       27 croak( 'No country passed' ) unless defined $country_code;
191 2 50       12 my $country = $country_lang->{lc $country_code}
192             or return;
193 2 100       7 if ( wantarray ) {
194 1         3 return @{ $country->{languages} };
  1         7  
195             }
196             else {
197 1         10 return $country->{languages}->[0];
198             }
199             }
200              
201              
202             =item get_country_name($iso_2code)
203              
204             Returns the name of the country specified by the suppied 2 letter code.
205             Dies if no country is specified.
206              
207             =cut
208              
209             sub get_country_name {
210 2     2 1 2877 my $self = shift;
211 2         5 my $country_code = shift;
212 2 100       20 croak( 'No country passed' ) unless defined $country_code;
213 1 50       7 my $country = $country_lang->{lc $country_code}
214             or return undef;
215 1         7 return $country->{name};
216             }
217              
218              
219             =item get_accept_lang()
220              
221             =item get_accept_lang($accept_lang_string)
222              
223             Returns the accepted language tag(s) described by the supplied Accept-Language
224             header value, or from C<$ENV{HTTP_ACCEPT_LANGUAGE}> if not supplied. Dies if no
225             header value is available.
226             In scalar context, returns the first language tag listed. In list context,
227             returns all tags, in the order they are listed in the header value.
228              
229             =cut
230              
231             sub get_accept_lang {
232 6     6 1 3580 my $self = shift;
233 6         13 my $accept = shift;
234 6   100     30 $accept ||= $ENV{HTTP_ACCEPT_LANGUAGE};
235 6 100 66     40 croak( 'No ACCEPT_LANGUAGE string passed, and $ENV{HTTP_ACCEPT_LANGUAGE} is empty' )
236             if !$accept && $self->{die_on_missing};
237             # We are going to ignore q and assume the order is accurate... might not be the best policy
238 5         10 my @langs;
239             my %seen;
240 5         36 foreach my $language ( split(/\s*,\s*/, $accept) ) {
241 7         30 my ( $lang, $q ) = split(/\s*;\s*/, $language);
242 7         17 $lang = lc $lang;
243 7 100       20 if ( wantarray ) {
244 6 50       21 next if $seen{$lang}; # filter out any duplicates
245 6         13 push( @langs, $lang );
246 6         24 $seen{$lang}++;
247             }
248             else {
249 1         8 return $lang;
250             }
251             }
252 4         22 return @langs;
253             }
254              
255              
256             =item get_lang_name($language_code)
257              
258             Returns the name of the language specified by the supplied 2 or 3 letter
259             ISO-639 language code. Dies if no language code is supplied.
260              
261             =cut
262              
263             sub get_lang_name {
264 6     6 1 2369 my $self = shift;
265 6         12 my $lang_code = shift;
266 6 100       29 croak( 'No language passed' ) unless defined $lang_code;
267 5         11 $lang_code = lc $lang_code;
268 5 100       28 my $language = $lang_charset->{$lang_code}
269             or return undef;
270 4         42 return $language->{name};
271             }
272              
273              
274             =item get_lang_list($language_tag)
275              
276             Returns the list of language tags which could be used for matching the supplied
277             language tag. This will always include the supplied language tag. If the supplied
278             tag includes a C or C subtag, or is a primary tag for which C
279             or C subtags are available, all such subtags will be returned. If the
280             supplied tag contains any subtags, the primary tag will also be returned.
281             Dies is no language tag is supplied.
282              
283             =cut
284              
285             sub get_lang_list {
286 3     3 1 10172 my $self = shift;
287 3         6 my $lang = shift;
288 3 100       40 croak( 'No language passed' ) unless defined $lang;
289 2         5 $lang = lc $lang;
290 2         6 my @lang_list = ($lang);
291 2         9 my %lang_seen = ( $lang => 1 );
292              
293             # Check for Cyrillic/Latin versions
294 2         8 $lang =~ s/-(?:cyrl|latn)\z//;
295 2         11 for my $sublang ("$lang-latn","$lang-cyrl") {
296 4 100       22 $lang_charset->{$sublang} or next;
297 2 50       13 $lang_seen{$sublang}++
298             or push @lang_list,$sublang;
299             }
300              
301             # Check for general language
302 2 100       17 if ( $lang =~ s/-.+\z// ) {
303 1 50       7 if ( $lang_charset->{$lang} ) {
304 1 50       6 $lang_seen{$lang}++
305             or push @lang_list, $lang;
306             }
307             # Check for cyrl/latn versions
308 1         6 for my $sublang ( "$lang-latn", "$lang-cyrl" ) {
309 2 50       10 $lang_charset->{$sublang} or next;
310 0 0       0 $lang_seen{$sublang}++
311             or push @lang_list, $sublang;
312             }
313             }
314              
315 2         14 return @lang_list;
316             }
317              
318              
319             =item get_lang_charset($language_tag)
320              
321             =item get_lang_charset($language_tag, $os_name)
322              
323             Returns the charset(s) used by the supplied language. If an operating system
324             name is supplied, treats its character sets preferentially. Dies if no
325             language tag is supplied. In scalar context, returns the best matching
326             charset. In list context, returns a list of all suitable charsets.
327              
328             =cut
329              
330             sub get_lang_charset {
331 8     8 1 4841 my $self = shift;
332 8         16 my $lang = shift;
333 8         12 my $os = shift;
334 8 100       42 croak( 'No language tag passed to get_lang_charset()' ) unless $lang;
335 7         14 $lang = lc $lang;
336 7 50       24 my $group = $lang_charset->{$lang} or return;
337 7         20 my @oses = qw(windows macintosh linux);
338 7 100       22 if ( $os ) {
339 1         4 $os = lc $os;
340 1 50       10 croak( "OS $os not recognised" ) unless $os =~ /\A(?:windows|linux|macintosh)\z/;
341 1         6 @oses = ($os, grep $_ ne $os, @oses);
342             }
343 7         29 my @charsets = @$group{@oses};
344 7 100       20 if ( wantarray ) {
345 6         28 return @charsets;
346             } else {
347 1         9 return $charsets[0];
348             }
349             }
350              
351              
352             =item get_words($sample_string)
353              
354             =item get_words($sample_string, $max_words)
355              
356             Returns a list of unique words from the supplied sample string which contain
357             non-ASCII characters. Returns no more than the specified maximum number
358             or words, which defaults to 10. Dies if no sample text is supplied.
359              
360             =cut
361              
362             sub get_words {
363 7     7 1 3791 my $self = shift;
364 7         13 my $text = shift;
365 7         15 my $max = shift;
366 7   100     26 $max ||= 10;
367 7 100       35 croak( 'No sample text passed' ) unless $text;
368 6         89 my ( @words, %words );
369 6         55 while ( $text =~ /([\w\x80-\xff]*[\x80-\xff][\w\x80-\xff]*)/g ) {
370 7 50       38 next if $words{$1}++;
371 7         21 push @words, $1;
372 7 50       38 last if @words >= $max;
373             }#while
374 6         31 return @words;
375             }
376              
377              
378             =item detect(%params)
379              
380             =item detect(\%params)
381              
382             Determines the encoding of the supplied text. In scalar context, returns the most
383             likely charset code. In list context returns an arrayref of charset codes, ordered
384             from most to least likely, and a hashref of metadata. Dies if any required
385             parameters are not supplied. The following parameters are accepted:
386              
387             text Text to determine the encoding of (required)
388             words Maximum number of words to examine (default=10)
389             ip User's IP address (default=$ENV{REMOTE_ADDR})
390             accept_lang Accept-Language header value (required, default=$ENV{HTTP_ACCEPT_LANGUAGE})
391             inc_linux Include Linux charsets? (default=0)
392             ranking TODO document
393             os OS name (Windows, Macintosh or Linux)
394             user_agent User-Agent header value (required if os not supplied,
395             default=$ENV{HTTP_USER_AGENT})
396             lang Language tag or arrayref thereof
397             country Country code or arrayref thereof (required if lang not supplied)
398             country_extra TODO document
399             lang_extra TODO document
400              
401              
402             Requires a sample text string. Can optionally be passed the number of words to
403             try to match (default 10), the users IP, the users OS, the user_agent string,
404             the language code(S), the accept_language string, whether linux charsets should
405             be included, and for advanced use you can adjust the way languages and charsets
406             are ranked. Returns either a single charset (in scalar context) or a list of
407             charsets ordered by most likely with associated meta data. If
408             L is available it's guess is used to improve accuracy.
409              
410             For discussion of ranking heuristics and how to adjust them, see the section below.
411              
412             # I'm feeling lucky
413             my $charset = $detector->detect();
414              
415             # I'm feeling realistic
416             my ( $charset_list, $charset_meta ) = $detector->detect( text => '...' );
417              
418             # Data structure example
419             $charset_list = [ 'x-mac-cyrillic', 'x-mac-ce', 'windows-1251', 'x-mac-ukrainian'... ];
420             $charset_meta = {
421             charsets => {
422             'x-mac-cyrillic' => {
423             pos => 1, # Ranking position
424             words => [ 'Здравствуй', ... ], # Sample word list
425             lang => [ 'ru', ... ], # Language tags that led to this charset
426             },
427             'x-mac-ce' => {
428             pos => 2,
429             words => [ 'ášūŗ‚ŮÚ‚ůť', ... ],
430             lang => [ 'sr', ... ],
431             },
432             'windows-1251' => {
433             pos => 3,
434             words => [ '‡дравствуй', ... ],
435             lang => [ 'ru', ... ],
436             mozilla => 1, # In this example mozilla guessed wrong
437             },
438             ...
439             },
440             lang => {
441             ru => {
442             name => 'Russian', # Language name
443             both => 1, # Matched from both country and accept_lang
444             country => 1, # Matched from country (IP)
445             accept => 1, # Matched from accept_lang
446             pos => 1, # Ranking position
447             },
448             ...
449             },
450             country => {
451             name => 'Russia',
452             tag => 'ru',
453             },
454             error => [ 'utf-8', ... ], # Text wouldn't parse as utf-8
455             }
456              
457             =cut
458              
459             sub detect {
460 5     5 1 4401 my $self = shift;
461 5         11 my $param;
462 5 50 33     41 if ( @_ == 1 && ref $_[0] eq 'HASH' ) {
    50          
463 0         0 $param = $_[0];
464             }
465             elsif ( @_ % 2 == 0 ) {
466 5         30 $param = { @_ };
467             }
468             else {
469 0         0 croak( "Invalid parameters, must be either single hashref or key=>value pairs" );
470             }
471             # TODO(LH) Maybe some param name validation
472 5         61 my %conf = (
473             words => 10,
474             ip => $ENV{REMOTE_ADDR},
475             user_agent => $ENV{HTTP_USER_AGENT},
476             accept_lang => $ENV{HTTP_ACCEPT_LANGUAGE},
477             inc_linux => 0,
478             %$param,
479             );
480              
481 5         69 my %rank = (
482             lang => {
483             start => 'AC',
484             repeat => 'AC',
485             },
486             lang_both => 1,
487             char => {
488             windows => {
489             start => 'WW',
490             repeat => 'WML',
491             },
492             macintosh => {
493             start => 'M',
494             repeat => 'MWL',
495             },
496             linux => {
497             start => 'LWM',
498             repeat => 'LWM',
499             },
500             },
501             mozilla_move => 1,
502             mozilla_insert => 3,
503             );
504 5 100       23 if ( $conf{ranking} ) {
505 2         14 %rank = (
506             %rank,
507 2         8 %{ $conf{ranking} }
508             );
509             # Validate ranking sequences
510 2 50       12 croak( 'Missing language ranking sequence' ) unless $rank{lang}->{repeat};
511 2         5 foreach my $os ( qw( windows macintosh linux ) ) {
512 6 50       25 croak( 'Missing ' . ucfirst $os . ' charset ranking sequence' ) unless $rank{char}->{$os}->{repeat};
513             }
514             }
515              
516             # Get the OS
517 5 50       21 unless ( $conf{os} ) {
518 5         44 $conf{os} = $self->get_os( $conf{user_agent} );
519             }
520             # Default to windows if we still don't have an OS
521 5 50       14 $conf{os} = 'windows' unless $conf{os};
522 5         15 $conf{os} = lc $conf{os};
523             # OS of linux implies inc_linux
524 5 50       16 $conf{inc_linux} = 1 if $conf{os} eq 'linux';
525              
526             # Get the list of language tags
527 5         21 my($country_meta, $lang_meta) = $self->_detect_get_langs(\%conf,\%rank);
528              
529             # Get the related charsets and meta data
530 5         23 my @words = $self->get_words( $conf{text}, $conf{words} );
531 5         22 my %char_hash = (
532             W => [],
533             M => [],
534             L => [],
535             );
536 5         8 my %char_meta;
537             my %char_error;
538 5         29 $self->_detect_check_langs(\%char_meta,\%char_hash,\%char_error,\%conf,\@words);
539              
540             # Does this parse as UTF-8?
541 5         352 my $is_utf8 = 1;
542 5         9 eval { decode( 'UTF-8', $conf{text}, Encode::FB_CROAK ) };
  5         27  
543 5 100       637 $is_utf8 = 0 if $@;
544             # Make sure we have UTF-8 charset info
545 5 100       13 if ( $is_utf8 ) {
546             # UTF-8 could be any language, so doesn't tend to be picked up above
547 0         0 $char_meta{'utf-8'} = {
548             pos => 1,
549             lang => [],
550 1         7 words => [ map { decode( 'UTF-8', $_ ) } @words ],
551             };
552             }
553             else {
554 4         11 $char_error{'utf-8'}++;
555             }
556              
557             # Rank position
558 5   33     29 $rank{char}->{ $conf{os} }->{start} ||= $rank{char}->{ $conf{os} }->{repeat};
559 5         30 my @sequence = split( //, $rank{char}->{ $conf{os} }->{start} );
560 5 100       18 my $pos = $is_utf8 ? 2 : 1;
561 5         18 while ( my $type = shift @sequence ) {
562 25 100 100     28 last unless @{ $char_hash{W} } || @{ $char_hash{M} } || @{ $char_hash{L} };
  25   66     95  
  20         85  
  5         24  
563 20 100       68 push( @sequence, split( //, $rank{char}->{ $conf{os} }->{repeat} ) ) unless @sequence;
564 20         25 while ( my $charset = shift @{ $char_hash{$type} } ) {
  20         86  
565 10 50       27 if ( $char_meta{$charset}->{pos} ) {
566 0         0 next;
567             }
568             else {
569 10         26 $char_meta{$charset}->{pos} = $pos;
570 10         13 $pos++;
571 10         39 last;
572             }
573             }#while
574             }#while
575              
576             # Can we see what Mozilla detection thinks?
577 5         9 my $mozilla;
578 5 50       14 if ( $has_detect ) {
579 0         0 $mozilla = Encode::Detect::Detector::detect( $conf{text} );
580 0 0       0 $mozilla = lc $mozilla if defined $mozilla;
581 0 0       0 if ( $mozilla ) {
582             # Check charset can decode
583 0         0 my $charset_encode = _try_charset( $mozilla, $conf{text} );
584 0 0       0 if ( $charset_encode ) {
585             # Check we have the Mozilla charset in our list
586 0 0       0 if ( $char_meta{$mozilla} ) {
587 0         0 $char_meta{$mozilla}->{mozilla} = 1;
588             # Should Mozilla affect position?
589 0 0 0     0 if ( $rank{mozilla_move} && $char_meta{$mozilla}->{pos} != 1 ) {
590 0         0 my $pos_new = $char_meta{$mozilla}->{pos} - $rank{mozilla_move};
591 0 0       0 $pos_new = 1 if $pos_new < 1;
592             # Move other charsets
593 0 0       0 map { $_->{pos}++ } grep {
  0         0  
594 0         0 $_->{pos} >= $pos_new &&
595             $_->{pos} < $char_meta{$mozilla}->{pos}
596             } values %char_meta;
597 0         0 $char_meta{$mozilla}->{pos} = $pos_new;
598             }
599             }#if
600             else {
601             # Insert Mozilla if it's not in list?
602 0 0       0 if ( $rank{mozilla_insert} ) {
603             # Push everything else down
604 0         0 map { $_->{pos}++ } grep {
  0         0  
605 0         0 $_->{pos} >= $rank{mozilla_insert}
606             } values %char_meta;
607 0         0 $char_meta{$mozilla} = {
608             lang => [],
609 0         0 words => [ map { decode( $charset_encode, $_ ) } @words ],
610             pos => $rank{mozilla_insert},
611             mozilla => 1,
612             };
613             }
614             }#else
615             }#if
616             else {
617 0         0 $char_error{ $mozilla }++;
618             }
619             }#if
620             }#if
621              
622             # Prep return
623 5         39 my @charsets = sort { $char_meta{$a}->{pos} <=> $char_meta{$b}->{pos} } keys %char_meta;
  7         33  
624 5 100       16 if ( wantarray ) {
625 1         10 my %meta = (
626             charsets => \%char_meta,
627             lang => $lang_meta,
628             country => $country_meta,
629             error => [ keys %char_error ],
630             );
631 1         16 return ( \@charsets, \%meta);
632             }
633             else {
634 4         82 return $charsets[0];
635             }
636             }#sub
637              
638             sub _detect_get_langs{
639 5     5   10 my $self = shift;
640 5         7 my $conf = shift;
641 5         10 my $rank = shift;
642              
643 5         7 my %country_meta;
644             my %lang_meta;
645 5 100       55 if ( $conf->{lang} ) {
646 2 50       10 $conf->{lang} = [ $conf->{lang} ] unless ref $conf->{lang};
647 2         5 $conf->{lang} = [ map { lc $_ } @{ $conf->{lang} } ];
  2         9  
  2         5  
648             }
649             else {
650             ## Get language list from conf with meta data
651             # Start with country list
652 3         6 my @country_list;
653 3 50       23 if ( $conf->{country} ) {
654 0 0       0 @country_list = ref $conf->{country} ? @{ $conf->{country} } : ( $conf->{country} );
  0         0  
655 0         0 @country_list = map { lc $_ } @country_list;
  0         0  
656             }
657             else {
658             # See if we have IP's to lookup countries for, we may have several
659 3 50 33     36 if ( $conf->{ip} && ($has_ipcountry || $has_geoip) ) {
      33        
660 0 0       0 $conf->{ip} = [ $conf->{ip} ] unless ref $conf->{ip};
661 0         0 foreach my $ip ( @{ $conf->{ip} } ) {
  0         0  
662 0         0 my $country = $self->get_country( $ip );
663 0 0       0 next unless $country;
664 0 0       0 if ( $country_meta{$country} ) {
665 0         0 push( @{ $country_meta{$country}->{ip} }, $ip );
  0         0  
666             }
667             else {
668 0         0 push( @country_list, $country );
669 0         0 $country_meta{$country} = {
670             ip => [$ip],
671             };
672             }
673             }#foreach
674             }#if
675             # Are there extra countries to add to the start or end of the list
676 3 50       13 if ( ref $conf->{country_extra} ) {
677 0         0 foreach my $position ( qw( end start ) ) {
678 0 0       0 if ( $conf->{country_extra}->{$position} ) {
679 0 0       0 $conf->{country_extra}->{$position} = [ $conf->{country_extra}->{$position} ] unless ref $conf->{country_extra}->{$position};
680 0         0 $conf->{country_extra}->{$position} = [ map { lc $_ } @{ $conf->{country_extra}->{$position} } ];
  0         0  
  0         0  
681 0         0 foreach my $country ( @{ $conf->{country_extra}->{$position} } ) {
  0         0  
682             # Check if it's already in the list, in which case remove
683 0 0       0 if ( $country_meta{$country} ) {
684 0   0     0 $country_meta{$country}->{extra} ||= [];
685 0         0 push( @{ $country_meta{$country}->{extra} }, $position );
  0         0  
686             # If adding to the end, leave in current position, only move to start
687 0 0       0 @country_list = grep { $_ ne $country } @country_list if $position eq 'start';
  0         0  
688             }
689             else {
690 0         0 $country_meta{$country} = {
691             extra => [ $position ],
692             };
693             }
694             }
695             }#if
696             }#foreach
697             # Add to front/back of list
698 0 0       0 unshift( @country_list, @{ $conf->{country_extra}->{start} } ) if ref $conf->{country_extra}->{start};
  0         0  
699 0 0       0 push( @country_list, @{ $conf->{country_extra}->{end} } ) if ref $conf->{country_extra}->{end};
  0         0  
700             }#if
701             }#else
702             # Get lang tags from countries
703 3         6 my @lang_country;
704             my %country_seen;
705 3         9 foreach my $country ( @country_list ) {
706 0         0 $country_meta{$country}->{name} = $self->get_country_name( $country );
707 0         0 my @lang_list = $self->get_country_lang( $country );
708 0         0 foreach my $lang ( @lang_list ) {
709 0 0       0 next if $country_seen{$lang};
710 0         0 push( @lang_country, $lang );
711 0         0 $country_seen{$lang}++;
712             }
713             }#foreach
714              
715             # Now lang list from accept_langs
716 3         7 my @lang_accept;
717             my %accept_seen;
718 3 50       10 if ( $conf->{accept_lang} ) {
719 3 50       18 $conf->{accept_lang} = [ $conf->{accept_lang} ] unless ref $conf->{accept_lang};
720 3         6 foreach my $accept_lang ( @{ $conf->{accept_lang} } ) {
  3         11  
721 3         14 my @lang_list = $self->get_accept_lang( $accept_lang );
722 3         11 foreach my $lang ( @lang_list ) {
723 3 50       12 next if $accept_seen{$lang};
724 3         8 push( @lang_accept, $lang );
725 3         14 $accept_seen{$lang}++;
726             }
727             }
728             }
729             # Are there extra lang tags to add to the start or end of the list
730 3         6 my %extra_seen;
731             my %extra_list;
732 3 50       11 if ( ref $conf->{lang_extra} ) {
733 0         0 foreach my $position ( qw( end start ) ) {
734 0         0 $extra_list{$position} = [];
735 0 0       0 if ( $conf->{lang_extra}->{$position} ) {
736 0 0       0 $conf->{lang_extra}->{$position} = [ $conf->{lang_extra}->{$position} ] unless ref $conf->{lang_extra}->{$position};
737 0         0 $conf->{lang_extra}->{$position} = [ map { lc $_ } @{ $conf->{lang_extra}->{$position} } ];
  0         0  
  0         0  
738 0         0 $extra_list{$position} = $conf->{lang_extra}->{$position};
739 0         0 $extra_seen{$position} = { map { $_ => 1 } @{ $conf->{lang_extra}->{$position} } };
  0         0  
  0         0  
740             }
741             }#foreach
742             }#if
743             ## Rank languages based on order and appearance in both lists
744             # Which lists they appear in
745 3         143 foreach my $lang ( @lang_accept, @lang_country, @{ $extra_list{start} }, @{ $extra_list{end} } ) {
  3         8  
  3         9  
746 3 50       10 next if $lang_meta{$lang};
747 3 50 33     120 $lang_meta{$lang} = {
      50        
      50        
      50        
      50        
748             both => $accept_seen{$lang} && $country_seen{$lang} ? 1 : 0,
749             accept => $accept_seen{$lang} || 0,
750             country => $country_seen{$lang} || 0,
751             start => $extra_seen{start}->{$lang} || 0,
752             end => $extra_seen{end}->{$lang} || 0,
753             name => $self->get_lang_name($lang),
754             };
755             }#foreach
756             ## Rank position
757             # Extra start will go first
758 3         8 my $pos = 1;
759 3         5 foreach my $lang ( @{ $extra_list{start} } ) {
  3         12  
760 0 0       0 next if $lang_meta{$lang}->{pos};
761 0         0 $lang_meta{$lang}->{pos} = $pos;
762 0         0 $pos++;
763             }#foreach
764             # Then sequence
765 3         14 my %lang_hash = (
766             A => \@lang_accept,
767             C => \@lang_country,
768             );
769 3   33     14 $rank->{lang}->{start} ||= $rank->{lang}->{repeat};
770 3         16 my @sequence = split( //, $rank->{lang}->{start} );
771 3         13 while ( my $type = shift @sequence ) {
772 6 100 66     7 last unless @{ $lang_hash{A} } || @{ $lang_hash{C} };
  6         34  
  3         15  
773 3 50       10 push( @sequence, split( //, $rank->{lang}->{repeat} ) ) unless @sequence;
774 3         6 while ( my $lang = shift @{ $lang_hash{$type} } ) {
  3         14  
775 3 50       11 if ( $lang_meta{$lang}->{pos} ) {
776 0         0 next;
777             }
778             else {
779 3         15 $lang_meta{$lang}->{pos} = $pos;
780 3         4 $pos++;
781 3         15 last;
782             }
783             }#while
784             }#while
785             # Extra end added to the end
786 3         5 foreach my $lang ( @{ $extra_list{end} } ) {
  3         20  
787 0 0       0 next if $lang_meta{$lang}->{pos};
788 0         0 $lang_meta{$lang}->{pos} = $pos;
789 0         0 $pos++;
790             }#foreach
791             # Prefer languages that appear in both?
792 3 50       16 if ( $rank->{lang_both} ) {
793 0 0 0     0 $conf->{lang} = [ sort {
794 3         27 $lang_meta{$b}->{start} <=> $lang_meta{$a}->{start} ||
795             $lang_meta{$b}->{both} <=> $lang_meta{$a}->{both} ||
796             $lang_meta{$a}->{pos} <=> $lang_meta{$b}->{pos}
797             } keys %lang_meta ];
798             }
799             else {
800 0         0 $conf->{lang} = [ sort {
801 0         0 $lang_meta{$a}->{pos} <=> $lang_meta{$b}->{pos}
802             } keys %lang_meta ];
803             }
804             }#else
805              
806 5         23 return (\%country_meta,\%lang_meta);
807             }
808              
809             sub _detect_check_langs{
810 5     5   8 my $self = shift;
811 5         7 my $char_meta = shift;
812 5         8 my $char_hash = shift;
813 5         8 my $char_error = shift;
814 5         10 my $conf = shift;
815 5         7 my $words = shift;
816 5 50       19 my @os_list = ('W','M',$conf->{inc_linux} ? 'L' : ());
817 5         9 foreach my $lang ( @{ $conf->{lang} } ) {
  5         12  
818 5         16 my @charsets = $self->get_lang_charset( $lang );
819 5 50       12 next unless @charsets;
820 5         19 for ( my $i=0; $i <= $#os_list; $i++ ) {
821 10         185 my $charset = $charsets[$i];
822 10         15 my $os = $os_list[$i];
823 10 50       31 next if $char_error->{ $charset };
824 10 50       23 if ( $char_meta->{ $charset } ) {
825 0         0 push( @{ $char_meta->{$charset}->{lang} }, $lang );
  0         0  
826             }
827             else {
828             # Test charset parses
829 10         31 my $charset_encode = _try_charset( $charset, $conf->{text} );
830 10 50       67 if ( $charset_encode ) {
831 10         15 push( @{ $char_hash->{ $os } }, $charset );
  10         33  
832 10         199 $char_meta->{ $charset } = {
833             lang => [ $lang ],
834 10         110 words => [ map { decode( $charset_encode, $_ ) } @$words ],
835             };
836             }
837             else {
838 0         0 $char_error->{ $charset }++;
839             }
840             }#else
841             }#for
842             }#foreach
843             }
844              
845             sub _try_charset {
846 10     10   23 my ( $charset, $text ) = @_;
847             # Older versions of Encode::Alias don't map x-mac-* encodings properly
848 10         82 $charset =~ s/^(?:x[_-])?mac[_-](.*)$/mac$1/;
849 10         19 $charset =~ s/^macce$/maccentraleurroman/;
850 10         16 eval { decode( $charset, $text, Encode::FB_CROAK ) };
  10         60  
851 10 50       7201 return $@ ? 0 : $charset;
852             }#sub
853              
854              
855             =back
856              
857             =head1 RANKING SYSTEM
858              
859             Unfortunately the heuristics employed by this method aren't straight forward.
860             Several key scenarios are taken into consideration, namely:
861              
862             The upload charset is:
863             for the language that matches the browsers language settings and OS.
864             for the language that matches the uploaders countries official language and OS.
865             for the language that matches the browsers language settings, but a different OS.
866             for the language that matches the uploaders countries official language, but a different OS.
867             unrelated, hopefully detected by Mozilla's universal charset detector.
868              
869             Although the browsers language setting is preferred, it's not unusually for it
870             to be incorrect. For example a surprising number of UK users have en-US rather
871             than en-GB. In such instances the language from the IP would be more accurate.
872             For this reason if the Mozilla detected charset matches an IP dervied charset it
873             is brought to the front.
874             However, an Englishman uploading a file whilst abroad would not give an accurate
875             language from IP. Likewise, some countries like South Africa have several
876             recognised languages.
877             Some countries have inhabitants that use either Latin or Cyrillic alphabets for
878             the same language. In these instances, the Mozilla detector is used to determine
879             which is more likely, but both options will be returned.
880             The use of Macintosh computers has been on the rise, as has the appearance of
881             their charsets. In fact that's what led me to write this module, as the Mozilla
882             detector doesn't cover every encoding and was missing Mac-Roman. Generally
883             Windows users are less likely to upload files with Macintosh encoding, Although
884             the same cannot be said the other way around. For this reason, when the OS is
885             Macintosh it's matching charsets will come first, followed by the likely
886             Windows, alternating between the two.
887              
888             We assume linux systems are mostly UTF-8 these days, that their pre-UTF-8 ISO
889             charsets were roughly the same as the Windows equivalents, and that Linux users
890             are generally more computer savvy. For these reasons Linux charsets are not
891             included in results by default.
892              
893             Rather than ranking charsets through some kind of weighting based on appearance,
894             we apply configurable patterns. Weight would always favour common charsets,
895             hopefully the ranking patterns work better.
896              
897             This is the first version of this module. I'm open to suggestions with regards
898             improved heuristics, and possibly configurable heuristics.
899              
900             You can override the default ranking by passing the appropriate data structure
901             to detect(). You need to at least provide the repeat string for lang and all the
902             OSs.
903              
904             IP country lookup and accept_language parsing is used initially to generate a
905             list of matching languages. The order in which these are then ranked is based
906             on their appearance (accept_lang), or popularity (country), and the sequence
907             given. A represents accept_lang and C represents country, so a sequence starting
908             with AC and repeating with AC would generate ACACACACAC... until there are no
909             matching languages left. The lang_both option pushes charsets that come from
910             both accept_lang and country.
911              
912             Next charsets are matched from the languages by OS. Depending on what OS has
913             been passed, or detected from user_agent. The char sequences contain W for
914             Windows, M for Macintosh or L for Linux. The Linux charsets are filtered out
915             unless the OS is Linux or the inc_linux config option is enabled. So a Windows
916             OS with sequence starting WW and repeating WML would generate WWWMWMWMWM...
917             matching the first 3 likely windows charsets, then the most likely Macintosh,
918             etc. Charsets are tested to see if they can decode the text, invalid ones are
919             filtered out.
920              
921             The string is tested to see whether it looks like UTF-8. If it does that's
922             pushed to the front on the list. If the Mozilla charset detector is available
923             it's used to see what charset it returns. The option mozilla_move sets how the
924             many places to move the matching charset forward in the list. The
925             mozilla_insert options defines in what position to insert the Mozilla match if
926             it's not already in the list.
927              
928             my %ranking = (
929             lang => {
930             start => 'AC',
931             repeat => 'AC',
932             },
933             # Rank languages that appear in both country and accept_lang first
934             lang_both => 1,
935             char => {
936             windows => {
937             start => 'WW',
938             repeat => 'WML',
939             },
940             macintosh => {
941             start => 'M',
942             repeat => 'MWL',
943             },
944             linux => {
945             start => 'LWM',
946             repeat => 'LWM',
947             },
948             },
949             # Mozilla detected charset options
950             mozilla_move => 1, # Number of positions to move the forward
951             mozilla_insert => 3, # At what position to insert if it's not in list
952             );
953             my $charset = $detector->detect( ranking => \%ranking );
954              
955             =head1 LICENSE
956              
957             This is released under the Artistic
958             License. See L.
959              
960             =head1 AUTHORS
961              
962             Lyle Hopkins - L
963              
964             Peter Haworth - L
965              
966             Development kindly sponsored by - L
967              
968             =head1 REFERENCES
969              
970             I had a hard time finding good data sources, all the information I needed was
971             pretty spread out. These are the main sites I used, but there was lots of
972             googling to fill in the gaps.
973              
974             L
975             L
976             L
977             L
978             L
979             L
980             L
981             L
982             L
983              
984             =head1 SEE ALSO
985              
986             L, L, L, L
987              
988             =head1 TODO
989              
990             Make default between Latin and Cyrillic based on popularity in language
991             Write some tests
992             Rank regions differently?
993             Generalize environment examination, defaults only at detect() or confgurable thru the detector object itself
994              
995             =cut
996              
997              
998              
999             1;