File Coverage

blib/lib/Encode/Detect/Upload.pm

Criterion	Covered	Total	%
statement	244	354	68.9
branch	96	206	46.6
condition	30	69	43.4
subroutine	19	20	95.0
pod	11	11	100.0
total	400	660	60.6

line	stmt	bran	cond	sub	pod	time	code
1							package Encode::Detect::Upload;
2
3							our $VERSION=0.04;
4
5							=encoding utf8
6
7							=head1 NAME
8
9							Encode::Detect::Upload - Attempt to guess user's locale encoding from IP,
10							HTTP_ACCEPT_LANGUAGE and HTTP_USER_AGENT
11
12							=head1 SYNOPSIS
13
14							use Encode::Detect::Upload;
15							my $detector = new Encode::Detect::Upload;
16							# Feelin lucky!
17							my $charset = $detector->detect();
18							# More sensible
19							my ( $charset_list, $meta ) = $detector->detect();
20
21							=head1 DESCRIPTION
22
23							Dealing with input from globally disperse users can be a real pain. Although when
24							setting web forms to utf-8 browsers will often do the right thing, in some
25							instances, such as text file uploads, you are stuck will trying to figure out
26							the files charset encoding. L uses Mozilla's universal
27							charset detector, which works great most of the time. But when it doesn't your
28							stuck with asking the user, a user that all to often these days has a very low
29							technical ability, and likely doesn't know what a charset it.
30
31							In my experience with dealing with such user uploads, the charset of the file
32							usually relates to the users OS, location and language settings. Although it's
33							true that the file could have any encoding, the file could have been created on
34							a different machine, with a different locale to the one that is doing the upload.
35							But the use of this modules techniques along with that of
36							L more cases can be handled correctly. Methods for
37							helping the user chose encoding are also provided.
38
39							=cut
40
41	3			3		112241	use utf8;
	3					32
	3					18
42	3			3		104	use strict;
	3					5
	3					163
43	3			3		16	use warnings;
	3					11
	3					140
44	3			3		14	use Carp;
	3					6
	3					265
45
46	3			3		2956	use Encode::Detect::Upload::Data;
	3					19
	3					367
47	3			3		3772	use Encode;
	3					46079
	3					25517
48
49							my $country_lang = \%Encode::Detect::Upload::Data::country_lang;
50							my $lang_charset = \%Encode::Detect::Upload::Data::lang_charset;
51
52							# Try to load some other modules
53							my $has_ipcountry = eval { require IP::Country; };
54
55							my $has_geoip = !$has_ipcountry && eval { require Geo::IP; };
56
57							my $has_detect = eval { require Encode::Detect::Detector; };
58
59
60							=head2 Methods
61
62							=over 4
63
64							=item new()
65
66							=item new(\%params)
67
68							=item new(%params)
69
70							Returns a new detection object. Parameters may be passed either as
71							key/value pairs or as a hash references. The following parameters are
72							recognised:
73
74							die_on_missing Whether missing method parameters cause fatal errors (default: true)
75
76							=cut
77
78							sub new {
79	2			2	1	1803	my $class = shift;
80	2					10	my %config = (
81							die_on_missing => 1,
82							);
83	2					6	my $param;
84	2	50	33			25	if ( @_ == 1 && ref $_[0] eq 'HASH' ) {
		50
85	0					0	$param = $_[0];
86							}
87							elsif ( @_ % 2 == 0 ) {
88	2					7	$param = { @_ };
89							}
90							else {
91	0					0	croak( "Invalid parameters, must be either single hashref or key=>value pairs" );
92							}
93	2	50				16	if ( $param ) {
94	2					12	%config = (
95							%config,
96							%$param,
97							);
98							}
99	2					10	my $self = bless \%config, $class;
100	2					9	return $self;
101							}
102
103
104							=item get_os()
105
106							=item get_os($user_agent_string)
107
108							Extracts the operating system name from the supplied User-Agent header value,
109							or C<$ENV{HTTP_USER_AGENT}> if not supplied. Dies if no user agent string
110							is available.
111							Returns either C, C, C or undefined if no match was
112							made.
113
114							=cut
115
116							sub get_os {
117	10			10	1	1243	my $self = shift;
118	10					19	my $agent = shift;
119	10		100			51	$agent \|\|= $ENV{HTTP_USER_AGENT};
120	10	100	66			68	croak( 'No USER_AGENT string passed, and $ENV{HTTP_USER_AGENT} is empty' )
121							if !$agent && $self->{die_on_missing};
122
123							# Basic regexps for matching
124	9	100				68	return 'Windows' if $agent =~ /Windows/;
125	3	100				18	return 'Macintosh' if $agent =~ /\b(?:Macintosh\|Mac)\b/;
126	2	100				14	return 'Linux' if $agent =~ /Linux/;
127	1					5	return undef;
128							}
129
130
131							=item get_country()
132
133							=item get_country($ip_address)
134
135							=item get_country($ip_address,$geo_ip_data_filename)
136
137							Looks up the user's country from the supplied IP address, or C<$ENV{REMOTE_ADDR}>
138							by default. Dies if neither of L or L is installed.
139							Returns the ISO 2 character country code.
140
141							=cut
142
143							sub get_country {
144	0			0	1	0	my $self = shift;
145	0	0	0			0	croak( 'Could not load IP::Country or Geo::IP' ) unless $has_ipcountry \|\| $has_geoip;
146	0					0	my $ip = shift;
147	0		0			0	$ip \|\|= $ENV{REMOTE_ADDR};
148	0	0				0	croak( 'No IP passed, and $ENV{REMOTE_ADDR} is empty' ) unless $ip;
149	0	0	0			0	$ip=~/\A(?:0\|[1-9]\d)(?:\.(?:0\|[1-9]\d)){3}\z/ && !grep $_>255,split /\./,$ip
150							or croak( "$ip is not a valid IP" );
151
152							# Use the available IP -> Country DB
153	0	0				0	if ( $has_ipcountry ) {
154	0					0	my $reg = IP::Country->new(); # TODO Cache?
155	0					0	my $country = $reg->inet_atocc($ip);
156	0	0				0	$country = undef if $country eq '**';
157	0	0				0	return defined $country ? lc $country : $country;
158							}
159	0	0				0	if ( $has_geoip ) {
160	0					0	my $gi;
161	0					0	my $data_file = shift;
162	0	0				0	if ( $data_file ) {
163	0	0				0	die( "Geo::IP data file $data_file does not exist" ) unless -e $data_file;
164	0					0	$gi = Geo::IP->new( $data_file, 0 ); # 0 = GEOIP_STANDARD
165							}
166							else {
167	0					0	$gi = Geo::IP->new( 0 );
168							}
169	0					0	return lc $gi->country_code_by_addr( $ip );
170							}
171							}
172
173
174							=item get_country_lang($iso_2code)
175
176							Returns the language tag(s) associated with the supplied country code.
177							In scalar context returns the primary language tag; in list context
178							returns all associated language tags. Dies if the supplied country
179							code is undefined. Returns undef if no matching country is found.
180
181							Language tags are defined in section 3.10 or RFC 2616, and can be 2
182							or 3 letters, optionally followed by a series of subtags, separated
183							by dashes.
184
185							=cut
186
187							sub get_country_lang {
188	3			3	1	3610	my $self = shift;
189	3					5	my $country_code = shift;
190	3	100				27	croak( 'No country passed' ) unless defined $country_code;
191	2	50				12	my $country = $country_lang->{lc $country_code}
192							or return;
193	2	100				7	if ( wantarray ) {
194	1					3	return @{ $country->{languages} };
	1					7
195							}
196							else {
197	1					10	return $country->{languages}->[0];
198							}
199							}
200
201
202							=item get_country_name($iso_2code)
203
204							Returns the name of the country specified by the suppied 2 letter code.
205							Dies if no country is specified.
206
207							=cut
208
209							sub get_country_name {
210	2			2	1	2877	my $self = shift;
211	2					5	my $country_code = shift;
212	2	100				20	croak( 'No country passed' ) unless defined $country_code;
213	1	50				7	my $country = $country_lang->{lc $country_code}
214							or return undef;
215	1					7	return $country->{name};
216							}
217
218
219							=item get_accept_lang()
220
221							=item get_accept_lang($accept_lang_string)
222
223							Returns the accepted language tag(s) described by the supplied Accept-Language
224							header value, or from C<$ENV{HTTP_ACCEPT_LANGUAGE}> if not supplied. Dies if no
225							header value is available.
226							In scalar context, returns the first language tag listed. In list context,
227							returns all tags, in the order they are listed in the header value.
228
229							=cut
230
231							sub get_accept_lang {
232	6			6	1	3580	my $self = shift;
233	6					13	my $accept = shift;
234	6		100			30	$accept \|\|= $ENV{HTTP_ACCEPT_LANGUAGE};
235	6	100	66			40	croak( 'No ACCEPT_LANGUAGE string passed, and $ENV{HTTP_ACCEPT_LANGUAGE} is empty' )
236							if !$accept && $self->{die_on_missing};
237							# We are going to ignore q and assume the order is accurate... might not be the best policy
238	5					10	my @langs;
239							my %seen;
240	5					36	foreach my $language ( split(/\s,\s/, $accept) ) {
241	7					30	my ( $lang, $q ) = split(/\s;\s/, $language);
242	7					17	$lang = lc $lang;
243	7	100				20	if ( wantarray ) {
244	6	50				21	next if $seen{$lang}; # filter out any duplicates
245	6					13	push( @langs, $lang );
246	6					24	$seen{$lang}++;
247							}
248							else {
249	1					8	return $lang;
250							}
251							}
252	4					22	return @langs;
253							}
254
255
256							=item get_lang_name($language_code)
257
258							Returns the name of the language specified by the supplied 2 or 3 letter
259							ISO-639 language code. Dies if no language code is supplied.
260
261							=cut
262
263							sub get_lang_name {
264	6			6	1	2369	my $self = shift;
265	6					12	my $lang_code = shift;
266	6	100				29	croak( 'No language passed' ) unless defined $lang_code;
267	5					11	$lang_code = lc $lang_code;
268	5	100				28	my $language = $lang_charset->{$lang_code}
269							or return undef;
270	4					42	return $language->{name};
271							}
272
273
274							=item get_lang_list($language_tag)
275
276							Returns the list of language tags which could be used for matching the supplied
277							language tag. This will always include the supplied language tag. If the supplied
278							tag includes a C or C subtag, or is a primary tag for which C
279							or C subtags are available, all such subtags will be returned. If the
280							supplied tag contains any subtags, the primary tag will also be returned.
281							Dies is no language tag is supplied.
282
283							=cut
284
285							sub get_lang_list {
286	3			3	1	10172	my $self = shift;
287	3					6	my $lang = shift;
288	3	100				40	croak( 'No language passed' ) unless defined $lang;
289	2					5	$lang = lc $lang;
290	2					6	my @lang_list = ($lang);
291	2					9	my %lang_seen = ( $lang => 1 );
292
293							# Check for Cyrillic/Latin versions
294	2					8	$lang =~ s/-(?:cyrl\|latn)\z//;
295	2					11	for my $sublang ("$lang-latn","$lang-cyrl") {
296	4	100				22	$lang_charset->{$sublang} or next;
297	2	50				13	$lang_seen{$sublang}++
298							or push @lang_list,$sublang;
299							}
300
301							# Check for general language
302	2	100				17	if ( $lang =~ s/-.+\z// ) {
303	1	50				7	if ( $lang_charset->{$lang} ) {
304	1	50				6	$lang_seen{$lang}++
305							or push @lang_list, $lang;
306							}
307							# Check for cyrl/latn versions
308	1					6	for my $sublang ( "$lang-latn", "$lang-cyrl" ) {
309	2	50				10	$lang_charset->{$sublang} or next;
310	0	0				0	$lang_seen{$sublang}++
311							or push @lang_list, $sublang;
312							}
313							}
314
315	2					14	return @lang_list;
316							}
317
318
319							=item get_lang_charset($language_tag)
320
321							=item get_lang_charset($language_tag, $os_name)
322
323							Returns the charset(s) used by the supplied language. If an operating system
324							name is supplied, treats its character sets preferentially. Dies if no
325							language tag is supplied. In scalar context, returns the best matching
326							charset. In list context, returns a list of all suitable charsets.
327
328							=cut
329
330							sub get_lang_charset {
331	8			8	1	4841	my $self = shift;
332	8					16	my $lang = shift;
333	8					12	my $os = shift;
334	8	100				42	croak( 'No language tag passed to get_lang_charset()' ) unless $lang;
335	7					14	$lang = lc $lang;
336	7	50				24	my $group = $lang_charset->{$lang} or return;
337	7					20	my @oses = qw(windows macintosh linux);
338	7	100				22	if ( $os ) {
339	1					4	$os = lc $os;
340	1	50				10	croak( "OS $os not recognised" ) unless $os =~ /\A(?:windows\|linux\|macintosh)\z/;
341	1					6	@oses = ($os, grep $_ ne $os, @oses);
342							}
343	7					29	my @charsets = @$group{@oses};
344	7	100				20	if ( wantarray ) {
345	6					28	return @charsets;
346							} else {
347	1					9	return $charsets[0];
348							}
349							}
350
351
352							=item get_words($sample_string)
353
354							=item get_words($sample_string, $max_words)
355
356							Returns a list of unique words from the supplied sample string which contain
357							non-ASCII characters. Returns no more than the specified maximum number
358							or words, which defaults to 10. Dies if no sample text is supplied.
359
360							=cut
361
362							sub get_words {
363	7			7	1	3791	my $self = shift;
364	7					13	my $text = shift;
365	7					15	my $max = shift;
366	7		100			26	$max \|\|= 10;
367	7	100				35	croak( 'No sample text passed' ) unless $text;
368	6					89	my ( @words, %words );
369	6					55	while ( $text =~ /([\w\x80-\xff][\x80-\xff][\w\x80-\xff])/g ) {
370	7	50				38	next if $words{$1}++;
371	7					21	push @words, $1;
372	7	50				38	last if @words >= $max;
373							}#while
374	6					31	return @words;
375							}
376
377
378							=item detect(%params)
379
380							=item detect(\%params)
381
382							Determines the encoding of the supplied text. In scalar context, returns the most
383							likely charset code. In list context returns an arrayref of charset codes, ordered
384							from most to least likely, and a hashref of metadata. Dies if any required
385							parameters are not supplied. The following parameters are accepted:
386
387							text Text to determine the encoding of (required)
388							words Maximum number of words to examine (default=10)
389							ip User's IP address (default=$ENV{REMOTE_ADDR})
390							accept_lang Accept-Language header value (required, default=$ENV{HTTP_ACCEPT_LANGUAGE})
391							inc_linux Include Linux charsets? (default=0)
392							ranking TODO document
393							os OS name (Windows, Macintosh or Linux)
394							user_agent User-Agent header value (required if os not supplied,
395							default=$ENV{HTTP_USER_AGENT})
396							lang Language tag or arrayref thereof
397							country Country code or arrayref thereof (required if lang not supplied)
398							country_extra TODO document
399							lang_extra TODO document
400
401
402							Requires a sample text string. Can optionally be passed the number of words to
403							try to match (default 10), the users IP, the users OS, the user_agent string,
404							the language code(S), the accept_language string, whether linux charsets should
405							be included, and for advanced use you can adjust the way languages and charsets
406							are ranked. Returns either a single charset (in scalar context) or a list of
407							charsets ordered by most likely with associated meta data. If
408							L is available it's guess is used to improve accuracy.
409
410							For discussion of ranking heuristics and how to adjust them, see the section below.
411
412							# I'm feeling lucky
413							my $charset = $detector->detect();
414
415							# I'm feeling realistic
416							my ( $charset_list, $charset_meta ) = $detector->detect( text => '...' );
417
418							# Data structure example
419							$charset_list = [ 'x-mac-cyrillic', 'x-mac-ce', 'windows-1251', 'x-mac-ukrainian'... ];
420							$charset_meta = {
421							charsets => {
422							'x-mac-cyrillic' => {
423							pos => 1, # Ranking position
424							words => [ 'Здравствуй', ... ], # Sample word list
425							lang => [ 'ru', ... ], # Language tags that led to this charset
426							},
427							'x-mac-ce' => {
428							pos => 2,
429							words => [ 'ášūŗ‚ŮÚ‚ůť', ... ],
430							lang => [ 'sr', ... ],
431							},
432							'windows-1251' => {
433							pos => 3,
434							words => [ '‡дравствуй', ... ],
435							lang => [ 'ru', ... ],
436							mozilla => 1, # In this example mozilla guessed wrong
437							},
438							...
439							},
440							lang => {
441							ru => {
442							name => 'Russian', # Language name
443							both => 1, # Matched from both country and accept_lang
444							country => 1, # Matched from country (IP)
445							accept => 1, # Matched from accept_lang
446							pos => 1, # Ranking position
447							},
448							...
449							},
450							country => {
451							name => 'Russia',
452							tag => 'ru',
453							},
454							error => [ 'utf-8', ... ], # Text wouldn't parse as utf-8
455							}
456
457							=cut
458
459							sub detect {
460	5			5	1	4401	my $self = shift;
461	5					11	my $param;
462	5	50	33			41	if ( @_ == 1 && ref $_[0] eq 'HASH' ) {
		50
463	0					0	$param = $_[0];
464							}
465							elsif ( @_ % 2 == 0 ) {
466	5					30	$param = { @_ };
467							}
468							else {
469	0					0	croak( "Invalid parameters, must be either single hashref or key=>value pairs" );
470							}
471							# TODO(LH) Maybe some param name validation
472	5					61	my %conf = (
473							words => 10,
474							ip => $ENV{REMOTE_ADDR},
475							user_agent => $ENV{HTTP_USER_AGENT},
476							accept_lang => $ENV{HTTP_ACCEPT_LANGUAGE},
477							inc_linux => 0,
478							%$param,
479							);
480
481	5					69	my %rank = (
482							lang => {
483							start => 'AC',
484							repeat => 'AC',
485							},
486							lang_both => 1,
487							char => {
488							windows => {
489							start => 'WW',
490							repeat => 'WML',
491							},
492							macintosh => {
493							start => 'M',
494							repeat => 'MWL',
495							},
496							linux => {
497							start => 'LWM',
498							repeat => 'LWM',
499							},
500							},
501							mozilla_move => 1,
502							mozilla_insert => 3,
503							);
504	5	100				23	if ( $conf{ranking} ) {
505	2					14	%rank = (
506							%rank,
507	2					8	%{ $conf{ranking} }
508							);
509							# Validate ranking sequences
510	2	50				12	croak( 'Missing language ranking sequence' ) unless $rank{lang}->{repeat};
511	2					5	foreach my $os ( qw( windows macintosh linux ) ) {
512	6	50				25	croak( 'Missing ' . ucfirst $os . ' charset ranking sequence' ) unless $rank{char}->{$os}->{repeat};
513							}
514							}
515
516							# Get the OS
517	5	50				21	unless ( $conf{os} ) {
518	5					44	$conf{os} = $self->get_os( $conf{user_agent} );
519							}
520							# Default to windows if we still don't have an OS
521	5	50				14	$conf{os} = 'windows' unless $conf{os};
522	5					15	$conf{os} = lc $conf{os};
523							# OS of linux implies inc_linux
524	5	50				16	$conf{inc_linux} = 1 if $conf{os} eq 'linux';
525
526							# Get the list of language tags
527	5					21	my($country_meta, $lang_meta) = $self->_detect_get_langs(\%conf,\%rank);
528
529							# Get the related charsets and meta data
530	5					23	my @words = $self->get_words( $conf{text}, $conf{words} );
531	5					22	my %char_hash = (
532							W => [],
533							M => [],
534							L => [],
535							);
536	5					8	my %char_meta;
537							my %char_error;
538	5					29	$self->_detect_check_langs(\%char_meta,\%char_hash,\%char_error,\%conf,\@words);
539
540							# Does this parse as UTF-8?
541	5					352	my $is_utf8 = 1;
542	5					9	eval { decode( 'UTF-8', $conf{text}, Encode::FB_CROAK ) };
	5					27
543	5	100				637	$is_utf8 = 0 if $@;
544							# Make sure we have UTF-8 charset info
545	5	100				13	if ( $is_utf8 ) {
546							# UTF-8 could be any language, so doesn't tend to be picked up above
547	0					0	$char_meta{'utf-8'} = {
548							pos => 1,
549							lang => [],
550	1					7	words => [ map { decode( 'UTF-8', $_ ) } @words ],
551							};
552							}
553							else {
554	4					11	$char_error{'utf-8'}++;
555							}
556
557							# Rank position
558	5		33			29	$rank{char}->{ $conf{os} }->{start} \|\|= $rank{char}->{ $conf{os} }->{repeat};
559	5					30	my @sequence = split( //, $rank{char}->{ $conf{os} }->{start} );
560	5	100				18	my $pos = $is_utf8 ? 2 : 1;
561	5					18	while ( my $type = shift @sequence ) {
562	25	100	100			28	last unless @{ $char_hash{W} } \|\| @{ $char_hash{M} } \|\| @{ $char_hash{L} };
	25		66			95
	20					85
	5					24
563	20	100				68	push( @sequence, split( //, $rank{char}->{ $conf{os} }->{repeat} ) ) unless @sequence;
564	20					25	while ( my $charset = shift @{ $char_hash{$type} } ) {
	20					86
565	10	50				27	if ( $char_meta{$charset}->{pos} ) {
566	0					0	next;
567							}
568							else {
569	10					26	$char_meta{$charset}->{pos} = $pos;
570	10					13	$pos++;
571	10					39	last;
572							}
573							}#while
574							}#while
575
576							# Can we see what Mozilla detection thinks?
577	5					9	my $mozilla;
578	5	50				14	if ( $has_detect ) {
579	0					0	$mozilla = Encode::Detect::Detector::detect( $conf{text} );
580	0	0				0	$mozilla = lc $mozilla if defined $mozilla;
581	0	0				0	if ( $mozilla ) {
582							# Check charset can decode
583	0					0	my $charset_encode = _try_charset( $mozilla, $conf{text} );
584	0	0				0	if ( $charset_encode ) {
585							# Check we have the Mozilla charset in our list
586	0	0				0	if ( $char_meta{$mozilla} ) {
587	0					0	$char_meta{$mozilla}->{mozilla} = 1;
588							# Should Mozilla affect position?
589	0	0	0			0	if ( $rank{mozilla_move} && $char_meta{$mozilla}->{pos} != 1 ) {
590	0					0	my $pos_new = $char_meta{$mozilla}->{pos} - $rank{mozilla_move};
591	0	0				0	$pos_new = 1 if $pos_new < 1;
592							# Move other charsets
593	0	0				0	map { $_->{pos}++ } grep {
	0					0
594	0					0	$_->{pos} >= $pos_new &&
595							$_->{pos} < $char_meta{$mozilla}->{pos}
596							} values %char_meta;
597	0					0	$char_meta{$mozilla}->{pos} = $pos_new;
598							}
599							}#if
600							else {
601							# Insert Mozilla if it's not in list?
602	0	0				0	if ( $rank{mozilla_insert} ) {
603							# Push everything else down
604	0					0	map { $_->{pos}++ } grep {
	0					0
605	0					0	$_->{pos} >= $rank{mozilla_insert}
606							} values %char_meta;
607	0					0	$char_meta{$mozilla} = {
608							lang => [],
609	0					0	words => [ map { decode( $charset_encode, $_ ) } @words ],
610							pos => $rank{mozilla_insert},
611							mozilla => 1,
612							};
613							}
614							}#else
615							}#if
616							else {
617	0					0	$char_error{ $mozilla }++;
618							}
619							}#if
620							}#if
621
622							# Prep return
623	5					39	my @charsets = sort { $char_meta{$a}->{pos} <=> $char_meta{$b}->{pos} } keys %char_meta;
	7					33
624	5	100				16	if ( wantarray ) {
625	1					10	my %meta = (
626							charsets => \%char_meta,
627							lang => $lang_meta,
628							country => $country_meta,
629							error => [ keys %char_error ],
630							);
631	1					16	return ( \@charsets, \%meta);
632							}
633							else {
634	4					82	return $charsets[0];
635							}
636							}#sub
637
638							sub _detect_get_langs{
639	5			5		10	my $self = shift;
640	5					7	my $conf = shift;
641	5					10	my $rank = shift;
642
643	5					7	my %country_meta;
644							my %lang_meta;
645	5	100				55	if ( $conf->{lang} ) {
646	2	50				10	$conf->{lang} = [ $conf->{lang} ] unless ref $conf->{lang};
647	2					5	$conf->{lang} = [ map { lc $_ } @{ $conf->{lang} } ];
	2					9
	2					5
648							}
649							else {
650							## Get language list from conf with meta data
651							# Start with country list
652	3					6	my @country_list;
653	3	50				23	if ( $conf->{country} ) {
654	0	0				0	@country_list = ref $conf->{country} ? @{ $conf->{country} } : ( $conf->{country} );
	0					0
655	0					0	@country_list = map { lc $_ } @country_list;
	0					0
656							}
657							else {
658							# See if we have IP's to lookup countries for, we may have several
659	3	50	33			36	if ( $conf->{ip} && ($has_ipcountry \|\| $has_geoip) ) {
			33
660	0	0				0	$conf->{ip} = [ $conf->{ip} ] unless ref $conf->{ip};
661	0					0	foreach my $ip ( @{ $conf->{ip} } ) {
	0					0
662	0					0	my $country = $self->get_country( $ip );
663	0	0				0	next unless $country;
664	0	0				0	if ( $country_meta{$country} ) {
665	0					0	push( @{ $country_meta{$country}->{ip} }, $ip );
	0					0
666							}
667							else {
668	0					0	push( @country_list, $country );
669	0					0	$country_meta{$country} = {
670							ip => [$ip],
671							};
672							}
673							}#foreach
674							}#if
675							# Are there extra countries to add to the start or end of the list
676	3	50				13	if ( ref $conf->{country_extra} ) {
677	0					0	foreach my $position ( qw( end start ) ) {
678	0	0				0	if ( $conf->{country_extra}->{$position} ) {
679	0	0				0	$conf->{country_extra}->{$position} = [ $conf->{country_extra}->{$position} ] unless ref $conf->{country_extra}->{$position};
680	0					0	$conf->{country_extra}->{$position} = [ map { lc $_ } @{ $conf->{country_extra}->{$position} } ];
	0					0
	0					0
681	0					0	foreach my $country ( @{ $conf->{country_extra}->{$position} } ) {
	0					0
682							# Check if it's already in the list, in which case remove
683	0	0				0	if ( $country_meta{$country} ) {
684	0		0			0	$country_meta{$country}->{extra} \|\|= [];
685	0					0	push( @{ $country_meta{$country}->{extra} }, $position );
	0					0
686							# If adding to the end, leave in current position, only move to start
687	0	0				0	@country_list = grep { $_ ne $country } @country_list if $position eq 'start';
	0					0
688							}
689							else {
690	0					0	$country_meta{$country} = {
691							extra => [ $position ],
692							};
693							}
694							}
695							}#if
696							}#foreach
697							# Add to front/back of list
698	0	0				0	unshift( @country_list, @{ $conf->{country_extra}->{start} } ) if ref $conf->{country_extra}->{start};
	0					0
699	0	0				0	push( @country_list, @{ $conf->{country_extra}->{end} } ) if ref $conf->{country_extra}->{end};
	0					0
700							}#if
701							}#else
702							# Get lang tags from countries
703	3					6	my @lang_country;
704							my %country_seen;
705	3					9	foreach my $country ( @country_list ) {
706	0					0	$country_meta{$country}->{name} = $self->get_country_name( $country );
707	0					0	my @lang_list = $self->get_country_lang( $country );
708	0					0	foreach my $lang ( @lang_list ) {
709	0	0				0	next if $country_seen{$lang};
710	0					0	push( @lang_country, $lang );
711	0					0	$country_seen{$lang}++;
712							}
713							}#foreach
714
715							# Now lang list from accept_langs
716	3					7	my @lang_accept;
717							my %accept_seen;
718	3	50				10	if ( $conf->{accept_lang} ) {
719	3	50				18	$conf->{accept_lang} = [ $conf->{accept_lang} ] unless ref $conf->{accept_lang};
720	3					6	foreach my $accept_lang ( @{ $conf->{accept_lang} } ) {
	3					11
721	3					14	my @lang_list = $self->get_accept_lang( $accept_lang );
722	3					11	foreach my $lang ( @lang_list ) {
723	3	50				12	next if $accept_seen{$lang};
724	3					8	push( @lang_accept, $lang );
725	3					14	$accept_seen{$lang}++;
726							}
727							}
728							}
729							# Are there extra lang tags to add to the start or end of the list
730	3					6	my %extra_seen;
731							my %extra_list;
732	3	50				11	if ( ref $conf->{lang_extra} ) {
733	0					0	foreach my $position ( qw( end start ) ) {
734	0					0	$extra_list{$position} = [];
735	0	0				0	if ( $conf->{lang_extra}->{$position} ) {
736	0	0				0	$conf->{lang_extra}->{$position} = [ $conf->{lang_extra}->{$position} ] unless ref $conf->{lang_extra}->{$position};
737	0					0	$conf->{lang_extra}->{$position} = [ map { lc $_ } @{ $conf->{lang_extra}->{$position} } ];
	0					0
	0					0
738	0					0	$extra_list{$position} = $conf->{lang_extra}->{$position};
739	0					0	$extra_seen{$position} = { map { $_ => 1 } @{ $conf->{lang_extra}->{$position} } };
	0					0
	0					0
740							}
741							}#foreach
742							}#if
743							## Rank languages based on order and appearance in both lists
744							# Which lists they appear in
745	3					143	foreach my $lang ( @lang_accept, @lang_country, @{ $extra_list{start} }, @{ $extra_list{end} } ) {
	3					8
	3					9
746	3	50				10	next if $lang_meta{$lang};
747	3	50	33			120	$lang_meta{$lang} = {
			50
			50
			50
			50
748							both => $accept_seen{$lang} && $country_seen{$lang} ? 1 : 0,
749							accept => $accept_seen{$lang} \|\| 0,
750							country => $country_seen{$lang} \|\| 0,
751							start => $extra_seen{start}->{$lang} \|\| 0,
752							end => $extra_seen{end}->{$lang} \|\| 0,
753							name => $self->get_lang_name($lang),
754							};
755							}#foreach
756							## Rank position
757							# Extra start will go first
758	3					8	my $pos = 1;
759	3					5	foreach my $lang ( @{ $extra_list{start} } ) {
	3					12
760	0	0				0	next if $lang_meta{$lang}->{pos};
761	0					0	$lang_meta{$lang}->{pos} = $pos;
762	0					0	$pos++;
763							}#foreach
764							# Then sequence
765	3					14	my %lang_hash = (
766							A => \@lang_accept,
767							C => \@lang_country,
768							);
769	3		33			14	$rank->{lang}->{start} \|\|= $rank->{lang}->{repeat};
770	3					16	my @sequence = split( //, $rank->{lang}->{start} );
771	3					13	while ( my $type = shift @sequence ) {
772	6	100	66			7	last unless @{ $lang_hash{A} } \|\| @{ $lang_hash{C} };
	6					34
	3					15
773	3	50				10	push( @sequence, split( //, $rank->{lang}->{repeat} ) ) unless @sequence;
774	3					6	while ( my $lang = shift @{ $lang_hash{$type} } ) {
	3					14
775	3	50				11	if ( $lang_meta{$lang}->{pos} ) {
776	0					0	next;
777							}
778							else {
779	3					15	$lang_meta{$lang}->{pos} = $pos;
780	3					4	$pos++;
781	3					15	last;
782							}
783							}#while
784							}#while
785							# Extra end added to the end
786	3					5	foreach my $lang ( @{ $extra_list{end} } ) {
	3					20
787	0	0				0	next if $lang_meta{$lang}->{pos};
788	0					0	$lang_meta{$lang}->{pos} = $pos;
789	0					0	$pos++;
790							}#foreach
791							# Prefer languages that appear in both?
792	3	50				16	if ( $rank->{lang_both} ) {
793	0	0	0			0	$conf->{lang} = [ sort {
794	3					27	$lang_meta{$b}->{start} <=> $lang_meta{$a}->{start} \|\|
795							$lang_meta{$b}->{both} <=> $lang_meta{$a}->{both} \|\|
796							$lang_meta{$a}->{pos} <=> $lang_meta{$b}->{pos}
797							} keys %lang_meta ];
798							}
799							else {
800	0					0	$conf->{lang} = [ sort {
801	0					0	$lang_meta{$a}->{pos} <=> $lang_meta{$b}->{pos}
802							} keys %lang_meta ];
803							}
804							}#else
805
806	5					23	return (\%country_meta,\%lang_meta);
807							}
808
809							sub _detect_check_langs{
810	5			5		8	my $self = shift;
811	5					7	my $char_meta = shift;
812	5					8	my $char_hash = shift;
813	5					8	my $char_error = shift;
814	5					10	my $conf = shift;
815	5					7	my $words = shift;
816	5	50				19	my @os_list = ('W','M',$conf->{inc_linux} ? 'L' : ());
817	5					9	foreach my $lang ( @{ $conf->{lang} } ) {
	5					12
818	5					16	my @charsets = $self->get_lang_charset( $lang );
819	5	50				12	next unless @charsets;
820	5					19	for ( my $i=0; $i <= $#os_list; $i++ ) {
821	10					185	my $charset = $charsets[$i];
822	10					15	my $os = $os_list[$i];
823	10	50				31	next if $char_error->{ $charset };
824	10	50				23	if ( $char_meta->{ $charset } ) {
825	0					0	push( @{ $char_meta->{$charset}->{lang} }, $lang );
	0					0
826							}
827							else {
828							# Test charset parses
829	10					31	my $charset_encode = _try_charset( $charset, $conf->{text} );
830	10	50				67	if ( $charset_encode ) {
831	10					15	push( @{ $char_hash->{ $os } }, $charset );
	10					33
832	10					199	$char_meta->{ $charset } = {
833							lang => [ $lang ],
834	10					110	words => [ map { decode( $charset_encode, $_ ) } @$words ],
835							};
836							}
837							else {
838	0					0	$char_error->{ $charset }++;
839							}
840							}#else
841							}#for
842							}#foreach
843							}
844
845							sub _try_charset {
846	10			10		23	my ( $charset, $text ) = @_;
847							# Older versions of Encode::Alias don't map x-mac-* encodings properly
848	10					82	$charset =~ s/^(?:x[_-])?mac[_-](.*)$/mac$1/;
849	10					19	$charset =~ s/^macce$/maccentraleurroman/;
850	10					16	eval { decode( $charset, $text, Encode::FB_CROAK ) };
	10					60
851	10	50				7201	return $@ ? 0 : $charset;
852							}#sub
853
854
855							=back
856
857							=head1 RANKING SYSTEM
858
859							Unfortunately the heuristics employed by this method aren't straight forward.
860							Several key scenarios are taken into consideration, namely:
861
862							The upload charset is:
863							for the language that matches the browsers language settings and OS.
864							for the language that matches the uploaders countries official language and OS.
865							for the language that matches the browsers language settings, but a different OS.
866							for the language that matches the uploaders countries official language, but a different OS.
867							unrelated, hopefully detected by Mozilla's universal charset detector.
868
869							Although the browsers language setting is preferred, it's not unusually for it
870							to be incorrect. For example a surprising number of UK users have en-US rather
871							than en-GB. In such instances the language from the IP would be more accurate.
872							For this reason if the Mozilla detected charset matches an IP dervied charset it
873							is brought to the front.
874							However, an Englishman uploading a file whilst abroad would not give an accurate
875							language from IP. Likewise, some countries like South Africa have several
876							recognised languages.
877							Some countries have inhabitants that use either Latin or Cyrillic alphabets for
878							the same language. In these instances, the Mozilla detector is used to determine
879							which is more likely, but both options will be returned.
880							The use of Macintosh computers has been on the rise, as has the appearance of
881							their charsets. In fact that's what led me to write this module, as the Mozilla
882							detector doesn't cover every encoding and was missing Mac-Roman. Generally
883							Windows users are less likely to upload files with Macintosh encoding, Although
884							the same cannot be said the other way around. For this reason, when the OS is
885							Macintosh it's matching charsets will come first, followed by the likely
886							Windows, alternating between the two.
887
888							We assume linux systems are mostly UTF-8 these days, that their pre-UTF-8 ISO
889							charsets were roughly the same as the Windows equivalents, and that Linux users
890							are generally more computer savvy. For these reasons Linux charsets are not
891							included in results by default.
892
893							Rather than ranking charsets through some kind of weighting based on appearance,
894							we apply configurable patterns. Weight would always favour common charsets,
895							hopefully the ranking patterns work better.
896
897							This is the first version of this module. I'm open to suggestions with regards
898							improved heuristics, and possibly configurable heuristics.
899
900							You can override the default ranking by passing the appropriate data structure
901							to detect(). You need to at least provide the repeat string for lang and all the
902							OSs.
903
904							IP country lookup and accept_language parsing is used initially to generate a
905							list of matching languages. The order in which these are then ranked is based
906							on their appearance (accept_lang), or popularity (country), and the sequence
907							given. A represents accept_lang and C represents country, so a sequence starting
908							with AC and repeating with AC would generate ACACACACAC... until there are no
909							matching languages left. The lang_both option pushes charsets that come from
910							both accept_lang and country.
911
912							Next charsets are matched from the languages by OS. Depending on what OS has
913							been passed, or detected from user_agent. The char sequences contain W for
914							Windows, M for Macintosh or L for Linux. The Linux charsets are filtered out
915							unless the OS is Linux or the inc_linux config option is enabled. So a Windows
916							OS with sequence starting WW and repeating WML would generate WWWMWMWMWM...
917							matching the first 3 likely windows charsets, then the most likely Macintosh,
918							etc. Charsets are tested to see if they can decode the text, invalid ones are
919							filtered out.
920
921							The string is tested to see whether it looks like UTF-8. If it does that's
922							pushed to the front on the list. If the Mozilla charset detector is available
923							it's used to see what charset it returns. The option mozilla_move sets how the
924							many places to move the matching charset forward in the list. The
925							mozilla_insert options defines in what position to insert the Mozilla match if
926							it's not already in the list.
927
928							my %ranking = (
929							lang => {
930							start => 'AC',
931							repeat => 'AC',
932							},
933							# Rank languages that appear in both country and accept_lang first
934							lang_both => 1,
935							char => {
936							windows => {
937							start => 'WW',
938							repeat => 'WML',
939							},
940							macintosh => {
941							start => 'M',
942							repeat => 'MWL',
943							},
944							linux => {
945							start => 'LWM',
946							repeat => 'LWM',
947							},
948							},
949							# Mozilla detected charset options
950							mozilla_move => 1, # Number of positions to move the forward
951							mozilla_insert => 3, # At what position to insert if it's not in list
952							);
953							my $charset = $detector->detect( ranking => \%ranking );
954
955							=head1 LICENSE
956
957							This is released under the Artistic
958							License. See L.
959
960							=head1 AUTHORS
961
962							Lyle Hopkins - L
963
964							Peter Haworth - L
965
966							Development kindly sponsored by - L
967
968							=head1 REFERENCES
969
970							I had a hard time finding good data sources, all the information I needed was
971							pretty spread out. These are the main sites I used, but there was lots of
972							googling to fill in the gaps.
973
974							L
975							L
976							L
977							L
978							L
979							L
980							L
981							L
982							L
983
984							=head1 SEE ALSO
985
986							L, L, L, L
987
988							=head1 TODO
989
990							Make default between Latin and Cyrillic based on popularity in language
991							Write some tests
992							Rank regions differently?
993							Generalize environment examination, defaults only at detect() or confgurable thru the detector object itself
994
995							=cut
996
997
998
999							1;