File Coverage

lib/Encode/Repair.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package Encode::Repair;
2				our $VERSION = '0.0.2';
3	2	2	170414	use strict;
	2		7
	2		91
4	2	2	12	use warnings;
	2		4
	2		141
5
6				our @EXPORT_OK = qw(repair_double learn_recoding repair_encoding);
7	2	2	12	use Exporter qw(import);
	2		9
	2		80
8	2	2	949	use Encode qw(encode decode);
	2		10665
	2		260
9	2	2	2155	use Algorithm::Loops qw(NestedLoops MapCar);
	0
	0
10
11				# since Algorithm::Loops already provides MapCar, it is very easy to implement
12				# zip() with it, instead of introducing another dependency (on
13				# List::MoreUtils, specifically)
14				sub zip {
15				MapCar { @_ == 2 ? @_ : () } @_;
16				}
17
18				my %subs = (
19				encode => \&encode,
20				decode => \&decode,
21				);
22
23				sub repair_encoding {
24				my ($str, $actions) = @_;
25				for (my $i = 0; $i < @$actions; $i += 2) {
26				my $type = $actions->[$i];
27				my $encoding = $actions->[$i+1];
28				no warnings 'utf8';
29				$str = $subs{$type}->($encoding, $str);
30				}
31				$str;
32				}
33
34				sub repair_double {
35				my ($buf, $options) = @_;
36				my $via = 'ISO-8859-1';
37				$via = $options->{via} if $options && exists $options->{via};
38				repair_encoding($buf, [
39				'decode', 'UTF-8',
40				'encode', $via,
41				'decode', 'UTF-8',
42				]);
43				}
44
45				sub learn_recoding {
46				my %args = @_;
47				my $source = $args{from};
48				my $target = $args{to};
49				my $encodings = $args{encodings};
50				my $maxdepth = $args{depth} \|\| 5;
51				my $search_mode = $args{search} \|\| 'first';
52				return [] if $source eq $target;
53
54				my @result;
55				for my $depth (1..$maxdepth) {
56				my $iter = NestedLoops( [($encodings) x $depth] );
57				my @ed = (qw(encode decode)) x (int($depth / 2) + 1);
58				my @de = (qw(decode encode)) x (int($depth / 2) + 1);
59				while (my @steps = $iter->()) {
60				no warnings 'uninitialized';
61				for my $steps ([zip \@ed, \@steps], [zip \@de, \@steps]) {
62				# use Data::Dumper;
63				# warn Dumper($steps);
64				if (eval {repair_encoding($source, $steps)} eq $target) {
65				if (lc($search_mode) eq 'first') {
66				return $steps;
67				} else {
68				push @result, $steps;
69				}
70				}
71				}
72				}
73				return \@result if @result && lc($search_mode) eq 'shallow';
74				}
75				return \@result if @result;
76				return;
77				}
78
79				1;
80
81				=encoding utf-8
82
83				=head1 NAME
84
85				Encode::Repair - Repair wrongly encoded text strings
86
87				=head1 SYNOPSIS
88
89				# Simple usage
90				use Encode::Repair qw(repair_double);
91				binmode STDOUT, ':encoding(UTF-8)';
92
93				# prints: small ae: ä
94				print repair_double("small ae: \xc3\x83\xc2\xa4\n");
95
96				# prints: beta: β
97				print repair_double("beta: \xc4\xaa\xc2\xb2\n", {via => 'Latin-7'});
98
99
100				# Advanced usage
101				# assumes you have a sample text both correctly decoded in a
102				# character string, and as a wrongly encoded buffer
103
104				use Encode::Repair qw(repair_encoding learn_recoding);
105				use charnames qw(:full);
106				binmode STDOUT, ':encoding(UTF-8)';
107
108				my $recoding_pattern = learn_recoding(
109				from => "beta: \xc4\xaa\xc2\xb2",
110				to => "beta: \N{GREEK SMALL LETTER BETA}",
111				encodings => ['UTF-8', 'Latin-1', 'Latin-7'],
112				);
113				if ($recoding_pattern) {
114				my $mojibake = "\304\252\302\273\304\252\302\261\304\252\302"
115				."\274\304\252\342\200\234\304\252\302\261";
116				print repair_encoding($mojibake, $recoding_pattern), "\n";
117				} else {
118				print "Sorry, could not help you :-(\n";
119				}
120
121
122				=head1 DESCRIPTION
123
124				Sometimes software or humans mess up the character encoding of text. In some
125				cases it is possible to reconstruct the original text. This module helps you
126				to do it.
127
128				It covers the rather common case that a program assumes a wrong character
129				encoding on reading some input, and converts it to Mojibake (see
130				L).
131
132				If you use this module on a regular basis, it most likely indicates that
133				something is wrong in your processs. It should only be used for one-time tasks
134				such as migrating a database to a new system.
135
136				=head1 FUNCTIONS
137
138				=over
139
140				=item repair_double
141
142				Repairs the common case when a UTF-8 string was read as another encoding,
143				and was encoded as UTF-8 again. The other encoding defaults to ISO-8859-1 aka
144				Latin-1, and can be overridden with the C option:
145
146				my $repaired = repair_double($buffer, {via => 'ISO-8859-2' });
147
148				It expects an octet string as input, and returns a decoded character string.
149
150				=item learn_recoding
151
152				Given a sample of text twice, once correctly decoded and once mistreated,
153				attemps to find a sequence of encoding and decoding that turns the mistreated
154				text into the correct form.
155
156				my $coding_pattern = learn_recoding(
157				from => $mistreated_buffer,
158				to => $correct_string,
159				encodings => \@involved_encodings,
160				depth => 5,
161				search => 'first',
162				);
163
164				C should be an array reference containing all the character
165				encodings involved in the process that messes up the encoding. If you don't
166				know these, try it with C, C and the encoding that your
167				system uses by default.
168
169				C is the maximal number of encoding and decoding steps to be tried. For
170				example C needs three steps. Defaults to 5; higher values might
171				slow down the program significantly, although smaller depths are tried first.
172
173				The return value is C on failure, and an array reference otherwise. It
174				returns the encoding/decoding steps suitable for feeding into C.
175				It contains a list of even size, where elements with even indexes are either
176				C<'encode'> or C<'decode'>, and those with odd indexes contain the name of the
177				encoding.
178
179				With C you can adjust how long the function searches for a recoding
180				sequence.
181				WIth the default of C<'first'> it returns the first possible sequence. With
182				C<'shallow'> it searches for the first working sequence and all other
183				sequences of the same length, and then returns an array reference containing
184				array references to all sequences. With the value C<'all'>, all possible
185				sequences are searched and returned, but often that's a very bad idea, because
186				it also finds sequences where parts of the sequence undo the work of other
187				sequences (something like C<[qw(encode latin-1 decode latin-1)]>).
188
189				Since Version 0.0.2 C forces strict pattern of alternatining
190				encoding and decoding. So even if C<['decode', 'UTF-8', 'decode', 'UTF-8']> is
191				a working input, C will return C<['decode', 'UTF-8', 'encode',
192				'Latin-1', 'decode', 'UTF-8']> instead. So you might have to include C
193				in your encoding list even if it is not strictly involved.
194
195				=item repair_encoding
196
197				Takes an input string and an encoding/decoding pattern (as returned from
198				C) as input and returns the repaired string.
199
200				=back
201
202				=head1 Troubleshooting
203
204				If C returns C, you can increase the C option
205				value (for example to 7). If that doesn't help, check that the two input
206				strings actually corespond. C does an exact equality check, so
207				trailing newline characters or spaces will cause it to fail.
208
209				If C produces errors or warnings, it is likely that the sample
210				you used for learning was not long enough, or not representative. For example
211				if your system uses both ISO-8859-1 and ISO-8859-15 (which are quite similar),
212				C uses the first match, so the sample data has to contain at
213				least one character that's in ISO-8859-15 but not in ISO-8859-1, like the
214				Euro sign (€).
215
216				=head1 Further Reading
217
218				This document tries to stick to the terminology introduced in the L
219				module.
220
221				If you want to learn more about the way text is encoded and how perl handles
222				that, take a look at L.
223
224				=head1 LICENSE AND COPYRIGHT
225
226				Copyright (C) 2008, 2009 by Moritz Lenz, L,
227				moritz@faui2k3.org.
228
229				This is free software; you my use it under the terms of the Artistic License 2
230				as published by The Perl Foundation.
231
232				The code examples distributed with this package are an exception, and may be
233				used, modified and redistributed without any limitations.
234
235				Encode::Repair is distributed in the hope that it will be useful, but WITHOUT
236				ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
237				FOR A PARTICULAR PURPOSE.
238
239				=head1 Development
240
241				The source code is stored in a public git repository at
242				L. If you find any bugs, please used the
243				issue tracker linked from this site.
244
245				If you find a case of messed-up encodings that can be repaired deterministically
246				and that's not covered by this module, please contact the author, providing a
247				hex dump of both input and output, and as much information of the encoding and
248				decoding process as you have.
249
250				Patches are also very welcome.
251
252				=cut