File Coverage

lib/Encode/Repair.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package Encode::Repair;
2             our $VERSION = '0.0.2';
3 2     2   170414 use strict;
  2         7  
  2         91  
4 2     2   12 use warnings;
  2         4  
  2         141  
5              
6             our @EXPORT_OK = qw(repair_double learn_recoding repair_encoding);
7 2     2   12 use Exporter qw(import);
  2         9  
  2         80  
8 2     2   949 use Encode qw(encode decode);
  2         10665  
  2         260  
9 2     2   2155 use Algorithm::Loops qw(NestedLoops MapCar);
  0            
  0            
10              
11             # since Algorithm::Loops already provides MapCar, it is very easy to implement
12             # zip() with it, instead of introducing another dependency (on
13             # List::MoreUtils, specifically)
14             sub zip {
15             MapCar { @_ == 2 ? @_ : () } @_;
16             }
17              
18             my %subs = (
19             encode => \&encode,
20             decode => \&decode,
21             );
22              
23             sub repair_encoding {
24             my ($str, $actions) = @_;
25             for (my $i = 0; $i < @$actions; $i += 2) {
26             my $type = $actions->[$i];
27             my $encoding = $actions->[$i+1];
28             no warnings 'utf8';
29             $str = $subs{$type}->($encoding, $str);
30             }
31             $str;
32             }
33              
34             sub repair_double {
35             my ($buf, $options) = @_;
36             my $via = 'ISO-8859-1';
37             $via = $options->{via} if $options && exists $options->{via};
38             repair_encoding($buf, [
39             'decode', 'UTF-8',
40             'encode', $via,
41             'decode', 'UTF-8',
42             ]);
43             }
44              
45             sub learn_recoding {
46             my %args = @_;
47             my $source = $args{from};
48             my $target = $args{to};
49             my $encodings = $args{encodings};
50             my $maxdepth = $args{depth} || 5;
51             my $search_mode = $args{search} || 'first';
52             return [] if $source eq $target;
53              
54             my @result;
55             for my $depth (1..$maxdepth) {
56             my $iter = NestedLoops( [($encodings) x $depth] );
57             my @ed = (qw(encode decode)) x (int($depth / 2) + 1);
58             my @de = (qw(decode encode)) x (int($depth / 2) + 1);
59             while (my @steps = $iter->()) {
60             no warnings 'uninitialized';
61             for my $steps ([zip \@ed, \@steps], [zip \@de, \@steps]) {
62             # use Data::Dumper;
63             # warn Dumper($steps);
64             if (eval {repair_encoding($source, $steps)} eq $target) {
65             if (lc($search_mode) eq 'first') {
66             return $steps;
67             } else {
68             push @result, $steps;
69             }
70             }
71             }
72             }
73             return \@result if @result && lc($search_mode) eq 'shallow';
74             }
75             return \@result if @result;
76             return;
77             }
78              
79             1;
80              
81             =encoding utf-8
82              
83             =head1 NAME
84              
85             Encode::Repair - Repair wrongly encoded text strings
86              
87             =head1 SYNOPSIS
88              
89             # Simple usage
90             use Encode::Repair qw(repair_double);
91             binmode STDOUT, ':encoding(UTF-8)';
92              
93             # prints: small ae: ä
94             print repair_double("small ae: \xc3\x83\xc2\xa4\n");
95              
96             # prints: beta: β
97             print repair_double("beta: \xc4\xaa\xc2\xb2\n", {via => 'Latin-7'});
98              
99              
100             # Advanced usage
101             # assumes you have a sample text both correctly decoded in a
102             # character string, and as a wrongly encoded buffer
103              
104             use Encode::Repair qw(repair_encoding learn_recoding);
105             use charnames qw(:full);
106             binmode STDOUT, ':encoding(UTF-8)';
107              
108             my $recoding_pattern = learn_recoding(
109             from => "beta: \xc4\xaa\xc2\xb2",
110             to => "beta: \N{GREEK SMALL LETTER BETA}",
111             encodings => ['UTF-8', 'Latin-1', 'Latin-7'],
112             );
113             if ($recoding_pattern) {
114             my $mojibake = "\304\252\302\273\304\252\302\261\304\252\302"
115             ."\274\304\252\342\200\234\304\252\302\261";
116             print repair_encoding($mojibake, $recoding_pattern), "\n";
117             } else {
118             print "Sorry, could not help you :-(\n";
119             }
120              
121              
122             =head1 DESCRIPTION
123              
124             Sometimes software or humans mess up the character encoding of text. In some
125             cases it is possible to reconstruct the original text. This module helps you
126             to do it.
127              
128             It covers the rather common case that a program assumes a wrong character
129             encoding on reading some input, and converts it to Mojibake (see
130             L).
131              
132             If you use this module on a regular basis, it most likely indicates that
133             something is wrong in your processs. It should only be used for one-time tasks
134             such as migrating a database to a new system.
135              
136             =head1 FUNCTIONS
137              
138             =over
139              
140             =item repair_double
141              
142             Repairs the common case when a UTF-8 string was read as another encoding,
143             and was encoded as UTF-8 again. The other encoding defaults to ISO-8859-1 aka
144             Latin-1, and can be overridden with the C option:
145              
146             my $repaired = repair_double($buffer, {via => 'ISO-8859-2' });
147              
148             It expects an octet string as input, and returns a decoded character string.
149              
150             =item learn_recoding
151              
152             Given a sample of text twice, once correctly decoded and once mistreated,
153             attemps to find a sequence of encoding and decoding that turns the mistreated
154             text into the correct form.
155              
156             my $coding_pattern = learn_recoding(
157             from => $mistreated_buffer,
158             to => $correct_string,
159             encodings => \@involved_encodings,
160             depth => 5,
161             search => 'first',
162             );
163              
164             C should be an array reference containing all the character
165             encodings involved in the process that messes up the encoding. If you don't
166             know these, try it with C, C and the encoding that your
167             system uses by default.
168              
169             C is the maximal number of encoding and decoding steps to be tried. For
170             example C needs three steps. Defaults to 5; higher values might
171             slow down the program significantly, although smaller depths are tried first.
172              
173             The return value is C on failure, and an array reference otherwise. It
174             returns the encoding/decoding steps suitable for feeding into C.
175             It contains a list of even size, where elements with even indexes are either
176             C<'encode'> or C<'decode'>, and those with odd indexes contain the name of the
177             encoding.
178              
179             With C you can adjust how long the function searches for a recoding
180             sequence.
181             WIth the default of C<'first'> it returns the first possible sequence. With
182             C<'shallow'> it searches for the first working sequence and all other
183             sequences of the same length, and then returns an array reference containing
184             array references to all sequences. With the value C<'all'>, all possible
185             sequences are searched and returned, but often that's a very bad idea, because
186             it also finds sequences where parts of the sequence undo the work of other
187             sequences (something like C<[qw(encode latin-1 decode latin-1)]>).
188              
189             Since Version 0.0.2 C forces strict pattern of alternatining
190             encoding and decoding. So even if C<['decode', 'UTF-8', 'decode', 'UTF-8']> is
191             a working input, C will return C<['decode', 'UTF-8', 'encode',
192             'Latin-1', 'decode', 'UTF-8']> instead. So you might have to include C
193             in your encoding list even if it is not strictly involved.
194              
195             =item repair_encoding
196              
197             Takes an input string and an encoding/decoding pattern (as returned from
198             C) as input and returns the repaired string.
199              
200             =back
201              
202             =head1 Troubleshooting
203              
204             If C returns C, you can increase the C option
205             value (for example to 7). If that doesn't help, check that the two input
206             strings actually corespond. C does an exact equality check, so
207             trailing newline characters or spaces will cause it to fail.
208              
209             If C produces errors or warnings, it is likely that the sample
210             you used for learning was not long enough, or not representative. For example
211             if your system uses both ISO-8859-1 and ISO-8859-15 (which are quite similar),
212             C uses the first match, so the sample data has to contain at
213             least one character that's in ISO-8859-15 but not in ISO-8859-1, like the
214             Euro sign (€).
215              
216             =head1 Further Reading
217              
218             This document tries to stick to the terminology introduced in the L
219             module.
220              
221             If you want to learn more about the way text is encoded and how perl handles
222             that, take a look at L.
223              
224             =head1 LICENSE AND COPYRIGHT
225              
226             Copyright (C) 2008, 2009 by Moritz Lenz, L,
227             moritz@faui2k3.org.
228              
229             This is free software; you my use it under the terms of the Artistic License 2
230             as published by The Perl Foundation.
231              
232             The code examples distributed with this package are an exception, and may be
233             used, modified and redistributed without any limitations.
234              
235             Encode::Repair is distributed in the hope that it will be useful, but WITHOUT
236             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
237             FOR A PARTICULAR PURPOSE.
238              
239             =head1 Development
240              
241             The source code is stored in a public git repository at
242             L. If you find any bugs, please used the
243             issue tracker linked from this site.
244              
245             If you find a case of messed-up encodings that can be repaired deterministically
246             and that's not covered by this module, please contact the author, providing a
247             hex dump of both input and output, and as much information of the encoding and
248             decoding process as you have.
249              
250             Patches are also very welcome.
251              
252             =cut