File Coverage

blib/lib/JSON/Repair.pm
Criterion Covered Total %
statement 109 188 57.9
branch 46 116 39.6
condition 22 36 61.1
subroutine 10 12 83.3
pod 1 4 25.0
total 188 356 52.8


line stmt bran cond sub pod time code
1             package JSON::Repair;
2 1     1   73530 use parent Exporter;
  1         329  
  1         5  
3             our @EXPORT_OK = qw/repair_json/;
4             our %EXPORT_TAGS = (all => \@EXPORT_OK);
5 1     1   85 use warnings;
  1         2  
  1         23  
6 1     1   6 use strict;
  1         1  
  1         15  
7 1     1   4 use utf8;
  1         2  
  1         10  
8 1     1   19 use Carp;
  1         1  
  1         47  
9              
10             # This Perl version is required because of hashes as errors from
11             # JSON::Parse.
12              
13 1     1   28 use 5.014;
  1         3  
14 1     1   599 use JSON::Parse '0.58';
  1         1098  
  1         125  
15 1     1   516 use C::Tokenize '$comment_re';
  1         4505  
  1         1933  
16              
17             our $VERSION = '0.08';
18              
19             sub repair_json
20             {
21 9     9 1 21531 my ($broken, %options) = @_;
22 9         60 my $jp = JSON::Parse->new ();
23             # Request a hash response from $jp when there is an error.
24 9         31 $jp->diagnostics_hash (1);
25 9         16 my $verbose = $options{verbose};
26 9         13 my $output = $broken;
27 9         14 while (1) {
28             # Try various repairs. This continues until the JSON is
29             # valid, or none of the repairs have worked. After a
30             # successful repair, "next;" should be used. Falling through
31             # to the end of the while loop which started above causes an
32             # exit with an error message.
33 25         38 eval {
34 25         541 $jp->check ($output);
35             };
36 25 100       76 if (! $@) {
37 9         16 last;
38             }
39 16         32 my $error = $@->{error};
40             # print STDERR "$error\n";
41             # The type of thing where the error occurred
42 16         23 my $type = $@->{'bad type'};
43 16 100       45 if ($error eq 'Unexpected character') {
    100          
    50          
44 13         21 my $bad_byte = $@->{'bad byte contents'};
45             # $bad_byte is a number, so for convenient string
46             # comparison, turn it into a string.
47 13         26 my $bad_char = chr ($bad_byte);
48 13         19 my $valid_bytes = $@->{'valid bytes'};
49             # The position of the bad byte.
50 13         20 my $bad_pos = $@->{'bad byte position'};
51 13 50       24 if ($verbose) {
52 0         0 print "Unexpected character '$bad_char' at byte $bad_pos.\n";
53             }
54             # Everything leading up to the bad byte.
55 13         26 my $previous = substr ($output, 0, $bad_pos - 1);
56             # Everything after the bad byte.
57 13         35 my $remaining = substr ($output, $bad_pos);
58 13 100 66     71 if ($bad_char eq "'" && $valid_bytes->[ord ('"')]) {
    100 100        
59             # Substitute a ' in the remaining stuff, if there is
60             # one, up to a comma or colon or an end-of marker.
61 3 50       36 if ($remaining =~ s/^([^,:\]\}]*)'(\s*[,:\]\}])/$1"$2/) {
62 3         8 my $string = $1;
63 3 100       10 if ($string =~ /"/) {
64 1         2 my $quotedstring = $string;
65 1         14 $quotedstring =~ s/"/\\"/g;
66 1         19 $remaining =~ s/^\Q$string/$quotedstring/;
67             }
68             }
69 3         9 $output = $previous . '"' . $remaining;
70 3 50       7 if ($verbose) {
71 0         0 print "Changing single to double quote.\n";
72             }
73 3         7 next;
74             }
75             # An unexpected } or ] usually means there was a comma
76             # after an array or object entry, followed by the end
77             # of the object.
78             elsif ($bad_char eq '}' || $bad_char eq ']') {
79             # Look for a comma at the end of it.
80 2 50 0     13 if ($previous =~ /,\s*$/) {
    0          
81 2         12 $previous =~ s/,(\s*)$/$1/;
82 2         8 $output = $previous . $bad_char . $remaining;
83 2 50       5 if ($verbose) {
84 0         0 print "Removing a trailing comma.\n";
85             }
86 2         5 next;
87             }
88             elsif ($bad_char eq '}' && $previous =~ /:\s*$/) {
89             # In the unlikely event that there was a colon
90             # before the end of the object, add a "null"
91             # to it.
92 0         0 $output = $previous . "null" . $remaining;
93 0         0 next;
94             }
95             else {
96 0         0 warn "Unexpected } or ] in $type\n";
97             }
98             }
99 8 100 66     42 if (($type eq 'object' || $type eq 'array' ||
      100        
100             $type eq 'initial state')) {
101             # Handle comments in these states.
102 2 50       5 if ($bad_char eq '/') {
103 0 0       0 if ($verbose) {
104 0         0 print "C-style comments in object or array?\n";
105             }
106 0         0 $remaining = $bad_char . $remaining;
107 0 0       0 if ($remaining =~ s/^($comment_re)//) {
108 0 0       0 if ($verbose) {
109 0         0 print "Deleting comment '$1'.\n";
110             }
111 0         0 rm_trailing_h (\$previous, $verbose);
112 0         0 $output = $previous . $remaining;
113 0         0 next;
114             }
115             }
116 2 50       6 if ($bad_char eq '#') {
117 0 0       0 if ($verbose) {
118 0         0 print "Hash comments in object or array?\n";
119             }
120 0 0       0 if ($remaining =~ s/^(.*)\n//) {
121 0 0       0 if ($verbose) {
122 0         0 print "Deleting comment '$1'.\n";
123             }
124 0         0 rm_trailing_h (\$previous, $verbose);
125 0         0 $output = $previous . $remaining;
126 0         0 next;
127             }
128             }
129 2 100 66     11 if ($type eq 'initial state' && $previous !~ /^\s+$/) {
130 1 50       4 if ($verbose) {
131 0         0 print "Trailing garbage '$bad_char$remaining'?\n";
132             }
133 1         3 $output = $previous;
134 1         2 next;
135             }
136             }
137 7 50 66     24 if (($type eq 'object' || $type eq 'array') &&
      66        
138             $valid_bytes->[ord (',')]) {
139 0 0       0 if ($verbose) {
140 0         0 print "Missing comma in object or array?\n";
141             }
142             # Put any space at the end of $previous before the
143             # comma, for aesthetic reasons only.
144 0         0 my $join = ',';
145 0 0       0 if ($previous =~ s/(\s+)$//) {
146 0         0 $join .= $1;
147             }
148 0         0 $join .= $bad_char;
149 0         0 $output = $previous . $join . $remaining;
150 0         0 next;
151             }
152 7 0 33     30 if ($type eq 'object' && $valid_bytes->[ord ('"')]) {
153 0 0       0 if ($verbose) {
154 0         0 print "Unquoted key or value in object?\n";
155             }
156 0 0       0 if ($remaining =~ s/(^[^\}\]:,\n\r"]*)(\s*):/$1"$2:/) {
157 0 0       0 if ($verbose) {
158 0         0 print "Adding quotes to key '$bad_char$1'\n";
159             }
160 0         0 $output = $previous . '"' . $bad_char . $remaining;
161             # For strings missing quotes like , use <"a" :>
162             # rather than <"a ":>
163 0 0       0 if ($output =~ s/(\s+)"[:,]/"$1:/g) {
164 0 0       0 if ($verbose) {
165 0         0 print "Moved whitespace '$1' before \" to after \"";
166             }
167             }
168 0         0 next;
169             }
170 0 0       0 if ($previous =~ /:\s*$/) {
171 0         0 $remaining = $bad_char . $remaining;
172 0 0       0 if ($remaining =~ s/^(.*)\n/"$1"\n/) {
173 0 0       0 if ($verbose) {
174 0         0 print "Adding quotes to unquoted value '$1'.\n";
175 0         0 $output = $previous . $remaining;
176 0         0 next;
177             }
178             }
179             }
180             }
181 7 100       16 if ($type eq 'string') {
182 4 50       10 if ($bad_byte < 0x20) {
183 4         15 $bad_char = json_escape ($bad_char);
184 4 50       19 if ($verbose) {
185 0         0 printf "Changing bad byte %X into $bad_char.\n",
186             ord ($bad_byte);
187             }
188 4         10 $output = $previous . $bad_char . $remaining;
189 4         8 next;
190             }
191             }
192             # Add a zero to a fraction
193 3 100 66     15 if ($bad_char eq '.' && $remaining =~ /^[0-9]+/) {
194 1 50       3 if ($verbose) {
195 0         0 print "Missing zero before a fraction?\n";
196             }
197 1         4 $output = $previous . "0." . $remaining;
198 1         2 next;
199             }
200             # Delete a leading zero on a number.
201 2 50       5 if ($type eq 'number') {
202 2 100 66     23 if ($previous =~ /0$/ && $remaining =~ /^[0-9]+/) {
203 1 50       14 if ($verbose) {
204 0         0 print "Leading zero in number?\n";
205             }
206 1         6 $previous =~ s/0$//;
207 1         3 $remaining =~ s/^0+//;
208 1         3 $output = $previous . $bad_char . $remaining;
209             # print "$output\n";
210 1         3 next;
211             }
212 1 50 33     11 if ($bad_char =~ /[eE]/ && $previous =~ /\.$/) {
213 1 50       3 if ($verbose) {
214 0         0 print "Missing zero between . and e?\n";
215             }
216 1         5 $output = $previous . "0" . $bad_char . $remaining;
217 1         3 next;
218             }
219             }
220             # print "$output\n";
221 0         0 warn "Could not handle unexpected character '$bad_char' in $type\n";
222 0 0       0 if ($verbose) {
223 0         0 print_valid_bytes ($valid_bytes);
224             }
225             }
226             elsif ($error eq 'Unexpected end of input') {
227             # for my $k (keys %{$@}) {
228             # print "$k -> $@->{$k}\n";
229             # }
230             # print "Unexpected end of input.\n";
231 2 50       8 if ($type eq 'string') {
    100          
    50          
232 0         0 $output .= '"';
233 0 0       0 if ($verbose) {
234 0         0 print "String ended unexpectedly: adding a quote.\n";
235             }
236 0         0 next;
237             }
238             elsif ($type eq 'object') {
239 1         4 $output .= '}';
240 1 50       3 if ($verbose) {
241 0         0 print "Object ended unexpectedly: adding a }.\n";
242             }
243 1         2 next;
244             }
245             elsif ($type eq 'array') {
246 1         3 $output .= ']';
247 1 50       4 if ($verbose) {
248 0         0 print "Array ended unexpectedly: adding a ].\n";
249             }
250 1         2 next;
251             }
252             else {
253             # Cannot really get an unexpected end of a number
254             # since it has no end marker, nor of the initial
255             # state. That leaves the case of literals, which might
256             # come to an unexpected end like 'tru' or something.
257 0         0 warn "Unhandled unexpected end of input in $type";
258             }
259             }
260             elsif ($error eq 'Empty input') {
261 1         2 $output = '""';
262 1 50       4 if ($verbose) {
263 0         0 print "Changing empty input to empty string \"\".\n";
264             }
265 1         2 next;
266             }
267 0 0       0 if ($verbose) {
268 0         0 print "$output\n";
269             }
270 0         0 carp "Repair failed: unhandled error $error";
271 0         0 last;
272             }
273 9         61 return $output;
274             }
275              
276             sub print_valid_bytes
277             {
278 0     0 0 0 my ($valid_bytes) = @_;
279 0         0 for my $i (0..127) {
280 0         0 my $ok = $valid_bytes->[$i];
281 0 0       0 if ($ok) {
282 0         0 print "OK: '",chr ($i),"'\n";
283             }
284             }
285             }
286              
287             # Filched from JSON::Create::PP
288              
289             sub json_escape
290             {
291 4     4 0 9 my ($input) = @_;
292 4         15 $input =~ s/("|\\)/\\$1/g;
293 4         9 $input =~ s/\x08/\\b/g;
294 4         7 $input =~ s/\f/\\f/g;
295 4         9 $input =~ s/\n/\\n/g;
296 4         6 $input =~ s/\r/\\r/g;
297 4         11 $input =~ s/\t/\\t/g;
298 4         10 $input =~ s/([\x00-\x1f])/sprintf ("\\u%04x", ord ($1))/ge;
  1         7  
299 4         11 return $input;
300             }
301              
302             sub rm_trailing_h
303             {
304 0     0 0   my ($previous_ref, $verbose) = @_;
305 0           my $previous = $$previous_ref;
306 0 0         if ($previous =~ s/\h+$//) {
307 0 0         if ($verbose) {
308 0           print "Also removed whitespace before comment.\n";
309             }
310             }
311 0           $$previous_ref = $previous;
312             }
313              
314             1;