File Coverage

lib/OCR/PerfectCR.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package OCR::PerfectCR;
2              
3             # ABOVE the 'use strict' line!
4             $VERSION = 0.03;
5              
6 1     1   808 use warnings;
  1         2  
  1         43  
7 1     1   6 use strict;
  1         2  
  1         38  
8 1     1   969 use IO::File;
  1         14078  
  1         162  
9 1     1   452 use GD;
  0            
  0            
10             use Digest::MD5 'md5_hex';
11             use Graphics::ColorObject;
12             use Carp 'croak';
13              
14             =head1 NAME
15              
16             OCR::PerfectCR - Perfect OCR (if you have perfect input).
17              
18             =head1 SYNOPSIS
19              
20             use OCR::PerfectCR;
21             use GD;
22            
23             my $recognizer = OCR::PerfectCR->new;
24             $recognizer->load_charmap_file("charmap");
25             my $image = GD::Image->new("example.png") or die "Can't open example.png: $!";
26             my $string = $recognizer->recognize($image);
27             $recognizer->save_charmap_file("charmap");
28              
29              
30             =head1 DESCRIPTION
31              
32             OCR::PerfectCR is a fast, highly accurate "optical" character recognition
33             engine requiring minimal training. How does it manage this, despite
34             being written in pure perl? By ignoring most of the problems.
35             OCR::PerfectCR requires that your input is in perfect shape -- that it
36             hasn't gone into the real world and been scanned, that each image
37             represent one line of text, and nothing else, and most difficultly,
38             that the font have a fairly wide spacing. This makes it very useful
39             for converting image-based subtitle formats to text, and probably not
40             much else. However, it is very good at doing that.
41              
42             OCR::PerfectCR's knowledge about a particular font is encapsulated in a
43             "charmap" file, which maps md5 sums of the canonical representation
44             of a character (the first 32 characters of the line) to a string (the
45             34th and onwards chars, to newline).
46              
47             Most methods will die on error, rather then trying to recover and return undef.
48              
49             =cut
50              
51             =head2 $recognizer->load_charmap_file("charmap")
52              
53             Loads a charmap file into memory.
54              
55             =cut
56              
57             sub load_charmap_file {
58             my $self = shift;
59             my $filename = shift;
60            
61             # print "load_charmap_file($self, $filename);\n";
62            
63             my $charmapfile = IO::File->new("<".$filename) or
64             croak "Couldn't open $filename: $!";
65             binmode($charmapfile, ':utf8');
66             local $_;
67             while (<$charmapfile>) {
68             chomp;
69             next if !$_ or $_ =~ m/^#/;
70             my ($md5, $value);
71             $md5 = substr($_, 0, 32, '');
72             substr($_, 0, 1, '');
73             $value = $_;
74             $self->{charmap}{$md5}=$value;
75             }
76            
77             return;
78             }
79              
80             =head2 $recognizer->save_charmap_file("charmap")
81              
82             Saves the charmap to a file. Charmap files are always saved and
83             loaded in utf8.
84              
85             =cut
86              
87             sub save_charmap_file {
88             my ($recognizer, $filename) = @_;
89             # print "save_charmap_file($recognizer, $filename);\n";
90              
91             my $charmapfile = IO::File->new(">$filename") or
92             croak "Couldn't open $filename: $!";
93             my %images = %{$recognizer->{charmap}};
94             binmode($charmapfile, ':utf8');
95             {
96             no warnings 'uninitialized';
97             for (sort {$images{$a} cmp $images{$b} or
98             $a cmp $b}
99             keys %images) {
100             my $v = $images{$_};
101             $charmapfile->print("$_ $v\n");
102             }
103             }
104             }
105              
106             =head2 $recognizer->recognize($image) (recognise is an alias for this)
107              
108             Takes the image (a GD::Image object), and tries to convert it into
109             text. In list context, returns a list of hashrefs, each having a
110             C key, whose value is the string in the charmap for that image.
111             There may also be a C (note the spelling) key, with a value
112             between 0 and 360, representing the color of the text in degrees on
113             the color wheel, or C meaning grey. The C being missing
114             implies that there is nothing there but background -- that is, that
115             it's whitespace. For non-whitespace characters, there is a key
116             C, which gives the md5 sum of the character in canonical form --
117             that is, it's charmap entry. Other keys are purposefully not
118             documented -- if you find them useful, I let me know by filing
119             an RT request.
120              
121             Characters not in the charmap will have their str set to C<"\x{FFFD}"
122             eq "\N{REPLACEMENT CHARACTER}">, and will be added to the charmap.
123             They will also be saved as png files named I.png in the current
124             directory, so that they a human can look at them and ID them.
125              
126              
127             =cut
128              
129             sub recognize {
130             chopup(@_, \&charimage);
131             }
132             # To avoid an "only used once" warning.
133             *recognise = *recognize;
134             *recognise = *recognize;
135              
136             =head2 OCR::PerfectCR->new();
137              
138             Just a boring constructor. No parameters.
139              
140             =cut
141              
142             sub new {
143             return bless {}, shift;
144             }
145              
146             =head1 BUGS
147              
148             Please report bugs on L. If the bug /might possibly/ be because of your input file, please include it with the bug report.
149              
150             =head1 AUTHOR & LICENSE
151              
152             Copyright 2005 James Mastros, james@mastros.biz, JMASTROS, theorbtwo. (Those are all the same person.)
153              
154             May be used and copied under the same terms as C itself.
155              
156             Thanks, castaway, for being you, and diotalevi for a detailed review.
157              
158             =cut
159              
160             ### Internal functions below here.
161             sub charimage {
162             my ($recognizer, $image, @bgrgb) = @_;
163            
164             # print "charimage($recognizer, $image)\n";
165             ($image, my $this) = image_to_grey($image, @bgrgb);
166            
167             # printf "Got char image, size %d by %d\n", $image->getBounds;
168             my $md5 = imagesum($image);
169             $this->{md5} = $md5;
170             if (!exists $recognizer->{charmap}{$md5}) {
171             $recognizer->{charmap}{$md5} = "\x{FFFD}";
172             # print "md5: $md5\n";
173             # print "First time!\n";
174            
175             my $file = IO::File->new(">$md5.png") or die "Couldn't create $md5.png: $!";
176             binmode($file);
177             $file->print($image->png);
178             }
179            
180             #print "Known character: $images{$md5}\n";
181             #print $images{$md5};
182             $this->{str} = $recognizer->{charmap}{$md5};
183            
184             return $this;
185             }
186              
187             my %rgb255_to_hsv;
188             sub RGB255_to_HSV {
189             my ($r, $g, $b) = @_;
190             my $rgb = $r * 0x10000 + $g*0x100 + $b;
191             if (!exists $rgb255_to_hsv{$rgb}) {
192             $rgb255_to_hsv{$rgb} = Graphics::ColorObject->new_RGB255(\@_, space=>'PAL')->as_HSV;
193             }
194             return @{$rgb255_to_hsv{$rgb}};
195             }
196              
197             my %hsv_to_rgb255;
198             sub HSV_to_RGB255 {
199             my ($h, $s, $v) = @_;
200             my $hsv = "$h,$s,$v";
201             if (!exists $hsv_to_rgb255{$hsv}) {
202             $hsv_to_rgb255{$hsv} = Graphics::ColorObject->new_HSV(\@_, space=>'PAL')->as_RGB255;
203             }
204             return @{$hsv_to_rgb255{$hsv}};
205             }
206              
207             sub image_to_grey {
208             my ($colorimage, @bgrgb) = @_;
209             my $totalweight = 0;
210             my $totalcolor = 0;
211             my $maxval = 0;
212              
213             my ($width, $height) = $colorimage->getBounds;
214             my $bwimage = GD::Image->new($width, $height);
215             my $black = $bwimage->colorResolve(0, 0, 0);
216             my $white = $bwimage->colorResolve(255, 255, 255);
217              
218             # Squash to greyscale; figure out what the whitest pixel value is.
219             foreach my $x (0..$width) {
220             foreach my $y (0..$height) {
221             my ($r, $g, $b) = $colorimage->rgb($colorimage->getPixel($x, $y));
222             $r = abs($r - $bgrgb[0]);
223             $g = abs($g - $bgrgb[1]);
224             $b = abs($b - $bgrgb[1]);
225             my ($h, $s, $v) = RGB255_to_HSV($r, $g, $b);
226             $totalweight += $s;
227             $totalcolor += $h * $s;
228             $maxval = $v if $maxval < $v;
229             }
230             }
231              
232             # Adjust to put whitest value at 100%; squash to plain black and white.
233             foreach my $x (0..$width) {
234             foreach my $y (0..$height) {
235             my ($r, $g, $b) = $colorimage->rgb($colorimage->getPixel($x, $y));
236             $r = abs($r - $bgrgb[0]);
237             $g = abs($g - $bgrgb[1]);
238             $b = abs($b - $bgrgb[1]);
239             my ($h, $s, $v) = RGB255_to_HSV($r, $g, $b);
240             if ($v/$maxval > .5) {
241             $bwimage->setPixel($x, $y, $white);
242             } else {
243             $bwimage->setPixel($x, $y, $black);
244             }
245             }
246             }
247              
248             # print "Total color weight: ", $totalweight, "\n";
249             # print "Average color: ", $totalcolor/$totalweight, "\n";
250             my $avgcolor = sprintf("%.0f", $totalcolor/$totalweight);
251             $avgcolor = undef if $totalweight < 1;
252              
253             return $bwimage, {color => $avgcolor, bgrgb=>\@bgrgb};
254             }
255              
256             sub chopup {
257             my ($recognizer, $inimage, $imagefunc) = @_;
258             # print "chopup($recognizer, $inimage, $imagefunc);\n";
259             my @string;
260            
261             my $bgcolor = $inimage->getPixel(0,0);
262             my (@bgrgb) = $inimage->rgb($bgcolor);
263             print "Background color at index $bgcolor [@bgrgb]\n";
264             my ($width, $height) = $inimage->getBounds;
265            
266             my $mincol=0;
267             while ($mincol <= $width) {
268             my ($startcol, $endcol);
269             print "Finding bounds starting at $mincol\n";
270              
271             # Find left and right char boundry.
272             for my $col ($mincol .. $width-1) {
273             # print "Column $col: ";
274             my $hasnonbg=0;
275             for my $row (0 .. $height-1) {
276             if ($inimage->getPixel($col, $row) != $bgcolor) {
277             $hasnonbg=1;
278             last;
279             }
280             }
281             # print "$hasnonbg\n";
282            
283             if (not defined $startcol) {
284             if ($hasnonbg) {
285             $startcol = $col;
286             }
287             } else {
288             if (!$hasnonbg) {
289             $endcol = $col;
290             last;
291             }
292             }
293             }
294            
295             if (not defined $endcol) {
296             $endcol = $width-1;
297             }
298              
299             if (not defined $startcol or
300             $startcol >= $endcol) {
301             # print "Couldn't find anything\n";
302             last;
303             }
304            
305            
306             my ($startrow, $endrow);
307              
308             # Find top boundry
309             for my $row (0..$height-1) {
310             my $hasnonbg=0;
311             for my $col ($startcol..$endcol) {
312             if ($inimage->getPixel($col, $row) != $bgcolor) {
313             $hasnonbg=1;
314             last;
315             }
316             }
317             if ($hasnonbg) {
318             $startrow = $row;
319             last;
320             }
321             }
322            
323             # Find bottom boundry.
324             for my $row (reverse(0..$height-1)) {
325             my $hasnonbg=0;
326             for my $col ($startcol..$endcol) {
327             if ($inimage->getPixel($col, $row) != $bgcolor) {
328             $hasnonbg=1;
329             last;
330             }
331             }
332             if ($hasnonbg) {
333             $endrow = $row;
334             last;
335             }
336             }
337            
338             print "Character at ($startcol, $startrow)-($endcol, $endrow)\n";
339             my $charimage = gdextract($inimage, $startcol, $startrow, $endcol, $endrow);
340             my $this = $imagefunc->($recognizer, $charimage, @bgrgb);
341             $this->{prespace} = $startcol - $mincol;
342             $this->{startcol} = $startcol;
343             # $this->{mincol} = $mincol;
344             $this->{endcol} = $endcol;
345             $this->{width} = $endcol - $startcol;
346             $this->{chrwidth} = ($endcol - $startcol)/length($this->{str});
347             push @string, $this;
348            
349             $mincol = $endcol;
350             }
351            
352             # print "\n";
353            
354             # for (1..$#string-1) {
355             # my $prev = $string[$_-1];
356             # my $this = $string[$_];
357            
358             # print "Chars: $prev->{str} -- $this->{str}\n";
359             # print "Charwidths: $prev->{chrwidth} -- $this->{chrwidth}\n";
360             # print "Prespace: $this->{prespace}\n";
361             # print ("Metric: ", (($prev->{chrwidth}+$this->{chrwidth})/2)/$this->{prespace}, "\n");
362            
363             # }
364              
365             # Insert spaces.
366             @string = map {
367             # The "6" here is mostly just a guess.
368             # The ne '.' is just to fix up a common situation in the purticular
369             # source I checked against the most.
370             if ($_->{prespace} > $height/6
371             and $_->{str} ne '.') {
372             ({str=>" ", fake=>1}, $_);
373             } else {
374             $_;
375             }
376             } @string;
377            
378             # print "Finished: ", join('', map { $_->{str} } @string), "\n";
379              
380             if (wantarray) {
381             return @string;
382             } else {
383             return join "", map { $_->{str} } @string;
384             }
385             }
386              
387             # Just a silly helper
388             sub gdextract {
389             my ($inimage, $x1, $y1, $x2, $y2) = @_;
390             my $width = $x2-$x1 + 1;
391             my $height = $y2-$y1 + 1;
392              
393             my $outimage = GD::Image->new($width, $height);
394             $outimage->copy($inimage, 0, 0, $x1, $y1, $width, $height);
395              
396             return $outimage;
397             }
398              
399             # It appears that GD's ->png method doesn't always return exactly the
400             # same string for the same image -- it depends on the version of GD,
401             # or of libpng, or of libz, or... something. I want charmap files to
402             # be portable, so I need a portable method, so we define our own. It
403             # doesn't have to be small, just portable.
404             #
405             # Note to self: Everything should be packed N -- big-endian (network) u32.
406             sub imagesum {
407             my ($img) = @_;
408             my $str;
409             my ($w, $h) = $img->getBounds;
410              
411             $str = pack('NN', $w, $h);
412             for my $x (0..$w) {
413             for my $y (0..$h) {
414             $str .= pack('NNN', $img->rgb($img->getPixel($x, $y)));
415             }
416             }
417              
418             return md5_hex($str);
419             }
420              
421             1;