File Coverage

blib/lib/OCR/OcrSpace.pm

Criterion	Covered	Total	%
statement	56	68	82.3
branch	20	32	62.5
condition	4	11	36.3
subroutine	11	11	100.0
pod	1	1	100.0
total	92	123	74.8

line	stmt	bran	cond	sub	pod	time	code
1							package OCR::OcrSpace;
2
3	1			1		69861	use 5.006;
	1					4
4	1			1		5	use strict;
	1					3
	1					21
5	1			1		4	use warnings;
	1					2
	1					41
6
7	1			1		752	use LWP::UserAgent;
	1					50834
	1					36
8	1			1		56	use Carp qw( carp confess croak );
	1					4
	1					64
9
10	1			1		6	use vars qw($VERSION @EXPORT @ISA $BASE_URL);
	1					2
	1					776
11
12							@ISA = qw(Exporter);
13
14							@EXPORT = qw( get_result $BASE_URL);
15							############################################################
16							# DEFAULT base url
17							############################################################
18							$BASE_URL = 'http://api.ocr.space/parse/image';
19
20							=head1 NAME
21
22							Apr-2020 @
23
24							OCR::OcrSpace - Perl Interface to access L
25
26							The free OCR API provides a simple way of parsing images and multi-page PDF documents (PDF OCR) and getting the extracted text results returned in a JSON format.
27
28							This module implemented the Post request only.
29
30							Extract text from images , pdf via ocr-space
31
32							=head1 VERSION
33
34							Version 0.01
35
36							=cut
37
38							our $VERSION = '0.01';
39
40							=head1 SYNOPSIS
41
42							#using object oriented interaface
43							use OCR::OcrSpace;
44
45							my $ocrspace_obj = OCR::OcrSpace->new();
46
47							my $param = {
48							file => '/tmp/image.png', #full iamge path
49
50							or
51
52							url => 'http://imagedatabase.com/test.jpg' #image url to fetch from
53
54							or
55
56							base64Image => 'data:image/png;base64,iVBORw0KGgoAx7/7LNuCQS0posnocgEAFpySUVORK5CYII='
57
58							#following optional parameter
59							ocr_space_url => "https://api.ocr.space/parse/image",
60							apikey => 'XXXXXXXXXXXXXXXXXX', #API Key (mandatory)
61							isOverlayRequired =>'True', #optional
62							language =>'eng' , #optional
63							scale => 'True', #optional
64							isTable => 'True', #optional
65							OCREngine => 2, #optional
66							filetype => 'PNG', #optional
67							detectOrientation => 'False', #optional
68							isCreateSearchablePdf => 'True', #optional
69							isSearchablePdfHideTextLayer => 'True', #optional
70
71							};
72
73							print $ocrspace_obj->get_result( $param );
74
75
76							#using non-object oriented interaface
77
78							use OCR::OcrSpace;
79							print get_result( $param );
80
81
82
83							#since ocrSpace uses http as well as HTTPs you can always set the following varible before call
84							$BASE_URL
85
86							=head1 EXPORT
87
88							#method
89							get_result
90
91							#varible
92							$BASE_URL
93
94							=head1 SUBROUTINES/METHODS
95
96							=head2 new
97
98							used to create a constructor of OCR::OcrSpace for object oriented mode
99
100							=cut
101
102							sub new {
103	1			1	1	606	my ( $class, $params ) = ( @_ );
104
105	1					4	return ( bless( {}, $class ) );
106							}
107
108							=head2 get_result
109
110							params hash ref of following valid keys
111
112							=over 13
113
114							=item * B { optional but required when using object oriented interface }
115
116							=item * B { optional url if you want to use https mention url }
117
118							C scalar string ([Optional] Default L)
119
120							=item * B { scalar string }
121
122							C API Key (send in the header)
123
124							get your key from here L
125
126							=item * B
127
128							C
129							You can use three methods to upload the input image or PDF. We recommend the URL method for file sizes > 10 MB for faster upload speeds.
130
131							url: URL of remote image file (Make sure it has the right content type)
132
133							file: Multipart encoded image file with filename
134
135							base64Image: Image as Base64 encoded string
136
137
138							=item * B
139
140							C
141
142							[Optional]
143							Arabic=ara
144							Bulgarian=bul
145							Chinese(Simplified)=chs
146							Chinese(Traditional)=cht
147							Croatian = hrv
148							Czech = cze
149							Danish = dan
150							Dutch = dut
151							English = eng
152							Finnish = fin
153							French = fre
154							German = ger
155							Greek = gre
156							Hungarian = hun
157							Korean = kor
158							Italian = ita
159							Japanese = jpn
160							Polish = pol
161							Portuguese = por
162							Russian = rus
163							Slovenian = slv
164							Spanish = spa
165							Swedish = swe
166							Turkish = tur
167
168							Language used for OCR. If no language is specified, English eng is taken as default.
169
170							IMPORTANT: The language code has always 3-letters (not 2). So it is "eng" and not "en".
171
172							=item * B
173
174							C scalar string ([Optional] Boolean value)
175
176
177							Default = False
178							If true, returns the coordinates of the bounding boxes for each word. If false, the OCR'ed text is returned only as a text block (this makes the JSON reponse smaller). Overlay data can be used, for example, to show text over the image.
179
180
181							=item * B
182
183							C scalar string (Optional] String value: PDF, GIF, PNG, JPG, TIF, BMP)
184
185							Overwrites the automatic file type detection based on content-type. Supported image file formats are png, jpg (jpeg), gif, tif (tiff) and bmp. For document ocr, the api supports the Adobe PDF format. Multi-page TIFF files are supported.
186
187
188							=item * B
189
190							C scalar string ([Optional] true/false)
191
192
193							if set to true, the api autorotates the image correctly and sets the TextOrientation parameter in the JSON response. If the image is not rotated, then TextOrientation=0, otherwise it is the degree of the rotation, e. g. "270".
194
195
196							=item * B
197
198							C scalar string ([Optional] Boolean value)
199
200							Default = False
201							If true, API generates a searchable PDF. This parameter automatically sets isOverlayRequired = true
202
203
204							=item * B
205
206							C scalar string ([Optional] Boolean value)
207
208							Default = False. If true, the text layer is hidden (not visible)
209
210
211							=item * B
212
213							C scalar string ([Optional] true/false)
214
215
216							If set to true, the api does some internal upscaling. This can improve the OCR result significantly, especially for low-resolution PDF scans. Note that the front page demo uses scale=true, but the API uses scale=false by default. See also this OCR forum post.
217
218
219							=item * B
220
221							C scalar string ([Optional] true/false)
222
223							If set to true, the OCR logic makes sure that the parsed text result is always returned line by line. This switch is recommended for table OCR, receipt OCR, invoice processing and all other type of input documents that have a table like structure.
224
225							=item * B
226
227							C scalar int ([Optional] 1 or 2)
228
229							The default is engine 1. OCR Engine 2 is a new image-processing method.
230
231
232							=back
233
234							=head2 Notes from L
235
236							Tip: When serving images from an Amazon AWS S3 bucket or a similar service for use with the "URL" parameter, make sure it has the right content type. It should not be "Content-Type:application/x-www-form-urlencoded" (which seems to be the default) but image/png or similar. Alternatively you can include the filetype parameter and tell the API directly what type of document you are sending (PNG, JPG, GIF, PDF).
237
238
239							New: If you need to detect the status of checkboxes, please contact us about the Optical Mark Recognition (OMR) (Beta) features.
240
241
242							Select the best OCR Engine
243
244							New: We implemented a second OCR engine with a different processing logic. It is better than the default engine (engine1) in certain cases. So we recommend that you try engine1 first (since it is faster), but if the OCR results are not perfect, please try the same document with engine2. You can use the new OCR engine with our free online OCR service on the front page, and with the API.
245
246							Features of OCR Engine 1:
247
248							- Supports more languages (including Asian languages like Chinese, Japanese and Korean)
249
250							- Faster
251
252							- Supports larger images
253
254							- PDF OCR and Searchable PDF creation support
255
256							- Multi-Page TIFF scan support
257
258							- Parameter: OCREngine=1
259
260							Features of OCR Engine 2:
261
262							- Western Latin Character languages only (English, German, French,...)
263
264							- Language auto-detect (so it does not really matter what OCR language you select, as long as it uses Latin characters)
265
266							- Usually better at single number OCR and alphanumeric OCR (e. g. SUDOKO, Dot Matrix OCR, MRZ OCR,... )
267
268							- Usually better at special characters OCR like @+-...
269
270							- Image size limit 5000px width and 5000px height
271
272							- Parameter: OCREngine=2
273
274							- No PDF OCR and Offline OCR yet. If you need this, please contact us for an internal beta.
275
276							The returned OCR result JSON response is identical for both engines! So you can easily switch between both engines as needed. If you have any question about using Engine 1 or 2, please ask in our OCR API Forum.
277
278
279							=cut
280
281							sub get_result {
282
283							#can be simply done by discarding the $self
284							# but keeping it like this to allow future maintaince if any
285	1			1		321	my ( $params, $raw_request, $result );
286	1	50				5	if ( scalar @_ > 1 ) {
287	0					0	my $self;
288	0					0	( $self, $params ) = ( @_ );
289
290							#validate the parameters and get
291	0					0	$params = $self->_validate( $params );
292
293							#Generate the request
294	0					0	$raw_request = $self->_generate_request( $params );
295
296							#send the request via gateway
297	0					0	$result = $self->_process_request( $raw_request );
298
299							} else {
300	1					2	$params = shift;
301
302	1					2	$params = _validate( $params );
303
304							#Generate the request
305	1					18	$raw_request = _generate_request( $params );
306
307							#send the request via gateway
308	1					8	$result = _process_request( $raw_request );
309							}
310
311							#retun
312	1		50			539	return $result // undef;
313
314							}
315
316							=head2 Sample Ouput success
317
318							{"ParsedResults":[{"TextOverlay":{"Lines":[{"LineText":"Current","Words":[{"WordText":"Current","Left":11.666666030883789,"Top":59.166664123535156,"Height":14.999999046325684,"Width":54.999996185302734}],"MaxHeight":14.999999046325684,"MinTop":59.166664123535156},{"LineText":"59","Words":[{"WordText":"59","Left":32.5,"Top":239.99998474121094,"Height":20.833332061767578,"Width":29.166666030883789}],"MaxHeight":20.833332061767578,"MinTop":239.99998474121094}],"HasOverlay":true,"Message":"Total lines: "2"},"TextOrientation":"0","FileParseExitCode":1,"ParsedText":"Current\t\r\n59\t\r\n","ErrorMessage":"","ErrorDetails":""}],"OCRExitCode":1,"IsErroredOnProcessing":false,"ProcessingTimeInMilliseconds":"437","SearchablePDFURL":""}
319
320							=head2 Sample Ouput error
321
322							{"OCRExitCode":99,"IsErroredOnProcessing":true,"ErrorMessage":["Parameter name 'attributes' is invalid. Valid parameters: apikey,url,language,isoverlayrequired,base64image,iscreatesearchablepdf,issearchablepdfhidetextlayer,filetype,addressparsing,scale,detectorientation,istable,ocrengine,detectcheckbox,checkboxtemplate,checkboxtemplateregex","Please check if you need to URL encode the URL passed in request parameters."],"ProcessingTimeInMilliseconds":"0"}
323
324							=cut
325
326							####################
327							# internal function
328							###################
329							sub _generate_request {
330	2	100		2		8	my $params = ( scalar( @_ ) > 1 ) ? $_[1] : shift;
331
332							my $request_hash = {
333							url => $params->{endpoint},
334							body_param => $params->{body_param},
335	2					7	};
336
337	2	50				7	$request_hash->{file_path} = $params->{file} if ( defined $params->{file} );
338
339	2					4	return $request_hash;
340							}
341
342							####################
343							# internal function
344							###################
345							sub _validate {
346	2	100		2		426	my $params = ( scalar( @_ ) > 1 ) ? $_[1] : shift;
347	2	50				8	carp "Required parameter `apikey` not passed" unless ( defined $params->{apikey} );
348							carp "Required parameter `url or file or base64Image` not passed"
349	2	50	33			14	unless ( defined( $params->{url} \|\| $params->{file} \|\| $params->{base64Image} ) );
350
351	2		33			8	my $valid_params = { endpoint => $params->{ocr_space_url} // $BASE_URL, };
352	2	100				6	$valid_params->{url} = $params->{url} if ( defined $params->{url} );
353	2	100				6	$valid_params->{base64Image} = $params->{base64Image} if ( defined $params->{base64Image} );
354	2	50				6	if ( defined $params->{file} ) {
355	0	0				0	if ( -f $params->{file} ) {
356	0					0	$valid_params->{file} = $params->{file};
357							} else {
358	0					0	carp "Unable to open file $params->{file} \n";
359							}
360							}
361
362							#add optional keys
363	2					6	foreach (
364							qw/
365							language isOverlayRequired filetype
366							detectOrientation isCreateSearchablePdf url
367							isSearchablePdfHideTextLayer scale base64Image
368							isTable OCREngine apikey/
369							)
370							{
371	24	100				61	$valid_params->{body_param}->{$_} = $params->{$_} if ( defined $params->{$_} );
372							}
373	2					6	return $valid_params;
374							}
375
376							####################
377							# internal function
378							###################
379							sub _process_request {
380	1	50		1		5	my $params = ( scalar( @_ ) > 1 ) ? $_[1] : shift;
381
382	1					2	my $file = $params->{file_path};
383	1					2	my $endpoint = $params->{url};
384
385	1					2	my ( $res, $body, $header, $content );
386
387	1	50	33			17	if ( defined $params->{body_param} && uc( ref( $params->{body_param} ) ) eq 'HASH' ) {
388	1					2	foreach ( keys %{ $params->{body_param} } ) {
	1					7
389	11					21	push( @$content, ( $_ => $params->{body_param}->{$_} ) );
390							}
391							}
392
393	1	50				4	if ( $file ) {
394	0					0	push( @$content, ( file => [$file] ) );
395							}
396
397	1					6	my $ua = LWP::UserAgent->new();
398
399	1					2963	$ua->env_proxy;
400
401	1	50				15654	if ( defined $params->{header} ) {
402	0					0	$header = $params->{header};
403	0					0	$ua->default_header( %$header );
404							}
405
406	1					5	foreach ( 1 .. 3 ) {
407	1					6	$res = $ua->post(
408							$endpoint,
409							Content_Type => 'multipart/form-data',
410							Content => $content,
411							);
412
413	1	50				593165	if ( $res->is_success ) {
414	0					0	return $res->content;
415							} else {
416	1					21	return $res->status_line;
417							}
418							}
419							}
420
421							=head1 AUTHOR
422
423							sushrut pajai, C<< >>
424
425							=head1 BUGS
426
427							Please report any bugs or feature requests to C, or through
428							the web interface at L. I will be notified, and then you'll
429							automatically be notified of progress on your bug as I make changes.
430
431
432							=head1 SUPPORT
433
434							You can find documentation for this module with the perldoc command.
435
436							perldoc OCR::OcrSpace
437
438
439							You can also look for information at:
440
441							=over 4
442
443							=item * RT: CPAN's request tracker (report bugs here)
444
445							L
446
447							=item * AnnoCPAN: Annotated CPAN documentation
448
449							L
450
451							=item * CPAN Ratings
452
453							L
454
455							=item * Search CPAN
456
457							L
458
459							=back
460
461
462							=head1 ACKNOWLEDGEMENTS
463
464
465							=head1 LICENSE AND COPYRIGHT
466
467							This software is copyright (c) 2020 by sushrut pajai.
468
469							This is free software; you can redistribute it and/or modify it under
470							the same terms as the Perl 5 programming language system itself.
471
472
473							=cut
474
475							1; # End of OCR::OcrSpace
476
477							__END__