File Coverage

blib/lib/Data/Binary.pm
Criterion Covered Total %
statement 30 30 100.0
branch 7 8 87.5
condition n/a
subroutine 7 7 100.0
pod 2 2 100.0
total 46 47 97.8


line stmt bran cond sub pod time code
1             package Data::Binary;
2              
3 2     2   70829 use strict;
  2         5  
  2         71  
4 2     2   9 use warnings;
  2         3  
  2         298  
5              
6             our $VERSION = 0.01;
7              
8 2     2   10 use base qw(Exporter);
  2         7  
  2         386  
9              
10 2     2   2429 use Encode qw(decode_utf8);
  2         38539  
  2         580  
11              
12             our @EXPORT_OK = qw(is_text is_binary);
13              
14             sub is_text {
15 8     8 1 31 my ($string) = @_;
16              
17 8 50       22 if (length($string) > 512) {
18 8         14 $string = substr($string, 0, 512);
19             }
20              
21 8 100       37 return '' if (index($string, "\c@") != -1);
22 6         9 my $length = length($string);
23 6         22 my $odd = ($string =~ tr/\x01\x02\x03\x04\x05\x06\x07\x09\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f//d);
24              
25             # Detecting >=128 and non-UTF-8 is interesting. Note that all UTF-8 >=128 has several bytes with
26             # >=128 set, so a quick test is possible by simply checking if any are >=128. However, the count
27             # from that is typically wrong, if this is binary data, it'll not have been decoded. So we do this
28             # in two steps.
29              
30 6         10 my $copy = $string;
31 6 100       49 if (($copy =~ tr[\x80-\xff][]d) > 0) {
32 2         13 my $modified = decode_utf8($string, Encode::FB_DEFAULT);
33 2     2   2117 my $substitions = ($modified =~ tr/\x{fffd}//d);
  2         22  
  2         29  
  2         218  
34 2         3 $odd += $substitions;
35             }
36              
37 6 100       32 return '' if ($odd * 3 > $length);
38              
39 2         13 return 1;
40             }
41              
42             sub is_binary {
43 4     4 1 6 my ($string) = @_;
44 4         10 return ! is_text($string);
45             }
46              
47             1;
48              
49             =head1 NAME
50              
51             Data::Binary - Simple detection of binary versus text in strings
52              
53             =head1 SYNOPSIS
54              
55             use Data::Binary qw(is_text is_binary);
56             my $text = File::Slurp::read_file("test1.doc");
57             my $is_text = is_text($text); # equivalent to -T "test1.doc"
58             my $is_binary = is_binary($text); # equivalent to -B "test1.doc"
59              
60             =head1 DESCRIPTION
61              
62             This simple module provides string equivalents to the -T / -B operators. Since
63             these only work on file names and file handles, this module provides the same
64             functions but on strings.
65              
66             Note that the actual implementation is currently different, basically because
67             the -T / -B functions are in C/XS, and this module is written in pure Perl.
68             For now, anyway.
69              
70             =head1 FUNCTIONS
71              
72             =head2 is_text($string)
73              
74             Uses the same kind of heuristics in -T, but applies them to a string. Returns true
75             if the string is basically text.
76              
77             =head2 is_binary($string)
78              
79             Uses the same kind of heuristics in -B, but applies them to a string. Returns true
80             if the string is basically binary.
81              
82             =head1 AUTHOR
83              
84             Stuart Watt, stuart@morungos.com
85              
86             =head1 COPYRIGHT
87              
88             Copyright (c) 2014 Stuart Watt. All rights reserved.
89              
90             =cut