File Coverage

blib/lib/Data/Binary.pm

Criterion	Covered	Total	%
statement	30	30	100.0
branch	7	8	87.5
condition			n/a
subroutine	7	7	100.0
pod	2	2	100.0
total	46	47	97.8

line	stmt	bran	sub	pod	time	code
1						package Data::Binary;
2
3	2		2		70829	use strict;
	2				5
	2				71
4	2		2		9	use warnings;
	2				3
	2				298
5
6						our $VERSION = 0.01;
7
8	2		2		10	use base qw(Exporter);
	2				7
	2				386
9
10	2		2		2429	use Encode qw(decode_utf8);
	2				38539
	2				580
11
12						our @EXPORT_OK = qw(is_text is_binary);
13
14						sub is_text {
15	8		8	1	31	my ($string) = @_;
16
17	8	50			22	if (length($string) > 512) {
18	8				14	$string = substr($string, 0, 512);
19						}
20
21	8	100			37	return '' if (index($string, "\c@") != -1);
22	6				9	my $length = length($string);
23	6				22	my $odd = ($string =~ tr/\x01\x02\x03\x04\x05\x06\x07\x09\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f//d);
24
25						# Detecting >=128 and non-UTF-8 is interesting. Note that all UTF-8 >=128 has several bytes with
26						# >=128 set, so a quick test is possible by simply checking if any are >=128. However, the count
27						# from that is typically wrong, if this is binary data, it'll not have been decoded. So we do this
28						# in two steps.
29
30	6				10	my $copy = $string;
31	6	100			49	if (($copy =~ tr[\x80-\xff][]d) > 0) {
32	2				13	my $modified = decode_utf8($string, Encode::FB_DEFAULT);
33	2		2		2117	my $substitions = ($modified =~ tr/\x{fffd}//d);
	2				22
	2				29
	2				218
34	2				3	$odd += $substitions;
35						}
36
37	6	100			32	return '' if ($odd * 3 > $length);
38
39	2				13	return 1;
40						}
41
42						sub is_binary {
43	4		4	1	6	my ($string) = @_;
44	4				10	return ! is_text($string);
45						}
46
47						1;
48
49						=head1 NAME
50
51						Data::Binary - Simple detection of binary versus text in strings
52
53						=head1 SYNOPSIS
54
55						use Data::Binary qw(is_text is_binary);
56						my $text = File::Slurp::read_file("test1.doc");
57						my $is_text = is_text($text); # equivalent to -T "test1.doc"
58						my $is_binary = is_binary($text); # equivalent to -B "test1.doc"
59
60						=head1 DESCRIPTION
61
62						This simple module provides string equivalents to the -T / -B operators. Since
63						these only work on file names and file handles, this module provides the same
64						functions but on strings.
65
66						Note that the actual implementation is currently different, basically because
67						the -T / -B functions are in C/XS, and this module is written in pure Perl.
68						For now, anyway.
69
70						=head1 FUNCTIONS
71
72						=head2 is_text($string)
73
74						Uses the same kind of heuristics in -T, but applies them to a string. Returns true
75						if the string is basically text.
76
77						=head2 is_binary($string)
78
79						Uses the same kind of heuristics in -B, but applies them to a string. Returns true
80						if the string is basically binary.
81
82						=head1 AUTHOR
83
84						Stuart Watt, stuart@morungos.com
85
86						=head1 COPYRIGHT
87
88						Copyright (c) 2014 Stuart Watt. All rights reserved.
89
90						=cut