File Coverage

lib/File/FormatIdentification/RandomSampling.pm

Criterion	Covered	Total	%
statement	122	124	98.3
branch	25	28	89.2
condition	3	3	100.0
subroutine	14	14	100.0
pod	8	9	88.8
total	172	178	96.6

line	stmt	bran	cond	sub	pod	time	code
1							package File::FormatIdentification::RandomSampling;
2							# ABSTRACT: methods to identify content of device o media files using random sampling
3							our $VERSION = '0.006'; # VERSION:
4							# (c) 2020/2021 by Andreas Romeyke
5							# licensed via GPL v3.0 or later
6
7
8	2			2		75367	use strict;
	2					5
	2					63
9	2			2		11	use warnings;
	2					5
	2					78
10	2			2		33	use feature qw(say);
	2					4
	2					247
11	2			2		1290	use Moose;
	2					1004143
	2					18
12
13							has 'bytegram' => (
14							is => 'rw',
15							isa => 'ArrayRef',
16							default => sub {[]},
17							);
18
19
20
21
22							sub init_bytegrams {
23	8			8	1	18	my $self = shift;
24	8					16	my $bytegram_ref = $self->{'bytegram'};
25	8					127	$bytegram_ref->[0] = [(0) x 256]; # onegram
26	8					16899	$bytegram_ref->[1] = [(0) x 65536]; #bigram
27	8					59	return 1;
28							}
29
30							sub BUILD {
31	1			1	0	2	my $self = shift;
32	1					4	$self->init_bytegrams();
33	1					36	return 1;
34							}
35
36
37
38							sub update_bytegram {
39	8			8	1	19	my $self = shift;
40	8					21	my $buffer = shift;
41	8	100				28	if (defined $buffer) {
42	7					21	my $bytegram_ref = $self->{'bytegram'};
43	7					125	my @bytes = unpack "C*", $buffer;
44	7					66	my @words = unpack "S*", $buffer;
45							# my @bytes = map{ ord($_)} split //, $buffer;
46	7	50				23	if (scalar @bytes > 0) {
47	7					13	my @onegram = @{$bytegram_ref->[0]};
	7					85
48	7					17	my @bigram = @{$bytegram_ref->[1]};
	7					10879
49	7					36	foreach my $byte (@bytes) {
50	2562					3065	$onegram[$byte]++;
51							}
52	7					17	foreach my $word (@words) {
53	1280					1547	$bigram[$word]++;
54							}
55	7					52	$bytegram_ref->[0] = \@onegram;
56	7					5194	$bytegram_ref->[1] = \@bigram;
57							}
58							}
59	8					62	return 1;
60							}
61
62
63							sub calc_histogram { # use only the most significant first 8 entries
64	5			5	1	14	my $self = shift;
65	5					18	my $bytegram_ref = $self->{'bytegram'};
66	5					126	my @bytes_sorted = sort {$bytegram_ref->[0]->[$b] <=> $bytegram_ref->[0]->[$a]} (0..255);
	1290					1651
67	5					10455	my @words_sorted = sort {$bytegram_ref->[1]->[$b] <=> $bytegram_ref->[1]->[$a]} (0 .. 65535);
	327675					406376
68							# show only 8 most onegrame bytes
69	5					5858	my @bytes_truncated = @bytes_sorted[0..7];
70	5					23	my @words_truncated = @words_sorted[0..7];
71	5					11	my %histogram;
72	5					30	foreach my $byte (@bytes_truncated) {
73	40					62	push @{$histogram{onegram}}, $byte; #$bytegram_ref->[0]->[$byte];
	40					137
74							}
75	5					17	foreach my $word (@words_truncated) {
76	40					51	push @{$histogram{bigram}}, $word; #$bytegram_ref->[1]->[$word];
	40					71
77							}
78	5					5409	return \%histogram;
79							}
80
81
82							sub is_uniform {
83	5			5	1	13	my $self = shift;
84							#say "is_uniform?";
85	5					13	my $bytegram_ref = $self->{'bytegram'};
86	5					9	my $sum = 0;
87	5					12	my $n = 0;
88	5					8	my @unigram = @{$bytegram_ref->[0]};
	5					54
89	5					17	foreach my $byte (0 .. 255) {
90	1280	100				1927	if ($unigram[$byte] > 0) {
91	285					352	$n += $unigram[$byte];
92	285					367	$sum += ($unigram[$byte] * $byte);
93							}
94							}
95	5	100				28	if ($n == 0) { return;}
	2					19
96	3					5	my $expected = (256)/2;
97	3					33	my $mean = ($sum/$n);
98							#say "expected=$expected, sum=$sum, mean=$mean";
99	3					37	return (abs($expected - $mean) < 4);
100							}
101
102
103							sub is_empty {
104	7			7	1	16	my $self = shift;
105							#say "is_empty?";
106	7					14	my $bytegram_ref = $self->{'bytegram'};
107	7					12	my $sum = 0;
108	7					10	my $n = 0;
109	7					13	my @unigram = @{$bytegram_ref->[0]};
	7					64
110	7					22	foreach my $byte (0 .. 255) {
111	1792	100				2712	if ($unigram[$byte] > 0) {
112	314					364	$n += $unigram[$byte];
113	314					408	$sum += ($unigram[$byte] * $byte);
114							}
115							}
116	7	100				20	if ($n == 0) { return;}
	2					20
117	5					11	my $expected = 0;
118	5					37	my $mean = ($sum/$n);
119							# say "expected=$expected, mean=$mean";
120	5					17	my $criteria = abs($expected - $mean) < 4;
121	5					52	return ( $criteria);
122							}
123
124
125							sub is_text {
126	6			6	1	14	my $self = shift;
127							#say "is_text?";
128	6					14	my $bytegram_ref = $self->{'bytegram'};
129							# many Bytes in range 32 .. 173
130	6					7	my $printable = 0;
131	6					9	my $non_printable = 0;
132	6					11	my @unigram = @{$bytegram_ref->[0]};
	6					55
133	6					18	foreach my $byte (0 .. 255) {
134							#say "bytegram[$byte] = ". $bytegram_ref->[0]->[$byte];
135	1536	100				2359	if ($unigram[$byte] > 0) {
136	313	100	100			708	if (($byte >= 32) && ($byte <= 173)) {
137	196					253	$printable += ($unigram[$byte]);
138							}
139							else {
140	117					187	$non_printable += ($unigram[$byte]);
141							}
142							}
143							}
144	6					17	my $ratio = $printable / ($printable + $non_printable + 1); # +1 to avoid division by zero
145							#say "ratio text = $ratio (print=$printable, nonprint=$non_printable";
146	6					62	return ($ratio > 0.9);
147							}
148
149
150							sub is_video { # quicktime
151	4			4	1	11	my $self = shift;
152							#say "is_video?";
153	4					9	my $bytegram_ref = $self->{'bytegram'};
154							# many Bytes with 0x6d, ratio > 1/256 per read Byte
155	4					7	my $mp_indicator = 0;
156	4					8	my $other = 0;
157	4					7	my @unigram = @{$bytegram_ref->[0]};
	4					50
158							# MPEG-TS: Synchrobyte = 0x47 5times with distance of 188bytes
159							# MP4/Quicktime: Atom 'mvhd'
160							# General: 0x6d value
161	4					13	foreach my $byte ( 0 .. 255) {
162	1024	100				1536	if ($unigram[$byte] > 0) {
163	257	100				347	if ($byte != 0x6d) {
164	256					314	$other += $unigram[$byte];
165							} else { # $byte = 0x6d
166	1					4	$mp_indicator += $unigram[$byte];
167							}
168							}
169							}
170	4					13	my $ratio = $mp_indicator / ($mp_indicator + $other + 1); # +1 to avoid division by zero
171							#say "ratio=$ratio ($mp_indicator / ".($mp_indicator + $other + 1).") 47=", chr(0x47);
172	4					45	return ($ratio > 2/256);
173							}
174
175
176
177							sub calc_type {
178	3			3	1	8	my $self = shift;
179	3					5	my $buffer = shift;
180
181	3					13	$self->init_bytegrams();
182	3					32	$self->update_bytegram($buffer);
183
184	3	100				11	if ($self->is_empty()) {
		100
		50
		50
185	1					15	return "empty";
186							}
187							elsif ($self->is_text()) {
188	1					8	return "text";
189							}
190							elsif ($self->is_video()) {
191	0					0	return "video/audio";
192							}
193							elsif ($self->is_uniform()) {
194	0					0	return "random/encrypted/compressed";
195							}
196	1					25	return "undef";
197							}
198
199	2			2		36029	no Moose;
	2					4
	2					17
200
201							__PACKAGE__->meta->make_immutable;
202
203							1;
204
205							__END__
206
207							=pod
208
209							=encoding UTF-8
210
211							=head1 NAME
212
213							File::FormatIdentification::RandomSampling - methods to identify content of device o media files using random sampling
214
215							=head1 VERSION
216
217							version 0.006
218
219							=head1 SYNOPSIS
220
221							This module is suitable to get a good estimation about the content of media (or files). It uses random sampling of sectors to obtain heuristics about the content types.
222
223							To check the base type of a given binary string:
224
225							my $ff = File::FormatIdentification::RandomSampling->new(); # basic instantiation
226							my $type = $ff->calc_type($buffer); # calc type of given binary string
227
228							=head1 NAME
229
230							File::FormatIdentification::RandomSampling
231
232							=head1 TOOLS
233
234							The following tools are supplied with this module and are presented below:
235
236							=head2 F<crazy_fast_image_scan.pl>
237
238							This script scans devices or images very fast using random sampling and reports wht kind of content could be found.
239
240							For a detailed documentation use the included POD there.
241
242							=head2 F<cfi_create_training_data.pl>
243
244							This script scans a bunch of files and calcs most frequent one- and bigrams and stores them in a CSV file.
245
246							=head2 F<cfi_learn_model.pl>
247
248							This script uses the CSV file and prints a new model module in style of L<File::FormatIdentification::RandomSampling::Model> using L<AI::DecisionTree>.
249
250							=head1 SOURCE
251
252							The actual development version is available at L<https://art1pirat.spdns.org/art1/crazy-fast-image-scan>
253
254							=head1 METHODS
255
256							=head2 init_bytegrams
257
258							resets the internal bytegram state. Also called if object will be instantiated
259
260							=head2 update_bytegram
261
262							=over 1
263
264							=item C<$buffer> - updates the internal bytegram states using C<$buffer>
265
266							=back
267
268							=head2 calc_histogram
269
270							uses the most significant first 8 bytegram entries to from a histogram, returned as hash reference
271
272							=head2 is_uniform
273
274							returns true, if 1-byte bytegrams are uniform
275
276							=head2 is_empty
277
278							returns true, if 1-byte bytegrams indicating empty buffers
279
280							=head2 is_text
281
282							returns true, if 1-byte bytegrams are typical for texts
283
284							=head2 is_video
285
286							returns true, if 1-byte bytegrams are typical for MPEG/Quicktime Videos
287
288							=head2 calc_type
289
290							returns string indicating type of a given buffer
291
292							=head1 AUTHOR
293
294							Andreas Romeyke <pause@andreas-romeyke.de>
295
296							=head1 COPYRIGHT AND LICENSE
297
298							This software is Copyright (c) 2020 by Andreas Romeyke.
299
300							This is free software, licensed under:
301
302							The GNU General Public License, Version 3, June 2007
303
304							=cut