File Coverage

lib/File/FormatIdentification/RandomSampling.pm
Criterion Covered Total %
statement 122 124 98.3
branch 25 28 89.2
condition 3 3 100.0
subroutine 14 14 100.0
pod 8 9 88.8
total 172 178 96.6


line stmt bran cond sub pod time code
1             package File::FormatIdentification::RandomSampling;
2             # ABSTRACT: methods to identify content of device o media files using random sampling
3             our $VERSION = '0.006'; # VERSION:
4             # (c) 2020/2021 by Andreas Romeyke
5             # licensed via GPL v3.0 or later
6              
7              
8 2     2   75367 use strict;
  2         5  
  2         63  
9 2     2   11 use warnings;
  2         5  
  2         78  
10 2     2   33 use feature qw(say);
  2         4  
  2         247  
11 2     2   1290 use Moose;
  2         1004143  
  2         18  
12              
13             has 'bytegram' => (
14             is => 'rw',
15             isa => 'ArrayRef',
16             default => sub {[]},
17             );
18              
19              
20              
21              
22             sub init_bytegrams {
23 8     8 1 18 my $self = shift;
24 8         16 my $bytegram_ref = $self->{'bytegram'};
25 8         127 $bytegram_ref->[0] = [(0) x 256]; # onegram
26 8         16899 $bytegram_ref->[1] = [(0) x 65536]; #bigram
27 8         59 return 1;
28             }
29              
30             sub BUILD {
31 1     1 0 2 my $self = shift;
32 1         4 $self->init_bytegrams();
33 1         36 return 1;
34             }
35              
36              
37              
38             sub update_bytegram {
39 8     8 1 19 my $self = shift;
40 8         21 my $buffer = shift;
41 8 100       28 if (defined $buffer) {
42 7         21 my $bytegram_ref = $self->{'bytegram'};
43 7         125 my @bytes = unpack "C*", $buffer;
44 7         66 my @words = unpack "S*", $buffer;
45             # my @bytes = map{ ord($_)} split //, $buffer;
46 7 50       23 if (scalar @bytes > 0) {
47 7         13 my @onegram = @{$bytegram_ref->[0]};
  7         85  
48 7         17 my @bigram = @{$bytegram_ref->[1]};
  7         10879  
49 7         36 foreach my $byte (@bytes) {
50 2562         3065 $onegram[$byte]++;
51             }
52 7         17 foreach my $word (@words) {
53 1280         1547 $bigram[$word]++;
54             }
55 7         52 $bytegram_ref->[0] = \@onegram;
56 7         5194 $bytegram_ref->[1] = \@bigram;
57             }
58             }
59 8         62 return 1;
60             }
61              
62              
63             sub calc_histogram { # use only the most significant first 8 entries
64 5     5 1 14 my $self = shift;
65 5         18 my $bytegram_ref = $self->{'bytegram'};
66 5         126 my @bytes_sorted = sort {$bytegram_ref->[0]->[$b] <=> $bytegram_ref->[0]->[$a]} (0..255);
  1290         1651  
67 5         10455 my @words_sorted = sort {$bytegram_ref->[1]->[$b] <=> $bytegram_ref->[1]->[$a]} (0 .. 65535);
  327675         406376  
68             # show only 8 most onegrame bytes
69 5         5858 my @bytes_truncated = @bytes_sorted[0..7];
70 5         23 my @words_truncated = @words_sorted[0..7];
71 5         11 my %histogram;
72 5         30 foreach my $byte (@bytes_truncated) {
73 40         62 push @{$histogram{onegram}}, $byte; #$bytegram_ref->[0]->[$byte];
  40         137  
74             }
75 5         17 foreach my $word (@words_truncated) {
76 40         51 push @{$histogram{bigram}}, $word; #$bytegram_ref->[1]->[$word];
  40         71  
77             }
78 5         5409 return \%histogram;
79             }
80              
81              
82             sub is_uniform {
83 5     5 1 13 my $self = shift;
84             #say "is_uniform?";
85 5         13 my $bytegram_ref = $self->{'bytegram'};
86 5         9 my $sum = 0;
87 5         12 my $n = 0;
88 5         8 my @unigram = @{$bytegram_ref->[0]};
  5         54  
89 5         17 foreach my $byte (0 .. 255) {
90 1280 100       1927 if ($unigram[$byte] > 0) {
91 285         352 $n += $unigram[$byte];
92 285         367 $sum += ($unigram[$byte] * $byte);
93             }
94             }
95 5 100       28 if ($n == 0) { return;}
  2         19  
96 3         5 my $expected = (256)/2;
97 3         33 my $mean = ($sum/$n);
98             #say "expected=$expected, sum=$sum, mean=$mean";
99 3         37 return (abs($expected - $mean) < 4);
100             }
101              
102              
103             sub is_empty {
104 7     7 1 16 my $self = shift;
105             #say "is_empty?";
106 7         14 my $bytegram_ref = $self->{'bytegram'};
107 7         12 my $sum = 0;
108 7         10 my $n = 0;
109 7         13 my @unigram = @{$bytegram_ref->[0]};
  7         64  
110 7         22 foreach my $byte (0 .. 255) {
111 1792 100       2712 if ($unigram[$byte] > 0) {
112 314         364 $n += $unigram[$byte];
113 314         408 $sum += ($unigram[$byte] * $byte);
114             }
115             }
116 7 100       20 if ($n == 0) { return;}
  2         20  
117 5         11 my $expected = 0;
118 5         37 my $mean = ($sum/$n);
119             # say "expected=$expected, mean=$mean";
120 5         17 my $criteria = abs($expected - $mean) < 4;
121 5         52 return ( $criteria);
122             }
123              
124              
125             sub is_text {
126 6     6 1 14 my $self = shift;
127             #say "is_text?";
128 6         14 my $bytegram_ref = $self->{'bytegram'};
129             # many Bytes in range 32 .. 173
130 6         7 my $printable = 0;
131 6         9 my $non_printable = 0;
132 6         11 my @unigram = @{$bytegram_ref->[0]};
  6         55  
133 6         18 foreach my $byte (0 .. 255) {
134             #say "bytegram[$byte] = ". $bytegram_ref->[0]->[$byte];
135 1536 100       2359 if ($unigram[$byte] > 0) {
136 313 100 100     708 if (($byte >= 32) && ($byte <= 173)) {
137 196         253 $printable += ($unigram[$byte]);
138             }
139             else {
140 117         187 $non_printable += ($unigram[$byte]);
141             }
142             }
143             }
144 6         17 my $ratio = $printable / ($printable + $non_printable + 1); # +1 to avoid division by zero
145             #say "ratio text = $ratio (print=$printable, nonprint=$non_printable";
146 6         62 return ($ratio > 0.9);
147             }
148              
149              
150             sub is_video { # quicktime
151 4     4 1 11 my $self = shift;
152             #say "is_video?";
153 4         9 my $bytegram_ref = $self->{'bytegram'};
154             # many Bytes with 0x6d, ratio > 1/256 per read Byte
155 4         7 my $mp_indicator = 0;
156 4         8 my $other = 0;
157 4         7 my @unigram = @{$bytegram_ref->[0]};
  4         50  
158             # MPEG-TS: Synchrobyte = 0x47 5times with distance of 188bytes
159             # MP4/Quicktime: Atom 'mvhd'
160             # General: 0x6d value
161 4         13 foreach my $byte ( 0 .. 255) {
162 1024 100       1536 if ($unigram[$byte] > 0) {
163 257 100       347 if ($byte != 0x6d) {
164 256         314 $other += $unigram[$byte];
165             } else { # $byte = 0x6d
166 1         4 $mp_indicator += $unigram[$byte];
167             }
168             }
169             }
170 4         13 my $ratio = $mp_indicator / ($mp_indicator + $other + 1); # +1 to avoid division by zero
171             #say "ratio=$ratio ($mp_indicator / ".($mp_indicator + $other + 1).") 47=", chr(0x47);
172 4         45 return ($ratio > 2/256);
173             }
174              
175              
176              
177             sub calc_type {
178 3     3 1 8 my $self = shift;
179 3         5 my $buffer = shift;
180              
181 3         13 $self->init_bytegrams();
182 3         32 $self->update_bytegram($buffer);
183              
184 3 100       11 if ($self->is_empty()) {
    100          
    50          
    50          
185 1         15 return "empty";
186             }
187             elsif ($self->is_text()) {
188 1         8 return "text";
189             }
190             elsif ($self->is_video()) {
191 0         0 return "video/audio";
192             }
193             elsif ($self->is_uniform()) {
194 0         0 return "random/encrypted/compressed";
195             }
196 1         25 return "undef";
197             }
198              
199 2     2   36029 no Moose;
  2         4  
  2         17  
200              
201             __PACKAGE__->meta->make_immutable;
202              
203             1;
204              
205             __END__
206              
207             =pod
208              
209             =encoding UTF-8
210              
211             =head1 NAME
212              
213             File::FormatIdentification::RandomSampling - methods to identify content of device o media files using random sampling
214              
215             =head1 VERSION
216              
217             version 0.006
218              
219             =head1 SYNOPSIS
220              
221             This module is suitable to get a good estimation about the content of media (or files). It uses random sampling of sectors to obtain heuristics about the content types.
222              
223             To check the base type of a given binary string:
224              
225             my $ff = File::FormatIdentification::RandomSampling->new(); # basic instantiation
226             my $type = $ff->calc_type($buffer); # calc type of given binary string
227              
228             =head1 NAME
229              
230             File::FormatIdentification::RandomSampling
231              
232             =head1 TOOLS
233              
234             The following tools are supplied with this module and are presented below:
235              
236             =head2 F<crazy_fast_image_scan.pl>
237              
238             This script scans devices or images very fast using random sampling and reports wht kind of content could be found.
239              
240             For a detailed documentation use the included POD there.
241              
242             =head2 F<cfi_create_training_data.pl>
243              
244             This script scans a bunch of files and calcs most frequent one- and bigrams and stores them in a CSV file.
245              
246             =head2 F<cfi_learn_model.pl>
247              
248             This script uses the CSV file and prints a new model module in style of L<File::FormatIdentification::RandomSampling::Model> using L<AI::DecisionTree>.
249              
250             =head1 SOURCE
251              
252             The actual development version is available at L<https://art1pirat.spdns.org/art1/crazy-fast-image-scan>
253              
254             =head1 METHODS
255              
256             =head2 init_bytegrams
257              
258             resets the internal bytegram state. Also called if object will be instantiated
259              
260             =head2 update_bytegram
261              
262             =over 1
263              
264             =item C<$buffer> - updates the internal bytegram states using C<$buffer>
265              
266             =back
267              
268             =head2 calc_histogram
269              
270             uses the most significant first 8 bytegram entries to from a histogram, returned as hash reference
271              
272             =head2 is_uniform
273              
274             returns true, if 1-byte bytegrams are uniform
275              
276             =head2 is_empty
277              
278             returns true, if 1-byte bytegrams indicating empty buffers
279              
280             =head2 is_text
281              
282             returns true, if 1-byte bytegrams are typical for texts
283              
284             =head2 is_video
285              
286             returns true, if 1-byte bytegrams are typical for MPEG/Quicktime Videos
287              
288             =head2 calc_type
289              
290             returns string indicating type of a given buffer
291              
292             =head1 AUTHOR
293              
294             Andreas Romeyke <pause@andreas-romeyke.de>
295              
296             =head1 COPYRIGHT AND LICENSE
297              
298             This software is Copyright (c) 2020 by Andreas Romeyke.
299              
300             This is free software, licensed under:
301              
302             The GNU General Public License, Version 3, June 2007
303              
304             =cut