File Coverage

blib/lib/Image/ExifTool/Text.pm
Criterion Covered Total %
statement 72 100 72.0
branch 38 80 47.5
condition 13 33 39.3
subroutine 4 4 100.0
pod 0 1 0.0
total 127 218 58.2


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: Text.pm
3             #
4             # Description: Deduce characteristics of TXT and CSV files
5             #
6             # Revisions: 2019-11-01 - P. Harvey Created
7             # 2020-02-13 - PH Added CSV file support
8             #
9             # References: 1) https://github.com/file/file
10             #------------------------------------------------------------------------------
11              
12             package Image::ExifTool::Text;
13              
14 4     4   4526 use strict;
  4         12  
  4         142  
15 4     4   22 use vars qw($VERSION);
  4         10  
  4         174  
16 4     4   26 use Image::ExifTool qw(:DataAccess :Utils);
  4         14  
  4         6238  
17              
18             $VERSION = '1.04';
19              
20             # Text tags
21             %Image::ExifTool::Text::Main = (
22             VARS => { NO_ID => 1 },
23             GROUPS => { 0 => 'File', 1 => 'File', 2 => 'Document' },
24             NOTES => q{
25             Although basic text files contain no metadata, the following tags are
26             determined from a simple analysis of the data in TXT and CSV files.
27             Statistics are generated only for 8-bit encodings, but the L (-fast)
28             option may be used to limit processing to the first 64 kB in which case some
29             tags are not produced. To avoid long processing delays, ExifTool will issue
30             a minor warning and process only the first 64 kB of any file larger than 20
31             MB unless the L (-m)
32             option is used.
33             },
34             MIMEEncoding => { Groups => { 2 => 'Other' } },
35             Newlines => {
36             PrintConv => {
37             "\r\n" => 'Windows CRLF',
38             "\r" => 'Macintosh CR',
39             "\n" => 'Unix LF',
40             '' => '(none)',
41             },
42             },
43             ByteOrderMark => { PrintConv => { 0 => 'No', 1 => 'Yes' } },
44             LineCount => { },
45             WordCount => { },
46             Delimiter => { PrintConv => { '' => '(none)', ',' => 'Comma', ';' => 'Semicolon', "\t" => 'Tab' }},
47             Quoting => { PrintConv => { '' => '(none)', '"' => 'Double quotes', "'" => 'Single quotes' }},
48             RowCount => { },
49             ColumnCount => { },
50             );
51              
52             #------------------------------------------------------------------------------
53             # Extract some stats from a text file
54             # Inputs: 0) ExifTool ref, 1) dirInfo ref
55             # Returns: 1 on success, 0 if this wasn't a text file
56             sub ProcessTXT($$)
57             {
58 14     14 0 45 my ($et, $dirInfo) = @_;
59 14         45 my $dataPt = $$dirInfo{TestBuff};
60 14         33 my $raf = $$dirInfo{RAF};
61 14   50     53 my $fast = $et->Options('FastScan') || 0;
62 14         37 my ($buff, $enc, $isBOM, $isUTF8);
63 14         37 my $nl = '';
64              
65 14 50       56 return 0 unless length $$dataPt; # can't call it a text file if it has no text
66              
67             # read more from the file if necessary
68 14 50 33     81 if ($fast < 3 and length($$dataPt) == $Image::ExifTool::testLen) {
69 0 0       0 $raf->Read($buff, 65536) or return 0;
70 0         0 $dataPt = \$buff;
71             }
72             #
73             # make our best guess at the character encoding (EBCDIC is not supported)
74             #
75 14 100       67 if ($$dataPt =~ /([\0-\x06\x0e-\x1a\x1c-\x1f\x7f])/) {
76             # file contains weird control characters, could be multi-byte Unicode
77 1 50       12 if ($$dataPt =~ /^(\xff\xfe\0\0|\0\0\xfe\xff)/) {
    50          
78 0 0       0 if ($1 eq "\xff\xfe\0\0") {
79 0         0 $enc = 'utf-32le';
80 0 0       0 $nl = $1 if $$dataPt =~ /(\r\0\0\0\n|\r|\n)\0\0\0/;
81             } else {
82 0         0 $enc = 'utf-32be';
83 0 0       0 $nl = $1 if $$dataPt =~ /\0\0\0(\r\0\0\0\n|\r|\n)/;
84             }
85             } elsif ($$dataPt =~ /^(\xff\xfe|\xfe\xff)/) {
86 1 50       5 if ($1 eq "\xff\xfe") {
87 0         0 $enc = 'utf-16le';
88 0 0       0 $nl = $1 if $$dataPt =~ /(\r\0\n|\r|\n)\0/;
89             } else {
90 1         3 $enc = 'utf-16be';
91 1 50       11 $nl = $1 if $$dataPt =~ /\0(\r\0\n|\r|\n)/;
92             }
93             } else {
94 0         0 return 0; # probably not a text file
95             }
96 1         5 $nl =~ tr/\0//d; # remove nulls from newline sequence
97 1         2 $isBOM = 1; # (we don't recognize UTF-16/UTF-32 without one)
98             } else {
99 13         63 $isUTF8 = Image::ExifTool::IsUTF8($dataPt, 1);
100 13 100       74 if ($isUTF8 == 0) {
    100          
    100          
101 10         32 $enc = 'us-ascii';
102             } elsif ($isUTF8 > 0) {
103 1         3 $enc = 'utf-8';
104 1 50       5 $isBOM = ($$dataPt =~ /^\xef\xbb\xbf/ ? 1 : 0);
105             } elsif ($$dataPt !~ /[\x80-\x9f]/) {
106 1         4 $enc = 'iso-8859-1';
107             } else {
108 1         2 $enc = 'unknown-8bit';
109             }
110 13 100       102 $nl = $1 if $$dataPt =~ /(\r\n|\r|\n)/;
111             }
112              
113 14         51 my $tagTablePtr = GetTagTable('Image::ExifTool::Text::Main');
114              
115 14         79 $et->SetFileType();
116 14         72 $et->HandleTag($tagTablePtr, MIMEEncoding => $enc);
117              
118 14 50 33     114 return 1 if $fast == 3 or not $raf->Seek(0,0);
119              
120 14 100       85 $et->HandleTag($tagTablePtr, ByteOrderMark => $isBOM) if defined $isBOM;
121 14         65 $et->HandleTag($tagTablePtr, Newlines => $nl);
122              
123 14 100 66     107 return 1 if $fast or not defined $isUTF8;
124             #
125             # generate stats for CSV files
126             #
127 13 100       54 if ($$et{FileType} eq 'CSV') {
128 1         4 my ($delim, $quot, $ncols);
129 1         3 my $nrows = 0;
130 1         5 while ($raf->ReadLine($buff)) {
131 3 100       11 if (not defined $delim) {
    50          
132 1         6 my %count = ( ',' => 0, ';' => 0, "\t" => 0 );
133 1         10 ++$count{$_} foreach $buff =~ /[,;\t]/g;
134 1 50 33     9 if ($count{','} > $count{';'} and $count{','} > $count{"\t"}) {
    0          
    0          
135 1         3 $delim = ',';
136             } elsif ($count{';'} > $count{"\t"}) {
137 0         0 $delim = ';';
138             } elsif ($count{"\t"}) {
139 0         0 $delim = "\t";
140             } else {
141 0         0 $delim = '';
142 0         0 $ncols = 1;
143             }
144 1 50       4 unless ($ncols) {
145             # account for delimiters in quotes (simplistically)
146 1         62 while ($buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg) {
147 0         0 $quot = $2;
148 0         0 my $field = $3;
149 0         0 $count{$delim} -= () = $field =~ /$delim/g;
150             }
151 1         5 $ncols = $count{$delim} + 1;
152             }
153             } elsif (not $quot) {
154 2 50       40 $quot = $2 if $buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg;
155             }
156 3 50 33     15 if (++$nrows == 1000 and $et->Warn('Not counting rows past 1000', 2)) {
157 0         0 undef $nrows;
158 0         0 last;
159             }
160             }
161 1   50     19 $et->HandleTag($tagTablePtr, Delimiter => ($delim || ''));
162 1   50     19 $et->HandleTag($tagTablePtr, Quoting => ($quot || ''));
163 1         4 $et->HandleTag($tagTablePtr, ColumnCount => $ncols);
164 1 50       6 $et->HandleTag($tagTablePtr, RowCount => $nrows) if $nrows;
165 1         4 return 1;
166             }
167 12 50 33     111 return 1 if $$et{VALUE}{FileSize} and $$et{VALUE}{FileSize} > 20000000 and
      33        
168             $et->Warn('Not counting lines/words in text file larger than 20 MB', 2);
169             #
170             # count lines/words and check encoding of the rest of the file
171             #
172 12         40 my ($lines, $words) = (0, 0);
173 12         39 my $oldNL = $/;
174 12 100       43 $/ = $nl if $nl;
175 12         64 while ($raf->ReadLine($buff)) {
176 12         29 ++$lines;
177 12         129 ++$words while $buff =~ /\S+/g;
178 12 50 66     111 if (not $nl and $buff =~ /(\r\n|\r|\n)$/) {
179             # (the first line must have been longer than 64 kB)
180 0         0 $$et{VALUE}{Newlines} = $nl = $1;
181             }
182 12 50       50 next if $raf->Tell() < 65536;
183             # continue to check encoding after the first 64 kB
184 0 0 0     0 if ($isUTF8 >= 0) { # (if ascii or utf8)
    0          
185 0         0 $isUTF8 = Image::ExifTool::IsUTF8(\$buff);
186 0 0       0 if ($isUTF8 > 0) {
    0          
187 0         0 $enc = 'utf-8';
188             } elsif ($isUTF8 < 0) {
189 0 0       0 $enc = $buff =~ /[\x80-\x9f]/ ? 'unknown-8bit' : 'iso-8859-1';
190             }
191             } elsif ($enc eq 'iso-8859-1' and $buff =~ /[\x80-\x9f]/) {
192 0         0 $enc = 'unknown-8bit';
193             }
194             }
195 12 50       58 if ($$et{VALUE}{MIMEEncoding} ne $enc) {
196 0         0 $$et{VALUE}{MIMEEncoding} = $enc;
197 0         0 $et->VPrint(0," MIMEEncoding [override] = $enc\n");
198             }
199 12         52 $/ = $oldNL;
200 12         56 $et->HandleTag($tagTablePtr, LineCount => $lines);
201 12         67 $et->HandleTag($tagTablePtr, WordCount => $words);
202 12         43 return 1;
203             }
204              
205              
206             1; # end
207              
208             __END__