File Coverage

blib/lib/Image/ExifTool/Text.pm
Criterion Covered Total %
statement 75 103 72.8
branch 38 80 47.5
condition 13 33 39.3
subroutine 5 5 100.0
pod 0 1 0.0
total 131 222 59.0


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: Text.pm
3             #
4             # Description: Deduce characteristics of TXT and CSV files
5             #
6             # Revisions: 2019-11-01 - P. Harvey Created
7             # 2020-02-13 - PH Added CSV file support
8             #
9             # References: 1) https://github.com/file/file
10             #------------------------------------------------------------------------------
11              
12             package Image::ExifTool::Text;
13              
14 4     4   4003 use strict;
  4         8  
  4         124  
15 4     4   26 use vars qw($VERSION);
  4         12  
  4         182  
16 4     4   31 use Image::ExifTool qw(:DataAccess :Utils);
  4         12  
  4         773  
17 4     4   889 use Image::ExifTool::XMP;
  4         16  
  4         4712  
18              
19             $VERSION = '1.03';
20              
21             # Text tags
22             %Image::ExifTool::Text::Main = (
23             VARS => { NO_ID => 1 },
24             GROUPS => { 0 => 'File', 1 => 'File', 2 => 'Document' },
25             NOTES => q{
26             Although basic text files contain no metadata, the following tags are
27             determined from a simple analysis of the data in TXT and CSV files.
28             Statistics are generated only for 8-bit encodings, but the L (-fast)
29             option may be used to limit processing to the first 64 kB in which case some
30             tags are not produced. To avoid long processing delays, ExifTool will issue
31             a minor warning and process only the first 64 kB of any file larger than 20
32             MB unless the L (-m)
33             option is used.
34             },
35             MIMEEncoding => { Groups => { 2 => 'Other' } },
36             Newlines => {
37             PrintConv => {
38             "\r\n" => 'Windows CRLF',
39             "\r" => 'Macintosh CR',
40             "\n" => 'Unix LF',
41             '' => '(none)',
42             },
43             },
44             ByteOrderMark => { PrintConv => { 0 => 'No', 1 => 'Yes' } },
45             LineCount => { },
46             WordCount => { },
47             Delimiter => { PrintConv => { '' => '(none)', ',' => 'Comma', ';' => 'Semicolon', "\t" => 'Tab' }},
48             Quoting => { PrintConv => { '' => '(none)', '"' => 'Double quotes', "'" => 'Single quotes' }},
49             RowCount => { },
50             ColumnCount => { },
51             );
52              
53             #------------------------------------------------------------------------------
54             # Extract some stats from a text file
55             # Inputs: 0) ExifTool ref, 1) dirInfo ref
56             # Returns: 1 on success, 0 if this wasn't a text file
57             sub ProcessTXT($$)
58             {
59 14     14 0 44 my ($et, $dirInfo) = @_;
60 14         38 my $dataPt = $$dirInfo{TestBuff};
61 14         32 my $raf = $$dirInfo{RAF};
62 14   50     49 my $fast = $et->Options('FastScan') || 0;
63 14         41 my ($buff, $enc, $isBOM, $isUTF8);
64 14         27 my $nl = '';
65              
66 14 50       51 return 0 unless length $$dataPt; # can't call it a text file if it has no text
67              
68             # read more from the file if necessary
69 14 50 33     82 if ($fast < 3 and length($$dataPt) == $Image::ExifTool::testLen) {
70 0 0       0 $raf->Read($buff, 65536) or return 0;
71 0         0 $dataPt = \$buff;
72             }
73             #
74             # make our best guess at the character encoding (EBCDIC is not supported)
75             #
76 14 100       70 if ($$dataPt =~ /([\0-\x06\x0e-\x1a\x1c-\x1f\x7f])/) {
77             # file contains weird control characters, could be multi-byte Unicode
78 1 50       12 if ($$dataPt =~ /^(\xff\xfe\0\0|\0\0\xfe\xff)/) {
    50          
79 0 0       0 if ($1 eq "\xff\xfe\0\0") {
80 0         0 $enc = 'utf-32le';
81 0 0       0 $nl = $1 if $$dataPt =~ /(\r\0\0\0\n|\r|\n)\0\0\0/;
82             } else {
83 0         0 $enc = 'utf-32be';
84 0 0       0 $nl = $1 if $$dataPt =~ /\0\0\0(\r\0\0\0\n|\r|\n)/;
85             }
86             } elsif ($$dataPt =~ /^(\xff\xfe|\xfe\xff)/) {
87 1 50       5 if ($1 eq "\xff\xfe") {
88 0         0 $enc = 'utf-16le';
89 0 0       0 $nl = $1 if $$dataPt =~ /(\r\0\n|\r|\n)\0/;
90             } else {
91 1         3 $enc = 'utf-16be';
92 1 50       10 $nl = $1 if $$dataPt =~ /\0(\r\0\n|\r|\n)/;
93             }
94             } else {
95 0         0 return 0; # probably not a text file
96             }
97 1         4 $nl =~ tr/\0//d; # remove nulls from newline sequence
98 1         2 $isBOM = 1; # (we don't recognize UTF-16/UTF-32 without one)
99             } else {
100 13         77 $isUTF8 = Image::ExifTool::XMP::IsUTF8($dataPt, 1);
101 13 100       50 if ($isUTF8 == 0) {
    100          
    100          
102 10         25 $enc = 'us-ascii';
103             } elsif ($isUTF8 > 0) {
104 1         3 $enc = 'utf-8';
105 1 50       6 $isBOM = ($$dataPt =~ /^\xef\xbb\xbf/ ? 1 : 0);
106             } elsif ($$dataPt !~ /[\x80-\x9f]/) {
107 1         3 $enc = 'iso-8859-1';
108             } else {
109 1         3 $enc = 'unknown-8bit';
110             }
111 13 100       91 $nl = $1 if $$dataPt =~ /(\r\n|\r|\n)/;
112             }
113              
114 14         54 my $tagTablePtr = GetTagTable('Image::ExifTool::Text::Main');
115              
116 14         83 $et->SetFileType();
117 14         79 $et->HandleTag($tagTablePtr, MIMEEncoding => $enc);
118              
119 14 50 33     82 return 1 if $fast == 3 or not $raf->Seek(0,0);
120              
121 14 100       59 $et->HandleTag($tagTablePtr, ByteOrderMark => $isBOM) if defined $isBOM;
122 14         57 $et->HandleTag($tagTablePtr, Newlines => $nl);
123              
124 14 100 66     87 return 1 if $fast or not defined $isUTF8;
125             #
126             # generate stats for CSV files
127             #
128 13 100       63 if ($$et{FileType} eq 'CSV') {
129 1         3 my ($delim, $quot, $ncols);
130 1         2 my $nrows = 0;
131 1         6 while ($raf->ReadLine($buff)) {
132 3 100       10 if (not defined $delim) {
    50          
133 1         7 my %count = ( ',' => 0, ';' => 0, "\t" => 0 );
134 1         12 ++$count{$_} foreach $buff =~ /[,;\t]/g;
135 1 50 33     10 if ($count{','} > $count{';'} and $count{','} > $count{"\t"}) {
    0          
    0          
136 1         5 $delim = ',';
137             } elsif ($count{';'} > $count{"\t"}) {
138 0         0 $delim = ';';
139             } elsif ($count{"\t"}) {
140 0         0 $delim = "\t";
141             } else {
142 0         0 $delim = '';
143 0         0 $ncols = 1;
144             }
145 1 50       4 unless ($ncols) {
146             # account for delimiters in quotes (simplistically)
147 1         59 while ($buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg) {
148 0         0 $quot = $2;
149 0         0 my $field = $3;
150 0         0 $count{$delim} -= () = $field =~ /$delim/g;
151             }
152 1         5 $ncols = $count{$delim} + 1;
153             }
154             } elsif (not $quot) {
155 2 50       39 $quot = $2 if $buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg;
156             }
157 3 50 33     12 if (++$nrows == 1000 and $et->Warn('Not counting rows past 1000', 2)) {
158 0         0 undef $nrows;
159 0         0 last;
160             }
161             }
162 1   50     8 $et->HandleTag($tagTablePtr, Delimiter => ($delim || ''));
163 1   50     8 $et->HandleTag($tagTablePtr, Quoting => ($quot || ''));
164 1         5 $et->HandleTag($tagTablePtr, ColumnCount => $ncols);
165 1 50       5 $et->HandleTag($tagTablePtr, RowCount => $nrows) if $nrows;
166 1         5 return 1;
167             }
168 12 50 33     92 return 1 if $$et{VALUE}{FileSize} and $$et{VALUE}{FileSize} > 20000000 and
      33        
169             $et->Warn('Not counting lines/words in text file larger than 20 MB', 2);
170             #
171             # count lines/words and check encoding of the rest of the file
172             #
173 12         34 my ($lines, $words) = (0, 0);
174 12         37 my $oldNL = $/;
175 12 100       43 $/ = $nl if $nl;
176 12         55 while ($raf->ReadLine($buff)) {
177 12         29 ++$lines;
178 12         109 ++$words while $buff =~ /\S+/g;
179 12 50 66     78 if (not $nl and $buff =~ /(\r\n|\r|\n)$/) {
180             # (the first line must have been longer than 64 kB)
181 0         0 $$et{VALUE}{Newlines} = $nl = $1;
182             }
183 12 50       88 next if $raf->Tell() < 65536;
184             # continue to check encoding after the first 64 kB
185 0 0 0     0 if ($isUTF8 >= 0) { # (if ascii or utf8)
    0          
186 0         0 $isUTF8 = Image::ExifTool::XMP::IsUTF8(\$buff);
187 0 0       0 if ($isUTF8 > 0) {
    0          
188 0         0 $enc = 'utf-8';
189             } elsif ($isUTF8 < 0) {
190 0 0       0 $enc = $buff =~ /[\x80-\x9f]/ ? 'unknown-8bit' : 'iso-8859-1';
191             }
192             } elsif ($enc eq 'iso-8859-1' and $buff =~ /[\x80-\x9f]/) {
193 0         0 $enc = 'unknown-8bit';
194             }
195             }
196 12 50       55 if ($$et{VALUE}{MIMEEncoding} ne $enc) {
197 0         0 $$et{VALUE}{MIMEEncoding} = $enc;
198 0         0 $et->VPrint(0," MIMEEncoding [override] = $enc\n");
199             }
200 12         38 $/ = $oldNL;
201 12         46 $et->HandleTag($tagTablePtr, LineCount => $lines);
202 12         45 $et->HandleTag($tagTablePtr, WordCount => $words);
203 12         38 return 1;
204             }
205              
206              
207             1; # end
208              
209             __END__