File Coverage

lib/Sisimai/String.pm
Criterion Covered Total %
statement 80 83 96.3
branch 22 28 78.5
condition 20 26 76.9
subroutine 12 12 100.0
pod 5 6 83.3
total 139 155 89.6


line stmt bran cond sub pod time code
1             package Sisimai::String;
2 85     85   69074 use feature ':5.10';
  85         153  
  85         6382  
3 85     85   481 use strict;
  85         163  
  85         1546  
4 85     85   377 use warnings;
  85         162  
  85         1925  
5 85     85   47681 use Encode;
  85         838973  
  85         5879  
6 85     85   41319 use Digest::SHA;
  85         250686  
  85         6711  
7              
8             my $EncodingsC = [qw/big5-eten gb2312/];
9             my $EncodingsE = [qw/iso-8859-1/];
10             my $EncodingsJ = [qw/7bit-jis iso-2022-jp euc-jp shiftjis/];
11 85     85   39375 use Encode::Guess; Encode::Guess->add_suspects(@$EncodingsC, @$EncodingsE, @$EncodingsJ);
  85         329215  
  85         314  
12 17     17 0 171 sub encodenames { return [@$EncodingsC, @$EncodingsE, @$EncodingsJ] };
13              
14             sub token {
15             # Create the message token from an addresser and a recipient
16             # @param [String] addr1 A sender's email address
17             # @param [String] addr2 A recipient's email address
18             # @param [Integer] epoch Machine time of the email bounce
19             # @return [String] Message token(MD5 hex digest) or empty string
20             # if the any argument is missing
21             # @see http://en.wikipedia.org/wiki/ASCII
22             # @see https://metacpan.org/pod/Digest::MD5
23 2968   50 2968 1 73702 my $class = shift || return '';
24 2968   100     5738 my $addr1 = shift || return '';
25 2967   100     5177 my $addr2 = shift || return '';
26 2966   100     5560 my $epoch = shift // return '';
27              
28             # Format: STX(0x02) Sender-Address RS(0x1e) Recipient-Address ETX(0x03)
29 2965         43063 return Digest::SHA::sha1_hex(sprintf("\x02%s\x1e%s\x1e%d\x03", lc $addr1, lc $addr2, $epoch));
30             }
31              
32             sub is_8bit {
33             # The argument is 8-bit text or not
34             # @param [String] argv1 Any string to be checked
35             # @return [Integer] 0: ASCII Characters only
36             # 1: Including 8-bit character
37 2715     2715 1 3790 my $class = shift;
38 2715   50     5348 my $argv1 = shift // return undef;
39              
40 2715 50       8015 return undef unless ref $argv1 eq 'SCALAR';
41 2715 100       9538 return 1 unless $$argv1 =~ /\A[\x00-\x7f]+\z/;
42 2675         7595 return 0;
43             }
44              
45             sub sweep {
46             # Clean the string out
47             # @param [String] argv1 String to be cleaned
48             # @return [Scalar] Cleaned out string
49             # @example Clean up text
50             # sweep(' neko ') #=> 'neko'
51 3115     3115 1 5163 my $class = shift;
52 3115   100     6104 my $argv1 = shift // return undef;
53              
54 3114         5407 chomp $argv1;
55 3114         8972 $argv1 =~ y/ //s;
56 3114         5104 $argv1 =~ y/\t//d;
57 3114 100       10654 $argv1 =~ s/\A //g if index($argv1, ' ') == 0;
58 3114 100       9508 $argv1 =~ s/ \z//g if substr($argv1, -1, 1) eq ' ';
59 3114         5990 $argv1 =~ s/ [-]{2,}[^ \t].+\z//;
60 3114         8651 return $argv1;
61             }
62              
63             sub to_plain {
64             # Convert given HTML text to plain text
65             # @param [Scalar] argv1 HTML text(reference to string)
66             # @param [Integer] loose Loose check flag
67             # @return [Scalar] Plain text(reference to string)
68 24     24 1 6710 my $class = shift;
69 24   50     81 my $argv1 = shift // return \'';
70 24   100     120 my $loose = shift // 0;
71 24 50       80 return \'' unless ref $argv1 eq 'SCALAR';
72              
73 24         55 my $plain = $$argv1;
74 24         69 state $match = {
75             'html' => qr|].+?|sim,
76             'body' => qr|.+.*].+|sim,
77             };
78              
79 24 100 100     1997 if( $loose || $plain =~ $match->{'html'} || $plain =~ $match->{'body'} ) {
      66        
80             # ...
81             # 1. Remove ...
82             # 2. Remove
83             # 3. ... to " http://... "
84             # 4. ... to " Value "
85 23         4134 $plain =~ s|.*?||gsim;
86 23         4565 $plain =~ s|.*?||gsim;
87 23         6155 $plain =~ s|(.*?)| [$2]($1) |gsim;
88 23         3446 $plain =~ s|(.*?)| [$2](mailto:$1) |gsim;
89              
90 23         12099 $plain =~ s/<[^<@>]+?>\s*/ /g; # Delete HTML tags except
91 23         100 $plain =~ s/</
92 23         57 $plain =~ s/>/>/g; # Convert to right angle brackets
93 23         59 $plain =~ s/&/&/g; # Convert to "&"
94 23         71 $plain =~ s/"/"/g; # Convert to '"'
95 23         49 $plain =~ s/'/'/g; # Convert to "'"
96 23         170 $plain =~ s/ / /g; # Convert to ' '
97              
98 23 50       105 if( length($$argv1) > length($plain) ) {
99 23         221 $plain =~ y/ //s;
100 23         54 $plain .= "\n"
101             }
102             }
103 24         132 return \$plain;
104             }
105              
106             sub to_utf8 {
107             # Convert given string to UTF-8
108             # @param [String] argv1 String to be converted
109             # @param [String] argv2 Encoding name before converting
110             # @return [String] UTF-8 Encoded string
111 60     60 1 153 my $class = shift;
112 60   50     252 my $argv1 = shift || return \'';
113 60         140 my $argv2 = shift;
114              
115 60         111 my $tobeutf8ed = $$argv1;
116 60   50     206 my $encodefrom = lc $argv2 || '';
117 60         95 my $hasencoded = undef;
118 60         515 my $hasguessed = Encode::Guess->guess($tobeutf8ed);
119 60 100       65507 my $encodingto = ref $hasguessed ? lc($hasguessed->name) : '';
120 60         135 state $dontencode = qr/\A(?>utf[-]?8|(?:us[-])?ascii)\z/;
121              
122 60 50       187 if( $encodefrom ) {
123             # The 2nd argument is a encoding name of the 1st argument
124 60         117 while(1) {
125             # Encode a given string when the encoding of the string is neigther
126             # utf8 nor ascii.
127 60 100       494 last if $encodefrom =~ $dontencode;
128 59 100       298 last if $encodingto =~ $dontencode;
129              
130 49         105 eval {
131             # Try to convert the string to UTF-8
132 49         214 Encode::from_to($tobeutf8ed, $encodefrom, 'utf8');
133 49         9312 $hasencoded = 1;
134             };
135 49         121 last;
136             }
137             }
138 60 100       337 return \$tobeutf8ed if $hasencoded;
139 11 50       37 return \$tobeutf8ed unless $encodingto;
140 11 50       116 return \$tobeutf8ed if $encodingto =~ $dontencode;
141              
142             # a. The 2nd argument was not given or failed to convert from $encodefrom to UTF-8
143             # b. Guessed encoding name is available, try to encode using it.
144             # c. Encode a given string when the encoding of the string is neigther utf8 nor ascii.
145 0           eval { Encode::from_to($tobeutf8ed, $encodingto, 'utf8') };
  0            
146 0           return \$tobeutf8ed;
147             }
148              
149             1;
150             __END__