File Coverage

blib/lib/Lingua/JA/Fold.pm
Criterion Covered Total %
statement 150 155 96.7
branch 32 38 84.2
condition 2 6 33.3
subroutine 19 19 100.0
pod 5 11 45.4
total 208 229 90.8


line stmt bran cond sub pod time code
1             package Lingua::JA::Fold;
2              
3 1     1   28078 use 5.008;
  1         3  
  1         36  
4 1     1   6 use strict;
  1         2  
  1         33  
5 1     1   5 use warnings;
  1         12  
  1         27  
6 1     1   5 use utf8;
  1         1  
  1         7  
7              
8             our $VERSION = '0.08'; # 2008-03-19 (since 2003-03-26)
9              
10 1     1   41 use Exporter;
  1         2  
  1         98  
11             our @ISA = qw(Exporter);
12             our @EXPORT = qw(
13             fold
14             );
15             our @EXPORT_OK = qw(
16             length_full length_half
17             tab2space kana_half2full
18             );
19              
20 1     1   1003 use Encode;
  1         12906  
  1         106  
21 1     1   9 use Carp;
  1         2  
  1         1577  
22              
23             =head1 NAME
24              
25             Lingua::JA::Fold - to fold a Japanese text.
26              
27             =head1 SYNOPSIS
28              
29             use utf8;
30             use Lingua::JA::Fold qw(fold tab2space kana_half2full);
31            
32             my $text = 'アイウエオ 漢字';
33            
34             # replace a [TAB] with four [SPACE]s.
35             $text = tab2space('text' => $text, 'space' => 4);
36             # convert a half-width 'Kana' character to a full-width one.
37             $text = kana_half2full($text);
38            
39             # fold a text under full-width two characters par a line.
40             $text = fold('text' => $text, 'length' => 2);
41            
42             # output the result
43             utf8::encode($text);
44             print $text;
45              
46             =head1 DESCRIPTION
47              
48             This module is used to fold a Japanese text and so on.
49              
50             The Japanese (the Chinese and the Korean would be the same) text has traditionally unique manner in representing. Basically those characters are used to be represented in two kind of size which is 'full-width' or 'half-width'. Width and Height of full-width characters are the same size (regular square). At the point, it is different from the alphabet characters which have normally variable (slim) width in representing. Roughly say, we call width of alphabet characters and Arabic numbers as a half, and do the width of other characters as a full. In a Japanese text which is mixed with alphabet and Arabic numbers, a character has a width, it would be full or half.
51              
52             For such reasons, to fold a Japanese text is rather complicate thing.
53              
54             =head1 FUNCTIONS
55              
56             =over
57              
58             =item fold('text' => $text, 'length' => $i [, 'mode' => $mode])
59              
60             Function. To fold a string within specified length of $i.
61              
62             The way in which to calculate length is differs by a mode.
63              
64             'full-width' : culculated for a full-width character.
65             'traditional': culculated for a full-width character; reflects traditional manner of composition.
66             (not given) : igore size difference between a full and a half.
67              
68             =cut
69              
70             sub fold {
71 8     8 1 4743 my %param = @_;
72            
73             # check parameters
74 8 50       29 unless ($param{'text'}) {
75 0         0 return undef;
76             }
77 8 50 33     56 if (not $param{'length'} or $param{'length'} =~ m/\D/) {
78 0         0 croak "length must be given as an integer value of more than 1";
79             }
80            
81             # UTF-8 flag on
82 8         37 utf8::decode( $param{'text'} );
83            
84             # newline character unification
85 8         158 $param{'text'} =~ s/\x0D\x0A|\x0D|\x0A/\n/g;
86            
87             # split a text to lines
88 8         14 my @line;
89 8         20 while ($param{'text'}) {
90 70 100       198 if ($param{'text'} !~ m/\n/) {
91             # single line; end without newline
92 3         6 push @line, $param{'text'};
93 3         6 last;
94             }
95             else {
96             # single line; end with newline
97             # multi line; end with/without newline
98 67         453 $param{'text'} =~ s/^[^\n]*?\n//s;
99 67         272 push @line, $&;
100             }
101             }
102            
103             # folding mode junction
104 8 100       39 if (not $param{'mode'}) {
    100          
    50          
105 1         6 &fold_1($param{'length'}, \@line);
106             }
107             elsif ($param{'mode'} eq 'full-width') {
108 2         9 &fold_2($param{'length'}, \@line);
109             }
110             elsif ($param{'mode'} eq 'traditional') {
111 5         19 &fold_3($param{'length'}, \@line);
112             }
113            
114 8         83 return join '', @line;
115             }
116              
117             sub fold_1 {
118 1     1 0 3 my($length, $ref) = @_;
119            
120             # fold each lines
121 1         2 foreach my $line ( @{$ref} ) {
  1         5  
122 12         16 my @folded;
123 12         29 while ($line) {
124 47 100       107 if (length($line) > $length) {
125 35         39 my $newfold;
126 35         64 ($newfold, $line) = cutter_1($length, $line);
127 35         142 push(@folded, $newfold);
128             }
129             else {
130 12         18 last;
131             }
132             }
133 12         33 my $folded = join("\n", @folded);
134 12 100       32 if ($folded) {
135 5         29 $line = "$folded\n$line";
136             }
137             }
138            
139 1         3 return 1;
140             }
141             sub cutter_1 {
142 35     35 0 66 my($length, $string) = @_;
143 35         104 my $folded = substr($string, 0, $length);
144 35         88 my $unfold = substr($string, $length);
145 35         114 return $folded, $unfold;
146             }
147              
148             sub fold_2 {
149 2     2 0 5 my($length, $ref) = @_;
150            
151             # fold each lines
152 2         4 foreach my $line ( @{$ref} ) {
  2         6  
153 13         16 my @folded;
154 13         27 while ($line) {
155 52 100       87 if (length_full($line) > $length) {
156 39         43 my $newfold;
157 39         65 ($newfold, $line) = cutter_2($length, $line);
158 39         120 push(@folded, $newfold);
159             }
160             else {
161 13         17 last;
162             }
163             }
164 13         38 my $folded = join("\n", @folded);
165 13 100       31 if ($folded) {
166 6         30 $line = "$folded\n$line";
167             }
168             }
169            
170 2         6 return 1;
171             }
172             sub cutter_2 {
173 39     39 0 64 my($length, $string) = @_;
174 39         45 my $chars = $length;
175 39         97 my $folded = substr($string, 0, $chars);
176 39         70 my $shortage = $length - length_full($folded);
177 39         97 while ($shortage != 0) {
178 35 100       57 if ($shortage < 0) {
179 11         13 $chars -= 1;
180 11         25 $folded = substr($string, 0, $chars);
181 11         13 last;
182             }
183             else {
184 24         36 $chars += int($shortage + 0.5);
185 24         45 $folded = substr($string, 0, $chars);
186 24         38 $shortage = $length - length_full($folded);
187 24         64 next;
188             }
189             }
190 39         167 my $unfold = substr($string, $chars);
191 39         123 return $folded, $unfold;
192             }
193              
194             =item The Japanese Traditional Manner of Composition
195              
196             This formal manner is another saying as the forbidden rule. The rule is: 1) a termination mark like Ten "," and Maru ".", 2) a closing mark -- brace or parenthesis or bracket -- like ")", "}", "]", ">" and etc., 3) a repeat mark, those should not be placed at the top of a line. If it would be occured, such mark should be moved to the next to the bottom of the previous line.
197              
198             Actually by this module what is detect as a forbidden mark are listed next:
199              
200             ’”、。〃々〉》」』】〕〟ゝゞヽヾ),.]}
201              
202             Note that all of these marks are full-width Japanese characters.
203              
204             =cut
205              
206             my $Forbidden = '’”、。〃々〉》」』】〕〟ゝゞヽヾ),.]}';
207             # my $Forbidden = '\x{2019}\x{201D}\x{3001}-\x{3003}\x{3005}\x{3009}\x{300B}\x{300D}\x{300F}\x{3011}\x{3015}\x{301F}\x{309D}\x{309E}\x{30FD}\x{30FE}\x{FF09}\x{FF0C}\x{FF0E}\x{FF3D}\x{FF5D}';
208              
209             sub fold_3 {
210 5     5 0 10 my($length, $ref) = @_;
211            
212             # fold each lines
213 5         6 foreach my $line ( @{$ref} ) {
  5         12  
214 45         59 my @folded;
215 45         92 while ($line) {
216 123 100       246 if (length_full($line) > $length) {
217 78         87 my $newfold;
218 78         153 ($newfold, $line) = cutter_3($length, $line);
219 78         263 push(@folded, $newfold);
220             }
221             else {
222 45         60 last;
223             }
224             }
225 45         102 my $folded = join "\n", @folded;
226 45 100       107 if ($folded) {
227 35 50       84 if ( length($line) ) {
228 35 100       73 if ($line eq "\n") {
229 26         97 $line = "$folded$line";
230             }
231             else {
232 9         39 $line = "$folded\n$line";
233             }
234             }
235             else {
236 0         0 $line = $folded;
237             }
238             }
239             }
240            
241 5         13 return 1;
242             }
243             sub cutter_3 {
244 78     78 0 136 my($length, $string) = @_;
245            
246 78         93 my $chars = $length;
247 78         223 my $folded = substr($string, 0, $chars);
248 78         166 my $shortage = $length - length_full($folded);
249 78         212 while ($shortage != 0) {
250 8 100       17 if ($shortage < 0) {
251 2         3 $chars -= 1;
252 2         7 $folded = substr($string, 0, $chars);
253 2         3 last;
254             }
255             else {
256 6         9 $chars += int($shortage + 0.5);
257 6         14 $folded = substr($string, 0, $chars);
258 6         12 $shortage = $length - length_full($folded);
259 6         16 next;
260             }
261             }
262 78         166 my $unfold = substr($string, $chars);
263            
264 78         161 while ($unfold) {
265 134         308 my $char_top = substr($unfold, 0, 1);
266 134 100       595 if ($char_top =~ /[$Forbidden]/) {
267 56         80 $folded .= $char_top;
268 56         113 $unfold = substr($unfold, 1);
269 56         142 next;
270             }
271             else {
272 78         135 last;
273             }
274             }
275            
276 78         280 return $folded, $unfold;
277             }
278              
279             =item length_half($text)
280              
281             Function. Exportable. This is for counting length of a text for a half-width character.
282              
283             =cut
284              
285             sub length_half ($) {
286 1     1 1 541 my $string = shift;
287            
288             # remove all ASCII controls except for [SPACE]
289 1         5 $string =~ tr/\x00-\x1F\x7F//d;
290            
291             # ascii: arabic numbers, alphabets, marks
292 1         3 my $ascii = $string =~ tr/\x20-\x7E//d;
293             # half-width characters in the Unicode compatibility area
294 1     1   7 my $halfwidth = $string =~ tr/\x{FF61}-\x{FF9F}\x{FFE0}-\x{FFE5}//d;
  1         3  
  1         18  
  1         15  
295             # the rest: full-width characters
296 1         4 my $rest = length($string);
297            
298 1         6 return $ascii + $halfwidth + $rest * 2;
299             }
300              
301             =item length_full($text)
302              
303             Function. Exportable. This is for counting length of a text for a full-width character.
304              
305             =cut
306              
307             sub length_full ($) {
308 323     323 1 1312 my $string = shift;
309            
310             # remove all ASCII controls except for [SPACE]
311 323         1011 $string =~ tr/\x00-\x1F\x7F//d;
312            
313             # ascii: arabic numbers, alphabets, marks
314 323         1039 my $ascii = $string =~ tr/\x20-\x7E//d;
315             # half-width characters in the Unicode compatibility area
316 323         2084 my $halfwidth = $string =~ tr/\x{FF61}-\x{FF9F}\x{FFE0}-\x{FFE5}//d;
317             # the rest: full-width characters
318 323         596 my $rest = length($string);
319            
320 323         969 return ($ascii + $halfwidth) * 0.5 + $rest;
321             }
322              
323             # sub _length_full_fixed {}
324              
325             =item tab2space('text' => $text, 'space' => $i)
326              
327             Function. Exportable. To replace a [TAB] character in a text with given number of [SPACE]s.
328              
329             =cut
330              
331             sub tab2space {
332 1     1 1 14 my %param = @_;
333            
334             # check parameters
335 1 50       5 unless ($param{'text'}) {
336 0         0 return undef;
337             }
338 1 50 33     13 if (not $param{'space'} or $param{'space'} =~ m/\D/) {
339 0         0 croak "space must be given as an integer value of more than 1";
340             }
341            
342 1         3 my $spaces = ' ';
343 1         4 $spaces x= $param{'space'};
344            
345             # replacement
346 1         9 $param{'text'} =~ s/\t/$spaces/g;
347            
348 1         5 return $param{'text'};
349             }
350              
351             =item kana_half2full($text)
352              
353             Function. Exportable. To convert a character in a text from half-width 'Kana' to full-width one.
354              
355             =cut
356              
357             sub kana_half2full {
358 1     1 1 800 my $text = shift;
359            
360 1         8 $text = encode('iso-2022-jp', $text);
361 1         15120 $text = decode('iso-2022-jp', $text);
362            
363 1         116 return $text;
364             }
365              
366             ########################################################################
367             1;
368             __END__