File Coverage

blib/lib/Lingua/JA/Jtruncate.pm

Criterion	Covered	Total	%
statement	35	35	100.0
branch	13	18	72.2
condition	3	6	50.0
subroutine	6	6	100.0
pod	1	2	50.0
total	58	67	86.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Jtruncate;
2							$Lingua::JA::Jtruncate::VERSION = '0.022';
3							#------------------------------------------------------------------------------
4							#
5							# Start of POD
6							#
7							#------------------------------------------------------------------------------
8
9							=head1 NAME
10
11							Lingua::JA::Jtruncate - module to truncate Japanese encoded text.
12
13							=head1 SYNOPSIS
14
15							use Lingua::JA::Jtruncate qw( jtruncate );
16							$truncated_jtext = jtruncate( $jtext, $length );
17
18							=head1 DESCRIPTION
19
20							The jtruncate function truncates text to a length $length less than bytes. It
21							is designed to cope with Japanese text which has been encoded using one of the
22							standard encoding schemes - EUC, JIS, and Shift-JIS.
23							It uses the L module to detect what encoding is being used.
24							If the text is none of the above Japanese encodings,
25							the text is just truncated using substr.
26							If it is detected as Japanese text, it tries to truncate the text as well as
27							possible without breaking the multi-byte encoding. It does this by detecting
28							the character encoding of the text, and recursively deleting Japanese (possibly
29							multi-byte) characters from the end of the text until it is underneath the
30							length specified. It should work for EUC, JIS and Shift-JIS encodings.
31
32							=head1 FUNCTIONS
33
34							=head2 jtruncate( $jtext, $length )
35
36							B takes some japanese text and a byte length as arguments, and
37							returns the japanese text truncated to that byte length.
38
39							$truncated_jtext = jtruncate( $jtext, $length );
40
41							=head1 SEE ALSO
42
43							L
44
45							=head1 REPOSITORY
46
47							L
48
49							=head1 AUTHOR
50
51							Originally written by Ave Wrigley (AWRIGLEY),
52							now maintained by Neil Bowers (NEILB).
53
54							=head1 COPYRIGHT
55
56							Copyright (c) 1997 Canon Research Centre Europe (CRE). All rights reserved.
57
58							This is free software; you can redistribute it and/or modify it under
59							the same terms as the Perl 5 programming language system itself.
60
61							=cut
62
63							#------------------------------------------------------------------------------
64							#
65							# End of POD
66							#
67							#------------------------------------------------------------------------------
68
69							#------------------------------------------------------------------------------
70							#
71							# Pragmas
72							#
73							#------------------------------------------------------------------------------
74
75							require 5.006;
76	2			2		7	use strict;
	2					6
	2					41
77	2			2		6	use warnings;
	2					2
	2					37
78
79							#==============================================================================
80							#
81							# Modules
82							#
83							#==============================================================================
84
85							# use Lingua::JA::Jcode;
86	2			2		847	use Jcode;
	2					51891
	2					164
87							require Exporter;
88
89							#==============================================================================
90							#
91							# Public globals
92							#
93							#==============================================================================
94
95	2					745	use vars qw(
96							@ISA
97							@EXPORT_OK
98							%euc_code_set
99							%sjis_code_set
100							%jis_code_set
101							%char_re
102	2			2		14	);
	2					3
103
104							@ISA = qw( Exporter );
105							@EXPORT_OK = qw( jtruncate );
106
107							%euc_code_set = (
108							ASCII_JIS_ROMAN => '[\x00-\x7f]',
109							JIS_X_0208_1997 => '[\xa1-\xfe][\xa1-\xfe]',
110							HALF_WIDTH_KATAKANA => '\x8e[\xa0-\xdf]',
111							JIS_X_0212_1990 => '\x8f[\xa1-\xfe][\xa1-\xfe]',
112							);
113
114							%sjis_code_set = (
115							ASCII_JIS_ROMAN => '[\x21-\x7e]',
116							HALF_WIDTH_KATAKANA => '[\xa1-\xdf]',
117							TWO_BYTE_CHAR => '[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]',
118							);
119
120							%jis_code_set = (
121							TWO_BYTE_ESC =>
122							'(?:' .
123							join( '\|',
124							'\x1b\x24\x40',
125							'\x1b\x24\x42',
126							'\x1b\x26\x40\x1b\x24\x42',
127							'\x1b\x24\x28\x44',
128							) .
129							')'
130							,
131							TWO_BYTE_CHAR => '(?:[\x21-\x7e][\x21-\x7e])',
132							ONE_BYTE_ESC => '(?:\x1b\x28[\x4a\x48\x42\x49])',
133							ONE_BYTE_CHAR =>
134							'(?:' .
135							join( '\|',
136							'[\x21-\x5f]', # JIS7 Half width katakana
137							'\x0f[\xa1-\xdf]*\x0e', # JIS8 Half width katakana
138							'[\x21-\x7e]', # ASCII / JIS-Roman
139							) .
140							')'
141							);
142
143							%char_re = (
144							'euc' => '(?:' . join( '\|', values %euc_code_set ) . ')',
145							'sjis' => '(?:' . join( '\|', values %sjis_code_set ) . ')',
146							'jis' => '(?:' . join( '\|', values %jis_code_set ) . ')',
147							);
148
149							#==============================================================================
150							#
151							# Public exported functions
152							#
153							#==============================================================================
154
155							#------------------------------------------------------------------------------
156							#
157							# jtruncate( $text, $length )
158							#
159							# truncate a string safely (i.e. don't break japanese encoding)
160							#
161							#------------------------------------------------------------------------------
162
163							sub jtruncate
164							{
165	25			25	1	32	my $text = shift;
166	25					25	my $length = shift;
167
168							# sanity checks
169
170	25	50				58	return '' if $length == 0;
171	25	50				59	return undef if not defined $length;
172	25	50				54	return undef if $length < 0;
173	25	50				56	return $text if length( $text ) <= $length;
174
175							# save the original text, in case we need to bomb out with a substr
176
177	25					26	my $orig_text = $text;
178
179	25					848	my $encoding = Jcode::getcode( \$text );
180	25	100	33			2038	if ( not defined $encoding or $encoding !~ /^(?:euc\|s?jis)$/ )
181							{
182
183							# not euc/sjis/jis - just use substr
184
185	7					28	return substr( $text, 0, $length );
186							}
187
188	18					71	$text = chop_jchars( $text, $length, $encoding );
189	18	100				46	return substr( $orig_text, 0, $length ) unless defined $text;
190
191							# JIS encoding uses escape sequences to shift in and out of single-byte /
192							# multi-byte modes. If the truncation process leaves the text ending in
193							# multi-byte mode, we need to add the single-byte escape sequence.
194							# Therefore, we truncate (at least) 3 more bytes from JIS encoded
195							# string, so we have room to add the single-byte escape sequence without
196							# going over the $length limit
197
198	15	100	66			100	if ( $encoding eq 'jis' and $text =~ /$jis_code_set{ TWO_BYTE_CHAR }$/ )
199							{
200	6					18	$text = chop_jchars( $text, $length - 3, $encoding );
201	6	50				13	return substr( $orig_text, 0, $length ) unless defined $text;
202	6					11	$text .= "\x1b\x28\x42";
203							}
204	15					32	return $text;
205							}
206
207							sub chop_jchars
208							{
209	24			24	0	23	my $text = shift;
210	24					24	my $length = shift;
211	24					21	my $encoding = shift;
212
213	24					56	while( length( $text ) > $length )
214							{
215	1099	100				2991	return undef unless $text =~ s!$char_re{ $encoding }$!!o;
216							}
217
218	21					35	return $text;
219							}
220
221							#==============================================================================
222							#
223							# Return true
224							#
225							#==============================================================================
226
227							1;