File Coverage

blib/lib/ZMachine/ZSCII.pm
Criterion Covered Total %
statement 151 154 98.0
branch 42 58 72.4
condition 17 22 77.2
subroutine 17 17 100.0
pod 10 10 100.0
total 237 261 90.8


line stmt bran cond sub pod time code
1             package ZMachine::ZSCII 0.005;
2 1     1   134790 use 5.14.0;
  1         6  
3 1     1   6 use warnings;
  1         1  
  1         22  
4             # ABSTRACT: an encoder/decoder for Z-Machine text
5              
6 1     1   4 use Carp ();
  1         1  
  1         14  
7 1     1   3 use charnames ':full';
  1         2  
  1         15  
8              
9             #pod =head1 OVERVIEW
10             #pod
11             #pod ZMachine::ZSCII is a class for objects that are encoders/decoders of Z-Machine
12             #pod text. Right now, ZMachine::ZSCII only implements Version 5 (and thus 7 and 8),
13             #pod and even that partially. There is no abbreviation support yet.
14             #pod
15             #pod =head2 How Z-Machine Text Works
16             #pod
17             #pod The Z-Machine's text strings are composed of ZSCII characters. There are 1024
18             #pod ZSCII codepoints, although only bottom eight bits worth are ever used.
19             #pod Codepoints 0x20 through 0x7E are identical with the same codepoints in ASCII or
20             #pod Unicode.
21             #pod
22             #pod ZSCII codepoints are then encoded as strings of five-bit Z-characters. The
23             #pod most common ZSCII characters, the lowercase English alphabet, can be encoded
24             #pod with one Z-character. Uppercase letters, numbers, and common punctuation
25             #pod ZSCII characters require two Z-characters each. Any other ZSCII character can
26             #pod be encoded with four Z-characters.
27             #pod
28             #pod For storage on disk or in memory, the five-bit Z-characters are packed
29             #pod together, three in a word, and laid out in bytestrings. The last word in a
30             #pod string has its top bit set to mark the ending. When a bytestring would end
31             #pod with out enough Z-characters to pack a full word, it is padded.
32             #pod (ZMachine::ZSCII pads with Z-character 0x05, a shift character.)
33             #pod
34             #pod Later versions of the Z-Machine allow the mapping of ZSCII codepoints to
35             #pod Unicode codepoints to be customized. ZMachine::ZSCII does not yet support this
36             #pod feature.
37             #pod
38             #pod ZMachine::ZSCII I allow conversion between all four relevant
39             #pod representations: Unicode text, ZSCII text, Z-character strings, and packed
40             #pod Z-character bytestrings. All four forms are represented by Perl strings.
41             #pod
42             #pod =cut
43              
44             my %DEFAULT_ZSCII = (
45             chr(0x00) => "\N{NULL}",
46             chr(0x08) => "\N{DELETE}",
47             chr(0x0D) => "\x0D",
48             chr(0x1B) => "\N{ESCAPE}",
49              
50             (map {; chr $_ => chr $_ } (0x20 .. 0x7E)), # ASCII maps over
51              
52             # 0x09B - 0x0FB are the "extra characters" and need Unicode translation table
53             # 0x0FF - 0x3FF are undefined and never (?) used
54             );
55              
56             # We can use these characters below because they all (save for the magic A2-C6)
57             # are the same in Unicode/ASCII/ZSCII. -- rjbs, 2013-01-18
58             my $DEFAULT_ALPHABET = join(q{},
59             'a' .. 'z', # A0
60             'A' .. 'Z', # A1
61             ( # A2
62             "\0", # special: read 2 chars for 10-bit zscii character
63             "\x0D",
64             (0 .. 9),
65 1     1   13018 do { no warnings 'qw'; qw[ . , ! ? _ # ' " / \ - : ( ) ] },
  1         3  
  1         2551  
66             ),
67             );
68              
69             my @DEFAULT_EXTRA = map chr hex, qw(
70             E4 F6 FC C4 D6 DC DF BB AB EB EF FF CB CF E1 E9
71             ED F3 FA FD C1 C9 CD D3 DA DD E0 E8 EC F2 F9 C0
72             C8 CC D2 D9
73              
74             E2 EA EE F4 FB C2 CA CE D4 DB E5 C5 F8 D8 E3 F1
75             F5 C3 D1 D5 E6 C6 E7 C7 FE F0 DE D0 A3 153 152 A1
76             BF
77             );
78              
79             sub _validate_alphabet {
80 4     4   7 my (undef, $alphabet) = @_;
81              
82 4 50       8 Carp::croak("alphabet table was not 78 entries long")
83             unless length $alphabet == 78;
84              
85 4 50       12 Carp::carp("alphabet character 52 not set to 0x000")
86             unless substr($alphabet, 52, 1) eq chr(0);
87              
88             Carp::croak("alphabet table contains characters over 0xFF")
89 4 50       38 if grep {; ord > 0xFF } split //, $alphabet;
  312         389  
90             }
91              
92             sub _shortcuts_for {
93 4     4   9 my ($self, $alphabet) = @_;
94              
95 4         9 $self->_validate_alphabet($alphabet);
96              
97 4         20 my %shortcut = (q{ } => chr(0));
98              
99 4         12 for my $i (0 .. 2) {
100 12         18 my $offset = $i * 26;
101 12 100       21 my $prefix = $i ? chr(0x03 + $i) : '';
102              
103 12         15 for my $j (0 .. 25) {
104 312 100 100     506 next if $i == 2 and $j == 0; # that guy is magic! -- rjbs, 2013-01-18
105              
106 308         510 $shortcut{ substr($alphabet, $offset + $j, 1) } = $prefix . chr($j + 6);
107             }
108             }
109              
110 4         10 return \%shortcut;
111             }
112              
113             #pod =method new
114             #pod
115             #pod my $z = ZMachine::ZSCII->new;
116             #pod my $z = ZMachine::ZSCII->new(\%arg);
117             #pod my $z = ZMachine::ZSCII->new($version);
118             #pod
119             #pod This returns a new codec. If the only argument is a number, it is treated as a
120             #pod version specification. If no arguments are given, a Version 5 codec is made.
121             #pod
122             #pod Valid named arguments are:
123             #pod
124             #pod =begin :list
125             #pod
126             #pod = version
127             #pod
128             #pod The number of the Z-Machine targeted; at present, only 5, 7, or 8 are permitted
129             #pod values.
130             #pod
131             #pod = extra_characters
132             #pod
133             #pod This is a reference to an array of between 0 and 97 Unicode characters. These
134             #pod will be the characters to which ZSCII characters 155 through 251. They may not
135             #pod duplicate any characters represented by the default ZSCII set. No Unicode
136             #pod codepoint above U+FFFF is permitted, as it would not be representable in the
137             #pod Z-Machine Unicode substitution table.
138             #pod
139             #pod If no extra characters are given, the default table is used.
140             #pod
141             #pod = alphabet
142             #pod
143             #pod This is a string of 78 characters, representing the three 26-character
144             #pod alphabets used to encode ZSCII compactly into Z-characters. The first 26
145             #pod characters are alphabet 0, for the most common characters. The rest of the
146             #pod characters are alphabets 1 and 2.
147             #pod
148             #pod No character with a ZSCII value greater than 0xFF may be included in the
149             #pod alphabet. Character 52 (A2's first character) should be NUL.
150             #pod
151             #pod If no alphabet is given, the default alphabet is used.
152             #pod
153             #pod = alphabet_is_unicode
154             #pod
155             #pod By default, the values in the C are assumed to be ZSCII characters,
156             #pod so that the contents of the alphabet table from the Z-Machine's memory can be
157             #pod used directly. The C option specifies that the characters
158             #pod in the alphabet string are Unicode characters. They will be converted to ZSCII
159             #pod internally by the C method, and if characters appear in the
160             #pod alphabet that are not in the default ZSCII set or the extra characters, an
161             #pod exception will be raised.
162             #pod
163             #pod =end :list
164             #pod
165             #pod =cut
166              
167             sub new {
168 5     5 1 6523 my ($class, $arg) = @_;
169              
170 5 50       15 if (! defined $arg) {
171 0         0 $arg = { version => 5 };
172 5 100       13 } if (! ref $arg) {
173 2         5 $arg = { version => $arg };
174             }
175              
176 5         13 my $guts = { version => $arg->{version} };
177              
178             Carp::croak("only Version 5, 7, and 8 ZSCII are supported at present")
179             unless $guts->{version} == 5
180             or $guts->{version} == 7
181 5 50 66     96 or $guts->{version} == 8;
      66        
182              
183 4         220 $guts->{zscii} = { %DEFAULT_ZSCII };
184              
185             # Why is this an arrayref and not, like alphabets, a string?
186             # Alphabets are strings because they're guaranteed to fit in bytestrings.
187             # You can't put a ZSCII character over 0xFF in the alphabet, because it can't
188             # be put in the story file's alphabet table! By using a string, it's easy to
189             # just pass in the alphabet from memory to/from the codec. On the other
190             # hand, the Unicode translation table stores Unicode codepoint values packed
191             # into words, and it's not a good fit for use in the codec. Maybe a
192             # ZMachine::Util will be useful for packing/unpacking Unicode translation
193             # tables.
194             $guts->{extra} = $arg->{extra_characters}
195 4   100     24 || \@DEFAULT_EXTRA;
196              
197             Carp::confess("Unicode translation table exceeds maximum length of 97")
198 4 50       9 if @{ $guts->{extra} } > 97;
  4         11  
199              
200 4         6 for (0 .. $#{ $guts->{extra} }) {
  4         12  
201             Carp::confess("tried to add ambiguous Z->U mapping")
202 78 50       121 if exists $guts->{zscii}{ chr(155 + $_) };
203              
204 78         91 my $u_char = $guts->{extra}[$_];
205              
206             # Extra characters must go into the Unicode substitution table, which can
207             # only represent characters with codepoints between 0 and 0xFFFF. See
208             # Z-Machine Spec v1.1 ยง 3.8.4.2.1
209 78 50       109 Carp::confess("tried to add Unicode codepoint greater than U+FFFF")
210             if ord($u_char) > 0xFFFF;
211              
212 78         142 $guts->{zscii}{ chr(155 + $_) } = $u_char;
213             }
214              
215 4         12 $guts->{zscii_for} = { };
216 4         8 for my $zscii_char (sort keys %{ $guts->{zscii} }) {
  4         127  
217 474         494 my $unicode_char = $guts->{zscii}{$zscii_char};
218              
219             Carp::confess("tried to add ambiguous U->Z mapping")
220 474 50       587 if exists $guts->{zscii_for}{ $unicode_char };
221              
222 474         638 $guts->{zscii_for}{ $unicode_char } = $zscii_char;
223             }
224              
225 4         20 my $self = bless $guts => $class;
226              
227             # The default alphabet is entirely made up of characters that are the same in
228             # Unicode and ZSCII. If a user wants to put "extra characters" into the
229             # alphabet table, though, the alphabet should contain ZSCII values. When
230             # we're building a ZMachine::ZSCII using the contents of the story file's
231             # alphabet table, that's easy. If we're building a codec to *produce* a
232             # story file, it's less trivial, because we don't want to think about the
233             # specific ZSCII codepoints for the Unicode text we'll encode.
234             #
235             # We provide alphabet_is_unicode to let the user say "my alphabet is supplied
236             # in Unicode, please convert it to ZSCII during construction." -- rjbs,
237             # 2013-01-19
238 4   66     13 my $alphabet = $arg->{alphabet} || $DEFAULT_ALPHABET;
239              
240             # It's okay if the user supplies alphabet_is_unicode but not alphabet,
241             # because the default alphabet is all characters with the same value in both
242             # character sets! -- rjbs, 2013-01-20
243             $alphabet = $self->unicode_to_zscii($alphabet)
244 4 100       10 if $arg->{alphabet_is_unicode};
245              
246 4         10 $self->{alphabet} = $alphabet;
247 4         9 $self->{shortcut} = $class->_shortcuts_for( $self->{alphabet} );
248              
249 4         11 return $self;
250             }
251              
252             #pod =method encode
253             #pod
254             #pod my $packed_zchars = $z->encode( $unicode_text );
255             #pod
256             #pod This method takes a string of text and encodes it to a bytestring of packed
257             #pod Z-characters.
258             #pod
259             #pod Internally, it converts the Unicode text to ZSCII, then to Z-characters, and
260             #pod then packs them. Before this processing, any native newline characters (the
261             #pod value of C<\n>) are converted to C to match the Z-Machine's use of
262             #pod character 0x00D for newline.
263             #pod
264             #pod =cut
265              
266             sub encode {
267 3     3 1 1222 my ($self, $string) = @_;
268              
269 3         11 $string =~ s/\n/\x0D/g;
270              
271 3         9 my $zscii = $self->unicode_to_zscii($string);
272 3         7 my $zchars = $self->zscii_to_zchars($zscii);
273              
274 3         10 return $self->pack_zchars($zchars);
275             }
276              
277             #pod =method decode
278             #pod
279             #pod my $text = $z->decode( $packed_zchars );
280             #pod
281             #pod This method takes a bytestring of packed Z-characters and returns a string of
282             #pod text.
283             #pod
284             #pod Internally, it unpacks the Z-characters, converts them to ZSCII, and then
285             #pod converts those to Unicode. Any ZSCII characters 0x00D are converted to the
286             #pod value of C<\n>.
287             #pod
288             #pod =cut
289              
290             sub decode {
291 3     3 1 2010 my ($self, $bytestring) = @_;
292              
293 3         8 my $zchars = $self->unpack_zchars( $bytestring );
294 3         7 my $zscii = $self->zchars_to_zscii( $zchars );
295 3         7 my $unicode = $self->zscii_to_unicode( $zscii );
296              
297 3         10 $unicode =~ s/\x0D/\n/g;
298              
299 3         9 return $unicode;
300             }
301              
302             #pod =method unicode_to_zscii
303             #pod
304             #pod my $zscii_string = $z->unicode_to_zscii( $unicode_string );
305             #pod
306             #pod This method converts a Unicode string to a ZSCII string, using the dialect of
307             #pod ZSCII for the ZMachine::ZSCII's configuration.
308             #pod
309             #pod If the Unicode input contains any characters that cannot be mapped to ZSCII, an
310             #pod exception is raised.
311             #pod
312             #pod =cut
313              
314             sub unicode_to_zscii {
315 14     14 1 7310 my ($self, $unicode_text) = @_;
316              
317 14         22 my $zscii = '';
318 14         34 for (0 .. length($unicode_text) - 1) {
319 189         238 my $char = substr $unicode_text, $_, 1;
320              
321             Carp::croak(
322             sprintf "no ZSCII character available for Unicode U+%v05X <%s>",
323             $char,
324             charnames::viacode(ord $char),
325 189 100       310 ) unless defined( my $zscii_char = $self->{zscii_for}{ $char } );
326              
327 188         228 $zscii .= $zscii_char;
328             }
329              
330 13         34 return $zscii;
331             }
332              
333             #pod =method zscii_to_unicode
334             #pod
335             #pod my $unicode_string = $z->zscii_to_unicode( $zscii_string );
336             #pod
337             #pod This method converts a ZSCII string to a Unicode string, using the dialect of
338             #pod ZSCII for the ZMachine::ZSCII's configuration.
339             #pod
340             #pod If the ZSCII input contains any characters that cannot be mapped to Unicode, an
341             #pod exception is raised. I
342             #pod replacement character instead.>
343             #pod
344             #pod =cut
345              
346             sub zscii_to_unicode {
347 4     4 1 474 my ($self, $zscii) = @_;
348              
349 4         5 my $unicode = '';
350 4         10 for (0 .. length($zscii) - 1) {
351 49         52 my $char = substr $zscii, $_, 1;
352              
353             Carp::croak(
354             sprintf "no Unicode character available for ZSCII %#v05x", $char,
355 49 50       83 ) unless defined(my $unicode_char = $self->{zscii}{ $char });
356              
357 49         57 $unicode .= $unicode_char;
358             }
359              
360 4         6 return $unicode;
361             }
362              
363             #pod =method zscii_to_zchars
364             #pod
365             #pod my $zchars = $z->zscii_to_zchars( $zscii_string );
366             #pod
367             #pod Given a string of ZSCII characters, this method will return a (unpacked) string
368             #pod of Z-characters.
369             #pod
370             #pod It will raise an exception on ZSCII codepoints that cannot be represented as
371             #pod Z-characters, which should not be possible with legal ZSCII.
372             #pod
373             #pod =cut
374              
375             sub zscii_to_zchars {
376 10     10 1 5491 my ($self, $zscii) = @_;
377              
378 10 50       21 return '' unless length $zscii;
379              
380 10         14 my $zchars = '';
381 10         19 for (0 .. length($zscii) - 1) {
382 103         115 my $zscii_char = substr($zscii, $_, 1);
383 103 100       159 if (defined (my $shortcut = $self->{shortcut}{ $zscii_char })) {
384 92         90 $zchars .= $shortcut;
385 92         110 next;
386             }
387              
388 11         16 my $ord = ord $zscii_char;
389              
390 11 50       18 if ($ord >= 1024) {
391 0         0 Carp::croak(
392             sprintf "can't encode ZSCII codepoint %#v05x in Z-characters",
393             $zscii_char
394             );
395             }
396              
397 11         16 my $top = ($ord & 0b1111100000) >> 5;
398 11         13 my $bot = ($ord & 0b0000011111);
399              
400 11         14 $zchars .= "\x05\x06"; # The escape code for a ten-bit ZSCII character.
401 11         22 $zchars .= chr($top) . chr($bot);
402             }
403              
404 10         21 return $zchars;
405             }
406              
407             #pod =method zchars_to_zscii
408             #pod
409             #pod my $zscii = $z->zchars_to_zscii( $zchars_string, \%arg );
410             #pod
411             #pod Given a string of (unpacked) Z-characters, this method will return a string of
412             #pod ZSCII characters.
413             #pod
414             #pod It will raise an exception when the right thing to do can't be determined.
415             #pod Right now, that could mean lots of things.
416             #pod
417             #pod Valid arguments are:
418             #pod
419             #pod =begin :list
420             #pod
421             #pod = allow_early_termination
422             #pod
423             #pod If C is true, no exception is thrown if the
424             #pod Z-character string ends in the middle of a four z-character sequence. This is
425             #pod useful when dealing with dictionary words.
426             #pod
427             #pod =end :list
428             #pod
429             #pod =cut
430              
431             sub zchars_to_zscii {
432 7     7 1 2504 my ($self, $zchars, $arg) = @_;
433 7   100     30 $arg ||= {};
434              
435 7         10 my $text = '';
436 7         10 my $alphabet = 0;
437              
438 7         21 while (length( my $char = substr $zchars, 0, 1, '')) {
439 104         109 my $ord = ord $char;
440              
441 104 100       143 if ($ord == 0) { $text .= q{ }; next; }
  3         3  
  3         5  
442              
443 101 100       158 if ($ord == 0x04) { $alphabet = 1; next }
  7 100       7  
  7         12  
444 23         25 elsif ($ord == 0x05) { $alphabet = 2; next }
  23         35  
445              
446 71 100 100     132 if ($alphabet == 2 && $ord == 0x06) {
447 12         20 my $next_two = substr $zchars, 0, 2, '';
448 12 100       18 if (length $next_two != 2) {
449 2 100       6 last if $arg->{allow_early_termination};
450 1         124 Carp::croak("ten-bit ZSCII encoding segment terminated early")
451             }
452              
453 10         17 my $value = ord(substr $next_two, 0, 1) << 5
454             | ord(substr $next_two, 1, 1);
455              
456 10         12 $text .= chr $value;
457 10         11 $alphabet = 0;
458 10         19 next;
459             }
460              
461 59 50 33     125 if ($ord >= 0x06 && $ord <= 0x1F) {
462 59         102 $text .= substr $self->{alphabet}, (26 * $alphabet) + $ord - 6, 1;
463 59         58 $alphabet = 0;
464 59         96 next;
465             }
466              
467 0         0 Carp::croak("unknown zchar <$char> encountered in alphabet <$alphabet>");
468             }
469              
470 6         16 return $text;
471             }
472              
473             #pod =method make_dict_length
474             #pod
475             #pod my $zchars = $z->make_dict_length( $zchars_string )
476             #pod
477             #pod This method returns the Z-character string fit to dictionary length for the
478             #pod Z-machine version being handled. It will trim excess characters or pad with
479             #pod Z-character 5 to be the right length.
480             #pod
481             #pod When converting such strings back to ZSCII, you should pass the
482             #pod C to C, as a four-Z-character
483             #pod sequence may have been terminated early.
484             #pod
485             #pod =cut
486              
487             sub make_dict_length {
488 3     3 1 1424 my ($self, $zchars) = @_;
489              
490 3 50       8 my $length = $self->{version} >= 5 ? 9 : 6;
491 3         7 $zchars = substr $zchars, 0, $length;
492 3         7 $zchars .= "\x05" x ($length - length($zchars));
493              
494 3         6 return $zchars;
495             }
496              
497             #pod =method pack_zchars
498             #pod
499             #pod my $packed_zchars = $z->pack_zchars( $zchars_string );
500             #pod
501             #pod This method takes a string of unpacked Z-characters and packs them into a
502             #pod bytestring with three Z-characters per word. The final word will have its top
503             #pod bit set.
504             #pod
505             #pod =cut
506              
507             sub pack_zchars {
508 4     4 1 880 my ($self, $zchars) = @_;
509              
510 4         6 my $bytestring = '';
511              
512 4         13 while (my $substr = substr $zchars, 0, 3, '') {
513 31         48 $substr .= chr(5) until length $substr == 3;
514              
515 31         48 my $value = ord(substr($substr, 0, 1)) << 10
516             | ord(substr($substr, 1, 1)) << 5
517             | ord(substr($substr, 2, 1));
518              
519 31 100       44 $value |= (0x8000) if ! length $zchars;
520              
521 31         69 $bytestring .= pack 'n', $value;
522             }
523              
524 4         10 return $bytestring;
525             }
526              
527             #pod =method unpack_zchars
528             #pod
529             #pod my $zchars_string = $z->pack_zchars( $packed_zchars );
530             #pod
531             #pod Given a bytestring of packed Z-characters, this method will unpack them into a
532             #pod string of unpacked Z-characters that aren't packed anymore because they're
533             #pod unpacked instead of packed.
534             #pod
535             #pod Exceptions are raised if the input bytestring isn't made of an even number of
536             #pod octets, or if the string continues past the first word with its top bit set.
537             #pod
538             #pod =cut
539              
540             sub unpack_zchars {
541 5     5 1 1009 my ($self, $bytestring) = @_;
542              
543 5 50       16 Carp::croak("bytestring of packed zchars is not an even number of bytes")
544             if length($bytestring) % 2;
545              
546 5         12 my $terminate;
547 5         7 my $zchars = '';
548 5         15 while (my $word = substr $bytestring, 0, 2, '') {
549             # XXX: Probably allow this to warn and `last` -- rjbs, 2013-01-18
550 37 50       57 Carp::croak("input continues after terminating byte") if $terminate;
551              
552 37         52 my $n = unpack 'n', $word;
553 37         40 $terminate = $n & 0x8000;
554              
555 37         50 my $c1 = chr( ($n & 0b0111110000000000) >> 10 );
556 37         40 my $c2 = chr( ($n & 0b0000001111100000) >> 5 );
557 37         39 my $c3 = chr( ($n & 0b0000000000011111) );
558              
559 37         70 $zchars .= "$c1$c2$c3";
560             }
561              
562 5         15 return $zchars;
563             }
564              
565             1;
566              
567             __END__