File Coverage

blib/lib/OpenOffice/Wordlist.pm
Criterion Covered Total %
statement 57 61 93.4
branch 14 22 63.6
condition 2 4 50.0
subroutine 9 10 90.0
pod 5 5 100.0
total 87 102 85.2


line stmt bran cond sub pod time code
1             package OpenOffice::Wordlist;
2              
3 3     3   90216 use strict;
  3         8  
  3         100  
4 3     3   16 use warnings;
  3         8  
  3         151  
5              
6             =head1 NAME
7              
8             OpenOffice::Wordlist - Read/write OpenOffice.org wordlists
9              
10             =cut
11              
12             our $VERSION = '0.04';
13              
14             =head1 SYNOPSIS
15              
16             This module allows reading and writing of OpenOffice.org wordlists
17             (dictionaries).
18              
19             For example:
20              
21             use OpenOffice::Wordlist;
22              
23             my $dict = OpenOffice::Wordlist->new;
24             $dict->read(".openoffice.org/3/user/wordlist/standard.dic");
25              
26             # Print all words.
27             foreach my $word ( @{ $dict->words } ) {
28             print $word, "\n";
29             }
30              
31             # Add some words.
32             $dict->append( "openoffice", "great" );
33              
34             # Write a new dictionary.
35             $dict->write("new.dic");
36              
37             When used as a program this module will read all dictionaries given on
38             the command line and write the resultant list of words to standard
39             output. For example,
40              
41             $ perl OpenOffice/Wordlist.pm standard.dic
42              
43             =head1 METHODS
44              
45             =head2 $dict = new( [ type => 'WDSWG6', language => 2057, neg => 0 ] )
46              
47             Creates a new dict object.
48              
49             Optional arguments:
50              
51             type => 'WBSWG6' or 'WBSWG2' or 'WBSWG5'.
52              
53             'WBSWG6' (default) indicates a UTF-8 encoded dictionary, the others
54             indicate a ISO-8859.1 encoded dictionary.
55              
56             language => I
57              
58             The code for the language. I assume there's an extensive list of these
59             codes somewhere. Some values determined experimentally:
60              
61             255 All
62             1031 German (Germany)
63             1036 French (France)
64             1043 Dutch (Netherlands)
65             2047 English UK
66             2057 English USA
67              
68             neg => 0 or 1
69              
70             Whether the dictionary contains exceptions (neg = 1) or regular words
71             (neg = 0).
72              
73             If language and neg are not specified they are taken from the first
74             file read, if any.
75              
76             =cut
77              
78 3     3   12902 use Encode;
  3         52594  
  3         4109  
79              
80             sub new {
81 5     5 1 2217 my ( $pkg, %opts ) = @_;
82 5         35 my $self = bless { type => 'WBSWG6', words => [], %opts }, $pkg;
83 5         29 $self->_set_type( $self->{type} );
84 5         20 return $self;
85             }
86              
87             =head2 $dict->read( $file )
88              
89             Reads the contents of the indicated file.
90              
91             =cut
92              
93             sub read {
94 3     3 1 7 my ( $self, $file ) = @_;
95 3 50       109 open( my $dict, '<:raw', $file )
96             or die("$file: $!\n");
97              
98 3         4 my $data = do { local $/; <$dict> };
  3         8  
  3         69  
99 3 50       24 die( "$file: Invalid dict type\n")
100             unless substr( $data, 0, 8, '' ) =~ /\x06\x00(WBSWG[256])/;
101 3         8 my $type = $self->_set_type($1);
102              
103 3         6 my $lang = substr( $data, 0, 2, '' );
104 3 50       15 $self->{language} = unpack( "v", $lang )
105             unless defined $self->{language};
106              
107 3         7 my $neg = substr( $data, 0, 1, '' );
108 3 50       10 $self->{neg} = unpack( "C", $neg )
109             unless defined $self->{neg};;
110              
111 3         8 while ( $data ) {
112 27         709 my $length = substr( $data, 0, 2, '' );
113 27         42 $length = unpack( "v", $length );
114 27         29 push( @{$self->{words}},
  27         101  
115             decode( $self->{encoding}, substr( $data, 0, $length, '' ) ) );
116             }
117              
118 3 50       78 $self->_set_type($type) if $type;
119 3         38 return $self;
120             }
121              
122             # Internal.
123             sub _set_type {
124 13     13   25 my ( $self, $type ) = @_;
125 13         35 ( $self->{type}, $type) = ( $type, $self->{type} );
126 13 100       46 $self->{encoding} = $self->{type} eq 'WBSWG6' ? 'utf-8' : 'iso-8859-1';
127 13         21 return $type; # previous type
128             }
129              
130             =head2 $dict->append( @words )
131              
132             Append a list of words to the dictionary. To avoid unpleasant
133             surprises, the words must be encoded in Perl's internal encoding.
134              
135             The arguments may be constant strings or references to lists of strings.
136              
137             =cut
138              
139             sub append {
140 2     2 1 27 my ( $self, @words ) = @_;
141              
142 2         4 foreach my $word ( @words ) {
143 18 50       74 if ( UNIVERSAL::isa( $word, 'ARRAY' ) ) {
144 0         0 push( @{$self->{words}}, @$word );
  0         0  
145             }
146             else {
147 18         19 push( @{$self->{words}}, $word );
  18         46  
148             }
149             }
150              
151 2         6 return $self;
152             }
153              
154             =head2 $dict->words
155              
156             Returns a reference to the list of words in the dictionary,
157              
158             The words are encoded in Perl's internal encoding.
159              
160             =cut
161              
162             sub words {
163 0     0 1 0 my ( $self ) = @_;
164 0         0 $self->{words};
165             }
166              
167             =head2 $dict->write( $file [ , $type ] )
168              
169             Writes the contents of the object to a new dictionary.
170              
171             Arguments: The name of the file to be written, and (optionally) the
172             type of the file to be written (one of 'WBSWG6', 'WBSWG5', 'WBSWG2')
173             overriding the type of the dictionary as establised at create time.
174              
175             =cut
176              
177             sub write {
178 6     6 1 598 my ( $self, $file, $type ) = @_;
179              
180 6 100       20 $type = $self->_set_type($type) if $type;
181              
182 6 50       583 open( my $dict, '>:raw', $file )
183             or die("$file: $!\n");
184              
185 6   50     12 print { $dict } ( $self->__pfx( $self->{type} ),
  6   50     24  
186             pack( "v", $self->{language} || 0),
187             pack( "C", $self->{neg} || 0 ) );
188              
189 6         3392 require bytes;
190              
191 6         10 foreach ( @{$self->{words}} ) {
  6         20  
192 54         247 print { $dict } ( $self->__pfx($_) );
  54         119  
193             }
194 6 50       321 close($dict) or die("$file: $!\n");
195              
196 6 100       20 $type = $self->_set_type($type) if $type;
197              
198 6         23 return $self;
199             }
200              
201             # Internal.
202             sub __pfx {
203 60     60   84 my ( $self, $string ) = @_;
204 60         161 $string = encode( $self->{encoding}, $string );
205 60         1754 pack( "v", bytes::length($string) ) . $string;
206             }
207              
208             =head1 EXAMPLE
209              
210             This example reads all dictionaries that are supplied on the command
211             file, merges them, and writes a new dictionary.
212              
213             my $dict = OpenOffice::Wordlist->new( type => 'WBSWG6' );
214             $dict->read( shift );
215             foreach ( @ARGV ) {
216             my $extra = OpenOffice::Wordlist->new->read($_);
217             $dict->append( $extra->words );
218             }
219             $dict->write("new.dic");
220              
221             Settings like the language and exceptions are copied from the file
222             that is initially read.
223              
224             =head1 AUTHOR
225              
226             Johan Vromans, C<< >>
227              
228             =head1 BUGS
229              
230             There's currently no checking done on dictionary types arguments.
231              
232             Please report any bugs or feature requests to
233             C, or through the web
234             interface at
235             L.
236             I will be notified, and then you'll automatically be notified of
237             progress on your bug as I make changes.
238              
239             =head1 SUPPORT
240              
241             You can find documentation for this module with the perldoc command.
242              
243             perldoc OpenOffice::Wordlist
244              
245             You can also look for information at:
246              
247             =over 4
248              
249             =item * RT: CPAN's request tracker
250              
251             L
252              
253             =item * CPAN Ratings
254              
255             L
256              
257             =item * Search CPAN
258              
259             L
260              
261             =back
262              
263             =head1 ACKNOWLEDGEMENTS
264              
265             =head1 COPYRIGHT & LICENSE
266              
267             Copyright 2010 Johan Vromans, all rights reserved.
268              
269             This program is free software; you can redistribute it and/or modify it
270             under the same terms as Perl itself.
271              
272             =cut
273              
274             package main;
275              
276             unless ( caller ) {
277             binmode( STDOUT, ':encoding(utf-8)' );
278             my $dict = OpenOffice::Wordlist->new( type => 'WBSWG6' );
279             $dict->read( shift(@ARGV) );
280             foreach ( @ARGV ) {
281             my $extra = OpenOffice::Wordlist->new->read($_);
282             $dict->append( $extra->words );
283             }
284             foreach ( @{ $dict->words } ) {
285             print "$_\n";
286             }
287             }
288              
289             1;