File Coverage

buildlib/Lucy/Test/TestUtils.pm
Criterion Covered Total %
statement 133 134 99.2
branch 16 26 61.5
condition n/a
subroutine 24 24 100.0
pod 0 14 0.0
total 173 198 87.3


line stmt bran cond sub pod time code
1             # Licensed to the Apache Software Foundation (ASF) under one or more
2             # contributor license agreements. See the NOTICE file distributed with
3             # this work for additional information regarding copyright ownership.
4             # The ASF licenses this file to You under the Apache License, Version 2.0
5             # (the "License"); you may not use this file except in compliance with
6             # the License. You may obtain a copy of the License at
7             #
8             # http://www.apache.org/licenses/LICENSE-2.0
9             #
10             # Unless required by applicable law or agreed to in writing, software
11             # distributed under the License is distributed on an "AS IS" BASIS,
12             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13             # See the License for the specific language governing permissions and
14             # limitations under the License.
15              
16 47     47   503782 use strict;
  47         68  
  47         1167  
17 47     47   158 use warnings;
  47         48  
  47         2547  
18              
19             package Lucy::Test::TestUtils;
20              
21             our $VERSION = '0.006000_002';
22             $VERSION = eval $VERSION;
23              
24 47     47   160 use Exporter 'import';
  47         60  
  47         2235  
25             our @EXPORT_OK = qw(
26             working_dir
27             create_working_dir
28             remove_working_dir
29             uscon_dir
30             create_index
31             create_uscon_index
32             test_index_loc
33             persistent_test_index_loc
34             init_test_index_loc
35             get_uscon_docs
36             utf8_test_strings
37             test_analyzer
38             doc_ids_from_td_coll
39             modulo_set
40             );
41              
42 47     47   9671 use Lucy;
  47         72  
  47         1556  
43 47     47   12595 use Lucy::Test;
  47         76  
  47         1208  
44 47     47   16392 use File::Spec::Functions qw( catdir catfile curdir updir );
  47         22915  
  47         2817  
45 47     47   21656 use Encode qw( _utf8_off );
  47         333284  
  47         2707  
46 47     47   234 use File::Path qw( rmtree );
  47         52  
  47         2244  
47 47     47   192 use Carp;
  47         49  
  47         52446  
48              
49             my $working_dir = catfile( curdir(), 'lucy_test' );
50              
51             # Return a directory within the system's temp directory where we will put all
52             # testing scratch files.
53 3     3 0 45 sub working_dir {$working_dir}
54              
55             sub create_working_dir {
56 1 50   1 0 93 mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!";
57             }
58              
59             # Verify that this user owns the working dir, then zap it. Returns true upon
60             # success.
61             sub remove_working_dir {
62 2 100   2 0 106 return unless -d $working_dir;
63 1         2002 rmtree $working_dir;
64 1         4 return 1;
65             }
66              
67             # Return a location for a test index to be used by a single test file. If
68             # the test file crashes it cannot clean up after itself, so we put the cleanup
69             # routine in a single test file to be run at or near the end of the test
70             # suite.
71             sub test_index_loc {
72 9     9 0 60 return catdir( $working_dir, 'test_index' );
73             }
74              
75             # Return a location for a test index intended to be shared by multiple test
76             # files. It will be cleaned as above.
77             sub persistent_test_index_loc {
78 3     3 0 720 return catdir( $working_dir, 'persistent_test_index' );
79             }
80              
81             # Destroy anything left over in the test_index location, then create the
82             # directory. Finally, return the path.
83             sub init_test_index_loc {
84 9     9 0 42461 my $dir = test_index_loc();
85 9         6635 rmtree $dir;
86 9 50       127 die "Can't clean up '$dir'" if -e $dir;
87 9 50       452 mkdir $dir or die "Can't mkdir '$dir': $!";
88 9         31 return $dir;
89             }
90              
91             # Build a RAM index, using the supplied array of strings as source material.
92             # The index will have a single field: "content".
93             sub create_index {
94 26     26 0 119630 my $folder = Lucy::Store::RAMFolder->new;
95 26         1095 my $indexer = Lucy::Index::Indexer->new(
96             index => $folder,
97             schema => Lucy::Test::TestSchema->new,
98             );
99 26         22099 $indexer->add_doc( { content => $_ } ) for @_;
100 26         133053 $indexer->commit;
101 26         1338 return $folder;
102             }
103              
104             sub uscon_dir {
105 4     4 0 430 my @dirs = (
106             catdir('sample', 'us_constitution'),
107             catdir(updir(), 'common', 'sample', 'us_constitution'),
108             );
109              
110 4         14 for my $dir (@dirs) {
111 4 50       72 return $dir if -d $dir;
112             }
113              
114 0         0 die("uscon source dir not found");
115             }
116              
117             # Slurp us constitition docs and build hashrefs.
118             sub get_uscon_docs {
119              
120 2     2 0 11 my $uscon_dir = uscon_dir();
121 2 50       99 opendir( my $uscon_dh, $uscon_dir )
122             or die "couldn't opendir '$uscon_dir': $!";
123 2         196 my @filenames = grep {/\.txt$/} sort readdir $uscon_dh;
  112         134  
124 2 50       46 closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!";
125              
126 2         4 my %docs;
127              
128 2         4 for my $filename (@filenames) {
129 104         247 my $filepath = catfile( $uscon_dir, $filename );
130 104 50       1798 open( my $fh, '<', $filepath )
131             or die "couldn't open file '$filepath': $!";
132 104         77 my $content = do { local $/; <$fh> };
  104         216  
  104         864  
133 104 50       368 $content =~ /\A(.+?)^\s+(.*)/ms
134             or die "Can't extract title/bodytext from '$filepath'";
135 104         137 my $title = $1;
136 104         136 my $bodytext = $2;
137 104         4382 $bodytext =~ s/\s+/ /sg;
138 104 50       234 my $category
    100          
    100          
139             = $filename =~ /art/ ? 'article'
140             : $filename =~ /amend/ ? 'amendment'
141             : $filename =~ /preamble/ ? 'preamble'
142             : confess "Can't derive category for $filename";
143              
144 104         796 $docs{$filename} = {
145             title => $title,
146             bodytext => $bodytext,
147             url => "/us_constitution/$filename",
148             category => $category,
149             };
150             }
151              
152 2         14 return \%docs;
153             }
154              
155             sub _uscon_schema {
156 1     1   25 my $schema = Lucy::Plan::Schema->new;
157 1         30 my $analyzer = Lucy::Analysis::EasyAnalyzer->new( language => 'en' );
158 1         19 my $title_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, );
159 1         6 my $content_type = Lucy::Plan::FullTextType->new(
160             analyzer => $analyzer,
161             highlightable => 1,
162             );
163 1         7 my $url_type = Lucy::Plan::StringType->new( indexed => 0, );
164 1         3 my $cat_type = Lucy::Plan::StringType->new;
165 1         12 $schema->spec_field( name => 'title', type => $title_type );
166 1         28 $schema->spec_field( name => 'content', type => $content_type );
167 1         4 $schema->spec_field( name => 'url', type => $url_type );
168 1         4 $schema->spec_field( name => 'category', type => $cat_type );
169 1         9 return $schema;
170             }
171              
172             sub create_uscon_index {
173 1     1 0 2 my $folder
174             = Lucy::Store::FSFolder->new( path => persistent_test_index_loc() );
175 1         5 my $indexer = Lucy::Index::Indexer->new(
176             schema => _uscon_schema(),
177             index => $folder,
178             truncate => 1,
179             create => 1,
180             );
181              
182 1         74750 $indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 );
183 1         73126 $indexer->commit;
184 1         89 undef $indexer;
185              
186 1         10 $indexer = Lucy::Index::Indexer->new( index => $folder );
187 1         5 my $source_docs = get_uscon_docs();
188             $indexer->add_doc( { content => $_->{bodytext} } )
189 1         12361 for values %$source_docs;
190 1         8771 $indexer->commit;
191 1         45 undef $indexer;
192              
193 1         12 $indexer = Lucy::Index::Indexer->new( index => $folder );
194 1         9 my @chars = ( 'a' .. 'z' );
195 1         5 for ( 0 .. 1000 ) {
196 1001         970 my $content = '';
197 1001         1543 for my $num_words ( 1 .. int( rand(20) ) ) {
198 9422         8723 for ( 1 .. ( int( rand(10) ) + 10 ) ) {
199 136869         106869 $content .= @chars[ rand(@chars) ];
200             }
201 9422         6123 $content .= ' ';
202             }
203 1001         37080 $indexer->add_doc( { content => $content } );
204             }
205 1         6 $indexer->optimize;
206 1         220907 $indexer->commit;
207             }
208              
209             # Return 3 strings useful for verifying UTF-8 integrity.
210             sub utf8_test_strings {
211 4     4 0 79395 my $smiley = "\x{263a}";
212 4         6 my $not_a_smiley = $smiley;
213 4         33 _utf8_off($not_a_smiley);
214 4         7 my $frowny = $not_a_smiley;
215 4         11 utf8::upgrade($frowny);
216 4         12 return ( $smiley, $not_a_smiley, $frowny );
217             }
218              
219             # Verify an Analyzer's transform, transform_text, and split methods.
220             sub test_analyzer {
221 11     11 0 2702 my ( $analyzer, $source, $expected, $message ) = @_;
222              
223 11         128 my $inversion = Lucy::Analysis::Inversion->new( text => $source );
224 11         214 $inversion = $analyzer->transform($inversion);
225 11         37 my @got;
226 11         141 while ( my $token = $inversion->next ) {
227 18         94 push @got, $token->get_text;
228             }
229 11         67 Test::More::is_deeply( \@got, $expected, "analyze: $message" );
230              
231 11         5554 $inversion = $analyzer->transform_text($source);
232 11         124 @got = ();
233 11         76 while ( my $token = $inversion->next ) {
234 18         85 push @got, $token->get_text;
235             }
236 11         43 Test::More::is_deeply( \@got, $expected, "transform_text: $message" );
237              
238 11         3989 @got = @{ $analyzer->split($source) };
  11         244  
239 11         61 Test::More::is_deeply( \@got, $expected, "split: $message" );
240             }
241              
242             # Extract all doc nums from a SortCollector. Return two sorted array refs:
243             # by_score and by_id.
244             sub doc_ids_from_td_coll {
245 1484     1484 0 255287 my $collector = shift;
246 1484         1398 my @by_score;
247 1484         15296 my $match_docs = $collector->pop_match_docs;
248 24842         30596 my @by_score_then_id = map { $_->get_doc_id }
249             sort {
250 1484 50       3897 $b->get_score <=> $a->get_score
  27290         87595  
251             || $a->get_doc_id <=> $b->get_doc_id
252             } @$match_docs;
253 1484         2488 my @by_id = sort { $a <=> $b } @by_score_then_id;
  50985         31460  
254 1484         14993 return ( \@by_score_then_id, \@by_id );
255             }
256              
257             # Use a modulus to generate a set of numbers.
258             sub modulo_set {
259 3973     3973 0 1641624 my ( $interval, $max ) = @_;
260 3973         2963 my @out;
261 3973         6485 for ( my $doc = $interval; $doc < $max; $doc += $interval ) {
262 53362         68676 push @out, $doc;
263             }
264 3973         6900 return \@out;
265             }
266              
267             1;
268              
269             __END__