|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Licensed to the Apache Software Foundation (ASF) under one or more  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # contributor license agreements.  See the NOTICE file distributed with  | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # this work for additional information regarding copyright ownership.  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # The ASF licenses this file to You under the Apache License, Version 2.0  | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (the "License"); you may not use this file except in compliance with  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # the License.  You may obtain a copy of the License at  | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #     http://www.apache.org/licenses/LICENSE-2.0  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Unless required by applicable law or agreed to in writing, software  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # distributed under the License is distributed on an "AS IS" BASIS,  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # See the License for the specific language governing permissions and  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # limitations under the License.  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
503782
 | 
 use strict;  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
68
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1167
 | 
    | 
| 
17
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
158
 | 
 use warnings;  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
48
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2547
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Lucy::Test::TestUtils;  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '0.006000_002';  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 $VERSION = eval $VERSION;  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
24
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
160
 | 
 use Exporter 'import';  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
60
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2235
 | 
    | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @EXPORT_OK = qw(  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     working_dir  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     create_working_dir  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     remove_working_dir  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     uscon_dir  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     create_index  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     create_uscon_index  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     test_index_loc  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     persistent_test_index_loc  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     init_test_index_loc  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     get_uscon_docs  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     utf8_test_strings  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     test_analyzer  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     doc_ids_from_td_coll  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     modulo_set  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
42
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
9671
 | 
 use Lucy;  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
72
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1556
 | 
    | 
| 
43
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
12595
 | 
 use Lucy::Test;  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
76
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1208
 | 
    | 
| 
44
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
16392
 | 
 use File::Spec::Functions qw( catdir catfile curdir updir );  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22915
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2817
 | 
    | 
| 
45
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
21656
 | 
 use Encode qw( _utf8_off );  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
333284
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2707
 | 
    | 
| 
46
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
234
 | 
 use File::Path qw( rmtree );  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2244
 | 
    | 
| 
47
 | 
47
 | 
 
 | 
 
 | 
  
47
  
 | 
 
 | 
192
 | 
 use Carp;  | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
49
 | 
    | 
| 
 
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52446
 | 
    | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $working_dir = catfile( curdir(), 'lucy_test' );  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Return a directory within the system's temp directory where we will put all  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # testing scratch files.  | 
| 
53
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
0
  
 | 
45
 | 
 sub working_dir {$working_dir}  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub create_working_dir {  | 
| 
56
 | 
1
 | 
  
 50
  
 | 
 
 | 
  
1
  
 | 
  
0
  
 | 
93
 | 
     mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!";  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Verify that this user owns the working dir, then zap it.  Returns true upon  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # success.  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub remove_working_dir {  | 
| 
62
 | 
2
 | 
  
100
  
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
106
 | 
     return unless -d $working_dir;  | 
| 
63
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2002
 | 
     rmtree $working_dir;  | 
| 
64
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     return 1;  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Return a location for a test index to be used by a single test file.  If  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # the test file crashes it cannot clean up after itself, so we put the cleanup  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # routine in a single test file to be run at or near the end of the test  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # suite.  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub test_index_loc {  | 
| 
72
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
  
0
  
 | 
60
 | 
     return catdir( $working_dir, 'test_index' );  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Return a location for a test index intended to be shared by multiple test  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # files.  It will be cleaned as above.  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub persistent_test_index_loc {  | 
| 
78
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
0
  
 | 
720
 | 
     return catdir( $working_dir, 'persistent_test_index' );  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Destroy anything left over in the test_index location, then create the  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # directory.  Finally, return the path.  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub init_test_index_loc {  | 
| 
84
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
  
0
  
 | 
42461
 | 
     my $dir = test_index_loc();  | 
| 
85
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6635
 | 
     rmtree $dir;  | 
| 
86
 | 
9
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
127
 | 
     die "Can't clean up '$dir'" if -e $dir;  | 
| 
87
 | 
9
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
452
 | 
     mkdir $dir or die "Can't mkdir '$dir': $!";  | 
| 
88
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
     return $dir;  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Build a RAM index, using the supplied array of strings as source material.  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # The index will have a single field: "content".  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub create_index {  | 
| 
94
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
  
0
  
 | 
119630
 | 
     my $folder  = Lucy::Store::RAMFolder->new;  | 
| 
95
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1095
 | 
     my $indexer = Lucy::Index::Indexer->new(  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         index  => $folder,  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         schema => Lucy::Test::TestSchema->new,  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
99
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22099
 | 
     $indexer->add_doc( { content => $_ } ) for @_;  | 
| 
100
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
133053
 | 
     $indexer->commit;  | 
| 
101
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1338
 | 
     return $folder;  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub uscon_dir {  | 
| 
105
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
0
  
 | 
430
 | 
     my @dirs = (  | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         catdir('sample', 'us_constitution'),  | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         catdir(updir(), 'common', 'sample', 'us_constitution'),  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
110
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     for my $dir (@dirs) {  | 
| 
111
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
72
 | 
         return $dir if -d $dir;  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
114
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     die("uscon source dir not found");  | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Slurp us constitition docs and build hashrefs.  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_uscon_docs {  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
120
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
11
 | 
     my $uscon_dir = uscon_dir();  | 
| 
121
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
99
 | 
     opendir( my $uscon_dh, $uscon_dir )  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         or die "couldn't opendir '$uscon_dir': $!";  | 
| 
123
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
196
 | 
     my @filenames = grep {/\.txt$/} sort readdir $uscon_dh;  | 
| 
 
 | 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
134
 | 
    | 
| 
124
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
46
 | 
     closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!";  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
126
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     my %docs;  | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
128
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     for my $filename (@filenames) {  | 
| 
129
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
247
 | 
         my $filepath = catfile( $uscon_dir, $filename );  | 
| 
130
 | 
104
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1798
 | 
         open( my $fh, '<', $filepath )  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             or die "couldn't open file '$filepath': $!";  | 
| 
132
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
77
 | 
         my $content = do { local $/; <$fh> };  | 
| 
 
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
216
 | 
    | 
| 
 
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
864
 | 
    | 
| 
133
 | 
104
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
368
 | 
         $content =~ /\A(.+?)^\s+(.*)/ms  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             or die "Can't extract title/bodytext from '$filepath'";  | 
| 
135
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
137
 | 
         my $title    = $1;  | 
| 
136
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
136
 | 
         my $bodytext = $2;  | 
| 
137
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4382
 | 
         $bodytext =~ s/\s+/ /sg;  | 
| 
138
 | 
104
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
234
 | 
         my $category  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             = $filename =~ /art/      ? 'article'  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             : $filename =~ /amend/    ? 'amendment'  | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             : $filename =~ /preamble/ ? 'preamble'  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             :   confess "Can't derive category for $filename";  | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
144
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
796
 | 
         $docs{$filename} = {  | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             title    => $title,  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             bodytext => $bodytext,  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             url      => "/us_constitution/$filename",  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             category => $category,  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         };  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
152
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     return \%docs;  | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _uscon_schema {  | 
| 
156
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
25
 | 
     my $schema     = Lucy::Plan::Schema->new;  | 
| 
157
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
     my $analyzer   = Lucy::Analysis::EasyAnalyzer->new( language => 'en' );  | 
| 
158
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
     my $title_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, );  | 
| 
159
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my $content_type = Lucy::Plan::FullTextType->new(  | 
| 
160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         analyzer      => $analyzer,  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         highlightable => 1,  | 
| 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
163
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     my $url_type = Lucy::Plan::StringType->new( indexed => 0, );  | 
| 
164
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     my $cat_type = Lucy::Plan::StringType->new;  | 
| 
165
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     $schema->spec_field( name => 'title',    type => $title_type );  | 
| 
166
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
     $schema->spec_field( name => 'content',  type => $content_type );  | 
| 
167
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     $schema->spec_field( name => 'url',      type => $url_type );  | 
| 
168
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     $schema->spec_field( name => 'category', type => $cat_type );  | 
| 
169
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     return $schema;  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub create_uscon_index {  | 
| 
173
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
0
  
 | 
2
 | 
     my $folder  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         = Lucy::Store::FSFolder->new( path => persistent_test_index_loc() );  | 
| 
175
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     my $indexer = Lucy::Index::Indexer->new(  | 
| 
176
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         schema   => _uscon_schema(),  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         index    => $folder,  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         truncate => 1,  | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         create   => 1,  | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
182
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
74750
 | 
     $indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 );  | 
| 
183
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
73126
 | 
     $indexer->commit;  | 
| 
184
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
89
 | 
     undef $indexer;  | 
| 
185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
186
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
     $indexer = Lucy::Index::Indexer->new( index => $folder );  | 
| 
187
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     my $source_docs = get_uscon_docs();  | 
| 
188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $indexer->add_doc( { content => $_->{bodytext} } )  | 
| 
189
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12361
 | 
         for values %$source_docs;  | 
| 
190
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8771
 | 
     $indexer->commit;  | 
| 
191
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
     undef $indexer;  | 
| 
192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
193
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     $indexer = Lucy::Index::Indexer->new( index => $folder );  | 
| 
194
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     my @chars = ( 'a' .. 'z' );  | 
| 
195
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     for ( 0 .. 1000 ) {  | 
| 
196
 | 
1001
 | 
 
 | 
 
 | 
 
 | 
 
 | 
970
 | 
         my $content = '';  | 
| 
197
 | 
1001
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1543
 | 
         for my $num_words ( 1 .. int( rand(20) ) ) {  | 
| 
198
 | 
9422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8723
 | 
             for ( 1 .. ( int( rand(10) ) + 10 ) ) {  | 
| 
199
 | 
136869
 | 
 
 | 
 
 | 
 
 | 
 
 | 
106869
 | 
                 $content .= @chars[ rand(@chars) ];  | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
201
 | 
9422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6123
 | 
             $content .= ' ';  | 
| 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
203
 | 
1001
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37080
 | 
         $indexer->add_doc( { content => $content } );  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
205
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     $indexer->optimize;  | 
| 
206
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
220907
 | 
     $indexer->commit;  | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Return 3 strings useful for verifying UTF-8 integrity.  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub utf8_test_strings {  | 
| 
211
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
0
  
 | 
79395
 | 
     my $smiley       = "\x{263a}";  | 
| 
212
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my $not_a_smiley = $smiley;  | 
| 
213
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
     _utf8_off($not_a_smiley);  | 
| 
214
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     my $frowny = $not_a_smiley;  | 
| 
215
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
     utf8::upgrade($frowny);  | 
| 
216
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     return ( $smiley, $not_a_smiley, $frowny );  | 
| 
217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Verify an Analyzer's transform, transform_text, and split methods.  | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub test_analyzer {  | 
| 
221
 | 
11
 | 
 
 | 
 
 | 
  
11
  
 | 
  
0
  
 | 
2702
 | 
     my ( $analyzer, $source, $expected, $message ) = @_;  | 
| 
222
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
223
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
128
 | 
     my $inversion = Lucy::Analysis::Inversion->new( text => $source );  | 
| 
224
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
214
 | 
     $inversion = $analyzer->transform($inversion);  | 
| 
225
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
     my @got;  | 
| 
226
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
141
 | 
     while ( my $token = $inversion->next ) {  | 
| 
227
 | 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
94
 | 
         push @got, $token->get_text;  | 
| 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
229
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
67
 | 
     Test::More::is_deeply( \@got, $expected, "analyze: $message" );  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
231
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5554
 | 
     $inversion = $analyzer->transform_text($source);  | 
| 
232
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
124
 | 
     @got       = ();  | 
| 
233
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
76
 | 
     while ( my $token = $inversion->next ) {  | 
| 
234
 | 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
85
 | 
         push @got, $token->get_text;  | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
236
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43
 | 
     Test::More::is_deeply( \@got, $expected, "transform_text: $message" );  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
238
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3989
 | 
     @got = @{ $analyzer->split($source) };  | 
| 
 
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
244
 | 
    | 
| 
239
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
     Test::More::is_deeply( \@got, $expected, "split: $message" );  | 
| 
240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Extract all doc nums from a SortCollector.  Return two sorted array refs:  | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # by_score and by_id.  | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub doc_ids_from_td_coll {  | 
| 
245
 | 
1484
 | 
 
 | 
 
 | 
  
1484
  
 | 
  
0
  
 | 
255287
 | 
     my $collector = shift;  | 
| 
246
 | 
1484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1398
 | 
     my @by_score;  | 
| 
247
 | 
1484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15296
 | 
     my $match_docs = $collector->pop_match_docs;  | 
| 
248
 | 
24842
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30596
 | 
     my @by_score_then_id = map { $_->get_doc_id }  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         sort {  | 
| 
250
 | 
1484
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3897
 | 
                $b->get_score <=> $a->get_score  | 
| 
 
 | 
27290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
87595
 | 
    | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $a->get_doc_id <=> $b->get_doc_id  | 
| 
252
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } @$match_docs;  | 
| 
253
 | 
1484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2488
 | 
     my @by_id = sort { $a <=> $b } @by_score_then_id;  | 
| 
 
 | 
50985
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31460
 | 
    | 
| 
254
 | 
1484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14993
 | 
     return ( \@by_score_then_id, \@by_id );  | 
| 
255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Use a modulus to generate a set of numbers.  | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub modulo_set {  | 
| 
259
 | 
3973
 | 
 
 | 
 
 | 
  
3973
  
 | 
  
0
  
 | 
1641624
 | 
     my ( $interval, $max ) = @_;  | 
| 
260
 | 
3973
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2963
 | 
     my @out;  | 
| 
261
 | 
3973
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6485
 | 
     for ( my $doc = $interval; $doc < $max; $doc += $interval ) {  | 
| 
262
 | 
53362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
68676
 | 
         push @out, $doc;  | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
264
 | 
3973
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6900
 | 
     return \@out;  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |