File Coverage

blib/lib/FAQ/OMatic/Words.pm
Criterion Covered Total %
statement 7 71 9.8
branch 1 12 8.3
condition n/a
subroutine 3 11 27.2
pod 0 8 0.0
total 11 102 10.7


line stmt bran cond sub pod time code
1             ##############################################################################
2             # The Faq-O-Matic is Copyright 1997 by Jon Howell, all rights reserved. #
3             # #
4             # This program is free software; you can redistribute it and/or #
5             # modify it under the terms of the GNU General Public License #
6             # as published by the Free Software Foundation; either version 2 #
7             # of the License, or (at your option) any later version. #
8             # #
9             # This program is distributed in the hope that it will be useful, #
10             # but WITHOUT ANY WARRANTY; without even the implied warranty of #
11             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
12             # GNU General Public License for more details. #
13             # #
14             # You should have received a copy of the GNU General Public License #
15             # along with this program; if not, write to the Free Software #
16             # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.#
17             # #
18             # Jon Howell can be contacted at: #
19             # 6211 Sudikoff Lab, Dartmouth College #
20             # Hanover, NH 03755-3510 #
21             # jonh@cs.dartmouth.edu #
22             # #
23             # An electronic copy of the GPL is available at: #
24             # http://www.gnu.org/copyleft/gpl.html #
25             # #
26             ##############################################################################
27              
28 1     1   5 use strict;
  1         1  
  1         30  
29 1     1   733 use locale;
  1         199  
  1         4  
30              
31             ### Words.pm
32             ###
33             ### Support for extracting "words" from strings
34             ###
35             ### To change these routines to support other character sets,
36             ### copy this file to a location outside of the FAQ::OMatic tree and
37             ### add the following lines to the start of your cgi-bin/fom file:
38             ### use lib '/Whatever/your/directory/path/is';
39             ### require Words;
40             ### #existing use lib line
41             ### use FAQ::OMatic::Words
42             ### This will override the definitions in this file.
43              
44              
45             package FAQ::OMatic::Words;
46              
47             BEGIN {
48             # This code use Japanese environment only.
49             # see http://chasen.aist-nara.ac.jp/index.html.en
50             #
51 1 50   1   75 if (FAQ::OMatic::I18N::language() eq 'ja_JP.EUC') {
52 0           require Text::ChaSen; import Text::ChaSen;
  0            
53 0           &Text::ChaSen::getopt_argv('faq-omatic', '-j', '-F', '%m ');
54             }
55             }
56              
57             sub cannonical {
58 0     0 0   my $string = shift;
59              
60             # convert the input string into cannonical form.
61             #
62             # The default is to strip parenthesis and apostrophies, and
63             # convert to ASCII lower case.
64             #
65             # If you use another character set (e.g. ISO-8859-?), you'll want
66             # to override to do correct lower case handling.
67             #
68             # This routine is called both when the indicies are created and
69             # when the search pattern is formed, so things will be done
70             # consistantly.
71              
72             # convert
73             # timer(s) to timers
74             # timer's to timers
75             # e-mail to email
76 0           $string =~ s/[()'-]//g;
77 0           $string = lc($string); # convert to lower case
78              
79 0 0         if (FAQ::OMatic::I18N::language() eq 'hu') {
80             # Accentuated lc(),
81 0           $string =~ tr/\301\311\315\323\326\325\332\334\333/\341\351\355\363\366\365\372\374\373/;
82             }
83              
84 0           $string;
85             }
86              
87             sub getWords {
88 0     0 0   my $string = shift;
89 0           my $encode_lang = FAQ::OMatic::I18N::language();
90             #EUC-JP case
91 0 0         return getWordsEUCJP($string) if($encode_lang eq "ja_JP.EUC");
92             # Hungarian case
93 0 0         return getWordshu($string) if($encode_lang eq 'hu');
94             #normal case
95 0           return getWordsSB($string);
96             }
97              
98             sub getWordsSB {
99 0     0 0   my $string = shift;
100              
101             # given a user-input string, we break it into "legal" words
102             # and return an array of them
103              
104 0           $string = cannonical( $string );
105              
106 0           my $wordPattern = '[\w-]'; # alphanumeric + '_' + '-'
107              
108             #my @words = ($string =~ m/($wordPattern+)/gso);
109             # /gso seems to break in some circumstances. :v(
110 0           my @wordspl = split(/($wordPattern+)/, $string);
111 0           my @words=();
112 0           my $i;
113 0           for ($i=1; $i<@wordspl; $i+=2) {
114 0           push (@words, $wordspl[$i]);
115             }
116 0           return @words;
117              
118             }
119              
120             sub getWordsEUCJP {
121 0     0 0   require Text::ChaSen; import Text::ChaSen;
  0            
122 0           require NKF; import NKF;
  0            
123              
124 0           my $string = shift;
125              
126             # given a user-input string, we break it into "legal" words
127             # and return an array of them
128              
129 0           $string = nkf('-e', $string);
130 0           $string = cannonical( $string );
131              
132 0           my $wordPattern = '[\w-]'; # alphanumeric + '_' + '-'
133              
134 0           my $s = &Text::ChaSen::sparse_tostr($string);
135 0           chomp $s;
136 0           my @words = split / /, $s;
137 0           return @words;
138              
139             }
140              
141             sub getWordshu {
142 0     0 0   my $string = shift;
143              
144             # given a user-input string, we break it into "legal" words
145             # and return an array of them
146              
147 0           $string = cannonical( $string );
148              
149             # pattern for hungarian language:
150 0           my $wordPattern = '[\w\341\351\355\363\366\365\372\374\373-]';
151              
152             #my @words = ($string =~ m/($wordPattern+)/gso);
153             # /gso seems to break in some circumstances. :v(
154 0           my @wordspl = split(/($wordPattern+)/, $string);
155 0           my @words=();
156 0           my $i;
157 0           for ($i=1; $i<@wordspl; $i+=2) {
158 0           push (@words, $wordspl[$i]);
159             }
160 0           return @words;
161             }
162              
163             sub getPrefixes {
164 0     0 0   my $word = shift;
165 0           my $encode_lang = FAQ::OMatic::I18N::language();
166             #EUC-JP case
167 0 0         return getPrefixesEUCJP($word) if($encode_lang eq "ja_JP.EUC");
168             #normal case
169 0           return getPrefixesSB($word);
170             }
171              
172             sub getPrefixesSB {
173 0     0 0   my $word = shift;
174              
175             # given a word, return an array of prefixes which should be
176             # indexed.
177             #
178             # default routine returns all substrings
179 0           my @prefix=();
180 0           my $i = length( $word );
181 0           while( $i ) {
182 0           push @prefix, substr( $word, 0, $i-- );
183             }
184              
185 0           @prefix;
186             }
187              
188             ## Japanese EUC-JP multibyte extended getPrefixes by oota ##
189             sub getPrefixesEUCJP {
190 0     0 0   my $word = shift;
191              
192             # given a word, return an array of prefixes which should be
193             # indexed.
194             #
195             # default routine returns all substrings
196 0           my @prefix=();
197 0           my $i = 1;
198 0           while( $i <= length( $word )) {
199 0 0         if(ord(substr($word,$i-1,1)) >= 128) {
200 0           push @prefix, substr( $word, 0, $i+1 );
201 0           $i += 2;
202             } else {
203 0           push @prefix, substr( $word, 0, $i );
204 0           $i += 1;
205             }
206             }
207              
208 0           reverse @prefix;
209             }
210              
211             'true';
212