blib/lib/Search/Circa/Parser.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 48 | 197 | 24.3 |
branch | 2 | 102 | 1.9 |
condition | 5 | 87 | 5.7 |
subroutine | 10 | 15 | 66.6 |
pod | 7 | 7 | 100.0 |
total | 72 | 408 | 17.6 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Search::Circa::Parser; | ||||||
2 | |||||||
3 | # module Circa::Parser : See Circa::Indexer | ||||||
4 | # Copyright 2000 A.Barbet alian@alianwebserver.com. All rights reserved. | ||||||
5 | |||||||
6 | # $Log: Parser.pm,v $ | ||||||
7 | # Revision 1.27 2003/01/02 00:32:40 alian | ||||||
8 | # Add url found with meta http-equiv refresh | ||||||
9 | # | ||||||
10 | # Revision 1.26 2002/12/31 09:58:52 alian | ||||||
11 | # Use hash in place of list in look_at | ||||||
12 | # Call analyse in each text call in place of global var TEXT | ||||||
13 | # Update POD doc | ||||||
14 | # | ||||||
15 | # Revision 1.25 2002/12/29 14:35:10 alian | ||||||
16 | # Some minor fixe suite to last update | ||||||
17 | # | ||||||
18 | # Revision 1.24 2002/12/29 03:18:37 alian | ||||||
19 | # Update POD documentation | ||||||
20 | # | ||||||
21 | # Revision 1.23 2002/12/29 00:36:30 alian | ||||||
22 | # Add undef %insite => dangerous global var ... | ||||||
23 | # | ||||||
24 | # Revision 1.22 2002/12/28 22:23:59 alian | ||||||
25 | # Some optimization after bench | ||||||
26 | # | ||||||
27 | # Revision 1.21 2002/12/28 12:36:02 alian | ||||||
28 | # Ajout phase pour ne pas analyser les mots d'un sommaire | ||||||
29 | # | ||||||
30 | # Revision 1.20 2002/12/27 12:55:43 alian | ||||||
31 | # Use ref in analyse, update stopwords | ||||||
32 | |||||||
33 | 8 | 8 | 14256 | use strict; | |||
8 | 21 | ||||||
8 | 318 | ||||||
34 | 8 | 8 | 24084 | use URI::URL; | |||
8 | 180790 | ||||||
8 | 625 | ||||||
35 | 8 | 8 | 102 | use URI::WithBase; | |||
8 | 19 | ||||||
8 | 175 | ||||||
36 | 8 | 8 | 2586 | use DBI; | |||
8 | 136433 | ||||||
8 | 338 | ||||||
37 | 8 | 8 | 10526 | use LWP::RobotUA; | |||
8 | 1499655 | ||||||
8 | 388 | ||||||
38 | 8 | 8 | 94 | use Carp qw/cluck/; | |||
8 | 18 | ||||||
8 | 719 | ||||||
39 | 8 | 6150 | use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION | ||||
40 | 8 | 8 | 48 | %links %inside $RM $DESCRIPTION $KEYWORDS $facteur_full_text); | |||
8 | 19 | ||||||
41 | |||||||
42 | require Exporter; | ||||||
43 | |||||||
44 | @ISA = qw(Exporter); | ||||||
45 | @EXPORT = qw(); | ||||||
46 | $VERSION = ('$Revision: 1.27 $ ' =~ /(\d+\.\d+)/)[0]; | ||||||
47 | |||||||
48 | # stopwords | ||||||
49 | my %bad = map {$_ => 1} qw ( | ||||||
50 | able about above according across actually after afterwards again against ago | ||||||
51 | all almost already also althought altogether always among amongst and another | ||||||
52 | any anyhow anyone anything anyway anywhere apart are aren around aside away | ||||||
53 | back because been before beforehand behind being below beneath beside besides | ||||||
54 | between beyond but came can cannot come could couldn currently did didn | ||||||
55 | directly does doing don done down downward during each easily else elsewhere | ||||||
56 | enough especially even ever every everybody everyone everything everywhere | ||||||
57 | exactly except far farther few fewer find five for formerly forth found four | ||||||
58 | frequently from full fully further generally get gets give given going gone | ||||||
59 | gonna got gotten had hardly has have having height hence her here hereafter | ||||||
60 | hereby herein hereupon hers herself him himself his hope how however | ||||||
61 | immediatly including indeed inside instead into inward isn its itself just | ||||||
62 | know largely last lately later latest least leave less lesser let lets like | ||||||
63 | liked likes likewise little lot lower made mainly make making many may maybe | ||||||
64 | means meantime meanwhile might mine more moreover most mostly mrs much must | ||||||
65 | myself namely near necessarily neither never nevertheless nine nobody none | ||||||
66 | nonetheless nor not nothing now nowhere often once one only onto other others | ||||||
67 | otherwise ought our ours ourself ourselves out outside over overall own per | ||||||
68 | perform perharps please previous previously prior probably provide providing | ||||||
69 | quickly quite rather read ready really recently require roughly said same say | ||||||
70 | see sent seven several shall shan she should shouldn simply since six slightly | ||||||
71 | some somebody somehow someone something sometime sometimes somewhat somewhere | ||||||
72 | soon still strictly such take ten than thanks that the their theirs them | ||||||
73 | themselves then thence there thereafter thereby therefore therein thereupon | ||||||
74 | these they think this those though three through thru thus thusly timely | ||||||
75 | together too took top toward towards truly two unable under unless unlike | ||||||
76 | unlikely until upon upward upwards use used using usually various very wanna | ||||||
77 | want was wasn well went were weren what whatever when whence whenever where | ||||||
78 | whereabouts whereafter whereas whereby wherefor wherein whereis whereupon | ||||||
79 | wherever whether which whichever while whither who whoever whole whom | ||||||
80 | whomever whose why will with within without worth worthy would wouldn yes | ||||||
81 | yet you your yours yourself yourselves | ||||||
82 | |||||||
83 | afin ailleurs ainsi ais ait alors aucun aucune aucunes aucuns auparavant auquel | ||||||
84 | assez aussi autour autre autres aux auxquelles avait avant avec avoir beaucoup | ||||||
85 | bien car ceci cela celle celui cependant certain certaine certaines certains | ||||||
86 | ces cet cette ceux chacun chacune chacunes chaque chez cinq combien comme | ||||||
87 | comment contre dans dedans depuis des desquelles desquels deux dire dit dix | ||||||
88 | doit donc dont duquel elle elles encore enfin entre environ est etc eux faire | ||||||
89 | fait faut fit fut huit ici ils jamais laquelle lequel lequels les lesquelles | ||||||
90 | lesquels leur leurs lors lorsque lui maintenant mais mes moi moins mon neuf | ||||||
91 | non nos notre nous ont oui par parce parfois pas peu peut plus plusieurs pour | ||||||
92 | pourquoi pourtant puis quand quant quatre que quel quelconque quelle quelles | ||||||
93 | quelque quelquefois quelques quels qui quoi quoique sans sept ses sinon six | ||||||
94 | soit son sont soudain sous suis sur tandis tant tel telle tels tes toi ton | ||||||
95 | toujours tous tout toute toutes toutefois toutes trois une vers veut voici | ||||||
96 | voir vos votre vous \$ { & com/ + www html htm file/); | ||||||
97 | |||||||
98 | #------------------------------------------------------------------------------ | ||||||
99 | # new | ||||||
100 | #------------------------------------------------------------------------------ | ||||||
101 | sub new { | ||||||
102 | 1 | 1 | 1 | 10 | my $class = shift; | ||
103 | 1 | 2 | my $self = {}; | ||||
104 | 1 | 2 | my $indexer = shift; | ||||
105 | 1 | 10 | $indexer->trace(5, "Search::Circa::Parser::new\n"); | ||||
106 | 1 | 3 | bless $self, $class; | ||||
107 | 1 | 6 | $self->{DBH} = $indexer->{DBH}; | ||||
108 | 1 | 2 | $self->{ConfigMoteur} = $indexer->{ConfigMoteur}; | ||||
109 | 1 | 3 | while (my ($n,$v)=each(%{$indexer->{ConfigMoteur}})) | ||||
11 | 34 | ||||||
110 | 10 | 30 | { $indexer->trace(4, "\t$n => $v"); } | ||||
111 | 1 | 3 | $self->{INDEXER} = $indexer; | ||||
112 | 1 | 3 | $facteur_full_text = $self->{ConfigMoteur}->{'facteur_full_text'}; | ||||
113 | # Ce module n'est presque jamais installé ! | ||||||
114 | # Evidemment cela demande une charge machine et un .so | ||||||
115 | # compilé pour cet environnement. Ca fait peur aux admin | ||||||
116 | # ISP ! On encapsule donc l'appel, si on echoue, on previent que | ||||||
117 | # tout appel au parser se soldera par une utilisation d'un parseur basic | ||||||
118 | # sans handicaper le reste de l'application | ||||||
119 | # Il vous reste plus qu'a faire alors une install mysql/circa en local | ||||||
120 | # pour faire l'indexation, et exporter les resultats sur le serveur final. | ||||||
121 | 1 | 3 | $self->{_parser_ok}=1; | ||||
122 | 1 | 2 | eval { require HTML::Parser }; | ||||
1 | 11 | ||||||
123 | 1 | 50 | 33 | 10 | if ($@ || $HTML::Parser::VERSION < 3.0) { | ||
124 | 0 | 0 | warn "Module HTML-Parser 3.0 ou superieur requis pour ". | ||||
125 | "utiliser les fonctionnalités optimales du parser.($@)\n"; | ||||||
126 | 0 | 0 | $self->{_parser_ok}=0; | ||||
127 | } | ||||||
128 | 8 | 8 | 9110 | else { use HTML::Entities; } | |||
8 | 87188 | ||||||
8 | 44736 | ||||||
129 | 1 | 5 | $self->{INDEXER}->trace(1,"Parser::new"); | ||||
130 | 1 | 4 | return $self; | ||||
131 | } | ||||||
132 | |||||||
133 | #------------------------------------------------------------------------------ | ||||||
134 | # tag | ||||||
135 | #------------------------------------------------------------------------------ | ||||||
136 | sub tag { | ||||||
137 | 0 | 0 | 1 | 0 | my($tag, $num, $att) = @_; # parametre | ||
138 | # Liens exterieurs | ||||||
139 | 0 | 0 | 0 | 0 | if ((lc($tag) eq 'a') and ($$att{href})) {$links{$$att{href}}=1;} | ||
0 | 0 | 0 | 0 | ||||
0 | 0 | 0 | 0 | ||||
0 | 0 | ||||||
140 | # Frame | ||||||
141 | elsif ((lc($tag) eq 'frame') and ($$att{src})) {$links{$$att{src}}=1;} | ||||||
142 | # On est dans le cas d'un meta | ||||||
143 | 0 | 0 | elsif (lc($tag) eq 'meta' and defined(%$att)) { | ||||
144 | 0 | 0 | 0 | 0 | if ($$att{name} and lc($$att{name}) eq 'description') {# Description | ||
0 | 0 | ||||||
0 | 0 | ||||||
145 | 0 | 0 | $DESCRIPTION =$$att{content};} | ||||
146 | elsif ((lc($$att{'http-equiv'}) eq 'keywords') or | ||||||
147 | (lc($$att{name}) eq 'keywords')) {# Mots-clefs | ||||||
148 | 0 | 0 | $KEYWORDS=$$att{content} ;} | ||||
149 | elsif ((lc($$att{'http-equiv'}) eq 'refresh') and | ||||||
150 | ($$att{content}=~/\d*;URL=(.*)$/)) {#url refresh | ||||||
151 | 0 | 0 | $links{$1}=1; } | ||||
152 | } | ||||||
153 | # Area | ||||||
154 | elsif (($tag eq 'area') and ($$att{href})) {$links{$$att{href}}=1;} | ||||||
155 | 0 | 0 | $inside{$tag} += $num; # Profondeur de la balise | ||||
156 | } | ||||||
157 | |||||||
158 | #------------------------------------------------------------------------------ | ||||||
159 | # text | ||||||
160 | #------------------------------------------------------------------------------ | ||||||
161 | sub text { | ||||||
162 | 0 | 0 | 0 | 0 | 1 | 0 | return if $inside{script} || $inside{style}; |
163 | 0 | 0 | analyse($_[0], $facteur_full_text); | ||||
164 | } | ||||||
165 | |||||||
166 | #------------------------------------------------------------------------------ | ||||||
167 | # look_at | ||||||
168 | #------------------------------------------------------------------------------ | ||||||
169 | sub look_at { | ||||||
170 | 0 | 0 | 1 | 0 | my($this, $rh)=@_; | ||
171 | # $url,$idc,$idr,$lastModif,$url_local,$categorieAuto,$niveau,$categorie | ||||||
172 | 0 | 0 | undef %links; $RM={}; undef %inside; | ||||
0 | 0 | ||||||
0 | 0 | ||||||
173 | 0 | 0 | 0 | $rh->{niveau} = 0 if (!$rh->{niveau}); | |||
174 | 0 | 0 | 0 | $rh->{categorie} = 0 if (!$rh->{categorie}); | |||
175 | 0 | 0 | my $buf_debug = "\tUrl => $rh->{url}\n\tIdc => $rh->{idc}\n"; | ||||
176 | 0 | 0 | 0 | $buf_debug.= "\tLast update => $rh->{lastModif}" | |||
177 | unless (!defined($rh->{lastModif})); | ||||||
178 | 0 | 0 | 0 | $buf_debug.= "\tUrl local => $rh->{url_local}" | |||
179 | unless (!defined($rh->{url_local})); | ||||||
180 | 0 | 0 | $this->{INDEXER}->trace(3, "Parser::look_at\n$buf_debug"); | ||||
181 | 0 | 0 | my ($url_orig,$racineFile,$racineUrl,$lastUpdate); | ||||
182 | 0 | 0 | 0 | 0 | if ($rh->{url_local} or URI->new($rh->{url})->scheme eq 'file') { | ||
183 | 0 | 0 | $this->set_agent(1); | ||||
184 | } else { | ||||||
185 | 0 | 0 | $this->set_agent(0); | ||||
186 | } | ||||||
187 | 0 | 0 | 0 | if ($rh->{url_local}) { | |||
188 | 0 | 0 | $this->{ConfigMoteur}->{'temporate'}=0; | ||||
189 | 0 | 0 | 0 | if ($rh->{url_local}=~/.*\/$/) { | |||
190 | 0 | 0 | chop($rh->{url_local}); | ||||
191 | 0 | 0 | 0 | if (-e "$rh->{url_local}/index.html") { | |||
0 | |||||||
0 | |||||||
192 | 0 | 0 | $rh->{url_local}.="/index.html";} | ||||
193 | elsif (-e "$rh->{url_local}/index.htm") { | ||||||
194 | 0 | 0 | $rh->{url_local}.="/index.htm";} | ||||
195 | 0 | 0 | elsif (-e "$rh->{url_local}/default.htm") { | ||||
196 | 0 | 0 | $rh->{url_local}.="/default.htm";} | ||||
197 | else {return (-1,0,0);} | ||||||
198 | } | ||||||
199 | 0 | 0 | $url_orig=$rh->{url}; | ||||
200 | 0 | 0 | $rh->{url}=$rh->{url_local}; | ||||
201 | 0 | 0 | ($racineFile,$racineUrl) = | ||||
202 | $this->{INDEXER}->fetch_first("select path,url from ". | ||||||
203 | $this->{INDEXER}->pre_tbl."local_url ". | ||||||
204 | "where id=$rh->{idr}"); | ||||||
205 | } | ||||||
206 | 0 | 0 | my ($nb,$nbwg,$nburl)=(0,0,0); | ||||
207 | 0 | 0 | 0 | if ($rh->{url_local}) {$this->{INDEXER}->set_host_indexed($rh->{url_local});} | |||
0 | 0 | ||||||
0 | 0 | ||||||
208 | else {$this->{INDEXER}->set_host_indexed($rh->{url});} | ||||||
209 | |||||||
210 | # Creation d'une requete | ||||||
211 | # On passe la requete à l'agent et on attend le résultat | ||||||
212 | 0 | 0 | my $res = $this->{AGENT}->request(new HTTP::Request('GET' => $rh->{url})); | ||||
213 | 0 | 0 | $this->{INDEXER}->trace(2, "HTTP::Request return ".$res->status_line); | ||||
214 | 0 | 0 | 0 | if ($res->is_success) { | |||
215 | # Langue | ||||||
216 | 0 | 0 | 0 | my $language = $res->content_language || 'unkno'; | |||
217 | 0 | 0 | 0 | if ($rh->{lastModif}) { | |||
218 | 0 | 0 | $this->{INDEXER}->trace(2,"Update url ".$rh->{lastModif}.' '. | ||||
219 | $res->last_modified); | ||||||
220 | } | ||||||
221 | # Fichier non modifie depuis la derniere indexation | ||||||
222 | 0 | 0 | 0 | 0 | if (($rh->{lastModif}) && ($res->last_modified) && | ||
0 | |||||||
223 | ($rh->{lastModif} >= $res->last_modified)) { | ||||||
224 | 0 | 0 | $this->{INDEXER}->trace(1,"No update on $rh->{url}"); | ||||
225 | 0 | 0 | $this->{INDEXER}->URL->update | ||||
226 | ($rh->{idr},('id'=>$rh->{idc}, 'last_check'=>"NOW()")); | ||||||
227 | 0 | 0 | return (0,0,0); | ||||
228 | } | ||||||
229 | 0 | 0 | 0 | if ($res->last_modified) { | |||
0 | 0 | ||||||
230 | 0 | 0 | my @date = localtime($res->last_modified); | ||||
231 | 0 | 0 | $lastUpdate = ($date[5]+1900).'-'.($date[4]+1).'-'. | ||||
232 | $date[3].' '.$date[2].':'.$date[1].':'.$date[0]; | ||||||
233 | } | ||||||
234 | else {$lastUpdate='0000-00-00';} | ||||||
235 | 0 | 0 | my $x = 72-length($rh->{url}); | ||||
236 | 0 | 0 | 0 | if ( $this->{inindex}) { | |||
237 | 0 | 0 | 0 | print $this->{inindex},'/',$this->{toindex}," ", | |||
238 | $rh->{url},($ENV{SERVER_NAME} ? " \n" : (" "x$x)."\n"); |
||||||
239 | } | ||||||
240 | |||||||
241 | # Il serait judicieux de mettre ca dans le constructeur, | ||||||
242 | # mais cela entraine 10 Mo de Ram supplementaire à | ||||||
243 | # l'utilisation. A voir avec les evolution du module | ||||||
244 | # HTML::Parser | ||||||
245 | 0 | 0 | 0 | if ($this->{_parser_ok}) { | |||
246 | 0 | 0 | $this->{INDEXER}->trace(3,"Use HTML::Parser ..."); | ||||
247 | 0 | 0 | my $parser = HTML::Parser->new | ||||
248 | (api_version => 3, | ||||||
249 | handlers => [start => [\&tag, "tagname, '+1', attr"], | ||||||
250 | end => [\&tag, "tagname, '-1', attr"], | ||||||
251 | text => [\&text, "dtext"], | ||||||
252 | ], | ||||||
253 | marked_sections => 1); | ||||||
254 | # parse du fichier | ||||||
255 | 0 | 0 | 0 | $parser->parse($res->content) | |||
256 | || print STDERR "Can't parse ".$res->content."::$!\n"; | ||||||
257 | } | ||||||
258 | else { | ||||||
259 | 0 | 0 | $this->{INDEXER}->trace(1,"Use a basic parser ..."); | ||||
260 | 0 | 0 | my $TEXT = $res->content; | ||||
261 | 0 | 0 | $TEXT=~s{ } { | ||||
262 | 0 | 0 | 0 | 0 | if ($1 || $3) {"";} }gesx; | ||
0 | 0 | ||||||
263 | 0 | 0 | $TEXT=~s{ <(?: [^>\'\"] * | ".*?" | '.*?' ) + > }{}gsx; | ||||
264 | 0 | 0 | analyse(decode_entities($TEXT), | ||||
265 | $this->{ConfigMoteur}->{'facteur_full_text'}); | ||||||
266 | } | ||||||
267 | |||||||
268 | # Mots clefs et description | ||||||
269 | 0 | 0 | 0 | my ($desc,$keyword)=($DESCRIPTION||' ',$KEYWORDS||' '); | |||
0 | |||||||
270 | 0 | 0 | undef $DESCRIPTION; undef $KEYWORDS; | ||||
0 | 0 | ||||||
271 | 0 | 0 | 0 | my $titre = $res->title || $rh->{url};# Titre | |||
272 | # Categorie | ||||||
273 | 0 | 0 | 0 | if ($rh->{categorieAuto}) { | |||
274 | 0 | 0 | $rh->{categorie} = $this->{INDEXER}->categorie->get($rh->{url}, | ||||
275 | $rh->{idr}); | ||||||
276 | } | ||||||
277 | 0 | 0 | 0 | if (!$rh->{categorie}) {$rh->{categorie}=0;} | |||
0 | 0 | ||||||
278 | # Mis a jour de l'url | ||||||
279 | 0 | 0 | 0 | if ($this->{INDEXER}->URL->update | |||
280 | ($rh->{idr}, | ||||||
281 | (parse => 1, | ||||||
282 | id => $rh->{idc}, | ||||||
283 | titre => $titre, | ||||||
284 | description => $desc, | ||||||
285 | last_update => $lastUpdate, | ||||||
286 | last_check => 'NOW()', | ||||||
287 | langue => $language, | ||||||
288 | categorie => $rh->{categorie} | ||||||
289 | ) | ||||||
290 | )) { | ||||||
291 | 0 | 0 | $this->{INDEXER}->trace(2, "$rh->{url} mis à jour avec success"); | ||||
292 | } | ||||||
293 | |||||||
294 | # Traitement des mots trouves | ||||||
295 | 0 | 0 | analyse($keyword,$this->{ConfigMoteur}->{'facteur_keyword'}); | ||||
296 | 0 | 0 | analyse($desc,$this->{ConfigMoteur}->{'facteur_description'}); | ||||
297 | 0 | 0 | analyse($titre,$this->{ConfigMoteur}->{'facteur_titre'}); | ||||
298 | 0 | 0 | analyse($rh->{url},$this->{ConfigMoteur}->{'facteur_url'}); | ||||
299 | 0 | 0 | $this->{INDEXER}->dbh->do | ||||
300 | ("delete from ".$this->{INDEXER}->pre_tbl.$rh->{idr}."relation ". | ||||||
301 | "where id_site = $rh->{idc}"); | ||||||
302 | |||||||
303 | # Chaque mot trouve plus de $ConfigMoteur{'nb_min_mots'} fois | ||||||
304 | # est enregistre | ||||||
305 | # On passe cette etape si le nombre de liens de la page est superieur | ||||||
306 | # a 50% le nombre de mots retenus, il s'agit alors | ||||||
307 | # d'un sommaire peut interessant à consulter | ||||||
308 | 0 | 0 | my $nbw = 0; | ||||
309 | 0 | 0 | 0 | if (scalar keys %links < (( scalar keys %$RM) * 0.5)) { | |||
310 | 0 | 0 | while (my ($mot,$nb)=each(%$RM)) { | ||||
311 | 0 | 0 | 0 | 0 | next if (!$nb or $nb < $this->{'ConfigMoteur'}->{'nb_min_mots'}); | ||
312 | 0 | 0 | my $requete = "insert into ". | ||||
313 | $this->{INDEXER}->pre_tbl.$rh->{idr}. | ||||||
314 | "relation (mot,id_site,facteur) ". | ||||||
315 | "values ('$mot',$rh->{idc},$nb)"; | ||||||
316 | 0 | 0 | 0 | $this->{INDEXER}->dbh->do($requete) && $nbwg++; | |||
317 | 0 | 0 | $this->{INDEXER}->trace(4,"\t\tStore words: ".$requete); | ||||
318 | } | ||||||
319 | 0 | 0 | $nbw=keys %$RM; | ||||
320 | } | ||||||
321 | else { | ||||||
322 | 0 | 0 | $this->{INDEXER}->trace | ||||
323 | (1,"Sommaire - ".(scalar keys %$RM). | ||||||
324 | " mots ignores pour ".(scalar keys %links)." liens"); | ||||||
325 | } | ||||||
326 | |||||||
327 | # On n'indexe pas les liens si on est au niveau max | ||||||
328 | 0 | 0 | 0 | if ($rh->{niveau} == $this->{ConfigMoteur}{'niveau_max'}) { | |||
329 | 0 | 0 | $this->{INDEXER}->trace(1,"Niveau max atteint. Liens suivants de ". | ||||
330 | "cette page ignorés "); |
||||||
331 | 0 | 0 | return (0,0,0); | ||||
332 | } | ||||||
333 | # Traitement des url trouves | ||||||
334 | 0 | 0 | my $base = $res->base; | ||||
335 | 0 | 0 | my @l = keys %links; undef %links; | ||||
0 | 0 | ||||||
336 | 0 | 0 | 0 | $this->{INDEXER}->trace(2, "Liens trouvés") if ($#l>0); | |||
337 | 0 | 0 | foreach my $var (@l) { | ||||
338 | 0 | 0 | $var = url($var,$base)->abs; # Url absolu | ||||
339 | 0 | 0 | $var = $this->check_links('a',$var); | ||||
340 | 0 | 0 | 0 | 0 | if (($rh->{url_local}) && ($var)) { | ||
0 | |||||||
341 | 0 | 0 | my $urlb = $var; | ||||
342 | 0 | 0 | $urlb=~s/$racineFile/$racineUrl/g; | ||||
343 | #print h1("Ajout site local:$$var[2] pour $racineFile"); | ||||||
344 | 0 | 0 | $this->{INDEXER}->trace(2, "\t".$urlb); | ||||
345 | 0 | 0 | 0 | if ($this->{INDEXER}->URL->add | |||
346 | ($rh->{idr}, | ||||||
347 | (url => $urlb, | ||||||
348 | urllocal => $var, | ||||||
349 | niveau => $rh->{niveau}+1, | ||||||
350 | categorie => $rh->{categorie}, | ||||||
351 | valide => 1, | ||||||
352 | browse_categorie=>$rh->{categorieAuto}))) | ||||||
353 | 0 | 0 | { $nburl++; } | ||||
0 | 0 | ||||||
354 | else {$this->{INDEXER}->trace | ||||||
355 | (2,"\tCan't add $urlb:\n\t$DBI::errstr");} | ||||||
356 | } | ||||||
357 | elsif ($var) { | ||||||
358 | 0 | 0 | $this->{INDEXER}->trace(2, "\t".$var); | ||||
359 | 0 | 0 | 0 | if ($this->{INDEXER}->URL->add | |||
360 | ($rh->{idr}, | ||||||
361 | (url => $var, | ||||||
362 | niveau => $rh->{niveau}+1, | ||||||
363 | categorie => $rh->{categorie}, | ||||||
364 | valide => 1))) | ||||||
365 | 0 | 0 | { $nburl++; } | ||||
366 | else | ||||||
367 | 0 | 0 | { $this->{INDEXER}->trace | ||||
368 | (2,"\tCan't add $var:\n\t$DBI::errstr");} | ||||||
369 | } | ||||||
370 | } | ||||||
371 | 0 | 0 | $this->{INDEXER}->trace(3, "---------------------------------\n"); | ||||
372 | 0 | 0 | return ($nburl,$nbw,$nbwg); | ||||
373 | } | ||||||
374 | # Sinon previent que URL defectueuse | ||||||
375 | 0 | 0 | else { print "*** ", $res->code," : $rh->{url}\n";return (-1,0,0);} | ||||
0 | 0 | ||||||
376 | } | ||||||
377 | |||||||
378 | #------------------------------------------------------------------------------ | ||||||
379 | # set_agent | ||||||
380 | #------------------------------------------------------------------------------ | ||||||
381 | sub set_agent | ||||||
382 | { | ||||||
383 | 0 | 0 | 1 | 0 | my ($self,$locale)=@_; | ||
384 | 0 | 0 | $self->{INDEXER}->trace(5, "Circa::Parser::set_agent $locale\n"); | ||||
385 | 0 | 0 | 0 | 0 | return if ($self->{AGENT} && $self->{_ROBOT}==$locale); # agent already set | ||
386 | 0 | 0 | $self->{_ROBOT}=$locale; | ||||
387 | 0 | 0 | 0 | 0 | if (($self->{ConfigMoteur}->{'temporate'}) && (!$locale)) { | ||
0 | 0 | ||||||
388 | 0 | 0 | $self->{'AGENT'} = LWP::RobotUA->new | ||||
389 | ("CircaParser $VERSION",$self->{'ConfigMoteur'}->{'author'}); | ||||||
390 | 0 | 0 | $self->{AGENT}->delay(1/120.0); | ||||
391 | } | ||||||
392 | else {$self->{AGENT} = new LWP::UserAgent; } | ||||||
393 | 0 | 0 | 0 | if ($self->{PROXY}) {$self->{AGENT}->proxy(['http', 'ftp'], $self->{PROXY});} | |||
0 | 0 | ||||||
394 | 0 | 0 | 0 | $self->{AGENT}->max_size($self->{INDEXER}->size_max) | |||
395 | if ($self->{INDEXER}->size_max); | ||||||
396 | 0 | 0 | $self->{AGENT}->timeout(25); # Set timeout to 25s (defaut 180) | ||||
397 | } | ||||||
398 | |||||||
399 | #------------------------------------------------------------------------------ | ||||||
400 | # analyse | ||||||
401 | #------------------------------------------------------------------------------ | ||||||
402 | sub analyse { | ||||||
403 | 0 | 0 | 1 | 0 | my $data = shift; | ||
404 | 0 | 0 | my $facteur = shift; | ||||
405 | 0 | 0 | my $e; | ||||
406 | 0 | 0 | 0 | 0 | return if (!$data or !$facteur); | ||
407 | # Ponctuation et mots recurents | ||||||
408 | 0 | 0 | $data=~s/http:\/\// /gm; | ||||
409 | 0 | 0 | $data=~tr/\n\t<>.;:,?!()\"\'[]#=\/_/ /; | ||||
410 | 0 | 0 | $data=~s/\s+/ /gm; | ||||
411 | 0 | 0 | foreach (split(/\s/,$data)) { | ||||
412 | 0 | 0 | 0 | next if !$_; | |||
413 | 0 | 0 | $e=lc($_); | ||||
414 | 0 | 0 | 0 | 0 | $$RM{$e}+=$facteur | ||
0 | |||||||
0 | |||||||
415 | if (($e =~/\w/)&&(length($e)>2)&&(!$bad{$e})&&($e !~/^\d*$/)); | ||||||
416 | } | ||||||
417 | } | ||||||
418 | |||||||
419 | #------------------------------------------------------------------------------ | ||||||
420 | # check_links | ||||||
421 | #------------------------------------------------------------------------------ | ||||||
422 | sub check_links | ||||||
423 | { | ||||||
424 | 16 | 16 | 1 | 30 | my($self,$tag,$links) = @_; | ||
425 | 16 | 44 | my $host = $self->{INDEXER}->host_indexed; | ||||
426 | 16 | 24 | my $li = "doc|zip|ps|gif|jpg|gz|pdf|eps|png|deb|xls|ppt|". | ||||
427 | "class|GIF|css|js|wav|mid"; | ||||||
428 | 16 | 151 | my $bad = qr/\.($li)$/i; | ||||
429 | 16 | 50 | 33 | 292 | if (($tag) && ($links) && ($tag eq 'a') | ||
33 | |||||||
33 | |||||||
33 | |||||||
430 | && ($links=~/\Q$host\E/) | ||||||
431 | && ($links !~ $bad)) | ||||||
432 | { | ||||||
433 | 0 | 0 | 0 | if ($links=~/^(.*?)#/) {$links=$1;} # Don't add anchor | |||
0 | 0 | ||||||
434 | 0 | 0 | 0 | 0 | if ((!$self->{ConfigMoteur}->{'indexCgi'})&&($links=~/^(.*?)\?/)) | ||
0 | 0 | ||||||
435 | {$links=$1;} | ||||||
436 | 0 | 0 | return $links; | ||||
437 | } | ||||||
438 | 16 | 85 | return 0; | ||||
439 | } | ||||||
440 | |||||||
441 | |||||||
442 | #------------------------------------------------------------------------------ | ||||||
443 | # POD DOCUMENTATION | ||||||
444 | #------------------------------------------------------------------------------ | ||||||
445 | |||||||
446 | =head1 NAME | ||||||
447 | |||||||
448 | Search::Circa::Parser - provide functions to parse HTML pages by Circa | ||||||
449 | |||||||
450 | =head1 SYNOPSIS | ||||||
451 | |||||||
452 | use Search::Circa::Indexer; | ||||||
453 | my $index = new Search::Circa::Indexer; | ||||||
454 | $index->connect(...); | ||||||
455 | $index->Parser->look_at({ url => url, | ||||||
456 | idr => account }); | ||||||
457 | |||||||
458 | =head1 DESCRIPTION | ||||||
459 | |||||||
460 | This module use HTML::Parser facilities. It's call by Search::Circa::Indexer | ||||||
461 | for index each document. Main method is C |
||||||
462 | |||||||
463 | =head1 Public Class Interface | ||||||
464 | |||||||
465 | =over | ||||||
466 | |||||||
467 | =item B |
||||||
468 | |||||||
469 | Create a new Circa::Parser object with indexer instance properties | ||||||
470 | |||||||
471 | =item B |
||||||
472 | |||||||
473 | Index an url. Job done is: | ||||||
474 | |||||||
475 | =over | ||||||
476 | |||||||
477 | =item * | ||||||
478 | |||||||
479 | Test if url used is valid. Return -1 else | ||||||
480 | |||||||
481 | =item * | ||||||
482 | |||||||
483 | Get the page and add each words found with weight set in constructor. | ||||||
484 | |||||||
485 | =item * | ||||||
486 | |||||||
487 | If maximum level of links is not reach, add each link found for the next | ||||||
488 | indexation | ||||||
489 | |||||||
490 | =back | ||||||
491 | |||||||
492 | Keys for refHashParameters: | ||||||
493 | |||||||
494 | =over | ||||||
495 | |||||||
496 | =item I |
||||||
497 | |||||||
498 | Url to read | ||||||
499 | |||||||
500 | =item I |
||||||
501 | |||||||
502 | Id of url in table links | ||||||
503 | |||||||
504 | =item I |
||||||
505 | |||||||
506 | Id of account's url | ||||||
507 | |||||||
508 | =item I |
||||||
509 | |||||||
510 | (optional) : If this parameter is set, Circa didn't make any job | ||||||
511 | on this page if it's older that the date. | ||||||
512 | |||||||
513 | =item I |
||||||
514 | |||||||
515 | (optional) Local url to reach the file | ||||||
516 | |||||||
517 | =item I |
||||||
518 | |||||||
519 | (optional) If $categorieAuto set to true, Circa will | ||||||
520 | create/set the category of url with syntax of directory found. Ex: | ||||||
521 | http://www.alianwebserver.com/societe/stvalentin/index.html will create | ||||||
522 | and set the category for this url to Societe / StValentin. | ||||||
523 | If $categorieAuto set to false, $categorie will be used. | ||||||
524 | |||||||
525 | =item I |
||||||
526 | |||||||
527 | (optional) Depth of actual link. | ||||||
528 | |||||||
529 | =item I |
||||||
530 | |||||||
531 | (optional) See $categorieAuto. | ||||||
532 | |||||||
533 | =back | ||||||
534 | |||||||
535 | Return (-1,0) if url isn't valide, number of word and number of links | ||||||
536 | found else | ||||||
537 | |||||||
538 | =item B |
||||||
539 | |||||||
540 | Set user agent for Circa robot. If local is set to 0 or | ||||||
541 | $self->{ConfigMoteur}->{'temporate'}==0, | ||||||
542 | LWP::UserAgent will be used. Else LWP::RobotUA is used. | ||||||
543 | |||||||
544 | =item B |
||||||
545 | |||||||
546 | Split data in words, and put them in global %$RM with score. | ||||||
547 | Hash structure is ('mots'=>facteur). | ||||||
548 | |||||||
549 | =over | ||||||
550 | |||||||
551 | =item I | ||||||
552 | |||||||
553 | Buffer to analyse | ||||||
554 | |||||||
555 | =item I |
||||||
556 | |||||||
557 | Basic score for each word | ||||||
558 | |||||||
559 | =back | ||||||
560 | |||||||
561 | =item B |
||||||
562 | |||||||
563 | Method call for each HTML tag find in HTML pages. | ||||||
564 | |||||||
565 | =item B |
||||||
566 | |||||||
567 | Method call for each content of tag in HTML pages | ||||||
568 | |||||||
569 | =item B |
||||||
570 | |||||||
571 | Check if url $links will be add to Circa. Url must begin with | ||||||
572 | $self->host_indexed, and his extension must be not doc,zip,ps,gif,jpg,gz, | ||||||
573 | pdf,eps,png,deb,xls,ppt,class,GIF,css,js,wav,mid. | ||||||
574 | |||||||
575 | If $links is accepted, return url. Else return 0. | ||||||
576 | |||||||
577 | =back | ||||||
578 | |||||||
579 | =head1 VERSION | ||||||
580 | |||||||
581 | $Revision: 1.27 $ | ||||||
582 | |||||||
583 | =head1 SEE ALSO | ||||||
584 | |||||||
585 | L |
||||||
586 | |||||||
587 | =head1 AUTHOR | ||||||
588 | |||||||
589 | Alain BARBET alian@alianwebserver.com | ||||||
590 | |||||||
591 | =cut |