| blib/lib/Search/Circa/Parser.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 48 | 197 | 24.3 |
| branch | 2 | 102 | 1.9 |
| condition | 5 | 87 | 5.7 |
| subroutine | 10 | 15 | 66.6 |
| pod | 7 | 7 | 100.0 |
| total | 72 | 408 | 17.6 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package Search::Circa::Parser; | ||||||
| 2 | |||||||
| 3 | # module Circa::Parser : See Circa::Indexer | ||||||
| 4 | # Copyright 2000 A.Barbet alian@alianwebserver.com. All rights reserved. | ||||||
| 5 | |||||||
| 6 | # $Log: Parser.pm,v $ | ||||||
| 7 | # Revision 1.27 2003/01/02 00:32:40 alian | ||||||
| 8 | # Add url found with meta http-equiv refresh | ||||||
| 9 | # | ||||||
| 10 | # Revision 1.26 2002/12/31 09:58:52 alian | ||||||
| 11 | # Use hash in place of list in look_at | ||||||
| 12 | # Call analyse in each text call in place of global var TEXT | ||||||
| 13 | # Update POD doc | ||||||
| 14 | # | ||||||
| 15 | # Revision 1.25 2002/12/29 14:35:10 alian | ||||||
| 16 | # Some minor fixe suite to last update | ||||||
| 17 | # | ||||||
| 18 | # Revision 1.24 2002/12/29 03:18:37 alian | ||||||
| 19 | # Update POD documentation | ||||||
| 20 | # | ||||||
| 21 | # Revision 1.23 2002/12/29 00:36:30 alian | ||||||
| 22 | # Add undef %insite => dangerous global var ... | ||||||
| 23 | # | ||||||
| 24 | # Revision 1.22 2002/12/28 22:23:59 alian | ||||||
| 25 | # Some optimization after bench | ||||||
| 26 | # | ||||||
| 27 | # Revision 1.21 2002/12/28 12:36:02 alian | ||||||
| 28 | # Ajout phase pour ne pas analyser les mots d'un sommaire | ||||||
| 29 | # | ||||||
| 30 | # Revision 1.20 2002/12/27 12:55:43 alian | ||||||
| 31 | # Use ref in analyse, update stopwords | ||||||
| 32 | |||||||
| 33 | 8 | 8 | 14256 | use strict; | |||
| 8 | 21 | ||||||
| 8 | 318 | ||||||
| 34 | 8 | 8 | 24084 | use URI::URL; | |||
| 8 | 180790 | ||||||
| 8 | 625 | ||||||
| 35 | 8 | 8 | 102 | use URI::WithBase; | |||
| 8 | 19 | ||||||
| 8 | 175 | ||||||
| 36 | 8 | 8 | 2586 | use DBI; | |||
| 8 | 136433 | ||||||
| 8 | 338 | ||||||
| 37 | 8 | 8 | 10526 | use LWP::RobotUA; | |||
| 8 | 1499655 | ||||||
| 8 | 388 | ||||||
| 38 | 8 | 8 | 94 | use Carp qw/cluck/; | |||
| 8 | 18 | ||||||
| 8 | 719 | ||||||
| 39 | 8 | 6150 | use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION | ||||
| 40 | 8 | 8 | 48 | %links %inside $RM $DESCRIPTION $KEYWORDS $facteur_full_text); | |||
| 8 | 19 | ||||||
| 41 | |||||||
| 42 | require Exporter; | ||||||
| 43 | |||||||
| 44 | @ISA = qw(Exporter); | ||||||
| 45 | @EXPORT = qw(); | ||||||
| 46 | $VERSION = ('$Revision: 1.27 $ ' =~ /(\d+\.\d+)/)[0]; | ||||||
| 47 | |||||||
| 48 | # stopwords | ||||||
| 49 | my %bad = map {$_ => 1} qw ( | ||||||
| 50 | able about above according across actually after afterwards again against ago | ||||||
| 51 | all almost already also althought altogether always among amongst and another | ||||||
| 52 | any anyhow anyone anything anyway anywhere apart are aren around aside away | ||||||
| 53 | back because been before beforehand behind being below beneath beside besides | ||||||
| 54 | between beyond but came can cannot come could couldn currently did didn | ||||||
| 55 | directly does doing don done down downward during each easily else elsewhere | ||||||
| 56 | enough especially even ever every everybody everyone everything everywhere | ||||||
| 57 | exactly except far farther few fewer find five for formerly forth found four | ||||||
| 58 | frequently from full fully further generally get gets give given going gone | ||||||
| 59 | gonna got gotten had hardly has have having height hence her here hereafter | ||||||
| 60 | hereby herein hereupon hers herself him himself his hope how however | ||||||
| 61 | immediatly including indeed inside instead into inward isn its itself just | ||||||
| 62 | know largely last lately later latest least leave less lesser let lets like | ||||||
| 63 | liked likes likewise little lot lower made mainly make making many may maybe | ||||||
| 64 | means meantime meanwhile might mine more moreover most mostly mrs much must | ||||||
| 65 | myself namely near necessarily neither never nevertheless nine nobody none | ||||||
| 66 | nonetheless nor not nothing now nowhere often once one only onto other others | ||||||
| 67 | otherwise ought our ours ourself ourselves out outside over overall own per | ||||||
| 68 | perform perharps please previous previously prior probably provide providing | ||||||
| 69 | quickly quite rather read ready really recently require roughly said same say | ||||||
| 70 | see sent seven several shall shan she should shouldn simply since six slightly | ||||||
| 71 | some somebody somehow someone something sometime sometimes somewhat somewhere | ||||||
| 72 | soon still strictly such take ten than thanks that the their theirs them | ||||||
| 73 | themselves then thence there thereafter thereby therefore therein thereupon | ||||||
| 74 | these they think this those though three through thru thus thusly timely | ||||||
| 75 | together too took top toward towards truly two unable under unless unlike | ||||||
| 76 | unlikely until upon upward upwards use used using usually various very wanna | ||||||
| 77 | want was wasn well went were weren what whatever when whence whenever where | ||||||
| 78 | whereabouts whereafter whereas whereby wherefor wherein whereis whereupon | ||||||
| 79 | wherever whether which whichever while whither who whoever whole whom | ||||||
| 80 | whomever whose why will with within without worth worthy would wouldn yes | ||||||
| 81 | yet you your yours yourself yourselves | ||||||
| 82 | |||||||
| 83 | afin ailleurs ainsi ais ait alors aucun aucune aucunes aucuns auparavant auquel | ||||||
| 84 | assez aussi autour autre autres aux auxquelles avait avant avec avoir beaucoup | ||||||
| 85 | bien car ceci cela celle celui cependant certain certaine certaines certains | ||||||
| 86 | ces cet cette ceux chacun chacune chacunes chaque chez cinq combien comme | ||||||
| 87 | comment contre dans dedans depuis des desquelles desquels deux dire dit dix | ||||||
| 88 | doit donc dont duquel elle elles encore enfin entre environ est etc eux faire | ||||||
| 89 | fait faut fit fut huit ici ils jamais laquelle lequel lequels les lesquelles | ||||||
| 90 | lesquels leur leurs lors lorsque lui maintenant mais mes moi moins mon neuf | ||||||
| 91 | non nos notre nous ont oui par parce parfois pas peu peut plus plusieurs pour | ||||||
| 92 | pourquoi pourtant puis quand quant quatre que quel quelconque quelle quelles | ||||||
| 93 | quelque quelquefois quelques quels qui quoi quoique sans sept ses sinon six | ||||||
| 94 | soit son sont soudain sous suis sur tandis tant tel telle tels tes toi ton | ||||||
| 95 | toujours tous tout toute toutes toutefois toutes trois une vers veut voici | ||||||
| 96 | voir vos votre vous \$ { & com/ + www html htm file/); | ||||||
| 97 | |||||||
| 98 | #------------------------------------------------------------------------------ | ||||||
| 99 | # new | ||||||
| 100 | #------------------------------------------------------------------------------ | ||||||
| 101 | sub new { | ||||||
| 102 | 1 | 1 | 1 | 10 | my $class = shift; | ||
| 103 | 1 | 2 | my $self = {}; | ||||
| 104 | 1 | 2 | my $indexer = shift; | ||||
| 105 | 1 | 10 | $indexer->trace(5, "Search::Circa::Parser::new\n"); | ||||
| 106 | 1 | 3 | bless $self, $class; | ||||
| 107 | 1 | 6 | $self->{DBH} = $indexer->{DBH}; | ||||
| 108 | 1 | 2 | $self->{ConfigMoteur} = $indexer->{ConfigMoteur}; | ||||
| 109 | 1 | 3 | while (my ($n,$v)=each(%{$indexer->{ConfigMoteur}})) | ||||
| 11 | 34 | ||||||
| 110 | 10 | 30 | { $indexer->trace(4, "\t$n => $v"); } | ||||
| 111 | 1 | 3 | $self->{INDEXER} = $indexer; | ||||
| 112 | 1 | 3 | $facteur_full_text = $self->{ConfigMoteur}->{'facteur_full_text'}; | ||||
| 113 | # Ce module n'est presque jamais installé ! | ||||||
| 114 | # Evidemment cela demande une charge machine et un .so | ||||||
| 115 | # compilé pour cet environnement. Ca fait peur aux admin | ||||||
| 116 | # ISP ! On encapsule donc l'appel, si on echoue, on previent que | ||||||
| 117 | # tout appel au parser se soldera par une utilisation d'un parseur basic | ||||||
| 118 | # sans handicaper le reste de l'application | ||||||
| 119 | # Il vous reste plus qu'a faire alors une install mysql/circa en local | ||||||
| 120 | # pour faire l'indexation, et exporter les resultats sur le serveur final. | ||||||
| 121 | 1 | 3 | $self->{_parser_ok}=1; | ||||
| 122 | 1 | 2 | eval { require HTML::Parser }; | ||||
| 1 | 11 | ||||||
| 123 | 1 | 50 | 33 | 10 | if ($@ || $HTML::Parser::VERSION < 3.0) { | ||
| 124 | 0 | 0 | warn "Module HTML-Parser 3.0 ou superieur requis pour ". | ||||
| 125 | "utiliser les fonctionnalités optimales du parser.($@)\n"; | ||||||
| 126 | 0 | 0 | $self->{_parser_ok}=0; | ||||
| 127 | } | ||||||
| 128 | 8 | 8 | 9110 | else { use HTML::Entities; } | |||
| 8 | 87188 | ||||||
| 8 | 44736 | ||||||
| 129 | 1 | 5 | $self->{INDEXER}->trace(1,"Parser::new"); | ||||
| 130 | 1 | 4 | return $self; | ||||
| 131 | } | ||||||
| 132 | |||||||
| 133 | #------------------------------------------------------------------------------ | ||||||
| 134 | # tag | ||||||
| 135 | #------------------------------------------------------------------------------ | ||||||
| 136 | sub tag { | ||||||
| 137 | 0 | 0 | 1 | 0 | my($tag, $num, $att) = @_; # parametre | ||
| 138 | # Liens exterieurs | ||||||
| 139 | 0 | 0 | 0 | 0 | if ((lc($tag) eq 'a') and ($$att{href})) {$links{$$att{href}}=1;} | ||
| 0 | 0 | 0 | 0 | ||||
| 0 | 0 | 0 | 0 | ||||
| 0 | 0 | ||||||
| 140 | # Frame | ||||||
| 141 | elsif ((lc($tag) eq 'frame') and ($$att{src})) {$links{$$att{src}}=1;} | ||||||
| 142 | # On est dans le cas d'un meta | ||||||
| 143 | 0 | 0 | elsif (lc($tag) eq 'meta' and defined(%$att)) { | ||||
| 144 | 0 | 0 | 0 | 0 | if ($$att{name} and lc($$att{name}) eq 'description') {# Description | ||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 145 | 0 | 0 | $DESCRIPTION =$$att{content};} | ||||
| 146 | elsif ((lc($$att{'http-equiv'}) eq 'keywords') or | ||||||
| 147 | (lc($$att{name}) eq 'keywords')) {# Mots-clefs | ||||||
| 148 | 0 | 0 | $KEYWORDS=$$att{content} ;} | ||||
| 149 | elsif ((lc($$att{'http-equiv'}) eq 'refresh') and | ||||||
| 150 | ($$att{content}=~/\d*;URL=(.*)$/)) {#url refresh | ||||||
| 151 | 0 | 0 | $links{$1}=1; } | ||||
| 152 | } | ||||||
| 153 | # Area | ||||||
| 154 | elsif (($tag eq 'area') and ($$att{href})) {$links{$$att{href}}=1;} | ||||||
| 155 | 0 | 0 | $inside{$tag} += $num; # Profondeur de la balise | ||||
| 156 | } | ||||||
| 157 | |||||||
| 158 | #------------------------------------------------------------------------------ | ||||||
| 159 | # text | ||||||
| 160 | #------------------------------------------------------------------------------ | ||||||
| 161 | sub text { | ||||||
| 162 | 0 | 0 | 0 | 0 | 1 | 0 | return if $inside{script} || $inside{style}; |
| 163 | 0 | 0 | analyse($_[0], $facteur_full_text); | ||||
| 164 | } | ||||||
| 165 | |||||||
| 166 | #------------------------------------------------------------------------------ | ||||||
| 167 | # look_at | ||||||
| 168 | #------------------------------------------------------------------------------ | ||||||
| 169 | sub look_at { | ||||||
| 170 | 0 | 0 | 1 | 0 | my($this, $rh)=@_; | ||
| 171 | # $url,$idc,$idr,$lastModif,$url_local,$categorieAuto,$niveau,$categorie | ||||||
| 172 | 0 | 0 | undef %links; $RM={}; undef %inside; | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 173 | 0 | 0 | 0 | $rh->{niveau} = 0 if (!$rh->{niveau}); | |||
| 174 | 0 | 0 | 0 | $rh->{categorie} = 0 if (!$rh->{categorie}); | |||
| 175 | 0 | 0 | my $buf_debug = "\tUrl => $rh->{url}\n\tIdc => $rh->{idc}\n"; | ||||
| 176 | 0 | 0 | 0 | $buf_debug.= "\tLast update => $rh->{lastModif}" | |||
| 177 | unless (!defined($rh->{lastModif})); | ||||||
| 178 | 0 | 0 | 0 | $buf_debug.= "\tUrl local => $rh->{url_local}" | |||
| 179 | unless (!defined($rh->{url_local})); | ||||||
| 180 | 0 | 0 | $this->{INDEXER}->trace(3, "Parser::look_at\n$buf_debug"); | ||||
| 181 | 0 | 0 | my ($url_orig,$racineFile,$racineUrl,$lastUpdate); | ||||
| 182 | 0 | 0 | 0 | 0 | if ($rh->{url_local} or URI->new($rh->{url})->scheme eq 'file') { | ||
| 183 | 0 | 0 | $this->set_agent(1); | ||||
| 184 | } else { | ||||||
| 185 | 0 | 0 | $this->set_agent(0); | ||||
| 186 | } | ||||||
| 187 | 0 | 0 | 0 | if ($rh->{url_local}) { | |||
| 188 | 0 | 0 | $this->{ConfigMoteur}->{'temporate'}=0; | ||||
| 189 | 0 | 0 | 0 | if ($rh->{url_local}=~/.*\/$/) { | |||
| 190 | 0 | 0 | chop($rh->{url_local}); | ||||
| 191 | 0 | 0 | 0 | if (-e "$rh->{url_local}/index.html") { | |||
| 0 | |||||||
| 0 | |||||||
| 192 | 0 | 0 | $rh->{url_local}.="/index.html";} | ||||
| 193 | elsif (-e "$rh->{url_local}/index.htm") { | ||||||
| 194 | 0 | 0 | $rh->{url_local}.="/index.htm";} | ||||
| 195 | 0 | 0 | elsif (-e "$rh->{url_local}/default.htm") { | ||||
| 196 | 0 | 0 | $rh->{url_local}.="/default.htm";} | ||||
| 197 | else {return (-1,0,0);} | ||||||
| 198 | } | ||||||
| 199 | 0 | 0 | $url_orig=$rh->{url}; | ||||
| 200 | 0 | 0 | $rh->{url}=$rh->{url_local}; | ||||
| 201 | 0 | 0 | ($racineFile,$racineUrl) = | ||||
| 202 | $this->{INDEXER}->fetch_first("select path,url from ". | ||||||
| 203 | $this->{INDEXER}->pre_tbl."local_url ". | ||||||
| 204 | "where id=$rh->{idr}"); | ||||||
| 205 | } | ||||||
| 206 | 0 | 0 | my ($nb,$nbwg,$nburl)=(0,0,0); | ||||
| 207 | 0 | 0 | 0 | if ($rh->{url_local}) {$this->{INDEXER}->set_host_indexed($rh->{url_local});} | |||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 208 | else {$this->{INDEXER}->set_host_indexed($rh->{url});} | ||||||
| 209 | |||||||
| 210 | # Creation d'une requete | ||||||
| 211 | # On passe la requete à l'agent et on attend le résultat | ||||||
| 212 | 0 | 0 | my $res = $this->{AGENT}->request(new HTTP::Request('GET' => $rh->{url})); | ||||
| 213 | 0 | 0 | $this->{INDEXER}->trace(2, "HTTP::Request return ".$res->status_line); | ||||
| 214 | 0 | 0 | 0 | if ($res->is_success) { | |||
| 215 | # Langue | ||||||
| 216 | 0 | 0 | 0 | my $language = $res->content_language || 'unkno'; | |||
| 217 | 0 | 0 | 0 | if ($rh->{lastModif}) { | |||
| 218 | 0 | 0 | $this->{INDEXER}->trace(2,"Update url ".$rh->{lastModif}.' '. | ||||
| 219 | $res->last_modified); | ||||||
| 220 | } | ||||||
| 221 | # Fichier non modifie depuis la derniere indexation | ||||||
| 222 | 0 | 0 | 0 | 0 | if (($rh->{lastModif}) && ($res->last_modified) && | ||
| 0 | |||||||
| 223 | ($rh->{lastModif} >= $res->last_modified)) { | ||||||
| 224 | 0 | 0 | $this->{INDEXER}->trace(1,"No update on $rh->{url}"); | ||||
| 225 | 0 | 0 | $this->{INDEXER}->URL->update | ||||
| 226 | ($rh->{idr},('id'=>$rh->{idc}, 'last_check'=>"NOW()")); | ||||||
| 227 | 0 | 0 | return (0,0,0); | ||||
| 228 | } | ||||||
| 229 | 0 | 0 | 0 | if ($res->last_modified) { | |||
| 0 | 0 | ||||||
| 230 | 0 | 0 | my @date = localtime($res->last_modified); | ||||
| 231 | 0 | 0 | $lastUpdate = ($date[5]+1900).'-'.($date[4]+1).'-'. | ||||
| 232 | $date[3].' '.$date[2].':'.$date[1].':'.$date[0]; | ||||||
| 233 | } | ||||||
| 234 | else {$lastUpdate='0000-00-00';} | ||||||
| 235 | 0 | 0 | my $x = 72-length($rh->{url}); | ||||
| 236 | 0 | 0 | 0 | if ( $this->{inindex}) { | |||
| 237 | 0 | 0 | 0 | print $this->{inindex},'/',$this->{toindex}," ", | |||
| 238 | $rh->{url},($ENV{SERVER_NAME} ? " \n" : (" "x$x)."\n"); |
||||||
| 239 | } | ||||||
| 240 | |||||||
| 241 | # Il serait judicieux de mettre ca dans le constructeur, | ||||||
| 242 | # mais cela entraine 10 Mo de Ram supplementaire à | ||||||
| 243 | # l'utilisation. A voir avec les evolution du module | ||||||
| 244 | # HTML::Parser | ||||||
| 245 | 0 | 0 | 0 | if ($this->{_parser_ok}) { | |||
| 246 | 0 | 0 | $this->{INDEXER}->trace(3,"Use HTML::Parser ..."); | ||||
| 247 | 0 | 0 | my $parser = HTML::Parser->new | ||||
| 248 | (api_version => 3, | ||||||
| 249 | handlers => [start => [\&tag, "tagname, '+1', attr"], | ||||||
| 250 | end => [\&tag, "tagname, '-1', attr"], | ||||||
| 251 | text => [\&text, "dtext"], | ||||||
| 252 | ], | ||||||
| 253 | marked_sections => 1); | ||||||
| 254 | # parse du fichier | ||||||
| 255 | 0 | 0 | 0 | $parser->parse($res->content) | |||
| 256 | || print STDERR "Can't parse ".$res->content."::$!\n"; | ||||||
| 257 | } | ||||||
| 258 | else { | ||||||
| 259 | 0 | 0 | $this->{INDEXER}->trace(1,"Use a basic parser ..."); | ||||
| 260 | 0 | 0 | my $TEXT = $res->content; | ||||
| 261 | 0 | 0 | $TEXT=~s{ } { | ||||
| 262 | 0 | 0 | 0 | 0 | if ($1 || $3) {"";} }gesx; | ||
| 0 | 0 | ||||||
| 263 | 0 | 0 | $TEXT=~s{ <(?: [^>\'\"] * | ".*?" | '.*?' ) + > }{}gsx; | ||||
| 264 | 0 | 0 | analyse(decode_entities($TEXT), | ||||
| 265 | $this->{ConfigMoteur}->{'facteur_full_text'}); | ||||||
| 266 | } | ||||||
| 267 | |||||||
| 268 | # Mots clefs et description | ||||||
| 269 | 0 | 0 | 0 | my ($desc,$keyword)=($DESCRIPTION||' ',$KEYWORDS||' '); | |||
| 0 | |||||||
| 270 | 0 | 0 | undef $DESCRIPTION; undef $KEYWORDS; | ||||
| 0 | 0 | ||||||
| 271 | 0 | 0 | 0 | my $titre = $res->title || $rh->{url};# Titre | |||
| 272 | # Categorie | ||||||
| 273 | 0 | 0 | 0 | if ($rh->{categorieAuto}) { | |||
| 274 | 0 | 0 | $rh->{categorie} = $this->{INDEXER}->categorie->get($rh->{url}, | ||||
| 275 | $rh->{idr}); | ||||||
| 276 | } | ||||||
| 277 | 0 | 0 | 0 | if (!$rh->{categorie}) {$rh->{categorie}=0;} | |||
| 0 | 0 | ||||||
| 278 | # Mis a jour de l'url | ||||||
| 279 | 0 | 0 | 0 | if ($this->{INDEXER}->URL->update | |||
| 280 | ($rh->{idr}, | ||||||
| 281 | (parse => 1, | ||||||
| 282 | id => $rh->{idc}, | ||||||
| 283 | titre => $titre, | ||||||
| 284 | description => $desc, | ||||||
| 285 | last_update => $lastUpdate, | ||||||
| 286 | last_check => 'NOW()', | ||||||
| 287 | langue => $language, | ||||||
| 288 | categorie => $rh->{categorie} | ||||||
| 289 | ) | ||||||
| 290 | )) { | ||||||
| 291 | 0 | 0 | $this->{INDEXER}->trace(2, "$rh->{url} mis à jour avec success"); | ||||
| 292 | } | ||||||
| 293 | |||||||
| 294 | # Traitement des mots trouves | ||||||
| 295 | 0 | 0 | analyse($keyword,$this->{ConfigMoteur}->{'facteur_keyword'}); | ||||
| 296 | 0 | 0 | analyse($desc,$this->{ConfigMoteur}->{'facteur_description'}); | ||||
| 297 | 0 | 0 | analyse($titre,$this->{ConfigMoteur}->{'facteur_titre'}); | ||||
| 298 | 0 | 0 | analyse($rh->{url},$this->{ConfigMoteur}->{'facteur_url'}); | ||||
| 299 | 0 | 0 | $this->{INDEXER}->dbh->do | ||||
| 300 | ("delete from ".$this->{INDEXER}->pre_tbl.$rh->{idr}."relation ". | ||||||
| 301 | "where id_site = $rh->{idc}"); | ||||||
| 302 | |||||||
| 303 | # Chaque mot trouve plus de $ConfigMoteur{'nb_min_mots'} fois | ||||||
| 304 | # est enregistre | ||||||
| 305 | # On passe cette etape si le nombre de liens de la page est superieur | ||||||
| 306 | # a 50% le nombre de mots retenus, il s'agit alors | ||||||
| 307 | # d'un sommaire peut interessant à consulter | ||||||
| 308 | 0 | 0 | my $nbw = 0; | ||||
| 309 | 0 | 0 | 0 | if (scalar keys %links < (( scalar keys %$RM) * 0.5)) { | |||
| 310 | 0 | 0 | while (my ($mot,$nb)=each(%$RM)) { | ||||
| 311 | 0 | 0 | 0 | 0 | next if (!$nb or $nb < $this->{'ConfigMoteur'}->{'nb_min_mots'}); | ||
| 312 | 0 | 0 | my $requete = "insert into ". | ||||
| 313 | $this->{INDEXER}->pre_tbl.$rh->{idr}. | ||||||
| 314 | "relation (mot,id_site,facteur) ". | ||||||
| 315 | "values ('$mot',$rh->{idc},$nb)"; | ||||||
| 316 | 0 | 0 | 0 | $this->{INDEXER}->dbh->do($requete) && $nbwg++; | |||
| 317 | 0 | 0 | $this->{INDEXER}->trace(4,"\t\tStore words: ".$requete); | ||||
| 318 | } | ||||||
| 319 | 0 | 0 | $nbw=keys %$RM; | ||||
| 320 | } | ||||||
| 321 | else { | ||||||
| 322 | 0 | 0 | $this->{INDEXER}->trace | ||||
| 323 | (1,"Sommaire - ".(scalar keys %$RM). | ||||||
| 324 | " mots ignores pour ".(scalar keys %links)." liens"); | ||||||
| 325 | } | ||||||
| 326 | |||||||
| 327 | # On n'indexe pas les liens si on est au niveau max | ||||||
| 328 | 0 | 0 | 0 | if ($rh->{niveau} == $this->{ConfigMoteur}{'niveau_max'}) { | |||
| 329 | 0 | 0 | $this->{INDEXER}->trace(1,"Niveau max atteint. Liens suivants de ". | ||||
| 330 | "cette page ignorés "); |
||||||
| 331 | 0 | 0 | return (0,0,0); | ||||
| 332 | } | ||||||
| 333 | # Traitement des url trouves | ||||||
| 334 | 0 | 0 | my $base = $res->base; | ||||
| 335 | 0 | 0 | my @l = keys %links; undef %links; | ||||
| 0 | 0 | ||||||
| 336 | 0 | 0 | 0 | $this->{INDEXER}->trace(2, "Liens trouvés") if ($#l>0); | |||
| 337 | 0 | 0 | foreach my $var (@l) { | ||||
| 338 | 0 | 0 | $var = url($var,$base)->abs; # Url absolu | ||||
| 339 | 0 | 0 | $var = $this->check_links('a',$var); | ||||
| 340 | 0 | 0 | 0 | 0 | if (($rh->{url_local}) && ($var)) { | ||
| 0 | |||||||
| 341 | 0 | 0 | my $urlb = $var; | ||||
| 342 | 0 | 0 | $urlb=~s/$racineFile/$racineUrl/g; | ||||
| 343 | #print h1("Ajout site local:$$var[2] pour $racineFile"); | ||||||
| 344 | 0 | 0 | $this->{INDEXER}->trace(2, "\t".$urlb); | ||||
| 345 | 0 | 0 | 0 | if ($this->{INDEXER}->URL->add | |||
| 346 | ($rh->{idr}, | ||||||
| 347 | (url => $urlb, | ||||||
| 348 | urllocal => $var, | ||||||
| 349 | niveau => $rh->{niveau}+1, | ||||||
| 350 | categorie => $rh->{categorie}, | ||||||
| 351 | valide => 1, | ||||||
| 352 | browse_categorie=>$rh->{categorieAuto}))) | ||||||
| 353 | 0 | 0 | { $nburl++; } | ||||
| 0 | 0 | ||||||
| 354 | else {$this->{INDEXER}->trace | ||||||
| 355 | (2,"\tCan't add $urlb:\n\t$DBI::errstr");} | ||||||
| 356 | } | ||||||
| 357 | elsif ($var) { | ||||||
| 358 | 0 | 0 | $this->{INDEXER}->trace(2, "\t".$var); | ||||
| 359 | 0 | 0 | 0 | if ($this->{INDEXER}->URL->add | |||
| 360 | ($rh->{idr}, | ||||||
| 361 | (url => $var, | ||||||
| 362 | niveau => $rh->{niveau}+1, | ||||||
| 363 | categorie => $rh->{categorie}, | ||||||
| 364 | valide => 1))) | ||||||
| 365 | 0 | 0 | { $nburl++; } | ||||
| 366 | else | ||||||
| 367 | 0 | 0 | { $this->{INDEXER}->trace | ||||
| 368 | (2,"\tCan't add $var:\n\t$DBI::errstr");} | ||||||
| 369 | } | ||||||
| 370 | } | ||||||
| 371 | 0 | 0 | $this->{INDEXER}->trace(3, "---------------------------------\n"); | ||||
| 372 | 0 | 0 | return ($nburl,$nbw,$nbwg); | ||||
| 373 | } | ||||||
| 374 | # Sinon previent que URL defectueuse | ||||||
| 375 | 0 | 0 | else { print "*** ", $res->code," : $rh->{url}\n";return (-1,0,0);} | ||||
| 0 | 0 | ||||||
| 376 | } | ||||||
| 377 | |||||||
| 378 | #------------------------------------------------------------------------------ | ||||||
| 379 | # set_agent | ||||||
| 380 | #------------------------------------------------------------------------------ | ||||||
| 381 | sub set_agent | ||||||
| 382 | { | ||||||
| 383 | 0 | 0 | 1 | 0 | my ($self,$locale)=@_; | ||
| 384 | 0 | 0 | $self->{INDEXER}->trace(5, "Circa::Parser::set_agent $locale\n"); | ||||
| 385 | 0 | 0 | 0 | 0 | return if ($self->{AGENT} && $self->{_ROBOT}==$locale); # agent already set | ||
| 386 | 0 | 0 | $self->{_ROBOT}=$locale; | ||||
| 387 | 0 | 0 | 0 | 0 | if (($self->{ConfigMoteur}->{'temporate'}) && (!$locale)) { | ||
| 0 | 0 | ||||||
| 388 | 0 | 0 | $self->{'AGENT'} = LWP::RobotUA->new | ||||
| 389 | ("CircaParser $VERSION",$self->{'ConfigMoteur'}->{'author'}); | ||||||
| 390 | 0 | 0 | $self->{AGENT}->delay(1/120.0); | ||||
| 391 | } | ||||||
| 392 | else {$self->{AGENT} = new LWP::UserAgent; } | ||||||
| 393 | 0 | 0 | 0 | if ($self->{PROXY}) {$self->{AGENT}->proxy(['http', 'ftp'], $self->{PROXY});} | |||
| 0 | 0 | ||||||
| 394 | 0 | 0 | 0 | $self->{AGENT}->max_size($self->{INDEXER}->size_max) | |||
| 395 | if ($self->{INDEXER}->size_max); | ||||||
| 396 | 0 | 0 | $self->{AGENT}->timeout(25); # Set timeout to 25s (defaut 180) | ||||
| 397 | } | ||||||
| 398 | |||||||
| 399 | #------------------------------------------------------------------------------ | ||||||
| 400 | # analyse | ||||||
| 401 | #------------------------------------------------------------------------------ | ||||||
| 402 | sub analyse { | ||||||
| 403 | 0 | 0 | 1 | 0 | my $data = shift; | ||
| 404 | 0 | 0 | my $facteur = shift; | ||||
| 405 | 0 | 0 | my $e; | ||||
| 406 | 0 | 0 | 0 | 0 | return if (!$data or !$facteur); | ||
| 407 | # Ponctuation et mots recurents | ||||||
| 408 | 0 | 0 | $data=~s/http:\/\// /gm; | ||||
| 409 | 0 | 0 | $data=~tr/\n\t<>.;:,?!()\"\'[]#=\/_/ /; | ||||
| 410 | 0 | 0 | $data=~s/\s+/ /gm; | ||||
| 411 | 0 | 0 | foreach (split(/\s/,$data)) { | ||||
| 412 | 0 | 0 | 0 | next if !$_; | |||
| 413 | 0 | 0 | $e=lc($_); | ||||
| 414 | 0 | 0 | 0 | 0 | $$RM{$e}+=$facteur | ||
| 0 | |||||||
| 0 | |||||||
| 415 | if (($e =~/\w/)&&(length($e)>2)&&(!$bad{$e})&&($e !~/^\d*$/)); | ||||||
| 416 | } | ||||||
| 417 | } | ||||||
| 418 | |||||||
| 419 | #------------------------------------------------------------------------------ | ||||||
| 420 | # check_links | ||||||
| 421 | #------------------------------------------------------------------------------ | ||||||
| 422 | sub check_links | ||||||
| 423 | { | ||||||
| 424 | 16 | 16 | 1 | 30 | my($self,$tag,$links) = @_; | ||
| 425 | 16 | 44 | my $host = $self->{INDEXER}->host_indexed; | ||||
| 426 | 16 | 24 | my $li = "doc|zip|ps|gif|jpg|gz|pdf|eps|png|deb|xls|ppt|". | ||||
| 427 | "class|GIF|css|js|wav|mid"; | ||||||
| 428 | 16 | 151 | my $bad = qr/\.($li)$/i; | ||||
| 429 | 16 | 50 | 33 | 292 | if (($tag) && ($links) && ($tag eq 'a') | ||
| 33 | |||||||
| 33 | |||||||
| 33 | |||||||
| 430 | && ($links=~/\Q$host\E/) | ||||||
| 431 | && ($links !~ $bad)) | ||||||
| 432 | { | ||||||
| 433 | 0 | 0 | 0 | if ($links=~/^(.*?)#/) {$links=$1;} # Don't add anchor | |||
| 0 | 0 | ||||||
| 434 | 0 | 0 | 0 | 0 | if ((!$self->{ConfigMoteur}->{'indexCgi'})&&($links=~/^(.*?)\?/)) | ||
| 0 | 0 | ||||||
| 435 | {$links=$1;} | ||||||
| 436 | 0 | 0 | return $links; | ||||
| 437 | } | ||||||
| 438 | 16 | 85 | return 0; | ||||
| 439 | } | ||||||
| 440 | |||||||
| 441 | |||||||
| 442 | #------------------------------------------------------------------------------ | ||||||
| 443 | # POD DOCUMENTATION | ||||||
| 444 | #------------------------------------------------------------------------------ | ||||||
| 445 | |||||||
| 446 | =head1 NAME | ||||||
| 447 | |||||||
| 448 | Search::Circa::Parser - provide functions to parse HTML pages by Circa | ||||||
| 449 | |||||||
| 450 | =head1 SYNOPSIS | ||||||
| 451 | |||||||
| 452 | use Search::Circa::Indexer; | ||||||
| 453 | my $index = new Search::Circa::Indexer; | ||||||
| 454 | $index->connect(...); | ||||||
| 455 | $index->Parser->look_at({ url => url, | ||||||
| 456 | idr => account }); | ||||||
| 457 | |||||||
| 458 | =head1 DESCRIPTION | ||||||
| 459 | |||||||
| 460 | This module use HTML::Parser facilities. It's call by Search::Circa::Indexer | ||||||
| 461 | for index each document. Main method is C |
||||||
| 462 | |||||||
| 463 | =head1 Public Class Interface | ||||||
| 464 | |||||||
| 465 | =over | ||||||
| 466 | |||||||
| 467 | =item B |
||||||
| 468 | |||||||
| 469 | Create a new Circa::Parser object with indexer instance properties | ||||||
| 470 | |||||||
| 471 | =item B |
||||||
| 472 | |||||||
| 473 | Index an url. Job done is: | ||||||
| 474 | |||||||
| 475 | =over | ||||||
| 476 | |||||||
| 477 | =item * | ||||||
| 478 | |||||||
| 479 | Test if url used is valid. Return -1 else | ||||||
| 480 | |||||||
| 481 | =item * | ||||||
| 482 | |||||||
| 483 | Get the page and add each words found with weight set in constructor. | ||||||
| 484 | |||||||
| 485 | =item * | ||||||
| 486 | |||||||
| 487 | If maximum level of links is not reach, add each link found for the next | ||||||
| 488 | indexation | ||||||
| 489 | |||||||
| 490 | =back | ||||||
| 491 | |||||||
| 492 | Keys for refHashParameters: | ||||||
| 493 | |||||||
| 494 | =over | ||||||
| 495 | |||||||
| 496 | =item I |
||||||
| 497 | |||||||
| 498 | Url to read | ||||||
| 499 | |||||||
| 500 | =item I |
||||||
| 501 | |||||||
| 502 | Id of url in table links | ||||||
| 503 | |||||||
| 504 | =item I |
||||||
| 505 | |||||||
| 506 | Id of account's url | ||||||
| 507 | |||||||
| 508 | =item I |
||||||
| 509 | |||||||
| 510 | (optional) : If this parameter is set, Circa didn't make any job | ||||||
| 511 | on this page if it's older that the date. | ||||||
| 512 | |||||||
| 513 | =item I |
||||||
| 514 | |||||||
| 515 | (optional) Local url to reach the file | ||||||
| 516 | |||||||
| 517 | =item I |
||||||
| 518 | |||||||
| 519 | (optional) If $categorieAuto set to true, Circa will | ||||||
| 520 | create/set the category of url with syntax of directory found. Ex: | ||||||
| 521 | http://www.alianwebserver.com/societe/stvalentin/index.html will create | ||||||
| 522 | and set the category for this url to Societe / StValentin. | ||||||
| 523 | If $categorieAuto set to false, $categorie will be used. | ||||||
| 524 | |||||||
| 525 | =item I |
||||||
| 526 | |||||||
| 527 | (optional) Depth of actual link. | ||||||
| 528 | |||||||
| 529 | =item I |
||||||
| 530 | |||||||
| 531 | (optional) See $categorieAuto. | ||||||
| 532 | |||||||
| 533 | =back | ||||||
| 534 | |||||||
| 535 | Return (-1,0) if url isn't valide, number of word and number of links | ||||||
| 536 | found else | ||||||
| 537 | |||||||
| 538 | =item B |
||||||
| 539 | |||||||
| 540 | Set user agent for Circa robot. If local is set to 0 or | ||||||
| 541 | $self->{ConfigMoteur}->{'temporate'}==0, | ||||||
| 542 | LWP::UserAgent will be used. Else LWP::RobotUA is used. | ||||||
| 543 | |||||||
| 544 | =item B |
||||||
| 545 | |||||||
| 546 | Split data in words, and put them in global %$RM with score. | ||||||
| 547 | Hash structure is ('mots'=>facteur). | ||||||
| 548 | |||||||
| 549 | =over | ||||||
| 550 | |||||||
| 551 | =item I | ||||||
| 552 | |||||||
| 553 | Buffer to analyse | ||||||
| 554 | |||||||
| 555 | =item I |
||||||
| 556 | |||||||
| 557 | Basic score for each word | ||||||
| 558 | |||||||
| 559 | =back | ||||||
| 560 | |||||||
| 561 | =item B |
||||||
| 562 | |||||||
| 563 | Method call for each HTML tag find in HTML pages. | ||||||
| 564 | |||||||
| 565 | =item B |
||||||
| 566 | |||||||
| 567 | Method call for each content of tag in HTML pages | ||||||
| 568 | |||||||
| 569 | =item B |
||||||
| 570 | |||||||
| 571 | Check if url $links will be add to Circa. Url must begin with | ||||||
| 572 | $self->host_indexed, and his extension must be not doc,zip,ps,gif,jpg,gz, | ||||||
| 573 | pdf,eps,png,deb,xls,ppt,class,GIF,css,js,wav,mid. | ||||||
| 574 | |||||||
| 575 | If $links is accepted, return url. Else return 0. | ||||||
| 576 | |||||||
| 577 | =back | ||||||
| 578 | |||||||
| 579 | =head1 VERSION | ||||||
| 580 | |||||||
| 581 | $Revision: 1.27 $ | ||||||
| 582 | |||||||
| 583 | =head1 SEE ALSO | ||||||
| 584 | |||||||
| 585 | L |
||||||
| 586 | |||||||
| 587 | =head1 AUTHOR | ||||||
| 588 | |||||||
| 589 | Alain BARBET alian@alianwebserver.com | ||||||
| 590 | |||||||
| 591 | =cut |