File Coverage

blib/lib/Search/Circa/Parser.pm

Criterion	Covered	Total	%
statement	48	197	24.3
branch	2	102	1.9
condition	5	87	5.7
subroutine	10	15	66.6
pod	7	7	100.0
total	72	408	17.6

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Circa::Parser;
2
3							# module Circa::Parser : See Circa::Indexer
4							# Copyright 2000 A.Barbet alian@alianwebserver.com. All rights reserved.
5
6							# $Log: Parser.pm,v $
7							# Revision 1.27 2003/01/02 00:32:40 alian
8							# Add url found with meta http-equiv refresh
9							#
10							# Revision 1.26 2002/12/31 09:58:52 alian
11							# Use hash in place of list in look_at
12							# Call analyse in each text call in place of global var TEXT
13							# Update POD doc
14							#
15							# Revision 1.25 2002/12/29 14:35:10 alian
16							# Some minor fixe suite to last update
17							#
18							# Revision 1.24 2002/12/29 03:18:37 alian
19							# Update POD documentation
20							#
21							# Revision 1.23 2002/12/29 00:36:30 alian
22							# Add undef %insite => dangerous global var ...
23							#
24							# Revision 1.22 2002/12/28 22:23:59 alian
25							# Some optimization after bench
26							#
27							# Revision 1.21 2002/12/28 12:36:02 alian
28							# Ajout phase pour ne pas analyser les mots d'un sommaire
29							#
30							# Revision 1.20 2002/12/27 12:55:43 alian
31							# Use ref in analyse, update stopwords
32
33	8			8		14256	use strict;
	8					21
	8					318
34	8			8		24084	use URI::URL;
	8					180790
	8					625
35	8			8		102	use URI::WithBase;
	8					19
	8					175
36	8			8		2586	use DBI;
	8					136433
	8					338
37	8			8		10526	use LWP::RobotUA;
	8					1499655
	8					388
38	8			8		94	use Carp qw/cluck/;
	8					18
	8					719
39	8					6150	use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION
40	8			8		48	%links %inside $RM $DESCRIPTION $KEYWORDS $facteur_full_text);
	8					19
41
42							require Exporter;
43
44							@ISA = qw(Exporter);
45							@EXPORT = qw();
46							$VERSION = ('$Revision: 1.27 $ ' =~ /(\d+\.\d+)/)[0];
47
48							# stopwords
49							my %bad = map {$_ => 1} qw (
50							able about above according across actually after afterwards again against ago
51							all almost already also althought altogether always among amongst and another
52							any anyhow anyone anything anyway anywhere apart are aren around aside away
53							back because been before beforehand behind being below beneath beside besides
54							between beyond but came can cannot come could couldn currently did didn
55							directly does doing don done down downward during each easily else elsewhere
56							enough especially even ever every everybody everyone everything everywhere
57							exactly except far farther few fewer find five for formerly forth found four
58							frequently from full fully further generally get gets give given going gone
59							gonna got gotten had hardly has have having height hence her here hereafter
60							hereby herein hereupon hers herself him himself his hope how however
61							immediatly including indeed inside instead into inward isn its itself just
62							know largely last lately later latest least leave less lesser let lets like
63							liked likes likewise little lot lower made mainly make making many may maybe
64							means meantime meanwhile might mine more moreover most mostly mrs much must
65							myself namely near necessarily neither never nevertheless nine nobody none
66							nonetheless nor not nothing now nowhere often once one only onto other others
67							otherwise ought our ours ourself ourselves out outside over overall own per
68							perform perharps please previous previously prior probably provide providing
69							quickly quite rather read ready really recently require roughly said same say
70							see sent seven several shall shan she should shouldn simply since six slightly
71							some somebody somehow someone something sometime sometimes somewhat somewhere
72							soon still strictly such take ten than thanks that the their theirs them
73							themselves then thence there thereafter thereby therefore therein thereupon
74							these they think this those though three through thru thus thusly timely
75							together too took top toward towards truly two unable under unless unlike
76							unlikely until upon upward upwards use used using usually various very wanna
77							want was wasn well went were weren what whatever when whence whenever where
78							whereabouts whereafter whereas whereby wherefor wherein whereis whereupon
79							wherever whether which whichever while whither who whoever whole whom
80							whomever whose why will with within without worth worthy would wouldn yes
81							yet you your yours yourself yourselves
82
83							afin ailleurs ainsi ais ait alors aucun aucune aucunes aucuns auparavant auquel
84							assez aussi autour autre autres aux auxquelles avait avant avec avoir beaucoup
85							bien car ceci cela celle celui cependant certain certaine certaines certains
86							ces cet cette ceux chacun chacune chacunes chaque chez cinq combien comme
87							comment contre dans dedans depuis des desquelles desquels deux dire dit dix
88							doit donc dont duquel elle elles encore enfin entre environ est etc eux faire
89							fait faut fit fut huit ici ils jamais laquelle lequel lequels les lesquelles
90							lesquels leur leurs lors lorsque lui maintenant mais mes moi moins mon neuf
91							non nos notre nous ont oui par parce parfois pas peu peut plus plusieurs pour
92							pourquoi pourtant puis quand quant quatre que quel quelconque quelle quelles
93							quelque quelquefois quelques quels qui quoi quoique sans sept ses sinon six
94							soit son sont soudain sous suis sur tandis tant tel telle tels tes toi ton
95							toujours tous tout toute toutes toutefois toutes trois une vers veut voici
96							voir vos votre vous \$ { & com/ + www html htm file/);
97
98							#------------------------------------------------------------------------------
99							# new
100							#------------------------------------------------------------------------------
101							sub new {
102	1			1	1	10	my $class = shift;
103	1					2	my $self = {};
104	1					2	my $indexer = shift;
105	1					10	$indexer->trace(5, "Search::Circa::Parser::new\n");
106	1					3	bless $self, $class;
107	1					6	$self->{DBH} = $indexer->{DBH};
108	1					2	$self->{ConfigMoteur} = $indexer->{ConfigMoteur};
109	1					3	while (my ($n,$v)=each(%{$indexer->{ConfigMoteur}}))
	11					34
110	10					30	{ $indexer->trace(4, "\t$n => $v"); }
111	1					3	$self->{INDEXER} = $indexer;
112	1					3	$facteur_full_text = $self->{ConfigMoteur}->{'facteur_full_text'};
113							# Ce module n'est presque jamais install� !
114							# Evidemment cela demande une charge machine et un .so
115							# compil� pour cet environnement. Ca fait peur aux admin
116							# ISP ! On encapsule donc l'appel, si on echoue, on previent que
117							# tout appel au parser se soldera par une utilisation d'un parseur basic
118							# sans handicaper le reste de l'application
119							# Il vous reste plus qu'a faire alors une install mysql/circa en local
120							# pour faire l'indexation, et exporter les resultats sur le serveur final.
121	1					3	$self->{_parser_ok}=1;
122	1					2	eval { require HTML::Parser };
	1					11
123	1	50	33			10	if ($@ \|\| $HTML::Parser::VERSION < 3.0) {
124	0					0	warn "Module HTML-Parser 3.0 ou superieur requis pour ".
125							"utiliser les fonctionnalit�s optimales du parser.($@)\n";
126	0					0	$self->{_parser_ok}=0;
127							}
128	8			8		9110	else { use HTML::Entities; }
	8					87188
	8					44736
129	1					5	$self->{INDEXER}->trace(1,"Parser::new");
130	1					4	return $self;
131							}
132
133							#------------------------------------------------------------------------------
134							# tag
135							#------------------------------------------------------------------------------
136							sub tag {
137	0			0	1	0	my($tag, $num, $att) = @_; # parametre
138							# Liens exterieurs
139	0	0	0			0	if ((lc($tag) eq 'a') and ($$att{href})) {$links{$$att{href}}=1;}
	0	0	0			0
	0	0	0			0
		0	0
140							# Frame
141							elsif ((lc($tag) eq 'frame') and ($$att{src})) {$links{$$att{src}}=1;}
142							# On est dans le cas d'un meta
143	0					0	elsif (lc($tag) eq 'meta' and defined(%$att)) {
144	0	0	0			0	if ($$att{name} and lc($$att{name}) eq 'description') {# Description
		0	0
		0	0
145	0					0	$DESCRIPTION =$$att{content};}
146							elsif ((lc($$att{'http-equiv'}) eq 'keywords') or
147							(lc($$att{name}) eq 'keywords')) {# Mots-clefs
148	0					0	$KEYWORDS=$$att{content} ;}
149							elsif ((lc($$att{'http-equiv'}) eq 'refresh') and
150							($$att{content}=~/\d;URL=(.)$/)) {#url refresh
151	0					0	$links{$1}=1; }
152							}
153							# Area
154							elsif (($tag eq 'area') and ($$att{href})) {$links{$$att{href}}=1;}
155	0					0	$inside{$tag} += $num; # Profondeur de la balise
156							}
157
158							#------------------------------------------------------------------------------
159							# text
160							#------------------------------------------------------------------------------
161							sub text {
162	0	0	0	0	1	0	return if $inside{script} \|\| $inside{style};
163	0					0	analyse($_[0], $facteur_full_text);
164							}
165
166							#------------------------------------------------------------------------------
167							# look_at
168							#------------------------------------------------------------------------------
169							sub look_at {
170	0			0	1	0	my($this, $rh)=@_;
171							# $url,$idc,$idr,$lastModif,$url_local,$categorieAuto,$niveau,$categorie
172	0					0	undef %links; $RM={}; undef %inside;
	0					0
	0					0
173	0	0				0	$rh->{niveau} = 0 if (!$rh->{niveau});
174	0	0				0	$rh->{categorie} = 0 if (!$rh->{categorie});
175	0					0	my $buf_debug = "\tUrl => $rh->{url}\n\tIdc => $rh->{idc}\n";
176	0	0				0	$buf_debug.= "\tLast update => $rh->{lastModif}"
177							unless (!defined($rh->{lastModif}));
178	0	0				0	$buf_debug.= "\tUrl local => $rh->{url_local}"
179							unless (!defined($rh->{url_local}));
180	0					0	$this->{INDEXER}->trace(3, "Parser::look_at\n$buf_debug");
181	0					0	my ($url_orig,$racineFile,$racineUrl,$lastUpdate);
182	0	0	0			0	if ($rh->{url_local} or URI->new($rh->{url})->scheme eq 'file') {
183	0					0	$this->set_agent(1);
184							} else {
185	0					0	$this->set_agent(0);
186							}
187	0	0				0	if ($rh->{url_local}) {
188	0					0	$this->{ConfigMoteur}->{'temporate'}=0;
189	0	0				0	if ($rh->{url_local}=~/.*\/$/) {
190	0					0	chop($rh->{url_local});
191	0	0				0	if (-e "$rh->{url_local}/index.html") {
		0
		0
192	0					0	$rh->{url_local}.="/index.html";}
193							elsif (-e "$rh->{url_local}/index.htm") {
194	0					0	$rh->{url_local}.="/index.htm";}
195	0					0	elsif (-e "$rh->{url_local}/default.htm") {
196	0					0	$rh->{url_local}.="/default.htm";}
197							else {return (-1,0,0);}
198							}
199	0					0	$url_orig=$rh->{url};
200	0					0	$rh->{url}=$rh->{url_local};
201	0					0	($racineFile,$racineUrl) =
202							$this->{INDEXER}->fetch_first("select path,url from ".
203							$this->{INDEXER}->pre_tbl."local_url ".
204							"where id=$rh->{idr}");
205							}
206	0					0	my ($nb,$nbwg,$nburl)=(0,0,0);
207	0	0				0	if ($rh->{url_local}) {$this->{INDEXER}->set_host_indexed($rh->{url_local});}
	0					0
	0					0
208							else {$this->{INDEXER}->set_host_indexed($rh->{url});}
209
210							# Creation d'une requete
211							# On passe la requete � l'agent et on attend le r�sultat
212	0					0	my $res = $this->{AGENT}->request(new HTTP::Request('GET' => $rh->{url}));
213	0					0	$this->{INDEXER}->trace(2, "HTTP::Request return ".$res->status_line);
214	0	0				0	if ($res->is_success) {
215							# Langue
216	0		0			0	my $language = $res->content_language \|\| 'unkno';
217	0	0				0	if ($rh->{lastModif}) {
218	0					0	$this->{INDEXER}->trace(2,"Update url ".$rh->{lastModif}.' '.
219							$res->last_modified);
220							}
221							# Fichier non modifie depuis la derniere indexation
222	0	0	0			0	if (($rh->{lastModif}) && ($res->last_modified) &&
			0
223							($rh->{lastModif} >= $res->last_modified)) {
224	0					0	$this->{INDEXER}->trace(1,"No update on $rh->{url}");
225	0					0	$this->{INDEXER}->URL->update
226							($rh->{idr},('id'=>$rh->{idc}, 'last_check'=>"NOW()"));
227	0					0	return (0,0,0);
228							}
229	0	0				0	if ($res->last_modified) {
	0					0
230	0					0	my @date = localtime($res->last_modified);
231	0					0	$lastUpdate = ($date[5]+1900).'-'.($date[4]+1).'-'.
232							$date[3].' '.$date[2].':'.$date[1].':'.$date[0];
233							}
234							else {$lastUpdate='0000-00-00';}
235	0					0	my $x = 72-length($rh->{url});
236	0	0				0	if ( $this->{inindex}) {
237	0	0				0	print $this->{inindex},'/',$this->{toindex}," ",
238							$rh->{url},($ENV{SERVER_NAME} ? " \n" : (" "x$x)."\n");
239							}
240
241							# Il serait judicieux de mettre ca dans le constructeur,
242							# mais cela entraine 10 Mo de Ram supplementaire �
243							# l'utilisation. A voir avec les evolution du module
244							# HTML::Parser
245	0	0				0	if ($this->{_parser_ok}) {
246	0					0	$this->{INDEXER}->trace(3,"Use HTML::Parser ...");
247	0					0	my $parser = HTML::Parser->new
248							(api_version => 3,
249							handlers => [start => [\&tag, "tagname, '+1', attr"],
250							end => [\&tag, "tagname, '-1', attr"],
251							text => [\&text, "dtext"],
252							],
253							marked_sections => 1);
254							# parse du fichier
255	0	0				0	$parser->parse($res->content)
256							\|\| print STDERR "Can't parse ".$res->content."::$!\n";
257							}
258							else {
259	0					0	$this->{INDEXER}->trace(1,"Use a basic parser ...");
260	0					0	my $TEXT = $res->content;
261	0					0	$TEXT=~s{ } {
262	0	0	0			0	if ($1 \|\| $3) {"";} }gesx;
	0					0
263	0					0	$TEXT=~s{ <(?: [^>\'\"] * \| ".?" \| '.?' ) + > }{}gsx;
264	0					0	analyse(decode_entities($TEXT),
265							$this->{ConfigMoteur}->{'facteur_full_text'});
266							}
267
268							# Mots clefs et description
269	0		0			0	my ($desc,$keyword)=($DESCRIPTION\|\|' ',$KEYWORDS\|\|' ');
			0
270	0					0	undef $DESCRIPTION; undef $KEYWORDS;
	0					0
271	0		0			0	my $titre = $res->title \|\| $rh->{url};# Titre
272							# Categorie
273	0	0				0	if ($rh->{categorieAuto}) {
274	0					0	$rh->{categorie} = $this->{INDEXER}->categorie->get($rh->{url},
275							$rh->{idr});
276							}
277	0	0				0	if (!$rh->{categorie}) {$rh->{categorie}=0;}
	0					0
278							# Mis a jour de l'url
279	0	0				0	if ($this->{INDEXER}->URL->update
280							($rh->{idr},
281							(parse => 1,
282							id => $rh->{idc},
283							titre => $titre,
284							description => $desc,
285							last_update => $lastUpdate,
286							last_check => 'NOW()',
287							langue => $language,
288							categorie => $rh->{categorie}
289							)
290							)) {
291	0					0	$this->{INDEXER}->trace(2, "$rh->{url} mis � jour avec success");
292							}
293
294							# Traitement des mots trouves
295	0					0	analyse($keyword,$this->{ConfigMoteur}->{'facteur_keyword'});
296	0					0	analyse($desc,$this->{ConfigMoteur}->{'facteur_description'});
297	0					0	analyse($titre,$this->{ConfigMoteur}->{'facteur_titre'});
298	0					0	analyse($rh->{url},$this->{ConfigMoteur}->{'facteur_url'});
299	0					0	$this->{INDEXER}->dbh->do
300							("delete from ".$this->{INDEXER}->pre_tbl.$rh->{idr}."relation ".
301							"where id_site = $rh->{idc}");
302
303							# Chaque mot trouve plus de $ConfigMoteur{'nb_min_mots'} fois
304							# est enregistre
305							# On passe cette etape si le nombre de liens de la page est superieur
306							# a 50% le nombre de mots retenus, il s'agit alors
307							# d'un sommaire peut interessant � consulter
308	0					0	my $nbw = 0;
309	0	0				0	if (scalar keys %links < (( scalar keys %$RM) * 0.5)) {
310	0					0	while (my ($mot,$nb)=each(%$RM)) {
311	0	0	0			0	next if (!$nb or $nb < $this->{'ConfigMoteur'}->{'nb_min_mots'});
312	0					0	my $requete = "insert into ".
313							$this->{INDEXER}->pre_tbl.$rh->{idr}.
314							"relation (mot,id_site,facteur) ".
315							"values ('$mot',$rh->{idc},$nb)";
316	0	0				0	$this->{INDEXER}->dbh->do($requete) && $nbwg++;
317	0					0	$this->{INDEXER}->trace(4,"\t\tStore words: ".$requete);
318							}
319	0					0	$nbw=keys %$RM;
320							}
321							else {
322	0					0	$this->{INDEXER}->trace
323							(1,"Sommaire - ".(scalar keys %$RM).
324							" mots ignores pour ".(scalar keys %links)." liens");
325							}
326
327							# On n'indexe pas les liens si on est au niveau max
328	0	0				0	if ($rh->{niveau} == $this->{ConfigMoteur}{'niveau_max'}) {
329	0					0	$this->{INDEXER}->trace(1,"Niveau max atteint. Liens suivants de ".
330							"cette page ignor�s ");
331	0					0	return (0,0,0);
332							}
333							# Traitement des url trouves
334	0					0	my $base = $res->base;
335	0					0	my @l = keys %links; undef %links;
	0					0
336	0	0				0	$this->{INDEXER}->trace(2, "Liens trouv�s") if ($#l>0);
337	0					0	foreach my $var (@l) {
338	0					0	$var = url($var,$base)->abs; # Url absolu
339	0					0	$var = $this->check_links('a',$var);
340	0	0	0			0	if (($rh->{url_local}) && ($var)) {
		0
341	0					0	my $urlb = $var;
342	0					0	$urlb=~s/$racineFile/$racineUrl/g;
343							#print h1("Ajout site local:$$var[2] pour $racineFile");
344	0					0	$this->{INDEXER}->trace(2, "\t".$urlb);
345	0	0				0	if ($this->{INDEXER}->URL->add
346							($rh->{idr},
347							(url => $urlb,
348							urllocal => $var,
349							niveau => $rh->{niveau}+1,
350							categorie => $rh->{categorie},
351							valide => 1,
352							browse_categorie=>$rh->{categorieAuto})))
353	0					0	{ $nburl++; }
	0					0
354							else {$this->{INDEXER}->trace
355							(2,"\tCan't add $urlb:\n\t$DBI::errstr");}
356							}
357							elsif ($var) {
358	0					0	$this->{INDEXER}->trace(2, "\t".$var);
359	0	0				0	if ($this->{INDEXER}->URL->add
360							($rh->{idr},
361							(url => $var,
362							niveau => $rh->{niveau}+1,
363							categorie => $rh->{categorie},
364							valide => 1)))
365	0					0	{ $nburl++; }
366							else
367	0					0	{ $this->{INDEXER}->trace
368							(2,"\tCan't add $var:\n\t$DBI::errstr");}
369							}
370							}
371	0					0	$this->{INDEXER}->trace(3, "---------------------------------\n");
372	0					0	return ($nburl,$nbw,$nbwg);
373							}
374							# Sinon previent que URL defectueuse
375	0					0	else { print "*** ", $res->code," : $rh->{url}\n";return (-1,0,0);}
	0					0
376							}
377
378							#------------------------------------------------------------------------------
379							# set_agent
380							#------------------------------------------------------------------------------
381							sub set_agent
382							{
383	0			0	1	0	my ($self,$locale)=@_;
384	0					0	$self->{INDEXER}->trace(5, "Circa::Parser::set_agent $locale\n");
385	0	0	0			0	return if ($self->{AGENT} && $self->{_ROBOT}==$locale); # agent already set
386	0					0	$self->{_ROBOT}=$locale;
387	0	0	0			0	if (($self->{ConfigMoteur}->{'temporate'}) && (!$locale)) {
	0					0
388	0					0	$self->{'AGENT'} = LWP::RobotUA->new
389							("CircaParser $VERSION",$self->{'ConfigMoteur'}->{'author'});
390	0					0	$self->{AGENT}->delay(1/120.0);
391							}
392							else {$self->{AGENT} = new LWP::UserAgent; }
393	0	0				0	if ($self->{PROXY}) {$self->{AGENT}->proxy(['http', 'ftp'], $self->{PROXY});}
	0					0
394	0	0				0	$self->{AGENT}->max_size($self->{INDEXER}->size_max)
395							if ($self->{INDEXER}->size_max);
396	0					0	$self->{AGENT}->timeout(25); # Set timeout to 25s (defaut 180)
397							}
398
399							#------------------------------------------------------------------------------
400							# analyse
401							#------------------------------------------------------------------------------
402							sub analyse {
403	0			0	1	0	my $data = shift;
404	0					0	my $facteur = shift;
405	0					0	my $e;
406	0	0	0			0	return if (!$data or !$facteur);
407							# Ponctuation et mots recurents
408	0					0	$data=~s/http:\/\// /gm;
409	0					0	$data=~tr/\n\t<>.;:,?!()\"\'[]#=\/_/ /;
410	0					0	$data=~s/\s+/ /gm;
411	0					0	foreach (split(/\s/,$data)) {
412	0	0				0	next if !$_;
413	0					0	$e=lc($_);
414	0	0	0			0	$$RM{$e}+=$facteur
			0
			0
415							if (($e =~/\w/)&&(length($e)>2)&&(!$bad{$e})&&($e !~/^\d*$/));
416							}
417							}
418
419							#------------------------------------------------------------------------------
420							# check_links
421							#------------------------------------------------------------------------------
422							sub check_links
423							{
424	16			16	1	30	my($self,$tag,$links) = @_;
425	16					44	my $host = $self->{INDEXER}->host_indexed;
426	16					24	my $li = "doc\|zip\|ps\|gif\|jpg\|gz\|pdf\|eps\|png\|deb\|xls\|ppt\|".
427							"class\|GIF\|css\|js\|wav\|mid";
428	16					151	my $bad = qr/\.($li)$/i;
429	16	50	33			292	if (($tag) && ($links) && ($tag eq 'a')
			33
			33
			33
430							&& ($links=~/\Q$host\E/)
431							&& ($links !~ $bad))
432							{
433	0	0				0	if ($links=~/^(.*?)#/) {$links=$1;} # Don't add anchor
	0					0
434	0	0	0			0	if ((!$self->{ConfigMoteur}->{'indexCgi'})&&($links=~/^(.*?)\?/))
	0					0
435							{$links=$1;}
436	0					0	return $links;
437							}
438	16					85	return 0;
439							}
440
441
442							#------------------------------------------------------------------------------
443							# POD DOCUMENTATION
444							#------------------------------------------------------------------------------
445
446							=head1 NAME
447
448							Search::Circa::Parser - provide functions to parse HTML pages by Circa
449
450							=head1 SYNOPSIS
451
452							use Search::Circa::Indexer;
453							my $index = new Search::Circa::Indexer;
454							$index->connect(...);
455							$index->Parser->look_at({ url => url,
456							idr => account });
457
458							=head1 DESCRIPTION
459
460							This module use HTML::Parser facilities. It's call by Search::Circa::Indexer
461							for index each document. Main method is C.
462
463							=head1 Public Class Interface
464
465							=over
466
467							=item B I
468
469							Create a new Circa::Parser object with indexer instance properties
470
471							=item B I
472
473							Index an url. Job done is:
474
475							=over
476
477							=item *
478
479							Test if url used is valid. Return -1 else
480
481							=item *
482
483							Get the page and add each words found with weight set in constructor.
484
485							=item *
486
487							If maximum level of links is not reach, add each link found for the next
488							indexation
489
490							=back
491
492							Keys for refHashParameters:
493
494							=over
495
496							=item I
497
498							Url to read
499
500							=item I
501
502							Id of url in table links
503
504							=item I
505
506							Id of account's url
507
508							=item I
509
510							(optional) : If this parameter is set, Circa didn't make any job
511							on this page if it's older that the date.
512
513							=item I
514
515							(optional) Local url to reach the file
516
517							=item I
518
519							(optional) If $categorieAuto set to true, Circa will
520							create/set the category of url with syntax of directory found. Ex:
521							http://www.alianwebserver.com/societe/stvalentin/index.html will create
522							and set the category for this url to Societe / StValentin.
523							If $categorieAuto set to false, $categorie will be used.
524
525							=item I
526
527							(optional) Depth of actual link.
528
529							=item I
530
531							(optional) See $categorieAuto.
532
533							=back
534
535							Return (-1,0) if url isn't valide, number of word and number of links
536							found else
537
538							=item B I
539
540							Set user agent for Circa robot. If local is set to 0 or
541							$self->{ConfigMoteur}->{'temporate'}==0,
542							LWP::UserAgent will be used. Else LWP::RobotUA is used.
543
544							=item B I
545
546							Split data in words, and put them in global %$RM with score.
547							Hash structure is ('mots'=>facteur).
548
549							=over
550
551							=item I
552
553							Buffer to analyse
554
555							=item I
556
557							Basic score for each word
558
559							=back
560
561							=item B
562
563							Method call for each HTML tag find in HTML pages.
564
565							=item B
566
567							Method call for each content of tag in HTML pages
568
569							=item B I
570
571							Check if url $links will be add to Circa. Url must begin with
572							$self->host_indexed, and his extension must be not doc,zip,ps,gif,jpg,gz,
573							pdf,eps,png,deb,xls,ppt,class,GIF,css,js,wav,mid.
574
575							If $links is accepted, return url. Else return 0.
576
577							=back
578
579							=head1 VERSION
580
581							$Revision: 1.27 $
582
583							=head1 SEE ALSO
584
585							L
586
587							=head1 AUTHOR
588
589							Alain BARBET alian@alianwebserver.com
590
591							=cut