File Coverage

blib/lib/Lingua/PT/PLN.pm

Criterion	Covered	Total	%
statement	57	190	30.0
branch	12	68	17.6
condition	0	15	0.0
subroutine	8	23	34.7
pod	12	12	100.0
total	89	308	28.9

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::PT::PLN;
2	4			4		92034	use strict;
	4					11
	4					144
3
4	4			4		4487	use Lingua::PT::PLNbase;
	4					181342
	4					44
5	4			4		10391	use Lingua::PT::PLN::Words2Sampa;
	4					45
	4					416
6
7							require Exporter;
8							our @ISA = qw(Exporter AutoLoader);
9
10							our @EXPORT =
11							(@Lingua::PT::PLNbase::EXPORT,
12							qw(syllable accent wordaccent oco initPhon toPhon));
13
14							our $VERSION = '0.21';
15
16	4			4		58	use POSIX qw(locale_h);
	4					7
	4					43
17							setlocale(&POSIX::LC_ALL, "pt_PT");
18	4			4		990	use locale;
	4					8
	4					33
19
20							# para transcri��o fon�tica
21							$INC{'Lingua/PT/PLN.pm'} =~ m!/PLN\.pm$!;
22							our $ptpho = "$`/PLN/ptpho";
23							our $naoacentuadas = "$`/PLN/nao_acentuadas";
24							our $dic;
25							our $no_accented;
26
27							our ($consoante, $vogal, $acento, %names);
28
29							my ($lmax,$maxlog,$magicF);
30
31							BEGIN {
32	4			4		10919	$consoante=qr{[bc�dfghjklm�npqrstvwyxz]}i;
33	4					23	$vogal=qr{[��aeiou��]}i;
34	4					12	$acento=qr{[��]}i;
35	4					430	setlocale(&POSIX::LC_ALL, "pt_PT");
36
37	4			4		1472	use POSIX;
	4					8
	4					21
38	4					24	POSIX::setlocale(LC_CTYPE,"pt_PT");
39
40	4					8	$lmax = 1000000;
41	4					7	$maxlog = 13.815;
42	4					12858	$magicF = $maxlog/log($lmax);
43							}
44
45
46
47							sub oco {
48							### { from => (file\|string),
49							### num => 1,
50							### log => 1, # logaritmic output
51							### alpha => 1,
52							### output => file,
53							### encoding => utf8,
54							### ignorexml => 1,
55							### ignorecase => 1}
56
57	2			2	1	1325	my %opt = (from => 'file', ignorecase => 0, ignorexml => 0, encoding => "latin1");
58	2	50				14	%opt = (%opt , %{shift(@_)}) if ref($_[0]) eq "HASH";
	2					38
59
60	2					11	local $\ = "\n"; # set output record separator
61
62	2					6	my $P="(?:[,;:?!]\|[.]+\|[-]+)"; # pontuacao a contar
63	2					4	my $A="[A-Z��a-z��_]";
64	2					3	my $I="[ \"(){}+*=<>\250\256\257\277\253\273]"; # car. a ignorar
65	2					5	my %oco=();
66	2					4	my $tot=0;
67
68	2	100				9	if ($opt{from} eq 'string') {
69	1					4	my (@str) = (@_);
70	1					3	for (@str) {
71	1	50				8	$_ = lc if $opt{ignorecase};
72	1	50				3	s/<[^>]+>//g if $opt{ignorexml};
73	1					133	for (/($A+(?:['-]$A+)*\|$P)/g) { $oco{$_}++; $tot++ }
	14					21
	14					20
74							}
75							} else {
76	1					3	my (@file) = (@_);
77	1					3	for(@file) {
78	1	50				42	open F,"< $_" or die "cant open $_: $!";
79	1	50				8	binmode(F, ":utf8") if $opt{encoding} =~ /utf8/i ;
80	1					34	while () {
81	500	50				1042	$_ = lc if $opt{ignorecase};
82	500	50				938	s/<[^>]+>//g if $opt{ignorexml};
83	500					14412	for (/($A+(?:['-]$A+)*\|$P)/g) { $oco{$_}++; $tot++}
	4854					7139
	4854					7898
84							}
85	1					15	close F;
86							}
87							}
88
89	2	50				9	if ($opt{log}){
90	0					0	print "total = $tot\n";
91	0					0	_setmax($tot);
92	0	0				0	_setmax($opt{log}) if($opt{log} > 1);
93	0					0	for (keys %oco){
94	0					0	$oco{$_}=_logit($oco{$_});
95							}
96							}
97
98	2	50				9	if ($opt{num}) { # imprime por ordem de quantidade de ocorrencias
		50
99
100							# TODO: n�o � port�vel
101	0	0				0	if (defined $opt{output}) {
102	0					0	open SORT,"\| sort -nr > $opt{output}"
103							} else {
104	0					0	open SORT,"\| sort -nr"
105							}
106
107	0					0	for my $i (keys %oco) {
108	0					0	print SORT "$oco{$i} $i"
109							}
110	0					0	close SORT;
111
112							} elsif ($opt{alpha}) { # imprime ordenado alfabeticamente
113
114	0	0				0	if (defined $opt{output}) {
115	0					0	open SORT ,"> $opt{output}";
116	0					0	for my $i (sort keys %oco ) {
117	0					0	print SORT "$i $oco{$i}";
118							}
119							} else {
120	0					0	for my $i (sort keys %oco ) {
121	0					0	print "$i $oco{$i}";
122							}
123							}
124							} else {
125	2					1448	return (%oco)
126							}
127							}
128
129							### syllabs, and accents
130
131							sub accent {
132	0			0	1		local $/ = ""; # input record separator=1 or more empty lines
133	0						my $p=shift;
134	0						$p =~ s/(\w+)/ wordaccent($1) /ge;
	0
135	0						$p
136							}
137
138							sub wordaccent {
139	0			0	1		my $p = syllable($_[0]);
140	0	0					my $flag = $_[1] or 0; # 0 (default) => use : after vowel; 1 => use " before syllable
141	0						for ($p) {
142	0	0	0				s{(\w*$acento)}{"$1}i or # word with an accent character
			0
143							# s{(\w*)([ua])(ir)$}{$1$2\|"$3}i or # word ending with air uir
144							s{(\w*([zlr]\|[iu]s?))$}{"$1}i or # word ending with z l r i u is us
145							s{(\w+\\|\w+)$}{"$1} or # accent in 2 syllable frm the end
146							s{(\w)}{"$1}; # accent in the only syllable
147
148	0	0					if(!$flag){
149	0						s{"(([qg]u\|$consoante)*($vogal\|[yw]))}{$1:}i; # accent in the 1.st vowel
150	0						s{:($acento)}{$1:}i; # mv accent after accents
151	0						s{"}{}g;
152							}
153
154							}
155							$p
156	0						}
157
158							my %syl = (
159							20 => " -.!?:;",
160							10 => "b�dfgjkpqtv",
161							8 => "sc",
162							7 => "m",
163							6 => "lzx",
164							5 => "nr",
165							4 => "h",
166							3 => "wy",
167							2 => "eao��",
168							1 => "iu",
169							breakpair =>
170							#"ie\|ia\|io\|ee\|oo\|oa\|sl\|sm\|sn\|sc\|sr\|rn\|bc\|lr\|lz\|bd\|bj\|bg\|bq\|bt\|bv\|pt\|pc\|dj\|p�\|ln\|nr\|mn\|tp\|bf\|bp",
171							"sl\|sm\|sn\|sc\|sr\|rn\|bc\|lr\|lz\|bd\|bj\|bg\|bq\|bt\|bv\|pt\|pc\|dj\|p�\|ln\|nr\|mn\|tp\|bf\|bp\|xc\|s�\|ss\|rr",
172							# d�grafos que se separam sempre: xc, s�, ss, rr, sc.
173							);
174
175							my %spri = ();
176
177							for my $pri (grep(/\d/, keys %syl)){
178							for(split(//,$syl{$pri})) { $spri{$_} = $pri}}
179
180							(my $sylseppair= $syl{breakpair}) =~ s/(\w)(\w)/(\?<=($1))(\?=($2))/g;
181
182							sub syllable{
183	0			0	1		my $p=shift;
184
185	0						for($p){
186	0						s/$sylseppair/\|/g;
187	0						s{(\w)(?=(\w)(\w))}
188	0	0	0				{if($spri{lc($1)}<$spri{lc($2)} && $spri{lc($2)}>=$spri{lc($3)}){"$1\|"}
	0
	0
189							else{$1}
190							}ge;
191
192	0						s{([a])(i[ru])}{$1\|$2}i; #ditongos and friends
193	0						s{([ioe�])([aoe])}{$1\|$2}ig;
194	0						s{u(ai\|ou\|a)}{u\|$1}i;
195	0						s{([^qg]u)(ei\|iu\|ir\|$acento\|e)}{$1\|$2}i; # continu\|e
196	0						s{([^q]u)(o)}{$1\|$2}i; # quo\|ta; v�cu\|o
197	0						s{([aeio])($acento)}{$1\|$2}i;
198	0						s{([��])($vogal)}{$1\|$2}i;
199	0						s{^a(o\|e)}{a\|$1}i; # a\|onde; a\|orta; a\|er�dromo
200
201	0						s{([qg]u)\\|([ei�])}{$1$2}i;
202	0						s{^($consoante)\\|}{$1}i;
203	0						s{�m$}{�\|_nhem}i;
204							}
205							$p
206	0						}
207
208							# carregar ficheiros necess�rios para transcri��o fon�tica
209							sub initPhon
210							{
211	0			0	1		our $dic = carregaDicionario($ptpho);
212	0						our $no_accented = chargeNoAccented($naoacentuadas);
213							}
214
215							# transcri��o fon�tica
216							sub toPhon {
217	0			0	1		my $word = shift;
218	0						my $prefix = undef;
219	0						my $res = undef;
220	0						$word = lc($word);
221
222	0	0					unless ($word =~ /,/) {
223	0						$res = gfdict($word,$dic); #$dic->{$word};
224							# $res = "$dic->{$1}S" if(!$res && $word =~ /(.*)s$/ );
225
226	0	0	0				unless ($res \|\| length($word)<3) {
227
228	0						$prefix = $word;
229	0		0				do {
230	0						$prefix =~ s{\*$}{}g;
231	0						$prefix =~ s{.$}{*};
232	0						$res = $dic->{$prefix};
233							} until ($res \|\| $prefix =~ m!^\w\*! );
234							}
235
236	0	0					if (defined($prefix)) {
237	0	0					if ($res) {
238	0						$prefix =~ s{\*$}{}g;
239	0						$res =~ s{\*$}{}g;
240	0						$word =~ s/^$prefix/$res/;
241	0						undef($res);
242							}
243							}
244							}
245	0	0					if ($res) {
		0
246	0	0					if ($res =~ /^!/) {
247	0						$res = toPhon2($');
248							}
249							} elsif ($no_accented->{$word}) {
250	0						$res = Lingua::PT::PLN::Words2Sampa::run($word, 0); # debug = 0
251							} else {
252	0						$res = toPhon2($word);
253							}
254	0						return $res;
255							}
256
257							sub toPhon2
258							{
259	0			0	1		my $word = shift;
260	0						my $t = wordaccent($word, 1);
261	0						$t =~ s/\\|//g;
262	0						return Lingua::PT::PLN::Words2Sampa::run($t, 0); # debug = 0
263							}
264
265							sub chargeNoAccented {
266	0			0	1		my $file = shift;
267	0						my $dic;
268	0	0					open F, $file or die ("cannot open dictionary file: $!");
269	0						while() {
270	0						chomp;
271	0						$dic->{$_}++;
272							}
273	0						close F;
274	0						return $dic;
275							}
276
277							sub carregaDicionario {
278	0			0	1		my $file = shift;
279	0						my $dic;
280	0	0					open F, $file or die ("cannot open dicionary file: $!");
281	0						while() {
282	0						chomp;
283	0						my ($a,$b) = split /=/;
284	0						$dic->{$a}=$b;
285							}
286	0						close F;
287	0						return $dic;
288							}
289
290							sub gfdict{
291	0			0	1		my ($word,$dic) = @_;
292	0	0					return "" unless ($word =~ /\w/);
293	0						my $res = $dic->{$word};
294	0	0					unless($res){ $res = $dic->{$1} if( $word =~ /^(.*)s$/ );
	0	0
295	0	0					return "" unless ($res);
296	0	0					if($res =~ /^!/) {$res .= "s"}
	0
	0
297							else {$res .= "S"}
298							}
299	0						$res;
300							}
301
302							sub compara {
303							# ordena pela lista de palavras invertida
304	0			0	1		join(" ", reverse(split(" ",$a))) cmp join(" ", reverse(split(" ",$b)));
305							}
306
307							sub compacta {
308	0			0	1		my $s;
309	0						my $p = shift;
310	0						my $r = $p;
311	0						my $q = $names{$p};
312	0						while ($s = shift) {
313	0	0					if ($s =~ (/^(.+) $p/))
314							{
315	0						$r = "($1) $r" ;
316	0						$q += $names{$s};
317							}
318							else
319							{
320	0						print "$r - $q";
321	0						$r = $s;
322	0						$q = $names{$s};
323							}
324	0						$p=$s;
325							}
326	0						print "$r - $q";
327							}
328
329							my %savit_p = ();
330							my $savit_n = 0;
331
332							sub _savit {
333	0			0			my $a = shift;
334	0						$savit_p{++$savit_n} = $a ;
335	0						" __MARCA__$savit_n "
336							}
337
338							sub _loadit {
339	0			0			my $a = shift;
340	0						$a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g;
341	0						$savit_n = 0;
342	0						$a;
343							}
344
345							1;
346
347							#sub setlogmax{
348							# $maxlog = shift;
349							# $magicF=$maxlog/log($lmax);
350							### print "Debud .... Maxlog=$maxlog; magic=$magicF\n";
351							#}
352
353							sub _setmax{
354	0			0			$lmax = shift;
355	0						$magicF=$maxlog/log($lmax);
356							## print "Debud .... Max=$lmax; magic=$magicF\n";
357							}
358
359							sub _logit{
360	0			0			my $n=shift;
361	0	0					return 0 unless $n;
362							## print STDERR "...$n,", log($n*$magicF) ,"\n" ;
363	0						log($n)*$magicF
364							}
365
366							1;
367
368							__END__