File Coverage

blib/lib/Lingua/LO/NLP/Syllabify.pm
Criterion Covered Total %
statement 48 48 100.0
branch 6 6 100.0
condition n/a
subroutine 14 14 100.0
pod 3 3 100.0
total 71 71 100.0


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Syllabify;
2 6     6   113435 use strict;
  6         21  
  6         190  
3 6     6   37 use warnings;
  6         15  
  6         178  
4 6     6   118 use 5.012000;
  6         24  
5 6     6   41 use utf8;
  6         13  
  6         58  
6 6     6   214 use feature 'unicode_strings';
  6         15  
  6         808  
7 6     6   246 use version 0.77; our $VERSION = version->declare('v1.0.1');
  6         1494  
  6         44  
8 6     6   658 use charnames qw/ :full lao /;
  6         14  
  6         64  
9 6     6   3962 use Carp;
  6         17  
  6         428  
10 6     6   2786 use Unicode::Normalize qw/ NFC /;
  6         11269  
  6         577  
11 6     6   1526 use Class::Accessor::Fast 'antlers';
  6         8874  
  6         55  
12 6     6   2550 use Lingua::LO::NLP::Data ':all';
  6         29  
  6         3551  
13              
14             =encoding utf8
15              
16             =head1 NAME
17              
18             Lingua::LO::NLP::Syllabify - Segment Lao or mixed-script text into syllables.
19              
20             =head1 FUNCTION
21              
22             This implements a purely regular expression based algorithm to segment Lao text
23             into syllables, based on the one described in PHISSAMAY et al:
24             I.
25              
26             =cut
27              
28             has text => (is => 'ro');
29              
30             my $syl_re = Lingua::LO::NLP::Data::get_sylre_basic;
31             my $complete_syl_re = Lingua::LO::NLP::Data::get_sylre_full;
32              
33             =head1 METHODS
34              
35             =head2 new
36              
37             C
38              
39             The constructor takes a mandatory argument containing the text to split, and
40             any number of hash-style named options. Currently, the only such option is
41             C which takes a boolean argument and indicates whether to run the
42             text though a normalization function that swaps tone marks and vowels appearing
43             in the wrong order.
44              
45             Note that in any case text is passed through L first
46             to obtain the Composed Normal Form. In pure Lao text, this affects only the
47             decomposed form of LAO VOWEL SIGN AM that will be transformed from C,
48             C to C.
49              
50             =cut
51              
52             sub new {
53 77     77 1 17497 my $class = shift;
54 77         118 my $text = shift;
55 77 100       410 croak("`text' argument missing or undefined") unless defined $text;
56 76         157 my %opts = @_;
57 76         527 $text = NFC( $text );
58 76 100       206 normalize_tone_marks($text) if $opts{normalize};
59 76         333 return bless { text => $text }, $class
60             }
61              
62             =head2 get_syllables
63              
64             C
65              
66             Returns a list of Lao syllables found in the text passed to the constructor. If
67             there are any blanks, non-Lao parts etc. mixed in, they will be silently
68             dropped.
69              
70             =cut
71              
72             sub get_syllables {
73 22     22 1 255 return shift->text =~ m/($complete_syl_re)/og;
74             }
75              
76             =head2 get_fragments
77              
78             C
79              
80             Returns a complete segmentation of the text passed to the constructor as an
81             array of hashes. Each hash has two keys:
82              
83             =over 4
84              
85             =item C
86              
87             The text of the respective fragment
88              
89             =item C
90              
91             If true, the fragment is a single valid Lao syllable. If
92             false, it may be whitespace, non-Lao script, Lao characters that don't
93             constitute valid syllables - basically anything at all that's I a valid
94             syllable.
95              
96             =back
97              
98             =cut
99              
100             sub get_fragments {
101 53     53 1 122 my $self = shift;
102 53         136 my $t = $self->text;
103 53         219 my @matches;
104 53         2717 while($t =~ /\G($complete_syl_re | .+?(?=$complete_syl_re|$) )/oxgcs) {
105 122 100       4756 unless($1 eq "\N{ZERO WIDTH SPACE}") {
106 120         221 my $match = $1;
107 120         3527 push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };
108             }
109             }
110             return @matches
111 53         200 }
112              
113             1;