File Coverage

blib/lib/Lingua/LO/NLP/Syllabify.pm
Criterion Covered Total %
statement 48 48 100.0
branch 6 6 100.0
condition n/a
subroutine 14 14 100.0
pod 3 3 100.0
total 71 71 100.0


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Syllabify;
2 6     6   125151 use strict;
  6         16  
  6         163  
3 6     6   28 use warnings;
  6         12  
  6         137  
4 6     6   87 use 5.012000;
  6         23  
5 6     6   30 use utf8;
  6         12  
  6         45  
6 6     6   168 use feature 'unicode_strings';
  6         11  
  6         513  
7 6     6   227 use version 0.77; our $VERSION = version->declare('v1.0.1');
  6         1589  
  6         33  
8 6     6   474 use charnames qw/ :full lao /;
  6         10  
  6         42  
9 6     6   2980 use Carp;
  6         14  
  6         326  
10 6     6   1826 use Unicode::Normalize qw/ NFC /;
  6         7629  
  6         406  
11 6     6   903 use Class::Accessor::Fast 'antlers';
  6         5922  
  6         43  
12 6     6   1996 use Lingua::LO::NLP::Data ':all';
  6         20  
  6         2726  
13              
14             =encoding utf8
15              
16             =head1 NAME
17              
18             Lingua::LO::NLP::Syllabify - Segment Lao or mixed-script text into syllables.
19              
20             =head1 FUNCTION
21              
22             This implements a purely regular expression based algorithm to segment Lao text
23             into syllables, based on the one described in PHISSAMAY et al:
24             I.
25              
26             =cut
27              
28             has text => (is => 'ro');
29              
30             my $syl_re = Lingua::LO::NLP::Data::get_sylre_basic;
31             my $complete_syl_re = Lingua::LO::NLP::Data::get_sylre_full;
32              
33             =head1 METHODS
34              
35             =head2 new
36              
37             C
38              
39             The constructor takes a mandatory argument containing the text to split, and
40             any number of hash-style named options. Currently, the only such option is
41             C which takes a boolean argument and indicates whether to run the
42             text though a normalization function that swaps tone marks and vowels appearing
43             in the wrong order.
44              
45             Note that in any case text is passed through L first
46             to obtain the Composed Normal Form. In pure Lao text, this affects only the
47             decomposed form of LAO VOWEL SIGN AM that will be transformed from C,
48             C to C.
49              
50             =cut
51              
52             sub new {
53 77     77 1 22615 my $class = shift;
54 77         165 my $text = shift;
55 77 100       458 croak("`text' argument missing or undefined") unless defined $text;
56 76         204 my %opts = @_;
57 76         655 $text = NFC( $text );
58 76 100       261 normalize_tone_marks($text) if $opts{normalize};
59 76         391 return bless { text => $text }, $class
60             }
61              
62             =head2 get_syllables
63              
64             C
65              
66             Returns a list of Lao syllables found in the text passed to the constructor. If
67             there are any blanks, non-Lao parts etc. mixed in, they will be silently
68             dropped.
69              
70             =cut
71              
72             sub get_syllables {
73 22     22 1 890 return shift->text =~ m/($complete_syl_re)/og;
74             }
75              
76             =head2 get_fragments
77              
78             C
79              
80             Returns a complete segmentation of the text passed to the constructor as an
81             array of hashes. Each hash has two keys:
82              
83             =over 4
84              
85             =item C
86              
87             The text of the respective fragment
88              
89             =item C
90              
91             If true, the fragment is a single valid Lao syllable. If
92             false, it may be whitespace, non-Lao script, Lao characters that don't
93             constitute valid syllables - basically anything at all that's I a valid
94             syllable.
95              
96             =back
97              
98             =cut
99              
100             sub get_fragments {
101 53     53 1 153 my $self = shift;
102 53         1126 my $t = $self->text;
103 53         318 my @matches;
104 53         3201 while($t =~ /\G($complete_syl_re | .+?(?=$complete_syl_re|$) )/oxgcs) {
105 122 100       4732 unless($1 eq "\N{ZERO WIDTH SPACE}") {
106 120         268 my $match = $1;
107 120         4151 push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };
108             }
109             }
110             return @matches
111 53         248 }
112              
113             1;