File Coverage

blib/lib/Lingua/LO/Transform/Syllables.pm
Criterion Covered Total %
statement 45 45 100.0
branch 3 4 75.0
condition n/a
subroutine 14 14 100.0
pod 3 3 100.0
total 65 66 98.4


line stmt bran cond sub pod time code
1             package Lingua::LO::Transform::Syllables;
2 3     3   223569 use strict;
  3         3  
  3         72  
3 3     3   10 use warnings;
  3         3  
  3         66  
4 3     3   54 use 5.012000;
  3         7  
5 3     3   9 use utf8;
  3         3  
  3         15  
6 3     3   54 use feature 'unicode_strings';
  3         5  
  3         238  
7 3     3   396 use version 0.77; our $VERSION = version->declare('v0.0.1');
  3         1338  
  3         13  
8 3     3   204 use charnames qw/ :full lao /;
  3         4  
  3         16  
9 3     3   1279 use Carp;
  3         3  
  3         181  
10 3     3   1160 use Unicode::Normalize qw/ NFC /;
  3         328268  
  3         273  
11 3     3   1534 use Class::Accessor::Fast 'antlers';
  3         6092  
  3         22  
12 3     3   1270 use Lingua::LO::Transform::Data;
  3         4  
  3         723  
13              
14             =encoding UTF-8
15              
16             =head1 NAME
17              
18             Lingua::LO::Transform::Syllables - Segment Lao or mixed-script text into syllables.
19              
20             =head1 FUNCTION
21              
22             This implements a purely regular expression based algorithm to segment Lao text into syllables, based
23             on the one described in PHISSAMAY et al: I.
24              
25             =cut
26              
27             has text => (is => 'ro');
28              
29             my $syl_re = Lingua::LO::Transform::Data::get_sylre_basic;
30             my $complete_syl_re = Lingua::LO::Transform::Data::get_sylre_full;
31              
32             =head1 METHODS
33              
34             =head2 new
35              
36             C $text, ... )>
37              
38             The constructor takes hash-style named arguments. The only one defined so far
39             is C whose value is obviously the text to be segmented.
40              
41             Note that text is passed through L<"Unicode::Normalize"/NFC> first to obtain the Composed Normal Form. In pure Lao text, this affects only the decomposed form of LAO VOWEL SIGN AM that will be transformed from C,C to C.
42              
43             =cut
44              
45             sub new {
46 42     42 1 11861 my $class = shift;
47 42         88 my %opts = @_;
48 42 50       89 croak("`text' key missing or undefined") unless defined $opts{text};
49             return bless {
50 42         96 text => NFC( $opts{text} ),
51             }, $class;
52             }
53              
54             =head2 get_syllables
55              
56             C
57              
58             Returns a list of Lao syllables found in the text passed to the constructor. If
59             there are any blanks, non-Lao parts etc. mixed in, they will be silently
60             dropped.
61              
62             =cut
63              
64             sub get_syllables {
65 20     20 1 3937 return shift->text =~ m/($complete_syl_re)/og;
66             }
67              
68             =head2 get_fragments
69              
70             C
71              
72             Returns a complete segmentation of the text passed to the constructor as an
73             array of hashes. Each hash has two keys:
74              
75             =over 4
76              
77             =item C: the text of the respective fragment
78              
79             =item C: if true, the fragment is a single valid Lao syllable. If
80             false, it may be whitespace, non-Lao script, Lao characters that don't
81             constitute valid syllables - basically anything at all that's I a valid
82             syllable.
83              
84             =back
85              
86             =cut
87              
88             sub get_fragments {
89 21     21 1 2612 my $self = shift;
90 21         45 my $t = $self->text;
91 21         76 my @matches;
92 21         1230 while($t =~ /\G($complete_syl_re | .+?(?=$complete_syl_re|$) )/oxgcs) {
93 50 100       2465 unless($1 eq "\N{ZERO WIDTH SPACE}") {
94 48         52 my $match = $1;
95 48         1432 push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };
96             }
97             }
98             return @matches
99 21         93 }
100              
101             1;