File Coverage

blib/lib/Lingua/AR/Tashkeel.pm
Criterion Covered Total %
statement 36 36 100.0
branch n/a
condition n/a
subroutine 12 12 100.0
pod 3 4 75.0
total 51 52 98.0


line stmt bran cond sub pod time code
1 4     4   637043 use strict;
  4         6  
  4         106  
2 4     4   13 use warnings;
  4         4  
  4         200  
3             package Lingua::AR::Tashkeel;
4              
5             # ABSTRACT: Subroutines for operating on Arabic vowel marks
6             our $VERSION = '0.003'; # VERSION
7              
8 4     4   15 use Carp;
  4         4  
  4         219  
9 4     4   1737 use charnames ':full';
  4         101071  
  4         21  
10 4     4   1232 use Unicode::Normalize;
  4         161415  
  4         344  
11 4     4   26 use utf8;
  4         5  
  4         23  
12 4     4   2209 use Lingua::AR::Regexp;
  4         6502  
  4         134  
13 4     4   22 use Regexp::CharClasses::Helper;
  4         6  
  4         353  
14              
15              
16             =pod
17              
18             =encoding utf8
19              
20             =head1 NAME
21              
22             Lingua::AR::Tashkeel - Subroutines for handling Arabic Vowels and Vowel marks
23              
24              
25             =head1 SYNOPSIS
26              
27             use Lingua::AR::Tashkeel;
28              
29             # Strip all short vowels
30             Lingua::AR::Tashkeel->strip('مَكَرُونَة'); ‎# => مكرونة
31             # Heuristic for removing short vowels without causing ambiguity
32             Lingua::AR::Tashkeel->prune('فَتَّة'); ‎# => فتّة
33             # Heuristic for fixing mixed up short and long vowels
34             Lingua::AR::Tashkeel->fix('ماحشي'); ‎# => مَحشي
35              
36              
37             =head1 DESCRIPTION
38            
39             Subroutines for working with Arabic long (حروف علة) and short vowels (حركات تشكيل)
40              
41             =head1 METHODS AND ARGUMENTS
42              
43             =over 4
44              
45             =item strip($string)
46              
47             Strips away all Arabic short vowels (Tashkeel).
48              
49             =cut
50              
51             sub strip {
52 34     34 1 15015 my $self = shift;
53 34         88 my $string = NFD shift;
54            
55             #$string =~ s/(?[ (\p{InArabic} & \p{Mn}) - \N{ARABIC HAMZA ABOVE} ])//g;
56 34         1495 $string =~ s/\p{Lingua::AR::Regexp::IsTashkeel}//gx;
57              
58 34         69 return NFC $string;
59             }
60              
61             =item prune($string)
62              
63             Heuristic for pruning the short vowels that a native speaker wouldn't write,
64             as leaving them out wouldn't introduce ambiguity.
65              
66             This is often preferable to strip, as Shaddas, or Dammas that indicate a passive verb are useful clues that one might want to keep.
67              
68             =cut
69              
70             sub InOnesToKeepIn {
71 4     4 0 51673 return Regexp::CharClasses::Helper::fmt(
72             '+Lingua::AR::Regexp::IsTashkeel',
73             '-ARABIC SHADDA',
74             );
75             }
76              
77             sub prune {
78 4     4 1 1657 my $self = shift;
79 4         13 my $string = NFD shift;
80            
81 4         225 $string =~ s/\p{Lingua::AR::Tashkeel::InOnesToKeepIn}//g;
82              
83 4         9 return NFC $string;
84             }
85              
86             =item fix($string)
87              
88             Transliterating from a romanized representation of Arabic to actual Arabic script often gives incorrect results regarding short/long vowels.
89              
90             This subroutine implements a heuristic for fixing such mix ups.
91              
92             =cut
93              
94             sub fix {
95 2     2 1 549 my $self = shift;
96 2         7 my $string = NFD shift;
97            
98 2         143 return $string;
99             }
100              
101             1;
102             __END__