File Coverage

blib/lib/Text/MostFreqKDistance.pm
Criterion Covered Total %
statement 62 62 100.0
branch 27 28 96.4
condition 2 3 66.6
subroutine 8 8 100.0
pod 2 2 100.0
total 101 103 98.0


line stmt bran cond sub pod time code
1             package Text::MostFreqKDistance;
2              
3             $Text::MostFreqKDistance::VERSION = '0.10';
4             $Text::MostFreqKDistance::AUTHORITY = 'cpan:MANWAR';
5              
6             =head1 NAME
7              
8             Text::MostFreqKDistance - Estimate strings similarity.
9              
10             =head1 VERSION
11              
12             Version 0.10
13              
14             =cut
15              
16 5     5   72300 use 5.006;
  5         37  
17 5     5   26 use strict; use warnings;
  5     5   8  
  5         112  
  5         30  
  5         39  
  5         155  
18 5     5   3200 use Data::Dumper;
  5         34063  
  5         368  
19 5     5   2173 use parent 'Exporter';
  5         1468  
  5         27  
20              
21             our @EXPORT = qw(MostFreqKHashing MostFreqKSDF);
22              
23             =head1 DESCRIPTION
24              
25             In information theory, MostFreqKDistance is a string metric technique for quickly
26             estimating how similar two ordered sets or strings are.The scheme was invented by
27             Sadi Evren SEKER(2014) and initially used in text mining applications like author
28             recognition.
29              
30             Source: L
31              
32             =head1 SYNOPSIS
33              
34             use strict; use warnings;
35             use Text::MostFreqKDistance;
36              
37             print MostFreqKHashing('seeking', 2), "\n";
38             print MostFreqKHashing('research', 2), "\n";
39             print MostFreqKSDF('seeking', 'research', 2, 10), "\n";
40              
41             =head1 METHODS
42              
43             =head2 MostFreqKSDF($str1, $str2, $k, $max_distance)
44              
45             The method is suitable for bioinformatics to compare the genetic strings like in
46             FASTA format.
47              
48             use strict; use warnings;
49             use Text::MostFreqKDistance;
50              
51             my $str1 = 'LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV';
52             my $str2 = 'EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG';
53              
54             print MostFreqKHashing($str1, 2), "\n";
55             print MostFreqKHashing($str2, 2), "\n";
56             print MostFreqKSDF($str1, $str2, 2, 10), "\n";
57              
58             =cut
59              
60             sub MostFreqKSDF {
61 12     12 1 5184 my ($a, $b, $k, $d) = @_;
62              
63 12 100       41 die "ERROR: Missing source string (first).\n" unless defined $a;
64 11 100       25 die "ERROR: Missing source string (second).\n" unless defined $b;
65 10 100       21 die "ERROR: Missing frequency value.\n" unless defined $k;
66 9 100       51 die "ERROR: Invalid frequency value [$k].\n" unless ($k =~ /^[0-9]+$/);
67 8 100       18 die "ERROR: Missing max distance value.\n" unless defined $d;
68 7 100       25 die "ERROR: Invalid max distance value [$d].\n" unless ($d =~ /^[0-9]+$/);
69              
70 6         16 my $MostFreqKHashing_a = _MostFreqKHashing($a, $k);
71 6         15 my $MostFreqKHashing_b = _MostFreqKHashing($b, $k);
72              
73 6         9 my $MostFreqKSDF = 0;
74 6         10 foreach my $_a (@$MostFreqKHashing_a) {
75 12 50       26 next if ($_a->{key} eq 'NULL');
76 12         19 foreach my $_b (@$MostFreqKHashing_b) {
77 24 100       42 if ($_a->{key} eq $_b->{key}) {
78 6 100       10 if ($_a->{value} == $_b->{value}) {
79 5         21 $MostFreqKSDF += $_a->{value};
80             }
81             else {
82 1         3 $MostFreqKSDF += ($_a->{value} + $_b->{value});
83             }
84             }
85             }
86             }
87              
88 6         36 return ($d - $MostFreqKSDF);
89             }
90              
91             =head2 MostFreqKHashing($str, $k)
92              
93             It simply gets an input C<$str> and an integer C<$k> value. It outputs the most
94             frequent C<$k> characters from the input string. The only condition during the
95             creation of output string is adding the first occurring character first, if the
96             frequencies of two characters are equal.
97              
98             use strict; use warnings;
99             use Text::MostFreqKDistance;
100              
101             print MostFreqKHashing('seeking', 2), "\n";
102             print MostFreqKHashing('research', 2), "\n";
103              
104             =cut
105              
106             sub MostFreqKHashing {
107 16     16 1 6237 my ($string, $k) = @_;
108              
109 16 100       43 die "ERROR: Missing source string.\n" unless defined $string;
110 15 100       30 die "ERROR: Missing frequency value.\n" unless defined $k;
111 14 100       73 die "ERROR: Invalid frequency value [$k].\n" unless ($k =~ /^[0-9]+$/);
112              
113 12         20 my $MostFreqKHashing = '';
114 12         16 foreach (@{_MostFreqKHashing($string, $k)}) {
  12         23  
115 24         66 $MostFreqKHashing .= sprintf("%s%d", $_->{key}, $_->{value});
116             }
117              
118 12         42 return $MostFreqKHashing;
119             }
120              
121             #
122             #
123             # PRIVATE METHODS
124              
125             sub _MostFreqKHashing {
126 24     24   42 my ($string, $k) = @_;
127              
128 24         34 my $seen = {};
129 24         37 my %chars = ();
130 24         33 my $chars = [];
131 24         30 my $i = 0;
132 24         85 foreach (split //,$string) {
133 418         570 $chars{$_}++;
134 418         657 $chars->[$i++] = $_;
135             }
136              
137 24         104 my @chars = sort { $chars{$b} <=> $chars{$a} } keys(%chars);
  359         490  
138 24         63 my $MostFreqKHashing = [];
139 24         76 foreach my $j (0..($k-1)) {
140 48         66 foreach (@$chars) {
141 136 100 66     356 next if (defined $seen && exists $seen->{$_});
142 106 100       176 if ($chars{$_} == $chars{$chars[$j]}) {
143 46         81 $seen->{$_} = 1;
144 46         103 push @$MostFreqKHashing, { key => $_, value => $chars{$_} };
145 46         78 last;
146             }
147             }
148             }
149              
150 24         54 foreach (1..($k-(keys %$seen))) {
151 2         7 push @$MostFreqKHashing, { key => 'NULL', value => 0 };
152             }
153              
154 24         115 return $MostFreqKHashing;
155             }
156              
157             =head1 AUTHOR
158              
159             Mohammad S Anwar, C<< >>
160              
161             =head1 REPOSITORY
162              
163             L
164              
165             =head1 BUGS
166              
167             Please report any bugs or feature requests to C,
168             or through the web interface at L.
169             I will be notified and then you'll automatically be notified of progress on your
170             bug as I make changes.
171              
172             =head1 SUPPORT
173              
174             You can find documentation for this module with the perldoc command.
175              
176             perldoc Text::MostFreqKDistance
177              
178             You can also look for information at:
179              
180             =over 4
181              
182             =item * RT: CPAN's request tracker (report bugs here)
183              
184             L
185              
186             =item * AnnoCPAN: Annotated CPAN documentation
187              
188             L
189              
190             =item * CPAN Ratings
191              
192             L
193              
194             =item * Search CPAN
195              
196             L
197              
198             =back
199              
200             =head1 LICENSE AND COPYRIGHT
201              
202             Copyright (C) 2015 - 2017 Mohammad S Anwar.
203              
204             This program is free software; you can redistribute it and / or modify it under
205             the terms of the the Artistic License (2.0). You may obtain a copy of the full
206             license at:
207              
208             L
209              
210             Any use, modification, and distribution of the Standard or Modified Versions is
211             governed by this Artistic License.By using, modifying or distributing the Package,
212             you accept this license. Do not use, modify, or distribute the Package, if you do
213             not accept this license.
214              
215             If your Modified Version has been derived from a Modified Version made by someone
216             other than you,you are nevertheless required to ensure that your Modified Version
217             complies with the requirements of this license.
218              
219             This license does not grant you the right to use any trademark, service mark,
220             tradename, or logo of the Copyright Holder.
221              
222             This license includes the non-exclusive, worldwide, free-of-charge patent license
223             to make, have made, use, offer to sell, sell, import and otherwise transfer the
224             Package with respect to any patent claims licensable by the Copyright Holder that
225             are necessarily infringed by the Package. If you institute patent litigation
226             (including a cross-claim or counterclaim) against any party alleging that the
227             Package constitutes direct or contributory patent infringement,then this Artistic
228             License to you shall terminate on the date that such litigation is filed.
229              
230             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND
231             CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
232             WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
233             NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS
234             REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT,
235             INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE
236             OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
237              
238             =cut
239              
240             1; # End of Text-MostFreqKDistance