File Coverage

blib/lib/Text/MostFreqKDistance.pm
Criterion Covered Total %
statement 53 53 100.0
branch 9 10 90.0
condition 2 3 66.6
subroutine 8 8 100.0
pod 2 2 100.0
total 74 76 97.3


line stmt bran cond sub pod time code
1             package Text::MostFreqKDistance;
2              
3             $Text::MostFreqKDistance::VERSION = '0.08';
4             $Text::MostFreqKDistance::AUTHORITY = 'cpan:MANWAR';
5              
6             =head1 NAME
7              
8             Text::MostFreqKDistance - Estimate strings similarity.
9              
10             =head1 VERSION
11              
12             Version 0.08
13              
14             =cut
15              
16 5     5   46139 use 5.006;
  5         15  
17 5     5   21 use strict; use warnings;
  5     5   6  
  5         85  
  5         19  
  5         11  
  5         116  
18 5     5   1790 use Data::Dumper;
  5         25324  
  5         304  
19 5     5   1113 use parent 'Exporter';
  5         1145  
  5         25  
20              
21             our @EXPORT = qw(MostFreqKHashing MostFreqKSDF);
22              
23             =head1 DESCRIPTION
24              
25             In information theory, MostFreqKDistance is a string metric technique for quickly
26             estimating how similar two ordered sets or strings are.The scheme was invented by
27             Sadi Evren SEKER(2014) and initially used in text mining applications like author
28             recognition.
29              
30             Source: L
31              
32             =head1 SYNOPSIS
33              
34             use strict; use warnings;
35             use Text::MostFreqKDistance;
36              
37             print MostFreqKHashing('seeking', 2), "\n";
38             print MostFreqKHashing('research', 2), "\n";
39             print MostFreqKSDF('seeking', 'research', 2, 10), "\n";
40              
41             =head1 METHODS
42              
43             =head2 MostFreqKSDF($str1, $str2, $k, $max_distance)
44              
45             The method is suitable for bioinformatics to compare the genetic strings like in
46             FASTA format.
47              
48             use strict; use warnings;
49             use Text::MostFreqKDistance;
50              
51             my $str1 = 'LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV';
52             my $str2 = 'EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG';
53              
54             print MostFreqKHashing($str1, 2), "\n";
55             print MostFreqKHashing($str2, 2), "\n";
56             print MostFreqKSDF($str1, $str2, 2, 10), "\n";
57              
58             =cut
59              
60             sub MostFreqKSDF {
61 6     6 1 2598 my ($a, $b, $k, $d) = @_;
62              
63 6         14 my $MostFreqKHashing_a = _MostFreqKHashing($a, $k);
64 6         12 my $MostFreqKHashing_b = _MostFreqKHashing($b, $k);
65              
66 6         8 my $MostFreqKSDF = 0;
67 6         11 foreach my $_a (@$MostFreqKHashing_a) {
68 12 50       23 next if ($_a->{key} eq 'NULL');
69 12         15 foreach my $_b (@$MostFreqKHashing_b) {
70 24 100       51 if ($_a->{key} eq $_b->{key}) {
71 6 100       14 if ($_a->{value} == $_b->{value}) {
72 5         7 $MostFreqKSDF += $_a->{value};
73             }
74             else {
75 1         3 $MostFreqKSDF += ($_a->{value} + $_b->{value});
76             }
77             }
78             }
79             }
80              
81 6         29 return ($d - $MostFreqKSDF);
82             }
83              
84             =head2 MostFreqKHashing($str, $k)
85              
86             It simply gets an input C<$str> and an integer C<$k> value. It outputs the most
87             frequent C<$k> characters from the input string. The only condition during the
88             creation of output string is adding the first occurring character first, if the
89             frequencies of two characters are equal.
90              
91             use strict; use warnings;
92             use Text::MostFreqKDistance;
93              
94             print MostFreqKHashing('seeking', 2), "\n";
95             print MostFreqKHashing('research', 2), "\n";
96              
97             =cut
98              
99             sub MostFreqKHashing {
100 12     12 1 5074 my ($string, $k) = @_;
101              
102 12         19 my $MostFreqKHashing = '';
103 12         16 foreach (@{_MostFreqKHashing($string, $k)}) {
  12         22  
104 24         70 $MostFreqKHashing .= sprintf("%s%d", $_->{key}, $_->{value});
105             }
106              
107 12         47 return $MostFreqKHashing;
108             }
109              
110             #
111             #
112             # PRIVATE METHODS
113              
114             sub _MostFreqKHashing {
115 24     24   36 my ($string, $k) = @_;
116              
117 24         36 my $seen = {};
118 24         39 my %chars = ();
119 24         32 my $chars = [];
120 24         27 my $i = 0;
121 24         86 foreach (split //,$string) {
122 418         572 $chars{$_}++;
123 418         903 $chars->[$i++] = $_;
124             }
125              
126 24         104 my @chars = sort { $chars{$b} <=> $chars{$a} } keys(%chars);
  364         498  
127 24         43 my $MostFreqKHashing = [];
128 24         52 foreach my $j (0..($k-1)) {
129 48         73 foreach (@$chars) {
130 136 100 66     336 next if (defined $seen && exists $seen->{$_});
131 106 100       198 if ($chars{$_} == $chars{$chars[$j]}) {
132 46         67 $seen->{$_} = 1;
133 46         105 push @$MostFreqKHashing, { key => $_, value => $chars{$_} };
134 46         75 last;
135             }
136             }
137             }
138              
139 24         56 foreach (1..($k-(keys %$seen))) {
140 2         5 push @$MostFreqKHashing, { key => 'NULL', value => 0 };
141             }
142              
143 24         97 return $MostFreqKHashing;
144             }
145              
146             =head1 AUTHOR
147              
148             Mohammad S Anwar, C<< >>
149              
150             =head1 REPOSITORY
151              
152             L
153              
154             =head1 BUGS
155              
156             Please report any bugs or feature requests to C,
157             or through the web interface at L.
158             I will be notified and then you'll automatically be notified of progress on your
159             bug as I make changes.
160              
161             =head1 SUPPORT
162              
163             You can find documentation for this module with the perldoc command.
164              
165             perldoc Text::MostFreqKDistance
166              
167             You can also look for information at:
168              
169             =over 4
170              
171             =item * RT: CPAN's request tracker (report bugs here)
172              
173             L
174              
175             =item * AnnoCPAN: Annotated CPAN documentation
176              
177             L
178              
179             =item * CPAN Ratings
180              
181             L
182              
183             =item * Search CPAN
184              
185             L
186              
187             =back
188              
189             =head1 LICENSE AND COPYRIGHT
190              
191             Copyright (C) 2015 - 2017 Mohammad S Anwar.
192              
193             This program is free software; you can redistribute it and / or modify it under
194             the terms of the the Artistic License (2.0). You may obtain a copy of the full
195             license at:
196              
197             L
198              
199             Any use, modification, and distribution of the Standard or Modified Versions is
200             governed by this Artistic License.By using, modifying or distributing the Package,
201             you accept this license. Do not use, modify, or distribute the Package, if you do
202             not accept this license.
203              
204             If your Modified Version has been derived from a Modified Version made by someone
205             other than you,you are nevertheless required to ensure that your Modified Version
206             complies with the requirements of this license.
207              
208             This license does not grant you the right to use any trademark, service mark,
209             tradename, or logo of the Copyright Holder.
210              
211             This license includes the non-exclusive, worldwide, free-of-charge patent license
212             to make, have made, use, offer to sell, sell, import and otherwise transfer the
213             Package with respect to any patent claims licensable by the Copyright Holder that
214             are necessarily infringed by the Package. If you institute patent litigation
215             (including a cross-claim or counterclaim) against any party alleging that the
216             Package constitutes direct or contributory patent infringement,then this Artistic
217             License to you shall terminate on the date that such litigation is filed.
218              
219             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND
220             CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
221             WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
222             NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS
223             REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT,
224             INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE
225             OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
226              
227             =cut
228              
229             1; # End of Text-MostFreqKDistance