File Coverage

blib/lib/Text/MostFreqKDistance.pm
Criterion Covered Total %
statement 56 56 100.0
branch 15 16 93.7
condition 2 3 66.6
subroutine 8 8 100.0
pod 2 2 100.0
total 83 85 97.6


line stmt bran cond sub pod time code
1             package Text::MostFreqKDistance;
2              
3             $Text::MostFreqKDistance::VERSION = '0.09';
4             $Text::MostFreqKDistance::AUTHORITY = 'cpan:MANWAR';
5              
6             =head1 NAME
7              
8             Text::MostFreqKDistance - Estimate strings similarity.
9              
10             =head1 VERSION
11              
12             Version 0.09
13              
14             =cut
15              
16 5     5   73226 use 5.006;
  5         40  
17 5     5   27 use strict; use warnings;
  5     5   8  
  5         106  
  5         24  
  5         29  
  5         207  
18 5     5   3174 use Data::Dumper;
  5         34993  
  5         388  
19 5     5   2110 use parent 'Exporter';
  5         1542  
  5         29  
20              
21             our @EXPORT = qw(MostFreqKHashing MostFreqKSDF);
22              
23             =head1 DESCRIPTION
24              
25             In information theory, MostFreqKDistance is a string metric technique for quickly
26             estimating how similar two ordered sets or strings are.The scheme was invented by
27             Sadi Evren SEKER(2014) and initially used in text mining applications like author
28             recognition.
29              
30             Source: L
31              
32             =head1 SYNOPSIS
33              
34             use strict; use warnings;
35             use Text::MostFreqKDistance;
36              
37             print MostFreqKHashing('seeking', 2), "\n";
38             print MostFreqKHashing('research', 2), "\n";
39             print MostFreqKSDF('seeking', 'research', 2, 10), "\n";
40              
41             =head1 METHODS
42              
43             =head2 MostFreqKSDF($str1, $str2, $k, $max_distance)
44              
45             The method is suitable for bioinformatics to compare the genetic strings like in
46             FASTA format.
47              
48             use strict; use warnings;
49             use Text::MostFreqKDistance;
50              
51             my $str1 = 'LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV';
52             my $str2 = 'EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG';
53              
54             print MostFreqKHashing($str1, 2), "\n";
55             print MostFreqKHashing($str2, 2), "\n";
56             print MostFreqKSDF($str1, $str2, 2, 10), "\n";
57              
58             =cut
59              
60             sub MostFreqKSDF {
61 6     6 1 3108 my ($a, $b, $k, $d) = @_;
62              
63 6         17 my $MostFreqKHashing_a = _MostFreqKHashing($a, $k);
64 6         14 my $MostFreqKHashing_b = _MostFreqKHashing($b, $k);
65              
66 6         11 my $MostFreqKSDF = 0;
67 6         12 foreach my $_a (@$MostFreqKHashing_a) {
68 12 50       23 next if ($_a->{key} eq 'NULL');
69 12         19 foreach my $_b (@$MostFreqKHashing_b) {
70 24 100       49 if ($_a->{key} eq $_b->{key}) {
71 6 100       14 if ($_a->{value} == $_b->{value}) {
72 5         10 $MostFreqKSDF += $_a->{value};
73             }
74             else {
75 1         3 $MostFreqKSDF += ($_a->{value} + $_b->{value});
76             }
77             }
78             }
79             }
80              
81 6         38 return ($d - $MostFreqKSDF);
82             }
83              
84             =head2 MostFreqKHashing($str, $k)
85              
86             It simply gets an input C<$str> and an integer C<$k> value. It outputs the most
87             frequent C<$k> characters from the input string. The only condition during the
88             creation of output string is adding the first occurring character first, if the
89             frequencies of two characters are equal.
90              
91             use strict; use warnings;
92             use Text::MostFreqKDistance;
93              
94             print MostFreqKHashing('seeking', 2), "\n";
95             print MostFreqKHashing('research', 2), "\n";
96              
97             =cut
98              
99             sub MostFreqKHashing {
100 16     16 1 7784 my ($string, $k) = @_;
101              
102 16 100       50 die "ERROR: Missing source string.\n" unless defined $string;
103 15 100       37 die "ERROR: Missing frequency value.\n" unless defined $k;
104 14 100       89 die "ERROR: Invalid frequency value [$k].\n" unless ($k =~ /^[0-9]+$/);
105              
106 12         26 my $MostFreqKHashing = '';
107 12         18 foreach (@{_MostFreqKHashing($string, $k)}) {
  12         28  
108 24         75 $MostFreqKHashing .= sprintf("%s%d", $_->{key}, $_->{value});
109             }
110              
111 12         53 return $MostFreqKHashing;
112             }
113              
114             #
115             #
116             # PRIVATE METHODS
117              
118             sub _MostFreqKHashing {
119 24     24   53 my ($string, $k) = @_;
120              
121 24         39 my $seen = {};
122 24         47 my %chars = ();
123 24         53 my $chars = [];
124 24         40 my $i = 0;
125 24         104 foreach (split //,$string) {
126 418         693 $chars{$_}++;
127 418         726 $chars->[$i++] = $_;
128             }
129              
130 24         126 my @chars = sort { $chars{$b} <=> $chars{$a} } keys(%chars);
  366         557  
131 24         54 my $MostFreqKHashing = [];
132 24         414 foreach my $j (0..($k-1)) {
133 48         81 foreach (@$chars) {
134 136 100 66     394 next if (defined $seen && exists $seen->{$_});
135 106 100       214 if ($chars{$_} == $chars{$chars[$j]}) {
136 46         80 $seen->{$_} = 1;
137 46         145 push @$MostFreqKHashing, { key => $_, value => $chars{$_} };
138 46         93 last;
139             }
140             }
141             }
142              
143 24         63 foreach (1..($k-(keys %$seen))) {
144 2         11 push @$MostFreqKHashing, { key => 'NULL', value => 0 };
145             }
146              
147 24         115 return $MostFreqKHashing;
148             }
149              
150             =head1 AUTHOR
151              
152             Mohammad S Anwar, C<< >>
153              
154             =head1 REPOSITORY
155              
156             L
157              
158             =head1 BUGS
159              
160             Please report any bugs or feature requests to C,
161             or through the web interface at L.
162             I will be notified and then you'll automatically be notified of progress on your
163             bug as I make changes.
164              
165             =head1 SUPPORT
166              
167             You can find documentation for this module with the perldoc command.
168              
169             perldoc Text::MostFreqKDistance
170              
171             You can also look for information at:
172              
173             =over 4
174              
175             =item * RT: CPAN's request tracker (report bugs here)
176              
177             L
178              
179             =item * AnnoCPAN: Annotated CPAN documentation
180              
181             L
182              
183             =item * CPAN Ratings
184              
185             L
186              
187             =item * Search CPAN
188              
189             L
190              
191             =back
192              
193             =head1 LICENSE AND COPYRIGHT
194              
195             Copyright (C) 2015 - 2017 Mohammad S Anwar.
196              
197             This program is free software; you can redistribute it and / or modify it under
198             the terms of the the Artistic License (2.0). You may obtain a copy of the full
199             license at:
200              
201             L
202              
203             Any use, modification, and distribution of the Standard or Modified Versions is
204             governed by this Artistic License.By using, modifying or distributing the Package,
205             you accept this license. Do not use, modify, or distribute the Package, if you do
206             not accept this license.
207              
208             If your Modified Version has been derived from a Modified Version made by someone
209             other than you,you are nevertheless required to ensure that your Modified Version
210             complies with the requirements of this license.
211              
212             This license does not grant you the right to use any trademark, service mark,
213             tradename, or logo of the Copyright Holder.
214              
215             This license includes the non-exclusive, worldwide, free-of-charge patent license
216             to make, have made, use, offer to sell, sell, import and otherwise transfer the
217             Package with respect to any patent claims licensable by the Copyright Holder that
218             are necessarily infringed by the Package. If you institute patent litigation
219             (including a cross-claim or counterclaim) against any party alleging that the
220             Package constitutes direct or contributory patent infringement,then this Artistic
221             License to you shall terminate on the date that such litigation is filed.
222              
223             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND
224             CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
225             WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
226             NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS
227             REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT,
228             INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE
229             OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
230              
231             =cut
232              
233             1; # End of Text-MostFreqKDistance