File Coverage

blib/lib/String/Canonical.pm
Criterion Covered Total %
statement 30 32 93.7
branch 1 2 50.0
condition 4 8 50.0
subroutine 8 10 80.0
pod 4 4 100.0
total 47 56 83.9


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2              
3             =head1 NAME
4              
5             String::Canonical - Creates canonical strings.
6              
7             =head1 SYNOPSIS
8              
9             use String::Canonical qw/cstr/;
10             print cstr("one thousand maniacs");
11              
12             print String::Canonical::get("Second tier");
13              
14             =head1 DESCRIPTION
15              
16             This module generates a canonical string by converting roman numerals to digits, English descriptions of numbers to digits, stripping off all accents on characters (as well as handling oe = ö, ae = æ, etc.), replacing words with symbols (e.g. and = &, plus = +, etc.) and removing common variant endings.
17              
18             In short, this module generates the same signature for the following
19             strings:
20              
21             bjørk = björk = bjoerk = bjork
22             1,000 maniacs = one thousand maniacs = 1k maniacs
23             Boyz II Men = Boyz To Men = Boyz 2 Men
24             ACDC = AC/DC = AC-DC
25             Rubin and company = Rubin & Company = Rubin & Co.
26             Third Eye Blind = 3rd eye blind
27             Train runnin' = Train Running
28              
29             =cut
30              
31             # --- prologue ----------------------------------------------------------------
32              
33             package String::Canonical;
34              
35             require 5.000;
36              
37 1     1   5492 use warnings;
  1         3  
  1         39  
38 1     1   5 use strict;
  1         2  
  1         35  
39 1     1   5 use Exporter;
  1         5  
  1         47  
40              
41 1     1   2349 use Lingua::EN::Numericalize; # interpret English
  1         127  
  1         88  
42 1     1   723 use Text::Roman qw/roman2int/; # interpret Roman numbers
  1         1168  
  1         52  
43              
44 1     1   5 use vars qw/$VERSION @ISA @EXPORT_OK/;
  1         1  
  1         571  
45             $VERSION = substr q$Revision: 1.2 $, 10;
46             @ISA = qw/Exporter/;
47             @EXPORT_OK = qw/&cstr &cstr_cmp/;
48              
49             my @dx; # deletions
50             my %yx; # transliterations
51             my %sx; # replacements
52              
53             # --- module interface --------------------------------------------------------
54              
55             =head1 INTERFACE
56              
57             The following functions may be imported into the caller package by name:
58              
59             =head2 cstr/get [string = $_]
60              
61             Returns the canonical form of the string passed. If no string is passed, $_ is used. When called in void context the function will set $_. The functon may also be accessed as B but only B may be exported.
62              
63             =cut
64              
65 0     0 1 0 sub get { &cstr; }
66              
67             sub cstr {
68 30   50 30 1 122 my $s = lc(shift || $_) || return;
69 30 50       72 local $_ if defined wantarray();
70              
71 30         4478 $s =~ s/\Q$_\E/$sx{$_}/gi for keys %sx;
72 30         4337 eval "\$s =~ y/$_/$yx{$_}/" for keys %yx;
73 30         595 $s =~ s/\Q$_\E//g for @dx;
74              
75 30         147 ($_, $s) = (str2nbr($s), "");
76 30   66     53678 $s .= roman2int() || $_ for split;
77              
78 30         1218 $s =~ s/[_\W]//g;
79 30         175 $_ = $s;
80             }
81              
82             =head2 cstr_cmp/cmp [string = $_]
83              
84             Compares two strings. Note that if the second string is not provided, $_ is used.
85              
86             =cut
87              
88 0     0 1 0 sub cmp { &cstr_cmp; }
89              
90             sub cstr_cmp {
91 15     15 1 5795 my $s1 = shift;
92 15   33     49 my $s2 = shift || $_;
93              
94 15         36 cstr($s1) eq cstr($s2);
95             }
96              
97             # --- internal structures -----------------------------------------------------
98              
99             @dx = qw/the da/;
100              
101             %sx = (
102             "company" => "co",
103             "brother" => "bro",
104             "to" => 2,
105             "for" => 4,
106             "mister" => "mr",
107             "senior" => "sr",
108             "o'" => "of",
109             "ol'" => "old",
110             "in'" => "ing",
111             "oe" => "o",
112             "ae" => "a",
113             "@" => "at",
114             "&" => "and",
115             "'n" => "and",
116             " n'" => "and",
117             "'n'" => "and",
118             "#" => "no",
119             "nbr" => "no",
120             "number" => "no",
121             "%" => "pct",
122             "percent" => "pct",
123             "volume" => "vol",
124             "ß" => "ss",
125             "+" => "plus",
126             );
127              
128             %yx = (
129             "äÄàÀáÁåÅâÂãÃ" => "a",
130             "ëËèÈéÉêÊ" => "e",
131             "ïÏìÌíÍîÎ" => "i",
132             "öÖòÒóÓôÔõÕ" => "o",
133             "üÜùÙúÚûÛ" => "u",
134             "æÆøØçÇñÑðÐþÞýÝÿÿ"
135             => "aaooccnnddddyyyy",
136             );
137              
138             =head1 AUTHOR
139              
140             Erick Calder
141              
142             =head1 SUPPORT
143              
144             For help and thank you notes, e-mail the author directly. To report a bug, submit a patch or add to our wishlist please visit the CPAN bug manager at: F
145              
146             =head1 AVAILABILITY
147              
148             The latest version of the tarball, RPM and SRPM may always be found at: F Additionally the module is available from CPAN.
149              
150             =head1 LICENCE AND COPYRIGHT
151              
152             This utility is free and distributed under GPL, the Gnu Public License. A copy of this license was included in a file called LICENSE. If for some reason, this file was not included, please see F to obtain a copy of this license.
153              
154             $Id: Canonical.pm,v 1.2 2003/02/15 01:44:39 ekkis Exp $
155              
156             =cut