File Coverage

blib/lib/Finance/CompanyNames.pm

Criterion	Covered	Total	%
statement	152	175	86.8
branch	28	54	51.8
condition	18	39	46.1
subroutine	11	12	91.6
pod	0	9	0.0
total	209	289	72.3

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Finance::CompanyNames - Functions for finding company names in English free text
4
5							=head1 SYNOPSIS
6
7							use Finance::CompanyNames;
8
9							my $corps = {
10							MSFT => 'Microsoft'
11							, INTC => 'Intel'
12							, etc...
13							};
14
15							Finance::CompanyNames::Init($corps)
16							$hashref = Finance::CompanyNames::Match($freetext);
17
18							=head1 DESCRIPTION
19
20							Finance::CompanyNames finds company names in English text. The user provides
21							a list of company names they wish to find, and the body of text to search.
22							The module then uses natural language processing techniques to find those
23							names or their variants in the text. For example, if a company is alternately
24							referred to as "XYZ", "XYZ Corp.", "XYZ Corporation", and "The XYZ Corporation",
25							Finance::CompanyNames will recognize all variants.
26
27							=head1 INTERFACE
28
29							=head2 Initialization
30
31							It is necessary to call Finance::CompanyNames::Init() before anything else.
32							The argument to this function is a reference to a hash. The canonical use
33							is to use stock tickers as the keys and company names as values. However, you
34							are free to use anything for the keys.
35
36							=head2 Searching
37
38							Finance::CompanyNames::Match searches a body of text for company names. The only
39							argument is a scalar containing the text. The return value is a reference to a hash
40							of references to hashes. The keys are the stock ticker symbols of company names
41							found in the text, or other keys you may have used in Init(). The values are hashes
42							with keys "freq" and "contexts". "freq" is the number of times the company was seen
43							in the text, and "contexts" is a reference to an array storing the bit of text
44							mentioning the company.
45
46							For example:
47
48							$rv = {
49							INTC => {
50							freq => 10
51							, contexts => [
52							"blah blah blah blah blah Intel blah blah blah blah"
53							, "blah Intel Corp. blah blah blah blah blah blah"
54							]
55							}
56							};
57
58							=head1 NOTE
59
60							Please note that Finance::CompanyNames allocates a massive amount of memory.
61							It loads a complete English wordlist as well as a list of English root words
62							and their affixes. This requires approximately 20MB of memory on the author's
63							computer. It is possible for a future version to behave differently. Please
64							mail the author if you have an improvement.
65
66							Also please note this module only works with English text, due to the included
67							word and stem lists.
68
69							=head1 AUTHORS
70
71							Finance::CompanyNames is a product of Gilder, Gagnon, Howe, & Co. LLC.
72							Mail GGHC Skunkworks regarding this software.
73
74							=head1 LICENSE
75
76							Finance::CompanyNames is distributed under the Artistic License, the same
77							terms under which Perl itself is distributed.
78
79							=cut
80
81							package Finance::CompanyNames;
82
83	1			1		954	use strict;
	1					2
	1					33
84	1			1		11724	use Finance::CompanyNames::TextSupport;
	1					4
	1					86
85
86	1			1		17	use vars qw(@ISA @EXPORT_OK $VERSION %quants %names %wholeNames %leaders %abbrevs $avgLength $stems $dict $threshold);
	1					2
	1					2081
87
88							$VERSION = 1.0;
89
90							require Exporter;
91							@ISA = qw(Exporter);
92							@EXPORT_OK = qw(Init Match);
93
94							#### Init Function -- initialize the set of names/tickers to be matched.
95
96							# parameter $data is a ref to hash of ticker => name
97
98							sub Init {
99	1			1	0	888	my $data = shift;
100
101	1					3	%quants = ();
102	1					4	%names = ();
103	1					3	%wholeNames = ();
104	1					2	%leaders = ();
105	1					3	%abbrevs = ();
106
107	1					3	$abbrevs{IBM} = "IBM";
108	1					4	$abbrevs{ATT} = "T";
109	1					3	$abbrevs{NEC} = "NIPNY";
110	1					2	$abbrevs{RCN} = "RCNC";
111
112	1					7	$stems = Finance::CompanyNames::TextSupport::pandkStems();
113	1					7	$dict = Finance::CompanyNames::TextSupport::linuxDict();
114
115	1					7	get_info($data);
116
117	1					5	$threshold = 0.65;
118
119							}
120
121
122							######### find matches in a string
123							# parameter is a string
124							# returns a ref to a hash where each key is the ticker of a match.
125							# the hash entry keyed by a symbol is a
126							# ref to a hash with the following members:
127							# -- freq : # of times that ticker's name matched in the string
128							# -- contexts: a ref to an array of strings that are the contexts
129							# in which those matches were found
130
131
132
133							sub Match {
134	1			1	0	795	my $s = shift;
135
136	1					6	my %utica = matches($s);
137
138	1					5	return \%utica;
139							}
140
141
142							############# HELPERS ---------------------------------------
143
144							sub matches
145							{
146	1			1	0	3	my ($str) = @_;
147	1					5	my @lexems = get_lexems($str);
148
149	1					3	my ($i, $incr, $context, $lexem, $lower, $allUpper);
150
151	1					3	my %utica = ();
152	1					3	my ($ms, $mtic, $abbrev, $uniq);
153	0					0	my ($kk, $nn, $ss);
154	0					0	my ($chainBegin, $chainEnd, $links, $wordBuff, $lastMatch, $mid, $contextLength);
155
156	1					2	$chainBegin = -1;
157
158	1					3	my $nLex = scalar(@lexems);
159	1					6	for ($i = 0, $incr = 1; $i < $nLex; $i += $incr)
160							{
161	6					10	$ms = 0;
162	6					7	$mtic = "";
163	6					11	$abbrev = 0;
164	6					15	($lexem,$incr) = aggregateLetters(\@lexems, $i);
165
166	6	50	66			35	if(length($lexem) >= 3 && exists $abbrevs{$lexem}) {
167	0					0	$ms = $threshold;
168	0					0	$mtic = $abbrevs{$lexem};
169	0					0	$abbrev = 1;
170							}
171
172							# Check if the lexem exists as a leader
173	6	100				18	if (exists $leaders{$lexem})
174							{
175	1					3	my $nLeaders = scalar(@{$leaders{$lexem}});
	1					3
176	1					3	my @score = ();
177
178	1					4	for (my $j = 0; $j < $nLeaders; $j++)
179							{
180	1					4	my $ticker = $leaders{$lexem}[$j];
181	1					4	my $parts = $names{$ticker};
182
183	1					2	$uniq = 0;
184
185	1					6	$score[$j] = match_lead($parts, \@lexems, $i + $incr, $ticker);
186
187	1					3	($kk, $nn, $ss) = @{$score[$j]};
	1					4
188
189	1					3	$lower = $lexem;
190	1					2	$lower =~ tr/[A-Z]/[a-z]/;
191
192	1					3	$_ = $lexem;
193	1					5	$allUpper = ! /[a-z]/;
194
195	1	50	33			27	if($kk == 1 && $quants{$lexem} == 1 && ! exists $stems->{$lower} && ! exists $dict->{$lower} && ! $allUpper) {
			33
			33
			33
196
197	1					2	$uniq = 1;
198	1					3	$kk++;
199
200							}
201
202	1					3	$ss = $kk / $nn;
203
204	1	50				8	if ($ms < $ss)
205							{
206	1					3	$ms = $ss;
207	1					5	$mtic = $ticker;
208							}
209							}
210							}
211
212
213							# Attempt to disambiguate #1: max score
214	6					11	my $isMatch = 0;
215	6	100				17	if($ms >= $threshold) { # match is found
216	1					3	$isMatch = 1;
217
218	1					2	$_ = $mtic;
219	1	50				5	if(/may/i) {
220							# print "found $mtic: $mtic, $kk, $nn (" . getContext($i, \@lexems, 10);
221							}
222	1	50				5	if($chainBegin == -1) {
223	1					3	$links = {};
224	1					2	$chainBegin = $i;
225							}
226	1					3	$lastMatch = $i + $incr - 1;
227	1					3	$wordBuff = int($kk + 1 + 0.5);
228
229	1	50				5	if(! exists $links->{$mtic}) {
230	1					4	$links->{$mtic} = 1;
231							}
232							else {
233	0					0	$links->{$mtic}++;
234							}
235							}
236
237	6	100	66			45	if(((!$isMatch) \|\| $i >= $nLex - 1) && $chainBegin != -1) {
			100
238
239	4	50	33			15	if(exists $quants{$lexem} && $quants{$lexem} < 0.25) {
240	0					0	$wordBuff+= $avgLength;
241							}
242
243
244
245	4	100	66			25	if($i > $lastMatch + $wordBuff \|\| $i >= $nLex - 1 ) {
246	1					3	$chainEnd = $lastMatch;
247	1					4	$mid = int( ($chainBegin + $chainEnd) / 2);
248	1					3	$contextLength = 20 + $mid - $chainBegin;
249	1					5	$context = getContext($mid, \@lexems, $contextLength);
250
251
252	1					6	foreach $mtic (keys %$links) {
253
254	1	50				5	if(! exists $utica{$mtic}) {
255	1					2	$utica{$mtic} = {};
256	1					4	$utica{$mtic}->{freq} = 0;
257	1					4	$utica{$mtic}->{contexts} = [];
258							}
259
260	1	50				5	if($utica{$mtic}->{freq} <= 5) {
261	1					2	push(@{$utica{$mtic}->{contexts}}, $context);
	1					3
262							}
263	1					4	$utica{$mtic}->{freq} += $links->{$mtic};
264							}
265	1					5	$chainBegin = -1;
266							}
267							}
268							}
269
270	1					8	return %utica;
271							}
272
273							sub match_lead
274							{
275	1			1	0	5	my ($parts, $lexems, $i, $ticker) = @_;
276
277	1					2	my $n = scalar(@$parts);
278
279
280	1					3	my $k = 1;
281	1					2	my $go = 1;
282
283	1					3	my $s = 0;
284	1					2	my ($incr, $lexem);
285
286	1		33			8	for (my $j = 1, $incr = 1; $j < $n && $i < scalar(@$lexems); $j++, $i += $incr)
287							{
288	0					0	($lexem, $incr) = aggregateLetters($lexems, $i);
289	0					0	my $part = $parts->[$j];
290							# fix Bank of Montreal
291
292	0	0				0	if ($part eq $lexem)
293							{
294	0	0				0	if(validWord($lexem)) {
295	0	0				0	$k++ if $go;
296							}
297							else {
298	0	0				0	$k += 0.5 if $go;
299							}
300							}
301							else
302							{
303	0					0	$go = 0;
304	0	0				0	if (exists($quants{$lexem}))
305							{
306	0					0	$s += $quants{$lexem};
307							}
308							}
309							}
310
311
312
313	1					5	return [$k, $n, $s];
314							}
315
316							sub validWord {
317	0			0	0	0	my ($lexem) = @_;
318
319	0					0	$_ = $lexem;
320	0					0	my $allUpper = ! /[a-z]/;
321
322	0		0			0	return ($allUpper \|\| length($lexem) > 2);
323							}
324
325							sub getContext {
326	1			1	0	3	my ($i, $lexems, $contextSize) = @_;
327
328	1					2	my($context, $min, $max);
329
330	1					3	$context = "...";
331	1					3	$min = $i - $contextSize;
332	1					2	$max = $i + $contextSize;
333	1	50				5	if($min < 0) {
334	1					2	$min = 0;
335							}
336	1	50				5	if($max > scalar(@$lexems)) {
337	1					3	$max = scalar(@$lexems);
338							}
339
340	1					5	for($i = $min; $i <= $max; $i++) {
341	7	100				17	if (defined $lexems->[$i]) {
342	6					20	$context = $context . " " . $lexems->[$i];
343							}
344							}
345
346	1					3	$context = $context . " ...";
347
348	1					3	return $context;
349							}
350
351							sub get_info
352							{
353	1			1	0	3	my $data = shift;
354
355	1					3	$avgLength = 0;
356
357	1					11	while(my ($ticker, $name) = each(%$data)) {
358
359	1	50	33			15	next if ($ticker !~ /[\w]+/ \|\| $name !~ /[\w]+/ );
360
361	1					746	my @particles = split /[^\w]+/, $name;
362
363	1	50				13	next if (!scalar(@particles));
364	1	50				9	shift @particles if ($particles[0] eq "");
365	1	50				5	next if (!scalar(@particles));
366
367	1					2	my (@agParticles, $w);
368
369
370	0					0	my($i, $incr);
371	1					9	for($i = 0, $incr = 1; $i < scalar(@particles); $i += $incr) {
372	1					6	($w, $incr) = aggregateLetters(\@particles, $i);
373	1					6	push(@agParticles, $w);
374							}
375
376	1					4	$names{$ticker} = \@agParticles;
377
378	1					3	$avgLength += scalar(@agParticles);
379
380	1					2	$wholeNames{$ticker} = $name;
381
382	1					3	my $word = $agParticles[0];
383	1					2	push(@{$leaders{$word}}, $ticker);
	1					5
384	1					5	for (my $k = 0; $k < @agParticles; $k++)
385							{
386	1					2	$word = $agParticles[$k];
387
388	1					9	$quants{$word}++;
389							}
390							}
391
392	1					8	while (my ($ww, $quant) = each(%quants))
393							{
394	1					6	$quants{$ww} = 1 / $quant;
395							}
396
397	1					6	$avgLength = int($avgLength / scalar(keys %names) + 0.5);
398
399							}
400
401							sub aggregateLetters {
402	7			7	0	13	my ($words, $i) = @_;
403
404	7					10	my $done = 0;
405	7					9	my $len = 1;
406	7					14	my $word = $words->[$i];
407	7					10	my $res = $word;
408
409	7	100				19	if(length($word) == 1) {
410	1		66			10	while((! $done) && ($i + $len) < scalar(@$words)) {
411	1					3	$word = $words->[$i + $len];
412	1	50				4	if(length($word) == 1) {
413	0					0	$res .= $word;
414	0					0	$len++;
415							}
416							else {
417	1					5	$done = 1;
418							}
419							}
420							}
421
422	7					22	return ($res, $len);
423
424							}
425
426							sub get_lexems
427							{
428	1			1	0	3	my $str = shift;
429
430	1					2	$_ = $str;
431
432	1					10	my @words = split /[^\w]+/;
433
434	1					7	return @words;
435							}
436
437
438							1;