File Coverage

blib/lib/Array/Suffix.pm

Criterion	Covered	Total	%
statement	256	328	78.0
branch	79	126	62.7
condition	11	15	73.3
subroutine	26	35	74.2
pod	16	31	51.6
total	388	535	72.5

line	stmt	bran	cond	sub	pod	time	code
1							#########################################################################
2							# PACKAGE: Array::Suffix
3							#
4							# Copyright (C), 2004-2007
5							# Bridget Thomson McInnes, bthomson@d.umn.edu
6							#
7							# University of Minnesota, Duluth
8							#
9							# USAGE:
10							# use Array::Suffix
11							#
12							# DESCRIPTION:
13							#
14							# The Array::Suffix module creates a suffix array data structure
15							# that has the ability to store and return variable length n-grams
16							# and their frequency. See perldoc Array::Suffix
17							#
18							#########################################################################
19							package Array::Suffix;
20
21	1			1		25366	use 5.008;
	1					4
	1					43
22	1			1		6	use strict;
	1					2
	1					35
23	1			1		1097	use bytes;
	1					17
	1					7
24
25							require Exporter;
26	1			1		930	use AutoLoader qw(AUTOLOAD);
	1					1909
	1					6
27
28							our @ISA = qw(Exporter);
29
30							# Items to export into callers namespace by default. Note: do not export
31							# names by default without a very good reason. Use EXPORT_OK instead.
32							# Do not simply export all your public functions/methods/constants.
33
34							# This allows declaration use Array::Suffix ':all';
35							# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
36							# will save memory.
37							our %EXPORT_TAGS = ( 'all' => [ qw() ] );
38
39							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
40
41							our @EXPORT = qw();
42
43							our $VERSION = '.5';
44
45							#########################
46							# File Name Variables #
47							#########################
48							my $CORPUS_FILE = "";
49							my $VOCAB_FILE = "";
50							my $SNT_FILE = "";
51							my $SNTNGRAM_FILE = "";
52							my $NGRAM_FILE = "";
53							my $STOPLIST = "";
54							my $TOKEN_FILE = "";
55							my $NONTOKEN_FILE = "";
56
57							########################################
58							# User defined Suffix Array Variables #
59							########################################
60							my $max_ngram_size = 2; #default is 2
61							my $min_ngram_size = 2; #default is 2
62							my $frequency = 0; #default is 0
63
64							#########################
65							# Stop List Variables #
66							#########################
67							my $stop_mode = "AND"; #AND/OR default is AND
68							my $stop_regex = ""; #regex to store stop list
69
70
71							############################
72							# Token Option Variables #
73							############################
74							my $tokenizerRegex = "";
75							my $nontokenizerRegex = "";
76
77							####################
78							# Flag Variables #
79							####################
80							my $stop_flag = 0; #default is false
81							my $marginals = 0; #default is false
82							my $remove = 0; #default is false;
83							my $new_line = 0; #default is false
84
85							####################
86							# Hash Variables #
87							####################
88							my $cache = "";
89							my $unigrams = "";
90							my %remove_hash = ();
91
92							#####################
93							# Array Variables #
94							#####################
95							my @vocab_array = ();
96							my @window_array = ();
97
98							#################################
99							# Main Suffix Array Variables #
100							#################################
101							# VEC VARIABLES
102							my $corpus = ""; # corpus vec
103							my $suffix = ""; # the suffix vec
104
105							####################
106							# MISC VARIABLES #
107							####################
108							my $N = 0; # the length of the corpus
109							my $bit = 32; # the bit size for the vec array
110							my $ngram_count = 0; # the number of ngrams
111							my $win_bit = 1; # the bit size for the windowing
112							my $timestamp = ""; # the time stamp for the files
113
114							###############
115							# new method #
116							###############
117							my $location;
118							sub new
119							{
120							# First argument is class
121	16			16	0	1069075	my $class = shift;
122
123	16					274	my $self = {};
124
125	16					205	bless $self, $class;
126
127	16	50				310	$self->{dir} = shift if (defined(@_ > 0));
128	16	50				486	$self->{verbose} = @_ ? shift : 0;
129
130	16	50				122	warn "Dir = ", $self->{dir}, "\n" if ($self->{verbose});
131	16	50				63	warn "Verbose = ", $self->{verbose}, "\n" if ($self->{verbose});
132
133							# Initialize some variables at new
134	16					146	$CORPUS_FILE = ""; $VOCAB_FILE = ""; $SNT_FILE = ""; $SNTNGRAM_FILE = "";
	16					132
	16					38
	16					133
135	16					81	$NGRAM_FILE = ""; $STOPLIST = ""; $TOKEN_FILE = ""; $NONTOKEN_FILE = "";
	16					72
	16					49
	16					97
136	16					31	$stop_flag = 0; $marginals = 0; $remove = 0; $new_line = 0;
	16					30
	16					47
	16					27
137	16					35	$max_ngram_size = 2; $min_ngram_size = 2; $frequency = 0; $remove = 0;
	16					27
	16					24
	16					32
138
139	16					93	return $self;
140							}
141
142							#######################################
143							# Create the vocabulary and snt file #
144							#######################################
145							sub create_files
146							{
147	16			16	1	261	my $self = shift; my @files = @_; @vocab_array = ();
	16					157
	16					1040
148
149							# Open the corpus, vocab and snt files
150	16	50				3886	open(VOCAB , ">$VOCAB_FILE") \|\| die "Could not open the vocabfile: $!\n";
151	16	50				2076	open(SNT, ">$SNT_FILE") \|\| die "Could not open the sntfile : $!\n";
152
153							# Create the token and nontoken regular expression
154	16	50				73	if($NONTOKEN_FILE ne "") { set_nontoken_regex(); } set_token_regex();
	0					0
	16					191
155
156							################################################
157							# Index always starts at 2 because 1 is #
158							# considered a new line parameter if defined #
159							################################################
160
161	16					29	my $index = 2; my %vocab_hash = ();
	16					62
162
163	16					61	foreach (@files) {
164	16	50				878	open(CORPUS, $_) \|\| die "Could not find the corpus file: $_ \n";
165	16					752	while() {
166	248					335	chomp;
167
168	248					9981	s/$nontokenizerRegex//g;
169
170	248					1629	while( /$tokenizerRegex/g ) {
171	2217					4701	my $token = $&;
172
173	2217	100				5869	if (! exists $vocab_hash{$token} ) {
174	1263					5913	print SNT "$index ";
175	1263					1894	print VOCAB "$index\n"; print VOCAB "$token\n";
	1263					2762
176	1263					19492	$vocab_hash{$token} = $index++;
177							}
178							else {
179	954					7004	print SNT "$vocab_hash{$token} ";
180							}
181							}
182	248	50				448	print SNT "1" if $new_line;
183	248					5557	print SNT "\n";
184							}
185							}
186							}
187
188							######################
189							# Remove the files #
190							######################
191							sub remove_files
192							{
193	16			16	1	128	my $self = shift;
194
195	16					165377	system("rm -rf $VOCAB_FILE");
196	16					162824	system("rm -rf $SNT_FILE");
197	16					225248	system("rm -rf $SNTNGRAM_FILE");
198
199							}
200
201							###########################
202							# Remove the ngram file #
203							###########################
204							sub remove_ngram_file
205							{
206
207	0			0	0	0	my $self = shift;
208
209	0					0	system("rm -rf $NGRAM_FILE");
210
211							}
212
213							############################################
214							# Creates the token file #
215							# CODE obtained from NSP version 6.7 #
216							# http://www.d.umn.edu/~tpederse/nsp.html #
217							############################################
218							sub set_token_regex
219							{
220	16			16	0	31	my $self = shift; my @tokenRegex = (); $tokenizerRegex = "";
	16					42
	16					67
221
222	16	100				174	if(-e $TOKEN_FILE) {
223	6	50				227	open (TOKEN, $TOKEN_FILE) \|\| die "Couldnt open $TOKEN_FILE\n";
224
225	6					231	while() {
226	8					32	chomp; s/^\s//; s/\s$//;
	8					98
	8					77
227	8	50				34	if (length($_) <= 0) { next; }
	0					0
228	8	50	33			117	if (!(/^\//) \|\| !(/\/$/))
229							{
230	0					0	print STDERR "Ignoring regex with no delimiters: $_\n"; next;
	0					0
231							}
232	8					32	s/^\///; s/\/$//;
	8					46
233	8					76	push @tokenRegex, $_;
234							}
235	6					70	close TOKEN;
236							}
237							else {
238	10					67	push @tokenRegex, "\\w+"; push @tokenRegex, "[\.,;:\?!]";
	10					30
239							}
240
241							# create the complete token regex
242
243	16					100	foreach my $token (@tokenRegex)
244							{
245	28	100				116	if ( length($tokenizerRegex) > 0 )
246							{
247	12					34	$tokenizerRegex .= "\|";
248							}
249	28					44	$tokenizerRegex .= "(";
250	28					32	$tokenizerRegex .= $token;
251	28					110	$tokenizerRegex .= ")";
252							}
253
254							# if you dont have any tokens to work with, abort
255	16	50				87	if ( $#tokenRegex < 0 )
256							{
257	0					0	print STDERR "No token definitions to work with.\n";
258							#askHelp();
259	0					0	exit;
260							}
261							}
262
263							############################################
264							# Set the non token regular expression #
265							# CODE obtained from NSP version 6.7 #
266							# http://www.d.umn.edu/~tpederse/nsp.html #
267							############################################
268							sub set_nontoken_regex
269							{
270	0			0	0	0	$nontokenizerRegex = "";
271
272							#check if the file exists
273	0	0				0	if($NONTOKEN_FILE)
274							{
275							#open the non token file
276	0	0				0	open(NOTOK, $NONTOKEN_FILE) \|\| die "Couldn't open Nontoken file $NONTOKEN_FILE.\n";
277
278	0					0	while() {
279	0					0	chomp;
280	0					0	s/^\s+//; s/\s+$//;
	0					0
281
282							#handling a blank lines
283	0	0				0	if(/^\s*$/) { next; }
	0					0
284
285	0	0				0	if(!(/^\//)) {
286	0					0	print STDERR "Nontoken regular expression $_ should start with '/'\n"; exit;
	0					0
287							}
288
289	0	0				0	if(!(/\/$/)) {
290	0					0	print STDERR "Nontoken regular expression $_ should end with '/'\n"; exit;
	0					0
291							}
292
293							#removing the / s from the beginning and the end
294	0					0	s/^\///;
295	0					0	s/\/$//;
296
297							#form a single regex
298	0					0	$nontokenizerRegex .="(".$_.")\|";
299							}
300
301							# if no valid regexs are found in Nontoken file
302	0	0				0	if(length($nontokenizerRegex)<=0) {
303	0					0	print STDERR "No valid Perl Regular Experssion found in Nontoken file $NONTOKEN_FILE.\n";
304	0					0	exit;
305							}
306
307	0					0	chop $nontokenizerRegex;
308							}
309							else {
310	0					0	print STDERR "Nontoken file $NONTOKEN_FILE doesn't exist.\n";
311	0					0	exit;
312							}
313							}
314
315							##############################
316							# Create the stoplist hash #
317							# CODE obtained from NSP #
318							##############################
319							sub create_stop_list
320							{
321	3			3	1	55	my $self = shift;
322	3					18	my $file = shift;
323
324	3					15	$stop_regex = "";
325
326	3	50				206	open(FILE, $file) \|\| die "Could not open the Stoplist : $!\n";
327
328	3					170	while() {
329	15					24	chomp; # accepting Perl Regexs from Stopfile
330	15					66	s/^\s+//;
331	15					29	s/\s+$//;
332
333							#handling a blank lines
334	15	100				110	if(/^\s*$/) { next; }
	3					19
335
336							#check if a valid Perl Regex
337	12	50				54	if(!(/^\//)) {
338	0					0	print STDERR "Stop token regular expression <$_> should start with '/'\n";
339	0					0	exit;
340							}
341	12	50				51	if(!(/\/$/)) {
342	0					0	print STDERR "Stop token regular expression <$_> should end with '/'\n";
343	0					0	exit;
344							}
345
346							#remove the / s from beginning and end
347	12					55	s/^\///;
348	12					39	s/\/$//;
349
350							#form a single big regex
351	12					69	$stop_regex.="(".$_.")\|";
352							}
353
354	3	50				24	if(length($stop_regex)<=0) {
355	0					0	print STDERR "No valid Perl Regular Experssion found in Stop file.";
356	0					0	exit;
357							}
358
359	3					8	chop $stop_regex;
360
361							# Reset the stop flag to true
362	3					5	$stop_flag = 1;
363
364	3					50	close FILE;
365							}
366
367							###############################
368							# Load the vocabulary array #
369							###############################
370							sub load_vocab_array
371							{
372	16	50		16	0	1225	open(VOCAB, $VOCAB_FILE) \|\| die "Could not open the vocab file: $!\n";
373
374	16					48	@vocab_array = ();
375	16					411	while() {
376	1263					1312	chomp;
377	1263					1530	my $token = ; chomp $token;
	1263					1253
378	1263					3647	$vocab_array[$_] = $token;
379							}
380							}
381
382							##############################
383							# Set the remove parameter #
384							##############################
385							sub set_remove
386							{
387	2			2	1	42	my $self = shift;
388	2					8	$remove = shift;
389							}
390
391							################################
392							# Set the marginal parameter #
393							################################
394							sub set_marginals
395							{
396	15			15	1	133	my $self = shift;
397	15					37	$marginals = shift;
398
399							}
400
401							################################
402							# Set the new_line parameter #
403							################################
404							sub set_newline
405							{
406	0			0	1	0	my $self = shift;
407	0					0	$new_line = 1;
408
409							}
410
411							#######################
412							# Set the frequency #
413							#######################
414							sub set_frequency
415							{
416	2			2	1	53	my $self = shift;
417	2					7	$frequency = shift;
418							}
419
420							############################
421							# Set minimum ngram size #
422							############################
423							sub set_min_ngram_size
424							{
425	0			0	1	0	my $self = shift;
426	0					0	$min_ngram_size = shift;
427
428							}
429
430							############################
431							# Set maximum ngram size #
432							############################
433							sub set_max_ngram_size
434							{
435	0			0	1	0	my $self = shift;
436	0					0	my $max_ngram_size = shift;
437							}
438
439							####################################
440							# Set the min and max ngram size #
441							####################################
442							sub set_ngram_size
443							{
444	9			9	1	66	my $self = shift;
445	9					23	my $size = shift;
446
447	9					16	$max_ngram_size = $size;
448	9					22	$min_ngram_size = $size;
449							}
450
451							#######################
452							# Set the stop mode #
453							#######################
454							sub set_stop_mode
455							{
456
457	1			1	1	15	my $self = shift;
458	1					3	$stop_mode = shift;
459							}
460
461							########################
462							# Set the token file #
463							########################
464							sub set_token_file
465							{
466	6			6	1	38	my $self = shift;
467	6					31	$TOKEN_FILE = shift;
468							}
469
470							###########################
471							# Set the nontoken file #
472							###########################
473							sub set_nontoken_file
474							{
475	0			0	1	0	my $self = shift;
476	0					0	my $NONTOKEN_FILE = shift;
477							}
478
479							#############################
480							# Set the ngram file name #
481							#############################
482							sub set_destination_file
483							{
484	16			16	1	1060	my $self = shift;
485	16					173	my $file = shift;
486
487	16					91	$timestamp = time();
488
489							# Set the file names of the internal files
490							# that will be used by the perl module.
491	16					65	$VOCAB_FILE = $file . ".vocab." . $timestamp;
492	16					79	$SNT_FILE = $file . ".snt." . $timestamp;
493	16					37	$SNTNGRAM_FILE = $file . ".sntngram." . $timestamp;
494
495							# Set the ngram file
496	16					47	$NGRAM_FILE = $file;
497							}
498
499							#################################
500							# Return the number of ngrams #
501							#################################
502							sub get_ngram_count
503							{
504	16			16	1	45	return $ngram_count;
505							}
506
507							###########################
508							# Return the ngram file #
509							###########################
510							sub get_ngram_file
511							{
512	0			0	0	0	return $NGRAM_FILE;
513							}
514
515							#######################################################################
516							# METHOD THAT CALLS THE SUFFIX ARRAY FUNCTIONS TO OBTAIN THE NGRAMS #
517							#######################################################################
518							sub get_ngrams
519							{
520
521							# Set the ngram count to zero
522	16			16	1	84	$ngram_count = 0;
523
524							# Create the corpus array
525	16					132	corpus_array();
526
527							# Create the suffix array
528	16					53	suffix_array();
529
530							# Print the numerical ngrams to the snt ngram file
531	16					446	print_sntngrams();
532
533							# Print the ngrams
534	16					44	print_ngrams();
535
536	16					65	return 1;
537							}
538
539							##########################################
540							# Find the frequency of a given string #
541							##########################################
542							sub find_frequency
543							{
544	2063			2063	0	6142	my @ngram = split/\s+/, shift;
545
546							# Initialize the indexing variables
547	2063					2495	my $bottom = shift; my $top = $bottom+1;
	2063					2626
548
549	2063					2107	while(1) {
550	2199					4420	for (0..$#ngram) {
551	3143	100				9449	if(vec($corpus,vec($suffix, $top, $bit)+$_,$bit)!=$ngram[$_]) { return($top-$bottom); }
	2063					6047
552	136					376	} $top++;
553
554							}
555							}
556
557							###########################################
558							# Print the ngrams and their frequencies #
559							###########################################
560							sub print_sntngrams
561							{
562							# Open the SNTGRAM File #
563	16	50		16	0	2663	open(SNTNGRAM, ">$SNTNGRAM_FILE") \|\| die "Could not open the SNTNGRAM file : $!\n";
564
565							# Load the vocabulary array if not loaded
566	16	50				67	if(!@vocab_array) { load_vocab_array(); }
	16					63
567
568	16					42	my $i = 0; %remove_hash = ();
	16					311
569							# Continue for all the tokens in the vec
570	16					56	while($i <= $N) {
571
572							# Initialize variables
573	2081					2907	my @ngram =(); my @marginalFreqs = (); my $line = 0; my $doStop = 0; my @token_ngram = ();
	2081					2275
	2081					2279
	2081					1986
	2081					2394
574
575	2081					2658	my $l = vec($suffix, $i, $bit);
576							# Determine the ngram in its integer and token form
577	2081	100				3781	if($l+$min_ngram_size-1 <= $N) {
578	2063					3452	for(0..$min_ngram_size-1) {
579	4188					5562	push @ngram, vec($corpus, $l, $bit);
580	4188					7798	push @token_ngram, $vocab_array[vec($corpus, $l, $bit)];
581	4188					10713	$l++;
582							}
583
584							# Determine the frequency of the ngram and increment the ngram count.
585	2063					7493	my $freq = find_frequency( (join " ", @ngram), $i);
586
587							# If new line determine if the new line exists in the ngram
588	2063	0				4251	if($new_line) { map { if($_ == 1) { $line++; } } @ngram; }
	0	50				0
	0					0
	0					0
589
590							# If the stop list exists determine if tokens are in the stop list
591	2063	100				4127	if($stop_flag) {
592							# Set the doStop flag
593	87	100				305	if($stop_mode=~/OR\|or/) { $doStop = 0; } else { $doStop = 1; }
	29					39
	58					84
594
595	87					200	for $i(0..$#token_ngram) {
596							# if mode is OR, remove the current ngram if any word is a stop word
597	126	100				367	if($stop_mode=~/OR\|or/) { if($token_ngram[$i]=~/$stop_regex/) { $doStop=1; last; } }
	52	100				740
	12					16
	12					16
598
599							# if mode is AND, accept the current ngram if any word is not a stop word
600	74	100				422	else { if(!($token_ngram[$i]=~/$stop_regex/)) { $doStop=0; last; } }
	51					65
	51					65
601							}
602	87	100	66			270	if($doStop && $marginals) {
603	19					36	for (0..$#ngram) {
604	38	100				112	if(exists $remove_hash{$_ . ":" . $ngram[$_]}) {
605	6					18	$remove_hash{$_ . ":" . $ngram[$_]} += $freq;
606							}
607	32					122	else { $remove_hash{$_ . ":" . $ngram[$_]} = $freq; }
608							}
609							}
610							}
611
612							# If the ngram frequency is greater or equal to a specified frequency, a new
613							# line flag is false and the ngram is not elimanted by the stop list then print
614							# the ngram in its integer form with its frequency to the snt ngram file
615	2063	100	66			10770	if($line == 0 && $doStop == 0) {
616	2044	100				3661	if($remove <= $freq) {
617	1173					1220	$ngram_count+=$freq;
618	1173	100				2353	if($frequency <= $freq) { print SNTNGRAM "@ngram $freq\n"; }
	302					1061
619							}
620							else {
621	871					1594	for (0..$#ngram) {
622	1742	100				13095	if(exists $remove_hash{$_ . ":" . $ngram[$_]}) {
623	726					2362	$remove_hash{$_ . ":" . $ngram[$_]} += $freq;
624							}
625	1016					4358	else { $remove_hash{$_ . ":" . $ngram[$_]} = $freq; }
626							}
627							}
628	2063					9658	} $i += $freq;
629	18					48	} else { $i++; }
630							}
631
632							}
633
634							######################################################
635							# Print the ngrams with their marginal frequencies #
636							######################################################
637							sub print_ngrams
638							{
639							#open the SNTNGRAM file
640	16	50		16	0	3261	open(SNTNGRAM, $SNTNGRAM_FILE) \|\| die "Could not open the sntngram file: $!\n";
641
642							#open the ngram file
643	16	50				3420	open(NGRAM, ">$NGRAM_FILE") \|\| die "Could not open the ngram file: $! \n";
644
645							# Load the vocabulary array if not loaded
646	16	50				51	if(!@vocab_array) { load_vocab_array(); }
	0					0
647
648	16					68	my $count = get_ngram_count();
649	16					253	print NGRAM "$count\n";
650
651	16					280	while() {
652
653	302					406	chomp; my @ngram = split/\s+/; my @marginalFreqs = ();
	302					1211
	302					441
654
655							# Get the ngram size
656	302					403	my $freq = pop @ngram;
657
658							# Get the marginal counts for all ngrams
659	302	100				919	if($marginals) { @marginalFreqs = Marginals(@ngram); }
	279					612
660
661							#################################################################################
662							# Get the marginal counts for trigrams -> not set right now but it does work!! #
663							# This is an expensive operation!!!!! #
664							#if($marginals && $#ngram == 2) { @marginalFreqs = trigramMarginals(@ngram); } #
665							#################################################################################
666
667							# get the ngram in its token form and calculate the marginal frequencies
668	302					621	for (0..$#ngram) { print NGRAM "$vocab_array[$ngram[$_]]<>"; }
	666					2871
669
670							# print the frequencies
671	302					1925	print NGRAM "$freq @marginalFreqs \n";
672							}
673							}
674
675							# Gets the marginal counts for each individual word in the ngram
676							sub Marginals
677							{
678	279			279	0	371	my @marginalFreqs = ();
679
680	279					596	for my $i(0..$#_) {
681	643					2335	push @marginalFreqs, vec($unigrams, $_[$i], $bit);
682
683
684	643	100				1540	if($i == 0) {
685	279	100				746	if($_[$i] == vec($corpus, $N, $bit)) { $marginalFreqs[$#marginalFreqs] -= 1; }
	6					19
686							}
687	643	100				1227	if($i == $#_) {
688	279	100				608	if($_[$i] == vec($corpus, 0, $bit)) { $marginalFreqs[$#marginalFreqs] -= 1; }
	1					10
689							}
690
691	643	100	100			2810	if($stop_flag \|\| $remove > 0) {
692	186	100				780	if(exists $remove_hash{$i . ":" . $_[$i]}) {
693	51					169	$marginalFreqs[$#marginalFreqs] -= $remove_hash{$i . ":" . $_[$i]};
694							}
695							}
696							}
697	279					1493	return @marginalFreqs;
698							}
699
700
701							# Find the marginals for trigrams
702							sub trigramMarginals
703							{
704	0			0	0	0	my @marginalFreqs = bigramMarginals(@_);
705
706	0					0	for my $i(0..$#_-1) {
707	0					0	for my $j($i+1..$#_) {
708	0					0	my @ngram = $_[$i]; $ngram[1] = $_[$j]; my $split = $j - $i;
	0					0
	0					0
709	0					0	push @marginalFreqs, find_marginal($split, @ngram);
710
711	0	0				0	if($split == 1) {
712	0	0				0	if($ngram[0] == vec($corpus, 0, $bit)) { $marginalFreqs[$#marginalFreqs] -= 1; }
	0	0				0
713	0					0	elsif($ngram[$#ngram] == vec($corpus, $N, $bit)) { $marginalFreqs[$#marginalFreqs] -= 1; }
714							}
715							}
716							}
717	0					0	return @marginalFreqs;
718							}
719
720							##########################################
721							# Find the frequency of a given string #
722							##########################################
723							sub find_marginal
724							{
725	0			0	0	0	my $split = shift; my $bottom = vec($cache, $_[0], $bit);
	0					0
726
727	0					0	while( vec($corpus, vec($suffix, $bottom, $bit)+$split,$bit) != $_[1]) { $bottom++; }
	0					0
728
729	0					0	my $top = $bottom+1;
730	0					0	while(1) {
731	0	0				0	if(vec($corpus,vec($suffix, $top, $bit)+$split,$bit)!=$_[1]) { return($top-$bottom); }
	0					0
732	0					0	$top++;
733							}
734							}
735
736							#############################
737							# Create the corpus array #
738							#############################
739							sub corpus_array
740							{
741							#open SNTFILE
742	16	50		16	0	1725	open(SNT, $SNT_FILE) \|\| die "Could not open the sntfile: $!\n";
743
744							# Initialize the variables
745	16					47	my $offset = 0; $corpus = ""; $N = 0;
	16					56
	16					65
746
747	16					365	while(){
748	248					454	chomp;
749	248					1823	my @t = split/\s+/;
750	248					528	foreach (@t) { vec($corpus, $offset++, $bit) = $_; $N++; }
	2217					4123
	2217					5936
751							}
752
753							#decrement N by one to obtain the actual size of the corpus
754	16					45	$N--;
755							}
756
757							#############################
758							# Create the suffix array #
759							#############################
760							sub suffix_array
761							{
762	16			16	0	47	my %w = ();
763	16					49	for(0..$N) { push @{$w{vec($corpus, $_, $bit)}}, $_; }
	2217					7447
	2217					7988
764
765	16					57	my $count = 0;
766	16					892	foreach (sort keys %w) {
767	1263					1461	my $first = $count; my $temp;
	1263					1119
768							# store each possible ngram in sorted order
769	1263					1283	foreach my $elem (sort bysuffix @{$w{$_}}) {
	1263					4411
770	2217					4526	vec($suffix, $count++, $bit) = $elem; $temp = $elem;
	2217					6989
771							}
772							# if marginals cache the unigram counts as well as the
773							# first and last location of each word
774	1263	100				2886	if($marginals == 1) {
775	1240					3271	vec($unigrams, $_, $bit) = $count - $first;
776	1240					5300	vec($cache, $_, $bit) = $first;
777							}
778							}
779							}
780
781							############################
782							# Sort function bysuffix #
783							############################
784							sub bysuffix
785							{
786
787	2111			2111	0	2648	my $z = $a; my $x = $b; my $counter = 0;
	2111					2598
	2111					2318
788	2111		100			10845	while(vec($corpus, ++$z, $bit) == vec($corpus, ++$x, $bit) && ++$counter < $max_ngram_size) { ; }
789
790	2111	100				6898	return ( vec($corpus, $z, $bit) == vec($corpus, $x, $bit) ? 0 :
		100
791							(vec($corpus, $z, $bit) < vec($corpus, $x, $bit) ? -1 : 1) );
792							}
793
794							1;
795
796							__END__