line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
2
|
|
|
2
|
|
137028
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
69
|
|
2
|
2
|
|
|
2
|
|
10
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
94
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Parse::GutenbergRoget; |
5
|
2
|
|
|
2
|
|
10
|
use base qw(Exporter); |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
231
|
|
6
|
|
|
|
|
|
|
|
7
|
2
|
|
|
2
|
|
12
|
use Carp (); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
53
|
|
8
|
2
|
|
|
2
|
|
2580
|
use Text::CSV_XS; |
|
2
|
|
|
|
|
24039
|
|
|
2
|
|
|
|
|
2990
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @EXPORT = qw(parse_roget); ## no critic Export |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 NAME |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Parse::GutenbergRoget - parse Project Gutenberg's Roget's Thesaurus |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 VERSION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
version 0.022 |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
$Id$ |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=cut |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our $VERSION = '0.022'; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 SYNOPSIS |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use Parse::GutenbergRoget |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my %section = parse_roget("./roget15a.txt"); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
print $section{1}[0][0]{text}; # existence |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 DESCRIPTION |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
A Roget's Thesaurus is more than the simple synonym/antonym finder included in |
37
|
|
|
|
|
|
|
many dictionary sets. It organizes words into semantically realted categories, |
38
|
|
|
|
|
|
|
so that words with related meanings can be found in proximity to one another, |
39
|
|
|
|
|
|
|
with the level of proximity indicating the level of similarity. |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Project Gutenberg has produced an etext of the 1911 edition of Roget's |
42
|
|
|
|
|
|
|
Thesaurus, and later began to revise it, in 1991. While it's not the best |
43
|
|
|
|
|
|
|
Roget-style thesaurus available, it's the best public domain electronic |
44
|
|
|
|
|
|
|
thesaurus datasource I've found. |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
This module parses the file's contents into a Perl data structure, which can |
47
|
|
|
|
|
|
|
then be stored in systems for searching and browsing it. This module does |
48
|
|
|
|
|
|
|
I implement those systems. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
The code is not complete. This means that everything that can be parsed is not |
51
|
|
|
|
|
|
|
yet being parsed. It's important to realize that not everything is going to be |
52
|
|
|
|
|
|
|
parseable. There are too many typos and broken rules which, due to the lousy |
53
|
|
|
|
|
|
|
nature of the rules, create ambiguity. For a description of these rules see |
54
|
|
|
|
|
|
|
L"RULES"> below. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 FUNCTIONS |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head2 C<< parse_roget($filename) >> |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
This function, exported by default, will attempt to open, read, and parse the |
61
|
|
|
|
|
|
|
named file as a Project Gutenberg Roget's Thesaurus. It has only been tested |
62
|
|
|
|
|
|
|
with C, which is not included in the distribution, because it's |
63
|
|
|
|
|
|
|
too big. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
It returns a hash with the following structure: |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
%section = ( |
68
|
|
|
|
|
|
|
... |
69
|
|
|
|
|
|
|
'100a' => { |
70
|
|
|
|
|
|
|
major => 100, # major and minor form section identity |
71
|
|
|
|
|
|
|
minor => 'a', |
72
|
|
|
|
|
|
|
name => 'Fraction', |
73
|
|
|
|
|
|
|
comments => [ 'Less than one' ], |
74
|
|
|
|
|
|
|
subsections => [ |
75
|
|
|
|
|
|
|
{ |
76
|
|
|
|
|
|
|
type => 'N', # these entries are nouns |
77
|
|
|
|
|
|
|
groups => [ |
78
|
|
|
|
|
|
|
{ entries => [ |
79
|
|
|
|
|
|
|
{ text => 'fraction' }, |
80
|
|
|
|
|
|
|
{ text => 'fractional part' } |
81
|
|
|
|
|
|
|
] }, |
82
|
|
|
|
|
|
|
{ entries => [ { text => 'part &c. 51' } ] } |
83
|
|
|
|
|
|
|
] |
84
|
|
|
|
|
|
|
}, |
85
|
|
|
|
|
|
|
{ |
86
|
|
|
|
|
|
|
type => 'Adj', |
87
|
|
|
|
|
|
|
groups => [ { entries => [ ... ] } ] |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
] |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
... |
92
|
|
|
|
|
|
|
); |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
This structure isn't pretty or perfect, and is subject to change. All of its |
95
|
|
|
|
|
|
|
elements are shown here, except for one exception, which is the next likely |
96
|
|
|
|
|
|
|
subject for change: flags. Entries may have flags, in addition to text, which |
97
|
|
|
|
|
|
|
note things like "French" or "archaic". Entries (or possibly groups) will also |
98
|
|
|
|
|
|
|
gain cross reference attribues, replacing the ugly "&c. XX" text. I'd also |
99
|
|
|
|
|
|
|
like to deal with references to other subsections, which come in the form "&c. |
100
|
|
|
|
|
|
|
Adj." There isn't any reason for these to be needed, I think. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=cut |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub parse_roget { |
105
|
0
|
|
|
0
|
1
|
|
my ($filename) = @_; |
106
|
0
|
|
|
|
|
|
my %section = parse_sections($filename); |
107
|
0
|
|
|
|
|
|
bloom_sections(\%section); |
108
|
0
|
|
|
|
|
|
return %section; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 C<< parse_sections($filename) >> |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
This function is used internally by C to read the named file, |
114
|
|
|
|
|
|
|
returning the above structure, parsed only to the section level. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=cut |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub parse_sections { |
119
|
0
|
|
|
0
|
1
|
|
my ($filename) = @_; |
120
|
|
|
|
|
|
|
|
121
|
0
|
0
|
|
|
|
|
open my $roget, '<', $filename |
122
|
|
|
|
|
|
|
or Carp::croak "couldn't open $filename: $!"; |
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
|
my $previous_section; |
125
|
|
|
|
|
|
|
my %section; |
126
|
|
|
|
|
|
|
|
127
|
0
|
|
|
|
|
|
my $peeked_line; |
128
|
0
|
|
|
|
|
|
my ($in_newheader, $in_longcomment); |
129
|
|
|
|
|
|
|
|
130
|
0
|
|
0
|
|
|
|
while (my $line = ($peeked_line || <$roget>)) { |
131
|
0
|
|
|
|
|
|
undef $peeked_line; |
132
|
|
|
|
|
|
|
|
133
|
0
|
|
|
|
|
|
chomp $line; |
134
|
0
|
0
|
|
|
|
|
next unless $line; |
135
|
0
|
0
|
|
|
|
|
next if ($line =~ /^#/); # comment |
136
|
|
|
|
|
|
|
|
137
|
0
|
0
|
|
|
|
|
if ($line =~ /^\s*<--/) { $in_longcomment = 1; } |
|
0
|
|
|
|
|
|
|
138
|
0
|
0
|
|
|
|
|
if ($line =~ /-->$/) { $in_longcomment = 0; next; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
|
next if $in_longcomment; |
140
|
|
|
|
|
|
|
|
141
|
0
|
0
|
|
|
|
|
if ($line =~ /^%/) { |
142
|
0
|
|
|
|
|
|
$in_newheader = not $in_newheader; |
143
|
0
|
|
|
|
|
|
next; |
144
|
|
|
|
|
|
|
} |
145
|
0
|
0
|
|
|
|
|
next if $in_newheader; |
146
|
|
|
|
|
|
|
|
147
|
0
|
|
|
|
|
|
$line =~ s/^\s+//; |
148
|
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
|
until ($peeked_line) { |
150
|
0
|
|
|
|
|
|
$peeked_line = <$roget>; |
151
|
0
|
0
|
|
|
|
|
last unless defined $peeked_line; |
152
|
0
|
|
|
|
|
|
chomp $peeked_line; |
153
|
0
|
0
|
0
|
|
|
|
if ($peeked_line and $peeked_line !~ /^\s{4}/ |
|
|
|
0
|
|
|
|
|
154
|
|
|
|
|
|
|
and $peeked_line !~ /^(?:#|%|<--)/) |
155
|
|
|
|
|
|
|
{ |
156
|
0
|
0
|
|
|
|
|
$line .= q{ } unless (substr($line, -1, 1) eq q{-}); |
157
|
0
|
|
|
|
|
|
$line .= $peeked_line; |
158
|
0
|
|
|
|
|
|
undef $peeked_line; |
159
|
0
|
0
|
|
|
|
|
if ($line =~ /[^,]+,[^.]+\.\s{4}/) { |
160
|
0
|
|
|
|
|
|
($line, $peeked_line) = split /\s{4}/, $line, 2; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
|
my ($sec, $title, $newline) = |
166
|
|
|
|
|
|
|
($line =~ /^#?(\d+[a-z]?). (.*?)(?:--(.*))?$/); |
167
|
0
|
0
|
0
|
|
|
|
$line = ($newline||'') if ($sec); |
168
|
|
|
|
|
|
|
|
169
|
0
|
0
|
|
|
|
|
if ($sec) { |
170
|
0
|
|
|
|
|
|
(my($comment_beginning), $title, my($comment_end)) = |
171
|
|
|
|
|
|
|
($title =~ /(?:\[(.+?)\.?\])?\s*([^.]+)\.?\s*(?:\[(.+?)\.?\])?/); |
172
|
0
|
|
|
|
|
|
$title =~ s/(^\s+|\s{2,}|\s+$)//g; |
173
|
0
|
|
|
|
|
|
$section{$sec} = { |
174
|
|
|
|
|
|
|
name => $title, |
175
|
|
|
|
|
|
|
subsections => [ { text => $line||'' } ], |
176
|
0
|
|
0
|
|
|
|
comments => [ grep { defined $_ } ($comment_beginning, $comment_end) ] |
177
|
|
|
|
|
|
|
}; |
178
|
0
|
|
|
|
|
|
@{$section{$sec}}{qw[major minor]} = ($sec =~ /^(\d+)(.*)$/); |
|
0
|
|
|
|
|
|
|
179
|
0
|
0
|
|
|
|
|
Carp::confess "couldn't parse section: $sec" unless $section{$sec}{major}; |
180
|
0
|
|
|
|
|
|
$previous_section = $sec; |
181
|
|
|
|
|
|
|
} else { |
182
|
0
|
|
0
|
|
|
|
$section{$previous_section}{subsections} ||= []; |
183
|
0
|
|
|
|
|
|
push @{$section{$previous_section}{subsections}}, { text => $line }; |
|
0
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
} |
186
|
0
|
|
|
|
|
|
return %section; |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 C<< bloom_sections(\%sections) >> |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Given a reference to the section hash, this subroutine expands the sections |
193
|
|
|
|
|
|
|
into subsections, groups, and entries. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=cut |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub bloom_sections { |
198
|
0
|
|
|
0
|
1
|
|
my ($section) = @_; |
199
|
|
|
|
|
|
|
|
200
|
0
|
|
|
|
|
|
my $decomma = Text::CSV_XS->new; |
201
|
0
|
|
|
|
|
|
my $desemi = Text::CSV_XS->new({sep_char => q{;}}); |
202
|
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
|
my $types = qr/(Adj|Adv|Int|N|Phr|Pron|V)/; |
204
|
|
|
|
|
|
|
|
205
|
0
|
|
|
|
|
|
for (values %$section) { |
206
|
0
|
|
|
|
|
|
my $previous_subsection; |
207
|
0
|
|
|
|
|
|
for my $subsection (@{$_->{subsections}}) { |
|
0
|
|
|
|
|
|
|
208
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/\.$//; |
209
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/ {2,}/ /g; |
210
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/(^\s+|\s+$)//g; |
211
|
|
|
|
|
|
|
|
212
|
0
|
0
|
|
|
|
|
if (my ($type) = ($subsection->{text} =~ /^$types\./)) { |
|
|
0
|
|
|
|
|
|
213
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/^$type\.//; |
214
|
0
|
|
|
|
|
|
$subsection->{type} = $type; |
215
|
|
|
|
|
|
|
} elsif ($previous_subsection) { |
216
|
0
|
|
|
|
|
|
$subsection->{type} = $previous_subsection->{type}; |
217
|
|
|
|
|
|
|
} else { |
218
|
0
|
|
|
|
|
|
$subsection->{type} = 'UNKNOWN'; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
0
|
|
|
|
|
|
$desemi->parse(delete $subsection->{text}); |
222
|
0
|
|
|
|
|
|
$subsection->{groups} = [ map { { text => $_ } } $desemi->fields ]; |
|
0
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
|
224
|
0
|
|
|
|
|
|
for my $group (@{$subsection->{groups}}) { |
|
0
|
|
|
|
|
|
|
225
|
0
|
|
|
|
|
|
$decomma->parse(delete $group->{text}); |
226
|
0
|
|
|
|
|
|
$group->{entries} = [ map { { text => $_, flags => [] } } $decomma->fields ]; |
|
0
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
228
|
0
|
|
|
|
|
|
for (@{$group->{entries}}) { |
|
0
|
|
|
|
|
|
|
229
|
0
|
|
0
|
|
|
|
$_->{text}||= 'UNPARSED'; |
230
|
0
|
0
|
|
|
|
|
if ($_->{text} =~ s/\[obs3\]//) { |
231
|
0
|
|
|
|
|
|
push @{$_->{flags}}, 'archaic? (1991)'; |
|
0
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
} |
233
|
0
|
0
|
|
|
|
|
if ($_->{text} =~ s/|!//) { |
234
|
0
|
|
|
|
|
|
push @{$_->{flags}}, 'obsolete (1991)'; |
|
0
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
} |
236
|
0
|
0
|
|
|
|
|
if ($_->{text} =~ s/|//) { |
237
|
0
|
|
|
|
|
|
push @{$_->{flags}}, 'obsolete (1911)'; |
|
0
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
} |
239
|
0
|
|
|
|
|
|
$_->{text} =~ s/(^\s+|\s+$)//; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
} |
242
|
0
|
|
|
|
|
|
$previous_subsection = $subsection; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head1 THE FILE |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=over 4 |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=item * The thesaurus file is plain text, in 7-bit ASCII. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item * Lines with a C<#> as their first character are comments. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=item * Lines beginning with C<< <-- >> begin multi-line comments. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=item * Lines ending with C<< --> >> end multi-line comments. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
These multi-line comments were originally used for including the page numbers |
260
|
|
|
|
|
|
|
from the original text. Later editors used them (instead of C<#> comments) to |
261
|
|
|
|
|
|
|
mark their editorial notes. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
There exists one situation in C where the C<< <-- >> occurs |
264
|
|
|
|
|
|
|
outside of position zero in a line. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item * A line containing only C<%> begins or ends new "supersection" data. |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
So, if we wanted to begin a new supersection for "States of Mind", we might |
269
|
|
|
|
|
|
|
have the following: |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
% |
272
|
|
|
|
|
|
|
STATES OF MIND |
273
|
|
|
|
|
|
|
% |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Unfortunately, there is almost no consistency in how these supersections are |
276
|
|
|
|
|
|
|
organized. Some entries declare new sections "SECTION VII. CHANGE" and |
277
|
|
|
|
|
|
|
immediately begin new subsupersections, "1. SIMPLE CHANGE". Others just give |
278
|
|
|
|
|
|
|
headings: "Present Events" |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
Then there is this excerpt: |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
% |
283
|
|
|
|
|
|
|
CLASS II |
284
|
|
|
|
|
|
|
WORDS RELATING TO SPACE |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
SECTION I. SPACE IN GENERAL |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
1. ABSTRACT SPACE |
289
|
|
|
|
|
|
|
% |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
I think the only thing to do is ignore all this crap, so that's what is done. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=item * Lines do not exceed 79 characters in width. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=item * New lines begin in column 6. |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Some, though, begin in column 5, 6, or 8. A line starting in column 4 or less |
298
|
|
|
|
|
|
|
is always a continuation, unless it's in 0. (I'm not considering comments.) |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
Also, new lines might begin in any column, when a period that does not follow |
301
|
|
|
|
|
|
|
the declaration of a subsection type is followed by at least four spaces. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=item * Lines in column 0 are continuation. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
Unless the previous line ended with a period. |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
Continuations are appended to the continued line. Unless the continued line |
308
|
|
|
|
|
|
|
ended with a hyphen, a space is used to join the lines. |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
There is at least one case where a word is split without a hyphen, appearing as |
311
|
|
|
|
|
|
|
"crocu\ns". This word is left broken in the parsed text. |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=item * New lines beginning with a C<#> (not in column 0) begin new sections. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
The C<#> is omitted in one case, section 252a. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=item * The C<#> in a section heading is followed by the section identifier. |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
The section identifier is a series of numbers followed by an optional letter |
320
|
|
|
|
|
|
|
and terminated by a period. |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
=item * The section header is followed by a name and comments. |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
The comments are marked off by square brackets, and may occur before or after |
325
|
|
|
|
|
|
|
the name, or both. |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
The name is all the rest of the text between the comments and before a |
328
|
|
|
|
|
|
|
terminating C<-->. |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
=item * Once section headers are removed, every new line is a new subsection. |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=item * Subsections may begin with a type declaration. |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
A type declaration indicates that all entries in the subsection are of one |
335
|
|
|
|
|
|
|
part of speech: adjective, adverb, noun, interjection, phrase, and so on. |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=item * Subsections with no declared type are of the type of the previous subsection. |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
Despite this rule, some subsections have no type. I file them as UNKNOWN. |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=item * Subsections are divided into groups by semicolons. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
Semicolons don't divide subsections, if they occur within double quotes. The |
344
|
|
|
|
|
|
|
quotes can be part of the line, and need not quote the entire group. |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
In other words, this is a valid subsection, consisting of three groups: |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
... |
349
|
|
|
|
|
|
|
Int. go for it; "go ahead; make my day" [Dirty Harry]; give it a try, |
350
|
|
|
|
|
|
|
take a shot. |
351
|
|
|
|
|
|
|
... |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=item * Groups are divided into entries by commas. |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
Commas don't divide groups, if they occur within double quotes. The quotes can |
356
|
|
|
|
|
|
|
be just part of the entry, and need not quote the entire entry. |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Some groups include the entry-breaking comma inside the quotes, like this one: |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
Phr. it must go no further, it will go no further; don't tell a |
361
|
|
|
|
|
|
|
soul;"tell it not in Gath,"nobody the wiser; alitur vitium vivitque |
362
|
|
|
|
|
|
|
tegendo[Lat][obs3]; "let it be tenable in your silence still"[Hamlet]. |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
The comma after "Gath" should be on the outside. |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=item * Flags may follow the text of an entry. |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
Flags are either text enclosed in square brackets, as "[Lat]" or the special |
369
|
|
|
|
|
|
|
identifiers C<|!> or C<|>. These flags provide metadata about the entry, like |
370
|
|
|
|
|
|
|
its language of origin or the domain of jargon in which it is relevant. C<|> |
371
|
|
|
|
|
|
|
indicates that a word was obsolete in the 1911 edition, and C<|!> indicates |
372
|
|
|
|
|
|
|
words that were no longer used as indicated by 1991. (A third obsolete status |
373
|
|
|
|
|
|
|
is indicated by the flag "[obs3]" meaning "entirely archaic as of 1991.") |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Sometimes, flags occur outside of quotes, and sometimes they're inside. The |
376
|
|
|
|
|
|
|
example text above shows flags outside of the Hamlet quote, but this section |
377
|
|
|
|
|
|
|
(which really exists) is a counter-example: |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
Phr. noli me tangere[Lat]; nemo me impune lacessit[Lat]; don't tread |
380
|
|
|
|
|
|
|
on me; don't you dare; don't even think of it; "Go ahead, make my day!" |
381
|
|
|
|
|
|
|
[Dirty Harry]. |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=item * Cross-references may follow the entry. |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
I haven't determined whether any entries have both a cross-reference I |
386
|
|
|
|
|
|
|
flags. |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
A cross-reference is in the form "c&. 123" where 123 is a section, or "c&. n." |
389
|
|
|
|
|
|
|
where "n" refers to the noun-type subsections for this section. |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Unfortunately, cross-references are not always followed by commas or |
392
|
|
|
|
|
|
|
semicolons, meaning that they sometimes seem to appear in the middle of an |
393
|
|
|
|
|
|
|
entry, as follows: |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
miss, miss one's aim, miss the mark, miss one's footing, miss stays; |
396
|
|
|
|
|
|
|
slip, trip, stumble; make a slip &c., n. blunder &c. 495, make a mess of, |
397
|
|
|
|
|
|
|
... |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
The cross-reference after "make a slip" should have a comma after the "n." but |
400
|
|
|
|
|
|
|
does not, so it appears to be the middle of an entry beginning after "stumble;" |
401
|
|
|
|
|
|
|
and ending before "make a mess of." |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=back |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=head1 TODO |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Well, a good first step would be a TODO section. |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
I'll write some tests that will only run if you put a C file in |
410
|
|
|
|
|
|
|
the right place. I'll also try the tests with previous revisions of the file. |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
I'm also tempted to produce newer revisions on my own, after I contact the |
413
|
|
|
|
|
|
|
address listed in the file. The changes would just be to eliminate anomalies |
414
|
|
|
|
|
|
|
that prevent parsing. Distraction by shiny objects may prevent this goal. |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
The flags and cross reference bits above will be implemented. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
The need for Text::CSV_XS may be eliminated. |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
Entries with internal quoting (especially common in phrases) will no longer |
421
|
|
|
|
|
|
|
become UNPARSED. |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
I'll try to eliminate more UNKNOWN subsection types. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=head1 AUTHOR |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
Ricardo Signes, C<< >> |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=head1 BUGS |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
432
|
|
|
|
|
|
|
C, or through the web interface at |
433
|
|
|
|
|
|
|
L. I will be notified, and then you'll automatically be |
434
|
|
|
|
|
|
|
notified of progress on your bug as I make changes. |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=head1 COPYRIGHT |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
Copyright 2004 Ricardo Signes, All Rights Reserved. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
441
|
|
|
|
|
|
|
under the same terms as Perl itself. |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=cut |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
1; |