| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::ParseAHD; |
|
2
|
|
|
|
|
|
|
#use base qw(BASE); |
|
3
|
1
|
|
|
1
|
|
33020
|
use Text::ParseAHD::Word; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
use Text::ParseAHD::Definition; |
|
5
|
|
|
|
|
|
|
use Class::Std; |
|
6
|
|
|
|
|
|
|
use Class::Std::Utils; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
use warnings; |
|
9
|
|
|
|
|
|
|
use strict; |
|
10
|
|
|
|
|
|
|
use Carp; |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use version; our $VERSION = qv('0.0.2'); |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
{ |
|
15
|
|
|
|
|
|
|
my %html_of :ATTR( :get :set :default<''> :init_arg ); |
|
16
|
|
|
|
|
|
|
my %word_of :ATTR( :get :set :default<''> :init_arg ); |
|
17
|
|
|
|
|
|
|
my %pos_of :ATTR( :get :set :default<''> :init_arg ); |
|
18
|
|
|
|
|
|
|
my %syllables_of :ATTR( :get :set :default<''> :init_arg ); |
|
19
|
|
|
|
|
|
|
my %defs_of :ATTR( :get :set :default<''> :init_arg ); |
|
20
|
|
|
|
|
|
|
my %text_of :ATTR( :get :set :default<''> :init_arg ); |
|
21
|
|
|
|
|
|
|
my %word_obj_obj_of :ATTR( :get :set :default<''> :init_arg ); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub START { |
|
25
|
|
|
|
|
|
|
my ($self, $ident, $arg_ref) = @_; |
|
26
|
|
|
|
|
|
|
#$html_of{$ident} = $arg_ref->{html}; |
|
27
|
|
|
|
|
|
|
#$word_of{$ident} = $arg_ref->{word}; |
|
28
|
|
|
|
|
|
|
#$Word_of{$ident} = Text::ParseAHD::Word->new({'word',$word_of{$ident}}); |
|
29
|
|
|
|
|
|
|
$self->set_word_obj( Text::ParseAHD::Word->new({word => $self->get_word() }) ); |
|
30
|
|
|
|
|
|
|
return; |
|
31
|
|
|
|
|
|
|
} |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub parse_html { |
|
34
|
|
|
|
|
|
|
my ( $self ) = @_; |
|
35
|
|
|
|
|
|
|
my $ident = ident $self; |
|
36
|
|
|
|
|
|
|
my $html = $html_of{$ident}; |
|
37
|
|
|
|
|
|
|
while ( $html =~ m/(.*?)/gsix ) { |
|
38
|
|
|
|
|
|
|
my $definition_text = $1; |
|
39
|
|
|
|
|
|
|
$definition_text =~ s/(.*?)<\/b>//i; |
|
40
|
|
|
|
|
|
|
my $word = $self->clean_word( $1 ); |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# print " STATUS: AHD4 definition " . ++$count . " found\n"; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
if ($word eq $word_of{$ident}) { |
|
45
|
|
|
|
|
|
|
$definition_text =~ s/\n//g; # Remove newlines |
|
46
|
|
|
|
|
|
|
$definition_text =~ s/\ //g; # Remove nbsp |
|
47
|
|
|
|
|
|
|
$definition_text =~ s/ //g; # Remove br |
|
48
|
|
|
|
|
|
|
$definition_text =~ s/^(.*?)//six; # Remove leading |
|
49
|
|
|
|
|
|
|
$definition_text =~ s/<\/td>.*$//six; # Remove trailing |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my @defs = $self->split_defs( $definition_text ); |
|
52
|
|
|
|
|
|
|
my $i=1; |
|
53
|
|
|
|
|
|
|
foreach my $def (@defs) { |
|
54
|
|
|
|
|
|
|
print $i . "\n\n" . $def . "\n\n\n\n"; |
|
55
|
|
|
|
|
|
|
$self->parse_definition( $def ); |
|
56
|
|
|
|
|
|
|
$i++; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
#print $definition_text."\n\n\n"; |
|
60
|
|
|
|
|
|
|
} |
|
61
|
|
|
|
|
|
|
$self->report_word(); |
|
62
|
|
|
|
|
|
|
return; |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub clean_word { |
|
66
|
|
|
|
|
|
|
my ($self, $word) = @_; |
|
67
|
|
|
|
|
|
|
my $A = chr(194); |
|
68
|
|
|
|
|
|
|
my $bullet = chr(183); |
|
69
|
|
|
|
|
|
|
$word =~ s/://g; |
|
70
|
|
|
|
|
|
|
$word =~ s/$A//g; |
|
71
|
|
|
|
|
|
|
$word =~ s/$bullet//g; |
|
72
|
|
|
|
|
|
|
$word =~ s/.*?<\/sup>//; # RH 080719 |
|
73
|
|
|
|
|
|
|
$word =~ s/^\s+//; # RH 080719 |
|
74
|
|
|
|
|
|
|
$word =~ s/\s+$//; # RH 080719 |
|
75
|
|
|
|
|
|
|
return $word; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub split_defs { |
|
79
|
|
|
|
|
|
|
my ($self, $text) = @_; |
|
80
|
|
|
|
|
|
|
my @defs = split(//, $text ); shift @defs; |
|
81
|
|
|
|
|
|
|
return @defs; |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub parse_definition { |
|
85
|
|
|
|
|
|
|
my ($self, $text) = @_; |
|
86
|
|
|
|
|
|
|
my $ident = ident $self; |
|
87
|
|
|
|
|
|
|
my $pos = ''; |
|
88
|
|
|
|
|
|
|
my @word_forms = (); |
|
89
|
|
|
|
|
|
|
my @definitions = (); |
|
90
|
|
|
|
|
|
|
$text =~ s/(.*?)//six; |
|
91
|
|
|
|
|
|
|
$pos=$1; |
|
92
|
|
|
|
|
|
|
$text =~ s/(.*?)//six; |
|
93
|
|
|
|
|
|
|
my $subhead = $1; |
|
94
|
|
|
|
|
|
|
if (defined $subhead) { |
|
95
|
|
|
|
|
|
|
#while ($subhead =~ m/(.*?)<\/b>/gisx) { push @word_forms, $self->syllables( $1 ); } |
|
96
|
|
|
|
|
|
|
while ($subhead =~ m/(.*?)<\/b>/gisx) { push @word_forms, $1; } |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
if ($subhead =~ m/(.*?)<\/i>/i) { $pos = $1; $pos =~ s/\.//g; } # Subtype |
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
#print "WORD FORMS: " . join(' ', @word_forms) ."\n\n\n"; |
|
101
|
|
|
|
|
|
|
while ($text =~ m/(.*?)/gsix) { |
|
102
|
|
|
|
|
|
|
my $def_text = $1; |
|
103
|
|
|
|
|
|
|
my $new_defs; |
|
104
|
|
|
|
|
|
|
if ($def_text =~ m/parse_list( $def_text, $pos ); } |
|
105
|
|
|
|
|
|
|
else { $new_defs = [$self->parse_single( $def_text )]; } |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
foreach my $definition (@{ $new_defs }) {push @definitions, $definition; } |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
foreach my $definition (@definitions){ |
|
110
|
|
|
|
|
|
|
$self->get_word_obj()->add_definition($definition->{text}, $definition->{example}, $pos); |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub parse_list { |
|
116
|
|
|
|
|
|
|
my ($self, $text, $pos ) = @_; |
|
117
|
|
|
|
|
|
|
$text =~ s//\[/ig; |
|
118
|
|
|
|
|
|
|
$text =~ s/<\/ol>/]/ig; |
|
119
|
|
|
|
|
|
|
$text =~ s//{ text => '/ig; |
|
120
|
|
|
|
|
|
|
$text =~ s/<\/li>/'}, /ig; |
|
121
|
|
|
|
|
|
|
$text =~ s/'\[/\[/ig; |
|
122
|
|
|
|
|
|
|
$text =~ s/,\s]/]/ig; |
|
123
|
|
|
|
|
|
|
$text =~ s/]'/]/ig; |
|
124
|
|
|
|
|
|
|
$text =~ s/Informal<\/i>//ig;#added by Nathan |
|
125
|
|
|
|
|
|
|
$text =~ s/Slang<\/i>//ig;#added by Nathan |
|
126
|
|
|
|
|
|
|
$text =~ s/text => '? ?\[{ text => '/text => '/ig; |
|
127
|
|
|
|
|
|
|
$text =~ s/{ text => '.*<\/i> \[{ text/{ text/ig; |
|
128
|
|
|
|
|
|
|
$text =~ s/}\]}/}/ig; |
|
129
|
|
|
|
|
|
|
$text =~ s/({ text => '[-\.\w\s]*)'([-\.\w\s]*'},)/$1$2/ig; |
|
130
|
|
|
|
|
|
|
#$text =~ s///ig; |
|
131
|
|
|
|
|
|
|
#$text =~ s/<\/ol>/]/ig; |
|
132
|
|
|
|
|
|
|
#$text =~ s//{ text => '/ig; |
|
133
|
|
|
|
|
|
|
#$text =~ s/<\/li>/'}, /ig; |
|
134
|
|
|
|
|
|
|
#$text =~ s/'\[/\[/ig; |
|
135
|
|
|
|
|
|
|
#$text =~ s/,\s]/]/ig; |
|
136
|
|
|
|
|
|
|
#$text =~ s/]'/]/ig; |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
print "LIST TEXT: $text\n"; |
|
139
|
|
|
|
|
|
|
my $definitions = eval "return $text;"; |
|
140
|
|
|
|
|
|
|
#my $definitions = [{ text => 'A domesticated carnivorous mammal (Canis familiaris) related to the foxes and wolves and raised in a wide variety of breeds.'}, { text => 'Any of various carnivorous mammals of the family Canidae, such as the dingo.'}, { text => 'A male animal of the family Canidae, especially of the fox or a domesticated breed.'}, { text => 'Any of various other animals, such as the prairie dog.'}, { text => 'Informal [{ text => 'A person: You won, you lucky dog.'}, { text => 'A person regarded as contemptible: You stole my watch, you dog.'}, { text => 'A person regarded as unattractive or uninteresting.'}, { text => 'Something of inferior or low quality: "The President had read the speech to some of his friends and they told him it was a dog" (John P. Roche).'}, { text => 'An investment that produces a low return or a loss.'}]}, { text => 'Slang [{ text => 'A person regarded as unattractive or uninteresting.'}, { text => 'Something of inferior or low quality: "The President had read the speech to some of his friends and they told him it was a dog" (John P. Roche).'}, { text => 'An investment that produces a low return or a loss.'}]}, { text => 'dogs Slang The feet.'}, { text => 'See andiron.'}, { text => 'Slang A hot dog; a wiener.'}, { text => 'Any of various hooked or U-shaped metallic devices used for gripping or holding heavy objects.'}, { text => 'Astronomy A sun dog.'}]; |
|
141
|
|
|
|
|
|
|
#while($text=~m/[ |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my @definitions; |
|
144
|
|
|
|
|
|
|
foreach my $definition ( @$definitions ) { |
|
145
|
|
|
|
|
|
|
#foreach my $definition ( @definitions ) { |
|
146
|
|
|
|
|
|
|
#print "HELLO: $definition->{text}\n\n"; |
|
147
|
|
|
|
|
|
|
if ( $definition->{text} =~ m/^ARRAY/ ) { |
|
148
|
|
|
|
|
|
|
foreach my $sub_list ( @{ $definition->{text} } ) { |
|
149
|
|
|
|
|
|
|
push @definitions, $self->parse_single( $sub_list->{text} ); |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
} else { |
|
152
|
|
|
|
|
|
|
push @definitions, $self->parse_single( $definition->{text} ); |
|
153
|
|
|
|
|
|
|
#$self->get_word_obj()->add_definition($definitions[-1]{text}, $definitions[-1]{example}, $pos); |
|
154
|
|
|
|
|
|
|
#my $list= $self->get_word_obj()->get_defs(); |
|
155
|
|
|
|
|
|
|
#my @list2 = @$list; |
|
156
|
|
|
|
|
|
|
#$self->report_word(-1); |
|
157
|
|
|
|
|
|
|
} |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
return \@definitions; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub report_word{ |
|
164
|
|
|
|
|
|
|
my ($self, $i)=@_; |
|
165
|
|
|
|
|
|
|
my $word = $self->get_word_obj(); |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
print "WORD: " . $word->get_word() . "\n"; |
|
168
|
|
|
|
|
|
|
my $list = $word->get_defs(); |
|
169
|
|
|
|
|
|
|
my @list2 = @$list; |
|
170
|
|
|
|
|
|
|
if($i eq ''){ |
|
171
|
|
|
|
|
|
|
$i=1; |
|
172
|
|
|
|
|
|
|
foreach my $def (@list2){ |
|
173
|
|
|
|
|
|
|
print "DEF#$i:\n text: " . $def->get_text() . "\n example: " . $def->get_example() . "\n pos: " . $def->get_pos() . "\n\n"; |
|
174
|
|
|
|
|
|
|
$i++; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
}else{ |
|
177
|
|
|
|
|
|
|
my $def = $list2[$i]; |
|
178
|
|
|
|
|
|
|
print "DEF#$i:\n text: " . $def->get_text() . "\n example: " . $def->get_example() . "\n pos: " . $def->get_pos() . "\n\n"; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub parse_single { |
|
183
|
|
|
|
|
|
|
my ($self, $text ) = @_; |
|
184
|
|
|
|
|
|
|
my $ident = ident $self; |
|
185
|
|
|
|
|
|
|
while ( $text =~ m/(.*?)<\/b>(.*?)<\/i>/gi ) { |
|
186
|
|
|
|
|
|
|
my ($word_form, $pos) = ($self->syllables( $1 ), $2); $pos =~ s/\.//g; $pos =~ s/ //g; |
|
187
|
|
|
|
|
|
|
#$self->_insert_word_form( $word_form, $pos ); |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
my (@keys) = (); |
|
190
|
|
|
|
|
|
|
my (@roots) = (); |
|
191
|
|
|
|
|
|
|
my (%word_forms) = (); |
|
192
|
|
|
|
|
|
|
$text =~ s/(.*?)<\/tt>//i; |
|
193
|
|
|
|
|
|
|
my $root=''; |
|
194
|
|
|
|
|
|
|
#$root = $1; if ($root eq $text) { $root = ''; } |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# Check for additional word forms |
|
197
|
|
|
|
|
|
|
while ( $text =~ s/(.*?)<\/b>(.*?)<\/i>//gi ) { |
|
198
|
|
|
|
|
|
|
my ($word_form, $pos) = ($self->syllables($1), $2); $pos =~ s/\.//g; $pos =~ s/ //g; |
|
199
|
|
|
|
|
|
|
$word_forms{$word_form} = $pos; |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
if (keys %word_forms) { |
|
202
|
|
|
|
|
|
|
foreach my $key (sort keys %word_forms) { $self->_insert_word_form( $key, $word_forms{$key} ); } |
|
203
|
|
|
|
|
|
|
} else { |
|
204
|
|
|
|
|
|
|
# No word forms, parse definition, root, example |
|
205
|
|
|
|
|
|
|
#print "SINGLE TEXT: $text \n"; |
|
206
|
|
|
|
|
|
|
my $example = ''; |
|
207
|
|
|
|
|
|
|
$text =~ m/()/ig; #resetting $1 |
|
208
|
|
|
|
|
|
|
$text =~ s/(.*?)<\/i>//i; |
|
209
|
|
|
|
|
|
|
$example = $1; |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
#print "EXAMPLE: $example\n"; |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
if ($text eq $example) |
|
214
|
|
|
|
|
|
|
{ $example = ''; } |
|
215
|
|
|
|
|
|
|
$example =~ s/["']//i; |
|
216
|
|
|
|
|
|
|
$example =~ s/\.//i; |
|
217
|
|
|
|
|
|
|
$text =~ s/\[.*?]//i; |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
if ($root) { |
|
220
|
|
|
|
|
|
|
if ($root =~ m/(.*?)<\/font>/g) { $root = $1; } |
|
221
|
|
|
|
|
|
|
while ($root =~ m/(.*?)<\/b>/g) { push @roots, $1; } |
|
222
|
|
|
|
|
|
|
if (@roots) { $root = join(',', @roots); } |
|
223
|
|
|
|
|
|
|
$root =~ s/'/:/g; } |
|
224
|
|
|
|
|
|
|
$text =~ s/<\/font>//i; |
|
225
|
|
|
|
|
|
|
$text =~ s/(.*?)<\/i>//i; |
|
226
|
|
|
|
|
|
|
$text =~ s/.*?<\/sup>//i; |
|
227
|
|
|
|
|
|
|
$text =~ s/://i; |
|
228
|
|
|
|
|
|
|
$text =~ s/["']//i; |
|
229
|
|
|
|
|
|
|
$text =~ s/See Synonyms at .*?<\/a>\.//i; |
|
230
|
|
|
|
|
|
|
$text =~ s/See .*?<\/a>//i; |
|
231
|
|
|
|
|
|
|
$text =~ s/.*?<\/b>//i; |
|
232
|
|
|
|
|
|
|
$text =~ s/.*?<\/font>//i; |
|
233
|
|
|
|
|
|
|
$text =~ s/.*?<\/i>//i; |
|
234
|
|
|
|
|
|
|
$text =~ s/\.//i; |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# push @keys, "word_id => '$word_id_of{$ident}'"; |
|
237
|
|
|
|
|
|
|
push @keys, "word => '$word_of{$ident}'"; |
|
238
|
|
|
|
|
|
|
if ($pos_of{$ident} && !$root) { push @keys, "pos => '$pos_of{$ident}'"; } |
|
239
|
|
|
|
|
|
|
if ( $root && $example) { push @keys, "root => '$root'"; push @keys, "root_def => '" . $self->_sql_escape($example) . "'"; $example = ''; } |
|
240
|
|
|
|
|
|
|
elsif ( $root ) { push @keys, "root => '$root'"; } |
|
241
|
|
|
|
|
|
|
#if ($example) { push @keys, "example => '" . $self->_sql_escape($example) . "'"; } |
|
242
|
|
|
|
|
|
|
if ($example) { push @keys, "example => '$example'"; } |
|
243
|
|
|
|
|
|
|
#if ($text) { push @keys, "text => '" . $self->_sql_escape($text) . "'"; } |
|
244
|
|
|
|
|
|
|
if ($text) { push @keys, "text => '$text'"; } |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
return eval "return { " . join(', ', @keys) . " };"; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
|
|
|
|
|
|
} |
|
250
|
|
|
|
|
|
|
1; # Magic true value required at end of module |
|
251
|
|
|
|
|
|
|
__END__ |