| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Mining::Base; |
|
2
|
6
|
|
|
6
|
|
54497
|
use Class::Std; |
|
|
6
|
|
|
|
|
113922
|
|
|
|
6
|
|
|
|
|
42
|
|
|
3
|
6
|
|
|
6
|
|
6929
|
use Class::Std::Utils; |
|
|
6
|
|
|
|
|
41756
|
|
|
|
6
|
|
|
|
|
42
|
|
|
4
|
6
|
|
|
6
|
|
3554
|
use DBIx::MySperqlOO; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
use File::Spec; |
|
6
|
|
|
|
|
|
|
use YAML qw(DumpFile LoadFile); |
|
7
|
|
|
|
|
|
|
use Module::Runtime qw(use_module); |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use warnings; |
|
10
|
|
|
|
|
|
|
use strict; |
|
11
|
|
|
|
|
|
|
use Carp; |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use version; our $VERSION = qv('0.0.8'); |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $config_filename = '.corpus/config'; |
|
16
|
|
|
|
|
|
|
our $status_filename = '.tm-status'; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
{ |
|
19
|
|
|
|
|
|
|
my %library_dbh_of : ATTR(); |
|
20
|
|
|
|
|
|
|
my %analysis_dbh_of : ATTR(); |
|
21
|
|
|
|
|
|
|
my %root_dir_of : ATTR(); |
|
22
|
|
|
|
|
|
|
my %root_url_of : ATTR(); |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub library { my ( $self ) = @_; return $library_dbh_of{ident $self}; } |
|
25
|
|
|
|
|
|
|
sub analysis { my ( $self ) = @_; return $analysis_dbh_of{ident $self}; } |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub get_root_url { my ( $self ) = @_; return $root_url_of{ident $self}; } |
|
28
|
|
|
|
|
|
|
sub get_root_dir { my ( $self ) = @_; return $root_dir_of{ident $self}; } |
|
29
|
|
|
|
|
|
|
sub get_data_dir { my ( $self, $corpus_id ) = @_; return $self->get_root_dir() . "/documents/corpus_$corpus_id"; } |
|
30
|
|
|
|
|
|
|
sub get_config_filename { return File::Spec->catfile( $ENV{HOME}, $config_filename ); } |
|
31
|
|
|
|
|
|
|
sub get_status_filename { return File::Spec->catfile( $ENV{HOME}, $status_filename ); } |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub BUILD { |
|
34
|
|
|
|
|
|
|
my ($self, $ident, $arg_ref) = @_; |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
my $config = $self->_load_config(); |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
$root_dir_of{$ident} = $config->{root_dir}; |
|
39
|
|
|
|
|
|
|
$root_url_of{$ident} = $config->{root_url}; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
$library_dbh_of{$ident} = DBIx::MySperqlOO->new( $config->{library} ); |
|
42
|
|
|
|
|
|
|
$analysis_dbh_of{$ident} = DBIx::MySperqlOO->new( $config->{analysis} ); |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
return; |
|
45
|
|
|
|
|
|
|
} |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub get_corpus_id_from_name { |
|
48
|
|
|
|
|
|
|
my ( $self, $arg_ref ) = @_; |
|
49
|
|
|
|
|
|
|
my $sql = "select corpus_id from corpuses where corpus_name = '" . $arg_ref->{corpus_name} . "'"; |
|
50
|
|
|
|
|
|
|
my ( $corpus_id ) = $self->library()->sqlexec( $sql, '@' ); |
|
51
|
|
|
|
|
|
|
return $corpus_id; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _load_config { |
|
55
|
|
|
|
|
|
|
my ( $self ) = @_; |
|
56
|
|
|
|
|
|
|
return LoadFile( $self->get_config_filename() ); |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub _parse_file_name { |
|
60
|
|
|
|
|
|
|
my ( $self, $url ) = @_; |
|
61
|
|
|
|
|
|
|
my @path = split(/\//, $url); |
|
62
|
|
|
|
|
|
|
return pop(@path); |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub _download_file { |
|
66
|
|
|
|
|
|
|
my ( $self, $arg_ref ) = @_; |
|
67
|
|
|
|
|
|
|
my @stat; |
|
68
|
|
|
|
|
|
|
my $target_dir = defined $arg_ref->{target_dir} ? $arg_ref->{target_dir} : ''; |
|
69
|
|
|
|
|
|
|
my $url = defined $arg_ref->{url} ? $arg_ref->{url} : ''; |
|
70
|
|
|
|
|
|
|
my $tries = defined $arg_ref->{tries} ? $arg_ref->{tries} : 2; |
|
71
|
|
|
|
|
|
|
if ($target_dir && $url) { |
|
72
|
|
|
|
|
|
|
my $file_name = $self->_parse_file_name( $url ); |
|
73
|
|
|
|
|
|
|
my $wget = "wget --tries=$tries --directory-prefix=$target_dir $url"; |
|
74
|
|
|
|
|
|
|
`$wget`; |
|
75
|
|
|
|
|
|
|
@stat = stat("$target_dir/$file_name"); |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
return $stat[7] || '0'; |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub _sql_escape { |
|
81
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
82
|
|
|
|
|
|
|
if ($string) { $string =~ s/(['"\\])/\\$1/g; } |
|
83
|
|
|
|
|
|
|
return $string; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub _html_to_sql { |
|
87
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
88
|
|
|
|
|
|
|
$string = $self->_html_unescape( $string ); |
|
89
|
|
|
|
|
|
|
$string = $self->_sql_escape( $string ); |
|
90
|
|
|
|
|
|
|
return $string; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub _html_escape { |
|
94
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
95
|
|
|
|
|
|
|
$string =~ s/'/'/g; |
|
96
|
|
|
|
|
|
|
$string =~ s/"/"/g; |
|
97
|
|
|
|
|
|
|
return $string; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub _html_encode { |
|
101
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
102
|
|
|
|
|
|
|
$string =~ s/ /%20/g; |
|
103
|
|
|
|
|
|
|
$string =~ s/'/%27/g; |
|
104
|
|
|
|
|
|
|
$string =~ s/\{/%7B/g; |
|
105
|
|
|
|
|
|
|
$string =~ s/\}/%7D/g; |
|
106
|
|
|
|
|
|
|
return $string; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub _html_unescape { |
|
110
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
111
|
|
|
|
|
|
|
$string =~ s/'/'/g; |
|
112
|
|
|
|
|
|
|
$string =~ s/"/"/g; |
|
113
|
|
|
|
|
|
|
$string =~ s/%20/ /g; |
|
114
|
|
|
|
|
|
|
return $string; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub _phone_format { |
|
118
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
119
|
|
|
|
|
|
|
$string =~ s/(\d{3})(\d{3})(\d{4})/($1) $2-$3/; |
|
120
|
|
|
|
|
|
|
return $string; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _phone_unformat { |
|
124
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
125
|
|
|
|
|
|
|
$string =~ s/[^\d]//g; |
|
126
|
|
|
|
|
|
|
return $string; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub _commify { # Perl Cookbook 2.17 |
|
130
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
|
131
|
|
|
|
|
|
|
my $text = reverse $string; |
|
132
|
|
|
|
|
|
|
$text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; |
|
133
|
|
|
|
|
|
|
return scalar reverse $text; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub _get_files { |
|
137
|
|
|
|
|
|
|
my ( $self, $root_dir ) = @_; |
|
138
|
|
|
|
|
|
|
if (opendir(DIR, $root_dir)) { |
|
139
|
|
|
|
|
|
|
my (@files); |
|
140
|
|
|
|
|
|
|
my (@nodes) = (readdir(DIR)); |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
foreach my $node (@nodes) { |
|
143
|
|
|
|
|
|
|
if ($node =~ m/^\./) { next; } |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my $pathnode = $root_dir . "/" . $node; |
|
146
|
|
|
|
|
|
|
my @stat = stat($pathnode); |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
my $value = defined $stat[2] ? $stat[2] : ''; |
|
149
|
|
|
|
|
|
|
if ($value =~ /^[^1]/) { |
|
150
|
|
|
|
|
|
|
push(@files, $node); |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
return @files; |
|
154
|
|
|
|
|
|
|
} else { |
|
155
|
|
|
|
|
|
|
return 0; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
} |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub _get_dirs { |
|
160
|
|
|
|
|
|
|
my ( $self, $path, $nestedflag) = @_; |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# If the directory opens |
|
163
|
|
|
|
|
|
|
if (opendir(DIR, $path)) { |
|
164
|
|
|
|
|
|
|
# Read it |
|
165
|
|
|
|
|
|
|
my (@dirs); |
|
166
|
|
|
|
|
|
|
my (@nodes) = sort (readdir(DIR)); |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
foreach my $node (@nodes) { |
|
169
|
|
|
|
|
|
|
# Drop any dirs (or files) that start with a period |
|
170
|
|
|
|
|
|
|
if ($node =~ m/^\./) { next; } |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# Get file system node status |
|
173
|
|
|
|
|
|
|
my @stat = stat($path . '/' . $node); |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# if the first character of $mode is 1, then it is a dir |
|
176
|
|
|
|
|
|
|
if ($stat[2] =~ /^1/) { |
|
177
|
|
|
|
|
|
|
my $newpath = $path . "/" . $node; |
|
178
|
|
|
|
|
|
|
push(@dirs, $newpath); |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
if ($nestedflag) { |
|
181
|
|
|
|
|
|
|
my @subnodes = &GetDirs($newpath, $nestedflag); |
|
182
|
|
|
|
|
|
|
push(@dirs, @subnodes); |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
} |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
return @dirs; |
|
187
|
|
|
|
|
|
|
} else { |
|
188
|
|
|
|
|
|
|
return 0; |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub _get_file_text { |
|
193
|
|
|
|
|
|
|
my ( $self, $path_file_name ) = @_; |
|
194
|
|
|
|
|
|
|
my ($text, $line); |
|
195
|
|
|
|
|
|
|
if (-e $path_file_name) { |
|
196
|
|
|
|
|
|
|
open (my $IN, '<', $path_file_name) || $self->_status( "(Get) Cannot open $path_file_name: $!" ); |
|
197
|
|
|
|
|
|
|
while ($line = <$IN>) { $text .= $line; } |
|
198
|
|
|
|
|
|
|
close ($IN) || $self->_status( "(Get) Cannot close $path_file_name: $!" ); |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
return $text; |
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub _set_file_text { |
|
204
|
|
|
|
|
|
|
my ( $self, $path_file_name, $text ) = @_; |
|
205
|
|
|
|
|
|
|
open (my $OUT, '>', $path_file_name) || $self->_status( "(Set) Cannot open $path_file_name: $!" ); |
|
206
|
|
|
|
|
|
|
print {$OUT} $text || $self->_status( "(Set) Cannot write $path_file_name: $!" ); |
|
207
|
|
|
|
|
|
|
close ($OUT) || $self->_status( "(Set) Cannot close $path_file_name: $!" ); |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub _add_file_text { |
|
211
|
|
|
|
|
|
|
my ( $self, $path_file_name, $text ) = @_; |
|
212
|
|
|
|
|
|
|
open (my $OUT, '>>', $path_file_name) || $self->_status( "(Add) Cannot open $path_file_name: $!" ); |
|
213
|
|
|
|
|
|
|
print {$OUT} $text || $self->_status( "(Add) Cannot write $path_file_name: $!" ); |
|
214
|
|
|
|
|
|
|
close ($OUT) || $self->_status( "(Add) Cannot close $path_file_name: $!" ); |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub _status { |
|
218
|
|
|
|
|
|
|
my ( $self, $msg ) = @_; |
|
219
|
|
|
|
|
|
|
my $status_file = $self->get_status_filename(); |
|
220
|
|
|
|
|
|
|
open (my $OUT, '>>', $status_file) || croak( "(Status) Cannot open $status_file: $!" ); |
|
221
|
|
|
|
|
|
|
print {$OUT} " STATUS: $msg \n" || croak( "(Status) Cannot write $status_file: $!" ); |
|
222
|
|
|
|
|
|
|
close ($OUT) || croak( "(Status) Cannot close $status_file: $!" ); |
|
223
|
|
|
|
|
|
|
return; |
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
1; # Magic true value required at end of module |
|
228
|
|
|
|
|
|
|
__END__ |