File Coverage

blib/lib/Text/Mining.pm
Criterion Covered Total %
statement 3 3 100.0
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 4 4 100.0


line stmt bran cond sub pod time code
1             package Text::Mining;
2 2     2   27104 use base qw(Text::Mining::Base);
  2         4  
  2         777  
3             use Class::Std;
4             use Class::Std::Utils;
5             use Text::Mining::Corpus;
6             use Text::Mining::Corpus::Document;
7             use Text::Mining::Shell;
8              
9             use warnings;
10             use strict;
11             use Carp;
12              
13             use version; our $VERSION = qv('0.0.8');
14              
15             {
16             my %attribute_of : ATTR( get => 'attribute', set => 'attribute' );
17            
18             sub BUILD {
19             my ($self, $ident, $arg_ref) = @_;
20            
21             # &DBConnect( %{ $self->_library_connect_parameters() } );
22              
23             return;
24             }
25              
26             sub shell { my $shell = Text::Mining::Shell->new(); $shell->cmdloop(); }
27             sub version { return "VERSION $VERSION"; }
28              
29             sub create_corpus { my ( $self, $arg_ref ) = @_; return Text::Mining::Corpus->new( $arg_ref ); }
30             sub get_corpus { my ( $self, $arg_ref ) = @_; return Text::Mining::Corpus->new( $arg_ref ); }
31             sub delete_corpus { my ( $self, $arg_ref ) = @_; my $corpus = Text::Mining::Corpus->new(); return $corpus->delete( $arg_ref ); }
32              
33             sub get_root_dir { my ( $self ) = @_; return $self->_get_root_dir(); }
34             sub get_root_url { my ( $self ) = @_; return $self->_get_root_url(); }
35             sub get_data_dir { my ( $self, $corpus_id ) = @_; return $self->_get_data_dir( $corpus_id ); }
36              
37             sub get_submitted_document { my ( $self, $arg_ref ) = @_; return Text::Mining::Corpus::Document->new( $arg_ref ); }
38             sub count_submitted_waiting { my ( $self ) = @_; my ( $count ) = $self->library()->sqlexec( "select count(*) from submitted_documents where exit_date = '0000-00-00 00:00:00'", '@' ); return $count; }
39             sub count_submitted_complete { my ( $self ) = @_; my ( $count ) = $self->library()->sqlexec( "select count(*) from submitted_documents where exit_date != '0000-00-00 00:00:00'", '@' ); return $count; }
40              
41             sub parse_document {
42             my ( $self, $arg_ref ) = @_;
43             my $document = defined $arg_ref->{document} ? $arg_ref->{document} : $self->_status( "No document to parse." );
44             my $algorithm = defined $arg_ref->{algorithm} ? $arg_ref->{algorithm} : $self->_status( "No algorithm defined." );
45              
46             return $document;
47             }
48              
49             sub get_all_corpuses {
50             my ( $self, @corpuses) = @_;
51             my $corpuses = $self->library()->sqlexec( "select corpus_id from corpuses order by corpus_id asc", '\@@' );
52             foreach my $corpus (@$corpuses) { push @corpuses, Text::Mining::Corpus->new({ corpus_id => $corpus->[0] }); }
53             return \@corpuses;
54             }
55              
56             sub get_corpus_id {
57             my ( $self, $arg_ref ) = @_;
58             my $corpus = Text::Mining::Corpus->new();
59             my ( $corpus_id ) = $self->library()->sqlexec( "select corpus_id from corpuses where corpus_name = '" . $arg_ref->{corpus_name} . "'", '@' );
60             return $corpus_id;
61             }
62              
63             sub process_urls {
64             my ( $self ) = @_;
65             my $corpuses = $self->get_all_corpuses();
66             foreach my $corpus( @$corpuses ) {
67             my $data_dir = $self->get_data_dir( $corpus->get_id() );
68             my $sql = "select submitted_url_id, corpus_id, submitted_by_user_id, submitted_url from submitted_urls where exit_date = '0000-00-00 00:00:00' and file_not_found = 0";
69             my $urls = $self->library()->sqlexec( $sql, '\@@' );
70             foreach my $url ( @$urls ) { $self->_download_url( $url, $data_dir ); }
71             }
72             }
73              
74             sub reprocess_urls {
75             my ( $self ) = @_;
76             my $corpuses = $self->get_all_corpuses();
77             foreach my $corpus( @$corpuses ) {
78             my ( $corpus_id ) = @$corpus;
79             my $data_dir = $self->get_data_dir( $corpus->get_id() );
80             my $sql = "select submitted_url_id, corpus_id, submitted_by_user_id, submitted_url from submitted_urls where file_not_found = 1";
81             my $urls = $self->library()->sqlexec( $sql, '\@@' );
82             foreach my $url ( @$urls ) { $self->_download_url( $url, $data_dir ); }
83             }
84             }
85              
86             sub _download_url {
87             my ( $self, $url_row, $data_dir ) = @_;
88             my ( $id, $corpus_id, $user_id, $url ) = @$url_row;
89             my $file_name = $self->_parse_file_name( $url );
90             my $path = $self->_build_directories( $url, $data_dir );
91             my $bytes = $self->_download_file({ target_dir => $data_dir . $path,
92             url => $url,
93             file_name => $file_name });
94             if ( $bytes ) {
95             my $sql = "insert into submitted_documents (submitted_url_id, corpus_id, submitted_by_user_id, document_path, document_file_name, bytes ) ";
96             $sql .= "values ('$id', '$corpus_id', '$user_id', '$path', '$file_name', '$bytes' )";
97             $self->library()->sqlexec( $sql );
98             $self->library()->sqlexec( "update submitted_urls set exit_date = now(), file_found = 1, file_not_found = 0 where submitted_url_id = '$id'" ); }
99             else {
100             $self->library()->sqlexec( "update submitted_urls set exit_date = now(), file_found = 0, file_not_found = 1 where submitted_url_id = '$id'" ); }
101             }
102              
103             sub _build_directories {
104             my ( $self, $url, $corpus_data_dir ) = @_;
105             my @path = split(/\//, $url); shift(@path); shift(@path); # Remove protocol
106             my $file = pop(@path);
107             my $path = '';
108            
109             foreach my $part (@path) {
110             $path .= '/' . $part;
111             if (! -e $corpus_data_dir . $path) { mkdir $corpus_data_dir . $path; }
112             }
113             return $path;
114             }
115            
116             }
117              
118             1; # Magic true value required at end of module
119             __END__