File Coverage

blib/lib/Hailo/Storage.pm
Criterion Covered Total %
statement 105 105 100.0
branch 16 16 100.0
condition 13 14 92.8
subroutine 20 20 100.0
pod 0 6 0.0
total 154 161 95.6


line stmt bran cond sub pod time code
1             package Hailo::Storage;
2             our $AUTHORITY = 'cpan:AVAR';
3             $Hailo::Storage::VERSION = '0.75';
4 29     29   98876 use v5.10.0;
  29         110  
5 29     29   620 use Moose;
  29         384481  
  29         176  
6 29     29   185080 use MooseX::StrictConstructor;
  29         26083  
  29         195  
7 29     29   94982 use MooseX::Types::Moose ':all';
  29         45371  
  29         406  
8 29     29   288148 use DBI;
  29         492085  
  29         1993  
9 29     29   15761 use Hailo::Storage::Schema;
  29         86  
  29         35493  
10              
11             has dbd => (
12             isa => Str,
13             is => 'ro',
14             lazy_build => 1,
15             documentation => "The DBD::* driver we're using",
16             );
17              
18             has dbd_options => (
19             isa => HashRef,
20             is => 'ro',
21             lazy_build => 1,
22             documentation => 'Options passed as the last argument to DBI->connect()',
23             );
24              
25             sub _build_dbd_options {
26 254     254   2656 my ($self) = @_;
27             return {
28 254         7719 RaiseError => 1
29             };
30             }
31              
32             has dbh => (
33             isa => 'DBI::db',
34             is => 'ro',
35             lazy_build => 1,
36             documentation => 'Our DBD object',
37             );
38              
39             sub _build_dbh {
40 249     249   508 my ($self) = @_;
41 249         6153 my $dbd_options = $self->dbi_options;
42              
43 249         5706 return DBI->connect($self->dbi_options);
44             };
45              
46             has dbi_options => (
47             isa => ArrayRef,
48             is => 'ro',
49             auto_deref => 1,
50             lazy_build => 1,
51             documentation => 'Options passed to DBI->connect()',
52             );
53              
54             sub _build_dbi_options {
55 249     249   484 my ($self) = @_;
56 249         5884 my $dbd = $self->dbd;
57 249         6025 my $dbd_options = $self->dbd_options;
58 249   100     6269 my $db = $self->brain // '';
59              
60 249         1212 my @options = (
61             "dbi:$dbd:dbname=$db",
62             '',
63             '',
64             $dbd_options,
65             );
66              
67 249         811 return \@options;
68             }
69              
70             has _engaged => (
71             isa => Bool,
72             is => 'rw',
73             default => 0,
74             documentation => 'Have we done setup work to get this database going?',
75             );
76              
77             has sth => (
78             isa => HashRef,
79             is => 'ro',
80             lazy_build => 1,
81             documentation => 'A HashRef of prepared DBI statement handles',
82             );
83              
84             sub _build_sth {
85 50     50   144 my ($self) = @_;
86 50         1258 return Hailo::Storage::Schema->sth($self->dbd, $self->dbh, $self->order);
87             }
88              
89             has _boundary_token_id => (
90             isa => Int,
91             is => 'rw',
92             );
93              
94             # bootstrap the database
95             sub _engage {
96 251     251   1518 my ($self) = @_;
97              
98 251 100       901 if ($self->initialized) {
99             # Check the order we've been given and retrieve it from the
100             # database if there's nothing odd going on.
101 209         813 $self->_engage_initialized_check_and_set_order;
102              
103             # Likewise for the Tokenizer
104 10         56 $self->_engage_initialized_check_and_set_tokenizer;
105              
106 8         214 $self->sth->{token_id}->execute(0, '');
107 8         263 my $id = $self->sth->{token_id}->fetchrow_array;
108 8         266 $self->_boundary_token_id($id);
109             }
110             else {
111 42         1176 Hailo::Storage::Schema->deploy($self->dbd, $self->dbh, $self->order);
112              
113             # Set metadata in the database for use by subsequent
114             # invocations
115             {
116             # Don't change order again
117 42         104 my $order = $self->order;
  42         1654  
118 42         1234 $self->sth->{set_info}->execute('markov_order', $order);
119              
120             # Warn if the tokenizer changes
121 42         1524 my $tokenizer = $self->tokenizer_class;
122 42         976 $self->sth->{set_info}->execute('tokenizer_class', $tokenizer);
123             }
124              
125 42         1125 $self->sth->{add_token}->execute(0, '');
126 42         1187 $self->sth->{last_token_rowid}->execute();
127 42         1081 my $id = $self->sth->{last_token_rowid}->fetchrow_array();
128 42         1292 $self->_boundary_token_id($id);
129             }
130              
131 50         1410 $self->_engaged(1);
132              
133 50         141 return;
134             }
135              
136             sub _engage_initialized_check_and_set_order {
137 209     209   481 my ($self) = @_;
138              
139 209         6517 my $sth = $self->dbh->prepare(qq[SELECT text FROM info WHERE attribute = ?;]);
140 209         25828 $sth->execute('markov_order');
141 209         3954 my $db_order = $sth->fetchrow_array();
142              
143 209         7735 my $my_order = $self->order;
144 209 100       635 if ($my_order != $db_order) {
145 200 100       4975 if ($self->hailo->{has_custom_order}->()) {
146 199         4302 die <<"DIE";
147             You've manually supplied an order of `$my_order' to Hailo but you're
148             loading a brain that has the order `$db_order'.
149              
150             Hailo will automatically load the order from existing brains, however
151             you've constructed Hailo and manually specified an order not
152             equivalent to the existing order of the database.
153              
154             Either supply the correct order or omit the order attribute
155             altogether. We could continue but I'd rather die since you're probably
156             expecting something I can't deliver.
157             DIE
158             }
159              
160 1         27 $self->order($db_order);
161 1         28 $self->hailo->{set_order}->($db_order);
162             }
163              
164 10         126 return;
165             }
166              
167             sub _engage_initialized_check_and_set_tokenizer {
168 10     10   26 my ($self) = @_;
169              
170 10         255 my $sth = $self->dbh->prepare(qq[SELECT text FROM info WHERE attribute = ?;]);
171 10         1020 $sth->execute('tokenizer_class');
172 10         152 my $db_tokenizer_class = $sth->fetchrow_array;
173 10         383 my $my_tokenizer_class = $self->tokenizer_class;
174              
175             # defined() because we can't count on old brains having this
176 10 100 100     75 if (defined $db_tokenizer_class
177             and $my_tokenizer_class ne $db_tokenizer_class) {
178 3 100       78 if ($self->hailo->{has_custom_tokenizer_class}->()) {
179 2         47 die <<"DIE";
180             You've manually supplied a tokenizer class `$my_tokenizer_class' to
181             Hailo, but you're loading a brain that has the tokenizer class
182             `$db_tokenizer_class'.
183              
184             Hailo will automatically load the tokenizer class from existing
185             brains, however you've constructed Hailo and manually specified an
186             tokenizer class not equivalent to the existing tokenizer class of the
187             database.
188              
189             Either supply the correct tokenizer class or omit the order attribute
190             altogether. We could continue but I'd rather die since you're probably
191             expecting something I can't deliver.
192             DIE
193             }
194              
195 1         28 $self->tokenizer_class($db_tokenizer_class);
196 1         26 $self->hailo->{set_tokenizer_class}->($db_tokenizer_class);
197             }
198              
199 8         96 return;
200             }
201              
202             sub start_training {
203 23     23 0 187 my ($self) = @_;
204 23 100       757 $self->_engage() unless $self->_engaged;
205 23         114 $self->start_learning();
206 23         75 return;
207             }
208              
209             sub stop_training {
210 19     19 0 596 my ($self) = @_;
211 19         204 $self->stop_learning();
212 19         52 return;
213             }
214              
215             sub start_learning {
216 1657     1657 0 2808 my ($self) = @_;
217 1657 100       39665 $self->_engage() unless $self->_engaged;
218              
219             # start a transaction
220 1657         36200 $self->dbh->begin_work;
221 1657         26750 return;
222             }
223              
224             sub stop_learning {
225 1652     1652 0 3379 my ($self) = @_;
226             # finish a transaction
227 1652         44482 $self->dbh->commit;
228 1652         6145 return;
229             }
230              
231             # See if SELECT count(*) FROM info; fails. If not we assume that we
232             # have an up and running database.
233             sub initialized {
234 214     214 0 2367 my ($self) = @_;
235 214         6007 my $dbh = $self->dbh;
236              
237 214         421 my ($err, $warn, $res);
238 214         354 eval {
239             # SQLite will warn 'no such table info'
240 214     5   1597 local $SIG{__WARN__} = sub { $err = $_[0] };
  5         774  
241              
242             # If it doesn't warn trust that it dies here
243 214         1510 local ($@, $!);
244 214         1217 $res = $dbh->do("SELECT count(*) FROM info;");
245             };
246              
247 214   66     39974 return (not $err and not $warn and defined $res);
248             }
249              
250             # return some statistics
251             sub totals {
252 29     29 0 76 my ($self) = @_;
253 29 100       751 $self->_engage() unless $self->_engaged;
254              
255 29         699 $self->sth->{token_total}->execute();
256 29         868 my $token = $self->sth->{token_total}->fetchrow_array - 1;
257 29         769 $self->sth->{expr_total}->execute();
258 29   100     818 my $expr = $self->sth->{expr_total}->fetchrow_array // 0;
259 29         747 $self->sth->{prev_total}->execute();
260 29   100     790 my $prev = $self->sth->{prev_total}->fetchrow_array // 0;
261 29         724 $self->sth->{next_total}->execute();
262 29   100     788 my $next = $self->sth->{next_total}->fetchrow_array // 0;
263              
264 29         187 return $token, $expr, $prev, $next;
265             }
266              
267             __PACKAGE__->meta->make_immutable;
268              
269             =encoding utf8
270              
271             =head1 NAME
272              
273             Hailo::Storage - A base class for L<Hailo> L<storage|Hailo::Role::Storage> backends
274              
275             =head1 METHODS
276              
277             The following two methods must to be implemented by subclasses:
278              
279             =head2 C<_build_dbd>
280              
281             Should return the name of the database driver (e.g. 'SQLite') which will be
282             passed to L<DBI|DBI>.
283              
284             =head2 C<_build_dbd_options>
285              
286             Subclasses can override this method to add options of their own. E.g:
287              
288             override _build_dbd_options => sub {
289             return {
290             %{ super() },
291             sqlite_unicode => 1,
292             };
293             };
294              
295             =head1 Comparison of backends
296              
297             This benchmark shows how the backends compare when training on the
298             small testsuite dataset as reported by the F<utils/hailo-benchmark>
299             utility (found in the distribution):
300              
301             Rate DBD::Pg DBD::mysql DBD::SQLite/file DBD::SQLite/memory
302             DBD::Pg 2.22/s -- -33% -49% -56%
303             DBD::mysql 3.33/s 50% -- -23% -33%
304             DBD::SQLite/file 4.35/s 96% 30% -- -13%
305             DBD::SQLite/memory 5.00/s 125% 50% 15% --
306              
307             Under real-world workloads SQLite is much faster than these results
308             indicate since the time it takes to train/reply is relative to the
309             existing database size. Here's how long it took to train on a 214,710
310             line IRC log on a Linode 1080 with Hailo 0.18:
311              
312             =over
313              
314             =item * SQLite
315              
316             real 8m38.285s
317             user 8m30.831s
318             sys 0m1.175s
319              
320             =item * MySQL
321              
322             real 48m30.334s
323             user 8m25.414s
324             sys 4m38.175s
325              
326             =item * PostgreSQL
327              
328             real 216m38.906s
329             user 11m13.474s
330             sys 4m35.509s
331              
332             =back
333              
334             In the case of PostgreSQL it's actually much faster to first train
335             with SQLite, dump that database and then import it with L<psql(1)>,
336             see L<failo's README|http://github.com/hinrik/failo> for how to do
337             that.
338              
339             However, replying with an existing database (using
340             F<utils/hailo-benchmark-replies>) yields different results. SQLite can
341             reply really quickly without being warmed up (which is the typical
342             usecase for chatbots) but once PostgreSQL and MySQL are warmed up they
343             start replying faster:
344              
345             Here's a comparison of doing 10 replies:
346              
347             Rate PostgreSQL MySQL SQLite-file SQLite-file-28MB SQLite-memory
348             PostgreSQL 71.4/s -- -14% -14% -29% -50%
349             MySQL 83.3/s 17% -- 0% -17% -42%
350             SQLite-file 83.3/s 17% 0% -- -17% -42%
351             SQLite-file-28MB 100.0/s 40% 20% 20% -- -30%
352             SQLite-memory 143/s 100% 71% 71% 43% --
353              
354             In this test MySQL uses around 28MB of memory (using Debian's
355             F<my-small.cnf>) and PostgreSQL around 34MB. Plain SQLite uses 2MB of
356             cache but it's also tested with 28MB of cache as well as with the
357             entire database in memory.
358              
359             But doing 10,000 replies is very different:
360              
361             Rate SQLite-file PostgreSQL SQLite-file-28MB MySQL SQLite-memory
362             SQLite-file 85.1/s -- -7% -18% -27% -38%
363             PostgreSQL 91.4/s 7% -- -12% -21% -33%
364             SQLite-file-28MB 103/s 21% 13% -- -11% -25%
365             MySQL 116/s 37% 27% 13% -- -15%
366             SQLite-memory 137/s 61% 50% 33% 18% --
367              
368             Once MySQL gets more memory (using Debian's F<my-large.cnf>) and a
369             chance to warm it starts yielding better results (I couldn't find out
370             how to make PostgreSQL take as much memory as it wanted):
371              
372             Rate MySQL SQLite-memory
373             MySQL 121/s -- -12%
374             SQLite-memory 138/s 14% --
375              
376             =head1 AUTHOR
377              
378             E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason <avar@cpan.org>
379              
380             Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com
381              
382             =head1 LICENSE AND COPYRIGHT
383              
384             Copyright 2010 E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason and
385             Hinrik E<Ouml>rn SigurE<eth>sson
386              
387             This program is free software, you can redistribute it and/or modify
388             it under the same terms as Perl itself.
389              
390             =cut