File Coverage

blib/lib/Catalyst/Controller/SimpleCAS/Role/TextTranscode.pm
Criterion Covered Total %
statement 45 277 16.2
branch 0 120 0.0
condition 0 44 0.0
subroutine 15 33 45.4
pod 15 15 100.0
total 75 489 15.3


line stmt bran cond sub pod time code
1             package Catalyst::Controller::SimpleCAS::Role::TextTranscode;
2              
3 1     1   754 use strict;
  1         2  
  1         30  
4 1     1   5 use warnings;
  1         1  
  1         34  
5              
6 1     1   4 use MooseX::MethodAttributes::Role 0.29;
  1         29  
  1         6  
7             requires qw(Content fetch_content); # <-- methods of Catalyst::Controller::SimpleCAS
8              
9 1     1   35638 use Encode;
  1         2  
  1         105  
10 1     1   707 use HTML::Encoding 'encoding_from_html_document', 'encoding_from_byte_order_mark';
  1         7736  
  1         68  
11 1     1   420 use HTML::TokeParser::Simple;
  1         11013  
  1         42  
12 1     1   6 use Try::Tiny;
  1         1  
  1         54  
13 1     1   5 use Email::MIME;
  1         1  
  1         18  
14 1     1   418 use Email::MIME::CreateHTML;
  1         41688  
  1         44  
15 1     1   420 use Catalyst::Controller::SimpleCAS::CSS::Simple; #<-- hack/workaround CSS::Simple busted on CPAN
  1         2  
  1         30  
16 1     1   6 use String::Random;
  1         1  
  1         34  
17 1     1   4 use JSON;
  1         2  
  1         8  
18              
19 1     1   428 use Catalyst::Controller::SimpleCAS::MimeUriResolver;
  1         3  
  1         187  
20              
21             # FIXME - This is old and broken - file long gone from RapidApp ...
22             my $ISOLATE_CSS_RULE = ''; #'@import "/static/rapidapp/css/CssIsolation.css";';
23              
24             # Backend action for Ext.ux.RapidApp.Plugin.HtmlEditor.LoadHtmlFile
25             sub transcode_html :Chained('base') :PathPart('texttranscode/transcode_html') {
26 0     0 1 0 my ($self, $c) = @_;
27            
28 0 0       0 my $upload = $c->req->upload('Filedata') or die "no upload object";
29              
30 0         0 my $src_text = $self->normaliaze_rich_content($c,$upload,$upload->filename);
31            
32 0         0 my $rct= $c->stash->{requestContentType};
33 0 0 0     0 if ($rct eq 'JSON' || $rct eq 'text/x-rapidapp-form-response') {
34 0         0 $c->stash->{json}= { success => \1, content => $src_text };
35 0         0 return $c->forward('View::RapidApp::JSON');
36             }
37            
38             # find out what encoding the user wants, defaulting to utf8
39 0   0     0 my $dest_encoding= ($c->req->params->{dest_encoding} || 'utf-8');
40 0 0       0 my $out_codec= Encode::find_encoding($dest_encoding) or die "Unsupported encoding: $dest_encoding";
41 0         0 my $dest_octets= $out_codec->encode($src_text);
42            
43             # we need to set the charset here so that catalyst doesn't try to convert it further
44 0         0 $c->res->content_type('text/html; charset='.$dest_encoding);
45 0         0 return $c->res->body($dest_octets);
46 1     1   6 }
  1         1  
  1         9  
47              
48             # Backend action for Ext.ux.RapidApp.Plugin.HtmlEditor.SaveMhtml
49             sub generate_mhtml_download :Chained('base') :PathPart('texttranscode/generate_mhtml_download') {
50 0     0 1   my ($self, $c) = @_;
51 0 0         die "No html content supplied" unless ($c->req->params->{html_enc});
52 0           my $html = decode_json($c->req->params->{html_enc})->{data};
53              
54             # 'filename' param is optional and probably not supplied
55 0           $html = $self->normaliaze_rich_content($c,$html,$c->req->params->{filename});
56            
57 0   0       my $filename = $self->get_strip_orig_filename(\$html) || 'content.mht';
58 0           $filename =~ s/\"/\'/g; #<-- convert any " characters
59 0           my $disposition = 'attachment;filename="' . $filename . '"';
60            
61 0           my $MIME = $self->html_to_mhtml($c,$html);
62              
63 0           $c->response->header( $_ => $MIME->header($_) ) for ($MIME->header_names);
64 0           $c->response->header('Content-Disposition' => $disposition);
65 0           return $c->res->body( $MIME->as_string );
66 1     1   523 }
  1         1  
  1         3  
67              
68              
69             # extracts filename previously embedded by normaliaze_rich_content in html comment
70             sub get_strip_orig_filename {
71 0     0 1   my $self = shift;
72 0           my $htmlref = shift;
73 0 0         return undef unless (ref $htmlref);
74            
75 0           $$htmlref =~ /(\/\*----ORIGINAL_FILENAME:(.+)----\*\/)/;
76 0 0         my $comment = $1 or return undef;
77 0 0         my $filename = $2 or return undef;
78            
79             # strip comment:
80 0           $$htmlref =~ s/\Q${comment}\E//;
81            
82 0           return $filename;
83             }
84              
85             sub html_to_mhtml {
86 0     0 1   my $self = shift;
87 0           my $c = shift;
88 0           my $html = shift;
89            
90 0           my $style = $self->parse_html_get_styles(\$html,1);
91            
92 0 0         if($style) {
93            
94             # FIXME - this is broken:
95             # strip isolate css import rule:
96 0           $style =~ s/\Q$ISOLATE_CSS_RULE\E//g;
97            
98 0           my $Css = Catalyst::Controller::SimpleCAS::CSS::Simple->new;
99 0           $Css->read({ css => $style });
100            
101             #scream_color(BLACK.ON_RED,$Css->get_selectors);
102            
103             # undo the cas selector wrap applied during Load Html:
104 0           foreach my $selector ($Css->get_selectors) {
105 0           my $new_selector = $selector;
106 0 0         if($selector =~ /^\#cas\-selector\-wrap\-\w+$/){
107 0           $new_selector = 'body';
108             }
109             else {
110 0           my @parts = split(/\s+/,$selector);
111 0           my $first = shift @parts;
112 0 0         next unless ($first =~ /^\#cas\-selector\-wrap\-/);
113 0           $new_selector = join(' ',@parts);
114             }
115 0           $Css->modify_selector({
116             selector => $selector,
117             new_selector => $new_selector
118             });
119             }
120            
121 0           $style = $Css->write;
122             }
123            
124             # TODO/FIXME: remove RapidApp/TT dependency/entanglement
125 0           $html = $c->template_render('templates/rapidapp/xhtml_document.tt',{
126             style => $style,
127             body => $html
128             });
129            
130 0           my $UriResolver = Catalyst::Controller::SimpleCAS::MimeUriResolver->new({
131             Cas => $self,
132             base => ''
133             });
134            
135 0           my $MIME = Email::MIME->create_html(
136             header => [],
137             body_attributes => { charset => 'UTF-8', encoding => 'quoted-printable' },
138             body => encode('UTF-8', $html),
139             resolver => $UriResolver
140             );
141            
142             # Force wrap in a multipart/related
143 0 0         return Email::MIME->create(
144             attributes => {
145             content_type => "multipart/related",
146             disposition => "attachment"
147             },
148             parts => [ $MIME ]
149             ) unless ($MIME->subparts);
150            
151 0           return $MIME;
152             }
153              
154              
155             sub normaliaze_rich_content {
156 0     0 1   my $self = shift;
157 0           my $c = shift;
158 0           my $src_octets = shift;
159 0           my $filename = shift;
160            
161 0           my $upload;
162 0 0         if(ref($src_octets)) {
163 0           $upload = $src_octets;
164 0           $src_octets = $upload->slurp;
165             }
166            
167 0           my $content;
168            
169             # Try to determine what text encoding the file content came from, and then detect if it
170             # is MIME or HTML.
171             #
172             # Note that if the content came from a file upload/post an encode/decode phase happened
173             # during the HTTP transfer of this file, but it should have been taken care of by Catalyst
174             # and now we have the original file on disk in its native 8-bit encoding.
175              
176             # If MIME (MTHML):
177             my $MIME = try{
178             # This will frequently produce uninitialized value warnings from Email::Simple::Header,
179             # and I haven't been able to figure out how to stop it
180 0     0     Email::MIME->new($src_octets)
181 0           };
182 0 0 0       if($MIME && $MIME->subparts) {
183 0           $content = $self->convert_from_mhtml($c,$MIME);
184             }
185             # If HTML or binary:
186             else {
187 0 0 0       if(!$upload || $upload->type =~ /^text/){
188 0   0       my $src_encoding= encoding_from_html_document($src_octets) || 'utf-8';
189 0 0         my $in_codec= Encode::find_encoding($src_encoding) or die "Unsupported encoding: $src_encoding";
190 0 0         $content = (utf8::is_utf8($src_octets)) ? $src_octets : $in_codec->decode($src_octets);
191             }
192             # Binary
193             else {
194 0 0         my $checksum = $self->Store->add_content_file_mv($upload->tempname) or die "Failed to add content";
195 0           my $Content = $self->Content($checksum,$upload->filename);
196 0 0         return $Content->imglink if ($Content->imglink);
197 0           return $Content->filelink;
198             }
199             }
200             # TODO: Detect other content types and add fallback logic
201            
202 0           $content = $self->parse_html_get_style_body(\$content);
203 0           $self->convert_data_uri_scheme_links($c,\$content);
204            
205             # Use style tags just as a safe place to store the original filename
206             # (switched to this after having issues with html comments)
207 0 0         $content = '<style>/*----ORIGINAL_FILENAME:' .
208             $filename .
209             '----*/</style>' . "\n" . $content if ($filename);
210              
211 0           return $content;
212             }
213              
214              
215             sub convert_from_mhtml {
216 0     0 1   my $self = shift;
217 0           my $c = shift;
218 0           my $MIME = shift;
219              
220 0 0         my ($SubPart) = $MIME->subparts or return;
221            
222             ## -- Check for and remove extra outer MIME wrapper (exists in actual MIME EMails):
223 0 0 0       $MIME = $SubPart if (
224             $SubPart->content_type &&
225             $SubPart->content_type =~ /multipart\/related/
226             );
227             ## --
228            
229 0 0         my ($MainPart) = $MIME->subparts or return;
230              
231             ## ------
232             ## New: throw the kitchen sink at trying to figure out the charset/encoding
233             ##
234             ## This solves the long-standing problem where MHT files saved by Word 2010
235             ## would load garbled. These files are encoded as 'UTF-16LE', and the system
236             ## is not able to realize this out of the box (I think because it lists the
237             ## the charset ambiguously as ' charset="unicode" ' in the Content-Type
238             ## MIME header, but I'm no expert on Unicode). Below we're basically trying
239             ## all of the functions of HTML::Encoding until we find one that gives us
240             ## an answer, and if we do get an answer, we apply it to the MIME object before
241             ## calling ->body_str() which will then use it to decode to text.
242             ##
243 0           my $decoded = $MainPart->body; # <-- decodes from base64 (or whatever) to *bytes*
244              
245 0   0       my $char_set =
246             HTML::Encoding::encoding_from_html_document ($decoded) ||
247             HTML::Encoding::encoding_from_byte_order_mark ($decoded) ||
248             HTML::Encoding::encoding_from_meta_element ($decoded) ||
249             HTML::Encoding::xml_declaration_from_octets ($decoded) ||
250             HTML::Encoding::encoding_from_first_chars ($decoded) ||
251             HTML::Encoding::encoding_from_xml_declaration ($decoded) ||
252             HTML::Encoding::encoding_from_content_type ($decoded) ||
253             HTML::Encoding::encoding_from_xml_document ($decoded);
254              
255 0 0         $MainPart->charset_set( $char_set ) if ($char_set);
256             ## ------
257              
258 0           my $html = $MainPart->body_str; # <-- decodes to text using the character_set
259              
260 0   0       my $base_path = $self->parse_html_base_href(\$html) || $self->get_mime_part_base_path($MainPart);
261            
262 0           my %ndx = ();
263             $MIME->walk_parts(sub{
264 0     0     my $Part = shift;
265 0 0 0       return if ($Part == $MIME || $Part == $MainPart); #<-- ignore the outer and main/body parts
266            
267 0           my $content_id = $Part->header('Content-ID');
268 0 0         if ($content_id) {
269 0           $ndx{'cid:' . $content_id} = $Part;
270 0           $content_id =~ s/^\<//;
271 0           $content_id =~ s/\>$//;
272 0           $ndx{'cid:' . $content_id} = $Part;
273             }
274            
275 0           my $content_location = $Part->header('Content-Location');
276 0 0         if($content_location) {
277 0           $ndx{$content_location} = $Part;
278 0 0         if($base_path) {
279 0           $content_location =~ s/^\Q$base_path\E//;
280 0           $ndx{$content_location} = $Part;
281             }
282             }
283 0           });
284            
285 0           $self->convert_mhtml_links_parts($c,\$html,\%ndx);
286 0           return $html;
287             }
288              
289             # Try to extract the 'body' from html to prevent causing DOM/parsing issues on the client side
290             sub parse_html_get_style_body {
291 0     0 1   my $self = shift;
292 0           my $htmlref = shift;
293            
294 0 0         my $body = $self->parse_html_get_body($htmlref) or return $$htmlref;
295 0           my $style = $self->parse_html_get_styles($htmlref);
296            
297 0           my $auto_css_pre = 'cas-selector-wrap-';
298 0           my $auto_css_id = $auto_css_pre . String::Random->new->randregex('[a-z0-9]{8}');
299            
300 0 0         if($style) {
301 0           my $Css = Catalyst::Controller::SimpleCAS::CSS::Simple->new;
302 0           $Css->read({ css => $style });
303            
304             #scream_color(BLACK.ON_RED,$Css->get_selectors);
305            
306 0           foreach my $selector ($Css->get_selectors) {
307 0           my @parts = split(/\s+/,$selector);
308             # strip selector wrap from previous content processing (when the user imports +
309             # exports + imports multiple times)
310 0 0         shift @parts if ($parts[0] =~ /^\#${auto_css_pre}/);
311 0           unshift @parts, '#' . $auto_css_id;
312 0 0         pop @parts if (lc($selector) eq 'body'); #<-- any 'body' selectors are replaced by the new div wrap below
313            
314 0           $Css->modify_selector({
315             selector => $selector,
316             new_selector => join(' ',@parts)
317             });
318             }
319            
320 0           $style = $Css->write;
321             }
322              
323 0 0         if ($style) {
324             # minify:
325 0           $style =~ s/\r?\n/ /gm;
326 0           $style =~ s/\s+/ /gm;
327 0           $style = "\n<style type=\"text/css\">\n$style\n</style>";
328             }
329            
330 0   0       $style ||= '';
331 0           $style = "\n" . '<style type="text/css">' . "\n" .
332             " $ISOLATE_CSS_RULE\n" .
333             '</style>' . $style . "\n";
334              
335 0           return '<div class="isolate" id="' . $auto_css_id . '">' . "\n" .
336             $body . "\n" .
337             '</div>' . "\n$style";
338             }
339              
340              
341             # Try to extract the 'body' from html to prevent causing DOM/parsing issues on the client side
342             # Also strip html comments
343             sub parse_html_get_body {
344 0     0 1   my $self = shift;
345 0           my $htmlref = shift;
346 0           my $parser = HTML::TokeParser::Simple->new($htmlref);
347 0           my $in_body = 0;
348 0           my $inner = '';
349 0           while (my $tag = $parser->get_token) {
350 0 0 0       last if ($in_body && $tag->is_end_tag('body'));
351 0 0 0       $inner .= $tag->as_is if ($in_body && !$tag->is_comment);
352 0 0         $in_body = 1 if ($tag->is_start_tag('body'));
353             };
354 0 0         return undef if ($inner eq '');
355 0           return $inner;
356             }
357              
358             sub parse_html_get_styles {
359 0     0 1   my $self = shift;
360 0           my $htmlref = shift;
361 0           my $strip = shift;
362 0           my $parser = HTML::TokeParser::Simple->new($htmlref);
363 0           my $in_style = 0;
364 0           my $styles = '';
365 0           my $newhtml = '';
366 0           while (my $tag = $parser->get_token) {
367 0 0         if ($tag->is_end_tag('style')) {
368 0           $in_style = 0;
369 0           next;
370             }
371 0 0 0       $styles .= $tag->as_is and next if ($in_style);
372 0 0         if ($tag->is_start_tag('style')) {
373 0           $in_style = 1;
374 0           next;
375             }
376 0 0 0       $newhtml .= $tag->as_is if($strip && !$tag->is_tag('style'));
377             };
378 0 0         return undef if ($styles eq '');
379            
380 0 0         $$htmlref = $newhtml if ($strip);
381            
382             # Pull out html comment characters, ignored in css, but can interfere with CSS::Simple (rare cases)
383 0           $styles =~ s/\<\!\-\-//g;
384 0           $styles =~ s/\-\-\>//g;
385            
386 0           return $styles;
387             }
388              
389              
390              
391             # Extracts the base file path from the 'base' tag of the MHTML content
392             sub parse_html_base_href {
393 0     0 1   my $self = shift;
394 0           my $htmlref = shift;
395 0           my $parser = HTML::TokeParser::Simple->new($htmlref);
396 0           while (my $tag = $parser->get_tag) {
397 0 0         if($tag->is_tag('base')){
398 0 0         my $url = $tag->get_attr('href') or next;
399 0           return $url;
400             }
401             };
402 0           return undef;
403             }
404              
405             # alternative method to identify a base path from a Mime Part
406             sub get_mime_part_base_path {
407 0     0 1   my $self = shift;
408 0           my $Part = shift;
409            
410 0 0         my $content_location = $Part->header('Content-Location') or return undef;
411 0           my @parts = split(/\//,$content_location);
412 0           my $filename = pop @parts;
413 0           my $path = join('/',@parts) . '/';
414            
415 0           return $path;
416             }
417              
418              
419             sub convert_mhtml_links_parts {
420 0     0 1   my $self = shift;
421 0           my $c = shift;
422 0           my $htmlref = shift;
423 0           my $part_ndx = shift;
424            
425 0 0         die "convert_mhtml_links_parts(): Invalid arguments!!" unless (ref $part_ndx eq 'HASH');
426            
427 0           my $parser = HTML::TokeParser::Simple->new($htmlref);
428            
429 0           my $substitutions = {};
430            
431 0           while (my $tag = $parser->get_tag) {
432 0 0         next if($tag->is_tag('base')); #<-- skip the 'base' tag which we parsed earlier
433 0           for my $attr (qw(src href)){
434 0 0         my $url = $tag->get_attr($attr) or next;
435 0 0         my $Part = $part_ndx->{$url} or next;
436 0 0         my $cas_url = $self->mime_part_to_cas_url($c,$Part) or next;
437            
438 0           my $as_is = $tag->as_is;
439 0           $tag->set_attr( $attr => $cas_url );
440 0           $substitutions->{$as_is} = $tag->as_is;
441             }
442             }
443            
444 0           foreach my $find (keys %$substitutions) {
445 0           my $replace = $substitutions->{$find};
446 0           $$htmlref =~ s/\Q$find\E/$replace/gm;
447             }
448             }
449              
450              
451              
452             # See http://en.wikipedia.org/wiki/Data_URI_scheme
453             sub convert_data_uri_scheme_links {
454 0     0 1   my $self = shift;
455 0           my $c = shift;
456 0           my $htmlref = shift;
457            
458 0           my $parser = HTML::TokeParser::Simple->new($htmlref);
459            
460 0           my $substitutions = {};
461            
462 0           while (my $tag = $parser->get_tag) {
463            
464 0           my $attr;
465 0 0         if($tag->is_tag('img')) {
    0          
466 0           $attr = 'src';
467             }
468             elsif($tag->is_tag('a')) {
469 0           $attr = 'href';
470             }
471             else {
472 0           next;
473             }
474            
475 0 0         my $url = $tag->get_attr($attr) or next;
476            
477             # Support the special case where the src value is literal base64 data:
478 0 0         if ($url =~ /^data:/) {
479 0           my $newurl = $self->embedded_src_data_to_url($c,$url);
480 0 0         $substitutions->{$url} = $newurl if ($newurl);
481             }
482             }
483            
484 0           foreach my $find (keys %$substitutions) {
485 0           my $replace = $substitutions->{$find};
486 0           $$htmlref =~ s/\Q$find\E/$replace/gm;
487             }
488             }
489              
490             sub embedded_src_data_to_url {
491 0     0 1   my $self = shift;
492 0           my $c = shift;
493 0           my $url = shift;
494            
495 0           my ($pre,$content_type,$encoding,$base64_data) = split(/[\:\;\,]/,$url);
496            
497             # we only know how to handle base64 currently:
498 0 0         return undef unless (lc($encoding) eq 'base64');
499            
500 0     0     my $checksum = try{$self->Store->add_content_base64($base64_data)}
501 0 0         or return undef;
502            
503             # This is RapidApp-specific
504 0 0 0       my $pfx = $c->can('mount_url') ? $c->mount_url || '' : '';
505            
506 0           return join('/',$pfx,
507             $self->action_namespace($c),
508             'fetch_content', $checksum
509             );
510             }
511              
512             sub mime_part_to_cas_url {
513 0     0 1   my $self = shift;
514 0           my $c = shift;
515 0           my $Part = shift;
516            
517 0           my $data = $Part->body;
518 0           my $filename = $Part->filename(1);
519 0 0         my $checksum = $self->Store->add_content($data) or return undef;
520            
521             # This is RapidApp-specific
522 0 0 0       my $pfx = $c->can('mount_url') ? $c->mount_url || '' : '';
523            
524 0           return join('/',$pfx,
525             $self->action_namespace($c),
526             'fetch_content', $checksum, $filename
527             );
528             }
529              
530             1;
531              
532             __END__
533              
534             =head1 NAME
535              
536             Catalyst::Controller::SimpleCAS::Role::TextTranscode - Addl MHTML methods for SimpleCAS
537              
538             =head1 SYNOPSIS
539              
540             use Catalyst::Controller::SimpleCAS;
541             ...
542              
543             =head1 DESCRIPTION
544              
545             This is a Role which adds extra methods and functionality to L<Catalyst::Controller::SimpleCAS>.
546             This role is automatically loaded into the main controller class. The reason that this exists and
547             is structured this way is for historical reasons and will likely be refactored later.
548              
549              
550             =head1 PUBLIC ACTIONS
551              
552             =head2 transcode_html (texttranscode/transcode_html)
553              
554             =head2 generate_mhtml_download (texttranscode/generate_mhtml_download)
555              
556             =head1 METHODS
557              
558             =head2 convert_data_uri_scheme_links
559              
560             =head2 convert_from_mhtml
561              
562             =head2 convert_mhtml_links_parts
563              
564             =head2 embedded_src_data_to_url
565              
566             =head2 get_mime_part_base_path
567              
568             =head2 get_strip_orig_filename
569              
570             =head2 html_to_mhtml
571              
572             =head2 mime_part_to_cas_url
573              
574             =head2 normaliaze_rich_content
575              
576             =head2 parse_html_base_href
577              
578             =head2 parse_html_get_body
579              
580             =head2 parse_html_get_style_body
581              
582             =head2 parse_html_get_styles
583              
584             =head1 SEE ALSO
585              
586             =over
587              
588             =item *
589              
590             L<Catalyst::Controller::SimpleCAS>
591              
592             =back
593              
594             =head1 AUTHOR
595              
596             Henry Van Styn <vanstyn@cpan.org>
597              
598             =head1 COPYRIGHT AND LICENSE
599              
600             This software is copyright (c) 2014 by IntelliTree Solutions llc.
601              
602             This is free software; you can redistribute it and/or modify it under
603             the same terms as the Perl 5 programming language system itself.
604              
605             =cut