File Coverage

blib/lib/LaTeX/TOM.pm
Criterion Covered Total %
statement 11 11 100.0
branch n/a
condition n/a
subroutine 4 4 100.0
pod 1 1 100.0
total 16 16 100.0


line stmt bran cond sub pod time code
1             ###############################################################################
2             #
3             # LaTeX::TOM (TeX Object Model)
4             #
5             # Version 1.05
6             #
7             # ----------------------------------------------------------------------------
8             #
9             # originally written by Aaron Krowne (akrowne@vt.edu)
10             # July 2002
11             #
12             # Virginia Polytechnic Institute and State University
13             # Department of Computer Science
14             # Digital Libraries Research Laboratory
15             #
16             # now maintained by Steven Schubiger (schubiger@cpan.org)
17             # April 2008
18             #
19             # ----------------------------------------------------------------------------
20             #
21             # This module provides some decent semantic handling of LaTeX documents. It is
22             # inspired by XML::DOM, so users of that module should be able to acclimate
23             # themselves to this one quickly. Basically the subroutines in this package
24             # allow you to parse a LaTeX document into its logical structure, including
25             # groupings, commands, environments, and comments. These all go into a tree
26             # which is built as arrays of Perl hashes.
27             #
28             ###############################################################################
29              
30             package LaTeX::TOM;
31              
32 10     10   85050 use strict;
  10         73  
  10         307  
33 10     10   54 use base qw(LaTeX::TOM::Parser);
  10         16  
  10         5988  
34 10     10   79 use constant true => 1;
  10         23  
  10         4880  
35              
36             our $VERSION = '1.05';
37              
38             our (%INNERCMDS, %MATHENVS, %MATHBRACKETS,
39             %BRACELESS, %TEXTENVS, $PARSE_ERRORS_FATAL,
40             $DEBUG);
41              
42             # BEGIN CONFIG SECTION ########################################################
43              
44             # these are commands that can be "embedded" within a grouping to alter the
45             # environment of that grouping. For instance {\bf text}. Without listing the
46             # command names here, the parser will treat such sequences as plain text.
47             #
48             %INNERCMDS = map { $_ => true } (
49             'bf',
50             'md',
51             'em',
52             'up',
53             'sl',
54             'sc',
55             'sf',
56             'rm',
57             'it',
58             'tt',
59             'noindent',
60             'mathtt',
61             'mathbf',
62             'tiny',
63             'scriptsize',
64             'footnotesize',
65             'small',
66             'normalsize',
67             'large',
68             'Large',
69             'LARGE',
70             'huge',
71             'Huge',
72             'HUGE',
73             );
74              
75             # these commands put their environments into math mode
76             #
77             %MATHENVS = map { $_ => true } (
78             'align',
79             'equation',
80             'eqnarray',
81             'displaymath',
82             'ensuremath',
83             'math',
84             '$$',
85             '$',
86             '\[',
87             '\(',
88             );
89              
90             # these commands/environments put their children in text (non-math) mode
91             #
92             %TEXTENVS = map { $_ => true } (
93             'tiny',
94             'scriptsize',
95             'footnotesize',
96             'small',
97             'normalsize',
98             'large',
99             'Large',
100             'LARGE',
101             'huge',
102             'Huge',
103             'HUGE',
104             'text',
105             'textbf',
106             'textmd',
107             'textsc',
108             'textsf',
109             'textrm',
110             'textsl',
111             'textup',
112             'texttt',
113             'mbox',
114             'fbox',
115             'section',
116             'subsection',
117             'subsubsection',
118             'em',
119             'bf',
120             'emph',
121             'it',
122             'enumerate',
123             'description',
124             'itemize',
125             'trivlist',
126             'list',
127             'proof',
128             'theorem',
129             'lemma',
130             'thm',
131             'prop',
132             'lem',
133             'table',
134             'tabular',
135             'tabbing',
136             'caption',
137             'footnote',
138             'center',
139             'flushright',
140             'document',
141             'article',
142             'titlepage',
143             'title',
144             'author',
145             'titlerunninghead',
146             'authorrunninghead',
147             'affil',
148             'email',
149             'abstract',
150             'thanks',
151             'algorithm',
152             'nonumalgorithm',
153             'references',
154             'thebibliography',
155             'bibitem',
156             'verbatim',
157             'verbatimtab',
158             'quotation',
159             'quote',
160             );
161              
162             # these form sets of simple mode delimiters
163             #
164             %MATHBRACKETS = (
165             '$$' => '$$',
166             '$' => '$',
167             # '\[' => '\]', # these are problematic and handled separately now
168             # '\(' => '\)',
169             );
170              
171             # these commands require no braces, and their parameters are simply the
172             # "word" following the command declaration
173             #
174             %BRACELESS = map { $_ => true } (
175             'oddsidemargin',
176             'evensidemargin',
177             'topmargin',
178             'headheight',
179             'headsep',
180             'textwidth',
181             'textheight',
182             'input',
183             );
184              
185             # default value controlling how fatal parse errors are
186             #
187             # 0 = warn, 1 = die, 2 = silent
188             #
189             $PARSE_ERRORS_FATAL = 0;
190              
191             # debugging mode (internal use)
192             #
193             # 0 = off, 1 = messages, 2 = messages and code
194             #
195             $DEBUG = 0;
196              
197             # END CONFIG SECTION ##########################################################
198              
199             sub new {
200 9     9 1 1161 my $class = shift;
201              
202 9         71 return LaTeX::TOM::Parser->_new(@_);
203             }
204              
205             1;
206              
207             =head1 NAME
208              
209             LaTeX::TOM - A module for parsing, analyzing, and manipulating LaTeX documents.
210              
211             =head1 SYNOPSIS
212              
213             use LaTeX::TOM;
214              
215             $parser = LaTeX::TOM->new;
216              
217             $document = $parser->parseFile('mypaper.tex');
218              
219             $latex = $document->toLaTeX;
220              
221             $specialnodes = $document->getNodesByCondition(sub {
222             my $node = shift;
223             return (
224             $node->getNodeType eq 'TEXT'
225             && $node->getNodeText =~ /magic string/
226             );
227             });
228              
229             $sections = $document->getNodesByCondition(sub {
230             my $node = shift;
231             return (
232             $node->getNodeType eq 'COMMAND'
233             && $node->getCommandName =~ /section$/
234             );
235             });
236              
237             $indexme = $document->getIndexableText;
238              
239             $document->print;
240              
241             =head1 DESCRIPTION
242              
243             This module provides a parser which parses and interprets (though not fully)
244             LaTeX documents and returns a tree-based representation of what it finds.
245             This tree is a C. The tree contains C nodes.
246              
247             This module should be especially useful to anyone who wants to do processing
248             of LaTeX documents that requires extraction of plain-text information, or
249             altering of the plain-text components (or alternatively, the math-text
250             components).
251              
252             =head1 COMPONENTS
253              
254             =head2 LaTeX::TOM::Parser
255              
256             The parser recognizes 3 parameters upon creation by C<< LaTeX::TOM->new >>.
257             The parameters, in order, are
258              
259             =over 4
260              
261             =item parse error handling (= B<0> || 1 || 2)
262              
263             Determines what happens when a parse error is encountered. C<0> results in a
264             warning. C<1> results in a die. C<2> results in silence. Note that particular
265             groupings in LaTeX (i.e. newcommands and the like) contain invalid TeX or
266             LaTeX, so you nearly always need this parameter to be C<0> or C<2> to completely
267             parse the document.
268              
269             =item read inputs flag (= 0 || B<1>)
270              
271             This flag determines whether a scan for C<\input> and C<\input-like> commands is
272             performed, and the resulting called files parsed and added to the parent
273             parse tree. C<0> means no, C<1> means do it. Note that this will happen recursively
274             if it is turned on. Also, bibliographies (F<.bbl> files) are detected and
275             included.
276              
277             =item apply mappings flag (= 0 || B<1>)
278              
279             This flag determines whether (most) user-defined mappings are applied. This
280             means C<\defs>, C<\newcommands>, and C<\newenvironments>. This is critical for
281             properly analyzing the content of the document, as this must be phrased in terms
282             of the semantics of the original TeX and LaTeX commands, not ad hoc user macros.
283             So, for instance, do not expect plain-text extraction to work properly with this
284             option off.
285              
286             =back
287              
288             The parser returns a C ($document in the SYNOPSIS).
289              
290             =head2 LaTeX::TOM::Node
291              
292             Nodes may be of the following types:
293              
294             =over 4
295              
296             =item TEXT
297              
298             C nodes can be thought of as representing the plain-text portions of the
299             LaTeX document. This includes math and anything else that is not a recognized
300             TeX or LaTeX command, or user-defined command. In reality, C nodes contain
301             commands that this parser does not yet recognize the semantics of.
302              
303             =item COMMAND
304              
305             A C node represents a TeX command. It always has child nodes in a tree,
306             though the tree might be empty if the command operates on zero parameters. An
307             example of a command is
308              
309             \textbf{blah}
310              
311             This would parse into a C node for C, which would have a subtree
312             containing the C node with text ``blah.''
313              
314             =item ENVIRONMENT
315              
316             Similarly, TeX environments parse into C nodes, which have metadata
317             about the environment, along with a subtree representing what is contained in
318             the environment. For example,
319              
320             \begin{equation}
321             r = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}
322             \end{equation}
323              
324             Would parse into an C node of the class ``equation'' with a child
325             tree containing the result of parsing C<``r = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}.''>
326              
327             =item GROUP
328              
329             A C is like an anonymous C. Since you can put whatever you want in
330             curly-braces (C<{}>) in TeX in order to make semantically isolated regions, this
331             separation is preserved by the parser. A C is just the subtree of the
332             parsed contents of plain curly-braces.
333              
334             It is important to note that currently only the first C in a series of
335             Cs following a LaTeX command will actually be parsed into a C node.
336             The reason is that, for the initial purposes of this module, it was not
337             necessary to recognize additional Cs as additional parameters to the
338             C. However, this is something that this module really should do
339             eventually. Currently if you want all the parameters to a multi-parametered
340             command, you'll need to pick out all the following C nodes yourself.
341              
342             Eventually this will become something like a list which is stored in the
343             C node, much like L's treatment of attributes. These are, in a
344             sense, apart from the rest of the document tree. Then C nodes will become
345             much more rare.
346              
347             =item COMMENT
348              
349             A C node is very similar to a C node, except it is specifically for
350             lines beginning with C<``%''> (the TeX comment delimiter) or the right-hand
351             portion of a line that has C<``%''> at some internal point.
352              
353             =back
354              
355             =head2 LaTeX::TOM::Trees
356              
357             As mentioned before, the Tree is the return result of a parse.
358              
359             The tree is nothing more than an arrayref of Nodes, some of which may contain
360             their own trees. This is useful knowledge at this point, since the user isn't
361             provided with a full suite of convenient tree-modification methods. However,
362             Trees do already have some very convenient methods, described in the next
363             section.
364              
365             =head1 METHODS
366              
367             =head2 LaTeX::TOM
368              
369             =head3 new
370              
371             =over 4
372              
373             =item C<>
374              
375             Instantiate a new parser object.
376              
377             =back
378              
379             In this section all of the methods for each of the components are listed and
380             described.
381              
382             =head2 LaTeX::TOM::Parser
383              
384             The methods for the parser are:
385              
386             =head3 parseFile (filename)
387              
388             =over 4
389              
390             =item C<>
391              
392             Read in the contents of I and parse them, returning a C.
393              
394             =back
395              
396             =head3 parse (string)
397              
398             =over 4
399              
400             =item C<>
401              
402             Parse the string I and return a C.
403              
404             =back
405              
406             =head2 LaTeX::TOM::Tree
407              
408             This section contains methods for the Trees returned by the parser.
409              
410             =head3 copy
411              
412             =over 4
413              
414             =item C<>
415              
416             Duplicate a tree into new memory.
417              
418             =back
419              
420             =head3 print
421              
422             =over 4
423              
424             =item C<>
425              
426             A debug print of the structure of the tree.
427              
428             =back
429              
430             =head3 plainText
431              
432             =over 4
433              
434             =item C<>
435              
436             Returns an arrayref which is a list of strings representing the text of all
437             C C nodes, in an inorder traversal.
438              
439             =back
440              
441             =head3 indexableText
442              
443             =over 4
444              
445             =item C<>
446              
447             A method like the above but which goes one step further; it cleans all of the
448             returned text and concatenates it into a single string which one could consider
449             having all of the standard information retrieval value for the document,
450             making it useful for indexing.
451              
452             =back
453              
454             =head3 toLaTeX
455              
456             =over 4
457              
458             =item C<>
459              
460             Return a string representing the LaTeX encoded by the tree. This is especially
461             useful to get a normal document again, after modifying nodes of the tree.
462              
463             =back
464              
465             =head3 getTopLevelNodes
466              
467             =over 4
468              
469             =item C<>
470              
471             Return a list of C at the top level of the Tree.
472              
473             =back
474              
475             =head3 getAllNodes
476              
477             =over 4
478              
479             =item C<>
480              
481             Return an arrayref with B nodes of the tree. This "flattens" the tree.
482              
483             =back
484              
485             =head3 getCommandNodesByName (name)
486              
487             =over 4
488              
489             =item C<>
490              
491             Return an arrayref with all C nodes in the tree which have a name
492             matching I.
493              
494             =back
495              
496             =head3 getEnvironmentsByName (name)
497              
498             =over 4
499              
500             =item C<>
501              
502             Return an arrayref with all C nodes in the tree which have a class
503             matching I.
504              
505             =back
506              
507             =head3 getNodesByCondition (code reference)
508              
509             =over 4
510              
511             =item C<>
512              
513             This is a catch-all search method which can be used to pull out nodes that
514             match pretty much any perl expression, without manually having to traverse the
515             tree. I is a perl code reference which receives as its first
516             argument the node of the tree that is currently scrutinized and is expected to
517             return a boolean value. See the SYNOPSIS for examples.
518              
519             =back
520              
521             =head3 getFirstNode
522              
523             =over 4
524              
525             =item C<>
526              
527             Returns the first node of the tree. This is useful if you want to walk the tree
528             yourself, starting with the first node.
529              
530             =back
531              
532             =head2 LaTeX::TOM::Node
533              
534             This section contains the methods for nodes of the parsed Trees.
535              
536             =head3 getNodeType
537              
538             =over 4
539              
540             =item C<>
541              
542             Returns the type, one of C, C, C, C, or C,
543             as described above.
544              
545             =back
546              
547             =head3 getNodeText
548              
549             =over 4
550              
551             =item C<>
552              
553             Applicable for C or C nodes; this returns the document text they contain.
554             This is undef for other node types.
555              
556             =back
557              
558             =head3 setNodeText
559              
560             =over 4
561              
562             =item C<>
563              
564             Set the node text, also for C and C nodes.
565              
566             =back
567              
568             =head3 getNodeStartingPosition
569              
570             =over 4
571              
572             =item C<>
573              
574             Get the starting character position in the document of this node. For C
575             and C nodes, this will be where the text begins. For C,
576             C, or C nodes, this will be the position of the I character of
577             the opening identifier.
578              
579             =back
580              
581             =head3 getNodeEndingPosition
582              
583             =over 4
584              
585             =item C<>
586              
587             Same as above, but for last character. For C, C, or C
588             nodes, this will be the I character of the closing identifier.
589              
590             =back
591              
592             =head3 getNodeOuterStartingPosition
593              
594             =over 4
595              
596             =item C<>
597              
598             Same as getNodeStartingPosition, but for C, C, or C nodes,
599             this returns the I character of the opening identifier.
600              
601             =back
602              
603             =head3 getNodeOuterEndingPosition
604              
605             =over 4
606              
607             =item C<>
608              
609             Same as getNodeEndingPosition, but for C, C, or C nodes,
610             this returns the I character of the closing identifier.
611              
612             =back
613              
614             =head3 getNodeMathFlag
615              
616             =over 4
617              
618             =item C<>
619              
620             This applies to any node type. It is C<1> if the node sets, or is contained
621             within, a math mode region. C<0> otherwise. C nodes which have this flag as C<1>
622             can be assumed to be the actual mathematics contained in the document.
623              
624             =back
625              
626             =head3 getNodePlainTextFlag
627              
628             =over 4
629              
630             =item C<>
631              
632             This applies only to C nodes. It is C<1> if the node is non-math B is
633             visible (in other words, will end up being a part of the output document). One
634             would only want to index C nodes with this property, for information
635             retrieval purposes.
636              
637             =back
638              
639             =head3 getEnvironmentClass
640              
641             =over 4
642              
643             =item C<>
644              
645             This applies only to C nodes. Returns what class of environment the
646             node represents (the C in C<\begin{X}> and C<\end{X}>).
647              
648             =back
649              
650             =head3 getCommandName
651              
652             =over 4
653              
654             =item C<>
655              
656             This applies only to C nodes. Returns the name of the command (the C in
657             C<\X{...}>).
658              
659             =back
660              
661             =head3 getChildTree
662              
663             =over 4
664              
665             =item C<>
666              
667             This applies only to C, C, and C nodes: it returns the
668             C which is ``under'' the calling node.
669              
670             =back
671              
672             =head3 getFirstChild
673              
674             =over 4
675              
676             =item C<>
677              
678             This applies only to C, C, and C nodes: it returns the
679             first node from the first level of the child subtree.
680              
681             =back
682              
683             =head3 getLastChild
684              
685             =over 4
686              
687             =item C<>
688              
689             Same as above, but for the last node of the first level.
690              
691             =back
692              
693             =head3 getPreviousSibling
694              
695             =over 4
696              
697             =item C<>
698              
699             Return the prior node on the same level of the tree.
700              
701             =back
702              
703             =head3 getNextSibling
704              
705             =over 4
706              
707             =item C<>
708              
709             Same as above, but for following node.
710              
711             =back
712              
713             =head3 getParent
714              
715             =over 4
716              
717             =item C<>
718              
719             Get the parent node of this node in the tree.
720              
721             =back
722              
723             =head3 getNextGroupNode
724              
725             =over 4
726              
727             =item C<>
728              
729             This is an interesting function, and kind of a hack because of the way the
730             parser makes the current tree. Basically it will give you the next sibling
731             that is a C node, until it either hits the end of the tree level, a C
732             node which doesn't match C, or a C node.
733              
734             This is useful for finding all Ced parameters after a C node (see
735             comments for C in the C / C section). You
736             can just have a while loop that calls this method until it gets C, and
737             you'll know you've found all the parameters to a command.
738              
739             Note: this may be bad, but C Nodes matching C (optional
740             parameter groups) are treated as if they were 'blank'.
741              
742             =back
743              
744             =head1 CAVEATS
745              
746             Due to the lack of tree-modification methods, currently this module is
747             mostly useful for minor modifications to the parsed document, for instance,
748             altering the text of C nodes but not deleting the nodes. Of course, the
749             user can still do this by breaking abstraction and directly modifying the Tree.
750              
751             Also note that the parsing is not complete. This module was not written with
752             the intention of being able to produce output documents the way ``latex'' does.
753             The intent was instead to be able to analyze and modify the document on a
754             logical level with regards to the content; it doesn't care about the document
755             formatting and outputting side of TeX/LaTeX.
756              
757             There is much work still to be done. See the F list in the F source.
758              
759             =head1 BUGS
760              
761             Probably plenty. However, this module has performed fairly well on a set of
762             ~1000 research publications from the Computing Research Repository, so I
763             deemed it ``good enough'' to use for purposes similar to mine.
764              
765             Please let the maintainer know of parser errors if you discover any.
766              
767             =head1 CREDITS
768              
769             Thanks to (in order of appearance) who have contributed valuable suggestions and patches:
770              
771             Otakar Smrz
772             Moritz Lenz
773             James Bowlin
774             Jesse S. Bangs
775             Cord Merrell
776             Debian Perl Group
777             Eli Billauer
778              
779             =head1 AUTHORS
780              
781             Written by Aaron Krowne
782              
783             Maintained by Steven Schubiger
784              
785             =head1 LICENSE
786              
787             This program is free software; you may redistribute it and/or
788             modify it under the same terms as Perl itself.
789              
790             See L
791              
792             =cut