#============================================================= -*-Perl-*-
#
# Template::Parser
#
# DESCRIPTION
#   This module implements a LALR(1) parser and assocated support 
#   methods to parse template documents into the appropriate "compiled"
#   format.  Much of the parser DFA code (see _parse() method) is based 
#   on Francois Desarmenien's Parse::Yapp module.  Kudos to him.
# 
# AUTHOR
#   Andy Wardley <abw@kfs.org>
#
# COPYRIGHT
#   Copyright (C) 1996-2000 Andy Wardley.  All Rights Reserved.
#   Copyright (C) 1998-2000 Canon Research Centre Europe Ltd.
#
#   This module is free software; you can redistribute it and/or
#   modify it under the same terms as Perl itself.
#
#   The following copyright notice appears in the Parse::Yapp 
#   documentation.  
#
#      The Parse::Yapp module and its related modules and shell
#      scripts are copyright (c) 1998 Francois Desarmenien,
#      France. All rights reserved.
#
#      You may use and distribute them under the terms of either
#      the GNU General Public License or the Artistic License, as
#      specified in the Perl README file.
# 
#----------------------------------------------------------------------------
#
# $Id: Parser.pm,v 2.4 2000/10/03 11:49:01 abw Exp $
#
#============================================================================

package Template::Parser;

require 5.004;

use strict;
use vars qw( $VERSION $DEBUG $ERROR );
use base qw( Template::Base );

use Template::Directive;
use Template::Grammar;

# parser state constants
use constant CONTINUE => 0;
use constant ACCEPT   => 1;
use constant ERROR    => 2;
use constant ABORT    => 3;

$VERSION = sprintf("%d.%02d", q$Revision: 2.4 $ =~ /(\d+)\.(\d+)/);
$DEBUG   = 0 unless defined $DEBUG;
$ERROR   = '';


#========================================================================
#                        -- COMMON TAG STYLES --
#========================================================================

my $TAG_STYLE   = {
    'default'   => [ '\[%',    '%\]'    ],
    'template'  => [ '\[%',    '%\]'    ],
    'template1' => [ '[\[%]%', '%[\]%]' ],
    'metatext'  => [ '%%',     '%%'     ],
    'html'      => [ '<!--',   '-->'    ],
    'mason'     => [ '<%',     '>'      ],
    'asp'       => [ '<%',     '%>'     ],
    'php'       => [ '<\?',    '\?>'    ],
};


#========================================================================
#                      -----  PUBLIC METHODS -----
#========================================================================

#------------------------------------------------------------------------
# new(\%config)
#
# Constructor method. 
#------------------------------------------------------------------------

sub new {
    my $class  = shift;
    my $config = ref $_[0] eq 'HASH' ? shift(@_) : { @_ };
    my ($tagstyle, $start, $end, $defaults, $grammar, $hash, $key, $udef);

    my $self = bless { 
	START_TAG   => undef,
	END_TAG     => undef,
	TAG_STYLE   => 'default',
	ANYCASE     => 0,
	INTERPOLATE => 0,
	PRE_CHOMP   => 0,
	POST_CHOMP  => 0,
	V1DOLLAR    => 0,
	EVAL_PERL   => 0,
	GRAMMAR     => undef,
	_ERROR      => '',
    }, $class;

    # update self with any relevant keys in config
    foreach $key (keys %$self) {
	$self->{ $key } = $config->{ $key } if defined $config->{ $key };
    }

    $grammar = $self->{ GRAMMAR } ||= do {
	require Template::Grammar;
	Template::Grammar->new();
    };
    $self->{ FACTORY } ||= 'Template::Directive';

    # determine START_TAG and END_TAG for specified (or default) TAG_STYLE
    $tagstyle = $self->{ TAG_STYLE } || 'default';
    return $class->error("Invalid tag style: $tagstyle")
	unless defined ($start = $TAG_STYLE->{ $tagstyle });
    ($start, $end) = @$start;

    $self->{ START_TAG } ||= $start;
    $self->{   END_TAG } ||= $end;

    # load grammar rules, states and lex table
    @$self{ qw( LEXTABLE STATES RULES ) } 
	= @$grammar{ qw( LEXTABLE STATES RULES ) };
    
    return $self;
}


#------------------------------------------------------------------------
# parse($text)
#
# Parses the text string, $text and returns a hash array representing
# the compiled template block(s) as Perl code, in the format expected
# by Template::Document.
#------------------------------------------------------------------------

sub parse {
    my $self = shift;
    my $text = shift;
    my ($tokens, $block);

    # store for blocks defined in the template (see define_block())
    my $defblock = $self->{ DEFBLOCK } = { };
    my $metadata = $self->{ METADATA } = [ ];

    $self->{ _ERROR }  = '';

    # split file into TEXT/DIRECTIVE chunks
    $tokens = $self->split_text($text)
	|| return undef;				    ## RETURN ##

    # parse chunks
    $block = $self->_parse($tokens)
	|| return undef;				    ## RETURN ##

    print STDERR "compiled main template document block:\n$block\n"
	if $DEBUG;

    return {
	BLOCK     => $block,
	DEFBLOCKS => $defblock,
	METADATA  => { @$metadata },
    };
}


#------------------------------------------------------------------------
# split_text($text)
#
# Split input template text into directives and raw text chunks.
#------------------------------------------------------------------------

sub split_text {
    my ($self, $text) = @_;
    my ($pre, $dir, $prelines, $dirlines, $postlines, $chomp, $tags, @tags);
    my ($start, $end, $prechomp, $postchomp, $interp ) = 
	@$self{ qw( START_TAG END_TAG PRE_CHOMP POST_CHOMP INTERPOLATE ) };

    my @tokens = ();
    my $line = 1;

    return \@tokens					    ## RETURN ##
	unless defined $text && length $text;

    # extract all directives from the text
    while ($text =~ s/
	   ^(.*?)               # $1 - start of line up to directive
	    (?:
		$start          # start of tag
		(.*?)           # $2 - tag contents
		$end            # end of tag
	    )
	    //sx) {

	($pre, $dir) = ($1, $2);
	$pre = '' unless defined $pre;
	$dir = '' unless defined $dir;
	
	$postlines = 0;                      # denotes lines chomped
	$prelines  = ($pre =~ tr/\n//);      # NULL - count only
	$dirlines  = ($dir =~ tr/\n//);      # ditto

	# the directive CHOMP options may modify the preceeding text
	for ($dir) {
	    # remove leading whitespace and check for a '-' chomp flag
	    s/^([-+\#])?\s*//s;
	    if ($1 && $1 eq '#') {
		# comment out entire directive
		$dir = '';
	    }
	    else {
		$chomp = ($1 && $1 eq '+') ? 0 : ($1 || $prechomp);

    		# chomp off whitespace and newline preceeding directive
    		$chomp and $pre =~ s/(\n|^)[ \t]*\Z//m
    		       and $1 eq "\n"
    		       and $prelines++;
	    }
    
	    # remove trailing whitespace and check for a '-' chomp flag
	    s/\s*([-+])?\s*$//s;
	    $chomp = ($1 && $1 eq '+') ? 0 : ($1 || $postchomp);

	    # only chomp newline if it's not the last character
	    $chomp and $text =~ s/^[ \t]*\n(.|\n)/$1/
		   and $postlines++;
	}

	# any text preceeding the directive can now be added
	if (length $pre) {
	    push(@tokens, $interp
		 ? [ $pre, $line, 'ITEXT' ]
		 : ('TEXT', $pre) );
	    $line += $prelines;
	}
	
	# and now the directive, along with line number information
	if (length $dir) {
	    # the TAGS directive is a compile-time switch
	    if ($dir =~ /TAGS\s+(.*)/i) {
		my @tags = split(/\s+/, $1);
		if (scalar @tags > 1) {
		    ($start, $end) = map { quotemeta($_) } @tags;
		}
		elsif ($tags = $TAG_STYLE->{ $tags[0] }) {
		    ($start, $end) = @$tags;
		}
		else {
		    warn "invalid TAGS style: $tags[0]\n";
		}
	    }
	    else {
		# DIRECTIVE is pushed as [ $dirtext, $line_no(s), \@tokens ]
		push(@tokens, [ $dir, 
				($dirlines 
				 ? sprintf("%d-%d", $line, $line + $dirlines)
				 : $line),
				$self->tokenise_directive($dir) ]);
	    }
	}

	# update line counter to include directive lines and any extra
	# newline chomped off the start of the following text
	$line += $dirlines + $postlines;
    }

    # anything remaining in the string is plain text 
    push(@tokens, $interp 
	 ? [ $text, $line, 'ITEXT' ]
	 : ( 'TEXT', $text) )
	if length $text;

    return \@tokens;					    ## RETURN ##
}


#------------------------------------------------------------------------
# interpolate_text($text, $line)
#
# Examines $text looking for any variable references embedded like
# $this or like ${ this }.
#------------------------------------------------------------------------

sub interpolate_text {
    my ($self, $text, $line) = @_;
    my @tokens  = ();
    my ($pre, $var, $dir);


    while ($text =~ 
	   /
	   ( (?: \\. | [^\$] )+ )   # escaped or non-'$' character [$1]
	   | 
	   ( \$ (?:		    # embedded variable	           [$2]
	     (?: \{ ([^\}]*) \} )   # ${ ... }                     [$3]
	     |
	     ([\w\.]+)		    # $word                        [$4]
	     )
	   )
	/gx) {
    
	($pre, $var, $dir) = ($1, $3 || $4, $2);

	# preceeding text
	if ($pre) {
	    $line += $pre =~ tr/\n//;
	    $pre =~ s/\\\$/\$/g;
	    push(@tokens, 'TEXT', $pre);
	}
	# $variable reference
        if ($var) {
	    $line += $dir =~ tr/\n/ /;
	    push(@tokens, [ $dir, $line, $self->tokenise_directive($var) ]);
	}
	# other '$' reference - treated as text
	elsif ($dir) {
	    $line += $dir =~ tr/\n//;
	    push(@tokens, 'TEXT', $dir);
	}
    }

    return \@tokens;
}


#------------------------------------------------------------------------
# tokenise_directive($text)
#
# Called by the private _parse() method when it encounters a DIRECTIVE
# token in the list provided by the split_text() or interpolate_text()
# methods.  The directive text is passed by parameter.
#
# The method splits the directive into individual tokens as recognised
# by the parser grammar (see Template::Grammar for details).  It
# constructs a list of tokens each represented by 2 elements, as per
# split_text() et al.  The first element contains the token type, the
# second the token itself.
#
# The method tokenises the string using a complex (but fast) regex.
# For a deeper understanding of the regex magic at work here, see
# Jeffrey Friedl's excellent book "Mastering Regular Expressions",
# from O'Reilly, ISBN 1-56592-257-3
#
# Returns a reference to the list of chunks (each one being 2 elements) 
# identified in the directive text.  On error, the internal _ERROR string 
# is set and undef is returned.
#------------------------------------------------------------------------

sub tokenise_directive {
    my ($self, $text, $line) = @_;
    my ($token, $uctoken, $type, $lookup);
    my ($lextable, $anycase, $start, $end) = 
	@$self{ qw( LEXTABLE ANYCASE START_TAG END_TAG ) };
    my @tokens = ( );

    while ($text =~ 
	    / 
		# strip out any comments
	        (\#[^\n]*)
	   |
		# a quoted phrase matches in $3
		(["'])                   # $2 - opening quote, " or '
		(                        # $3 - quoted text buffer
		    (?:                  # repeat group (no backreference)
			\\\\             # an escaped backslash \\
		    |                    # ...or...
			\\\2             # an escaped quote \" or \' (match $1)
		    |                    # ...or...
			.                # any other character
		    |	\n
		    )*?                  # non-greedy repeat
		)                        # end of $3
		\2                       # match opening quote
	    |
		# an unquoted number matches in $4
		(-?\d+(?:\.\d+)?)       # numbers
	    |
		# filename matches in $5
	    	( \/?\w+(?:(?:\/|::?)\w*)+ | \/\w+)
	    |
		# an identifier matches in $6
		(\w+)                    # variable identifier
	    |   
		# an unquoted word or symbol matches in $7
		(   [(){}\[\]:;,\/\\]    # misc parenthesis and symbols
#		|   \->                  # arrow operator (for future?)
		|   \+\-\*               # math operations
		|   \$\{?                # dollar with option left brace
		|   =>			 # like '='
		|   [=!<>]?= | [!<>]     # eqality tests
		|   &&? | \|\|?          # boolean ops
		|   \.\.?                # n..n sequence
 		|   \S+                  # something unquoted
		)                        # end of $7
	    /gmxo) {

	# ignore comments to EOL
	next if $1;

	# quoted string
	if (defined ($token = $3)) {
            # double-quoted string may include $variable references
	    if ($2 eq '"') {
	        if ($token =~ /[\$\\]/) {
		    $type = 'QUOTED';
		    # unescape " and \ but leave \$ escaped so that 
		    # interpolate_text() doesn't incorrectly treat it
		    # as a variable reference
#		    $token =~ s/\\([\\"])/$1/g;
		    $token =~ s/\\([^\$n])/$1/g;
		    $token =~ s/\\n/\n/g;
		    push(@tokens, ('"') x 2,
				  @{ $self->interpolate_text($token) },
				  ('"') x 2);
		    next;
		}
                else {
	            $type = 'LITERAL';
		    $token =~ s['][\\']g;
		    $token = "'$token'";
		}
	    } 
	    else {
		$type = 'LITERAL';
		$token = "'$token'";
	    }
	}
	# number
	elsif (defined ($token = $4)) {
	    $type = 'NUMBER';
	}
	elsif (defined($token = $5)) {
	    $type = 'FILENAME';
	}
	elsif (defined($token = $6)) {
	    # reserved words may be in lower case unless case sensitive
	    $uctoken = $anycase ? uc $token : $token;
	    if (defined ($type = $lextable->{ $uctoken })) {
		$token = $uctoken;
	    }
	    else {
		$type = 'IDENT';
	    }
	}
	elsif (defined ($token = $7)) {
	    # reserved words may be in lower case unless case sensitive
	    $uctoken = $anycase ? uc $token : $token;
	    unless (defined ($type = $lextable->{ $uctoken })) {
		$type = 'UNQUOTED';
	    }
	}

	push(@tokens, $type, $token);

#	print(STDERR " +[ $type, $token ]\n")
#	    if $DEBUG;
    }

#    print STDERR "tokenise directive() returning:\n  [ @tokens ]\n"
#	if $DEBUG;

    return \@tokens;					    ## RETURN ##
}


#------------------------------------------------------------------------
# define_block($name, $block)
#
# Called by the parser 'defblock' rule when a BLOCK definition is 
# encountered in the template.  The name of the block is passed in the 
# first parameter and a reference to the compiled block is passed in
# the second.  This method stores the block in the $self->{ DEFBLOCK }
# hash which has been initialised by parse() and will later be used 
# by the same method to call the store() method on the calling cache
# to define the block "externally".
#------------------------------------------------------------------------

sub define_block {
    my ($self, $name, $block) = @_;
    my $defblock = $self->{ DEFBLOCK } 
        || return undef;

    print STDERR "compiled block '$name':\n$block\n"
	if $DEBUG;

    $defblock->{ $name } = $block;
    
    return undef;
}


#------------------------------------------------------------------------
# add_metadata(\@setlist)
#------------------------------------------------------------------------

sub add_metadata {
    my ($self, $setlist) = @_;
    my $metadata = $self->{ METADATA } 
        || return undef;

    push(@$metadata, @$setlist);
    
    return undef;
}


#========================================================================
#                     -----  PRIVATE METHODS -----
#========================================================================

#------------------------------------------------------------------------
# _parse(\@tokens)
#
# Parses the list of input tokens passed by reference and returns a 
# Template::Directive::Block object which contains the compiled 
# representation of the template. 
#
# This is the main parser DFA loop.  See embedded comments for 
# further details.
#
# On error, undef is returned and the internal _ERROR field is set to 
# indicate the error.  This can be retrieved by calling the error() 
# method.
#------------------------------------------------------------------------

sub _parse {
    my ($self, $tokens) = @_;
    my ($token, $value, $text, $line, $inperl);
    my ($state, $stateno, $status, $action, $lookup, $coderet, @codevars);
    my ($lhs, $len, $code);	    # rule contents
    my $stack = [ [ 0, undef ] ];   # DFA stack

# DEBUG
#   local $" = ', ';

    # retrieve internal rule and state tables
    my ($states, $rules) = @$self{ qw( STATES RULES ) };

    # call the grammar set_factory method to install emitter factory
    $self->{ GRAMMAR }->install_factory($self->{ FACTORY });

    $line = $inperl = 0;
    $self->{ LINE   } = \$line;
    $self->{ INPERL } = \$inperl;

    $status = CONTINUE;

    while(1) {
	# get state number and state
	$stateno =  $stack->[-1]->[0];
	$state   = $states->[$stateno];

	# see if any lookaheads exist for the current state
	if (exists $state->{'ACTIONS'}) {

	    # get next token and expand any directives (i.e. token is an 
	    # array ref) onto the front of the token list
	    while (! defined $token && @$tokens) {
		$token = shift(@$tokens);
		if (ref $token) {
		    ($text, $line, $token) = @$token;
		    if (ref $token) {
			unshift(@$tokens, @$token, (';') x 2);
			$token = undef;  # force redo
		    }
		    elsif ($token eq 'ITEXT') {
			if ($inperl) {
			    # don't perform interpolation in PERL blocks
			    $token = 'TEXT';
			    $value = $text;
			}
			else {
			    unshift(@$tokens, 
				    @{ $self->interpolate_text($text, $line) });
			    $token = undef; # force redo
			}
		    }
		}
		else {
		    $value = shift(@$tokens);
		}
	    };
	    # clear undefined token to avoid 'undefined variable blah blah'
	    # warnings and let the parser logic pick it up in a minute
	    $token = '' unless defined $token;

	    # get the next state for the current lookahead token
	    $action = defined ($lookup = $state->{'ACTIONS'}->{ $token })
	              ? $lookup
		      : defined ($lookup = $state->{'DEFAULT'})
		        ? $lookup
		        : undef;
	}
	else {
	    # no lookahead actions
	    $action = $state->{'DEFAULT'};
	}

	# ERROR: no ACTION
	last unless defined $action;

	# - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	# shift (+ive ACTION)
	# - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	if ($action > 0) {
	    push(@$stack, [ $action, $value ]);
	    $token = $value = undef;
	    redo;
	};

	# - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	# reduce (-ive ACTION)
	# - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	($lhs, $len, $code) = @{ $rules->[ -$action ] };

	# no action imples ACCEPTance
	$action
	    or $status = ACCEPT;

	# use dummy sub if code ref doesn't exist
	$code = sub { $_[1] }
	    unless $code;

	@codevars = $len
		?   map { $_->[1] } @$stack[ -$len .. -1 ]
		:   ();

	$coderet = &$code( $self, @codevars );

	# reduce stack by $len
	splice(@$stack, -$len, $len);

	# ACCEPT
	return $coderet					    ## RETURN ##
	    if $status == ACCEPT;

	# ABORT
	return undef					    ## RETURN ##
	    if $status == ABORT;

	# ERROR
	last 
	    if $status == ERROR;
    }
    continue {
	push(@$stack, [ $states->[ $stack->[-1][0] ]->{'GOTOS'}->{ $lhs }, 
	      $coderet ]), 
    }

    # ERROR						    ## RETURN ##
    return $self->_parse_error('unexpected end of input')
	unless defined $value;

    # munge text of last directive to make it readable
#    $text =~ s/\n/\\n/g;

    return $self->_parse_error("unexpected end of directive", $text)
	if $value eq ';';   # end of directive SEPARATOR

    return $self->_parse_error("unexpected token ($value)", $text);
}


#------------------------------------------------------------------------
# _parse_error($msg, $dirtext)
#
# Method used to handle errors encountered during the parse process
# in the _parse() method.  
#------------------------------------------------------------------------

sub _parse_error {
    my ($self, $msg, $text) = @_;
    my $line = $self->{ LINE };
    $line = ref($line) ? $$line : $line;
    $line = 'unknown' unless $line;

    $msg .= "\n  [% $text %]"
	if defined $text;

    return $self->error("line $line: $msg");
}


#------------------------------------------------------------------------
# _dump()
# 
# Debug method returns a string representing the internal state of the 
# object.
#------------------------------------------------------------------------

sub _dump {
    my $self = shift;
    my $output = "$self:\n";
    foreach my $key (qw( START_TAG END_TAG TAG_STYLE ANYCASE INTERPOLATE 
			 PRE_CHOMP POST_CHOMP V1DOLLAR ) ) {
	
	$output .= sprintf("%-12s => %s\n", $key, $self->{ $key });
    }
    return $output;
}
    

1;