#ifndef lint
static char sccsid[] = "@(#)regexp.c     1.2 (LBL) 12/4/85";
static char rcsid[] =
   "$Id: regexp.c,v 1.3 1999/05/27 16:17:43 mike Exp $";
#endif

/*
 * Copyright %%\copyright%% 1980 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Regular expression matching routines for lgrind/tgrind/tfontedpr.
 *
 * These routines were written by Dave Presotto (I think) for vgrind.
 * Minor mods & attempts to improve performance by Van Jacobson
 * (van@@lbl-rtsg) and Chris Torek (chris@@maryland).
 *
 * Modifications.
 * --------------
 *    Sep 91	George V Reilly	Fixed up some bogus uses of @NIL@ and @NULL@.
 * 30 Mar 85	Van & Chris	Changed @expmatch()@ to return pointer to
 *				start of what was matched in addition to
 *				pointer to match end.  Several changes to
 *				improve performance (too numerous to mention).
 * 11 Dec 84	Dave Presotto	Written.
 */


#include <stdio.h>
#include <malloc.h>
#include <ctype.h>
#include <string.h>
#include "regexp.h"

#define makelower(c) (isupper((c)) ? tolower((c)) : (c))

extern char *l_id;
static void expconv(void);	 /* forward declaration */

int    (*re_strncmp)(const char *, const char *, size_t);	 
			/* function used by @expmatch()@ to compare
			* strings.  The caller should make it point to
			* @strncmp()@ if case is significant &
			* @lc_strncmp()@ otherwise.
			*/


/*  @lc_strncmp()@ ---	like @strncmp()@ except that we convert the
 *			first string to lower case before comparing.
 */
int lc_strncmp(register const char *s1, register const char *s2, register size_t len)
{
   while (len-- > 0)
      if (*s2 - makelower(*s1))
	 return 1;
      else
	 s2++, s1++;
   
   return 0;
}


/*  The following routine converts an irregular expression to
 *  internal format.
 *
 *  Either meta symbols (%|\a|%, %|\d|%, or %|\p|%) or character strings
 *  or operations (alternation or parenthesizing) can be specified.
 *  Each starts with a descriptor byte.  The descriptor byte
 *  has @STR@ set for strings, @META@ set for meta symbols,
 *  and @OPER@ set for operations.  The descriptor byte can also have
 *  the @OPT@ bit set if the object defined is optional.  Also @ALT@
 *  can be set to indicate an alternation.
 *
 *  For metasymbols, the byte following the descriptor byte identities
 *  the meta symbol (containing an ASCII @'a'@, @'d'@, @'p'@, @'|'@,
 *  or @'('@).  For strings, the byte after the descriptor is a
 *  character count for the string:
 *@
 *	meta symbols :=	descriptor
 *			symbol
 *
 *	strings :=	descriptor
 *			character count
 *			the string
 *
 *	operations :=	descriptor
 *			symbol
 *			character count@
 */


/*
 *  handy macros for accessing parts of match blocks
 */
#define MSYM(A)	 (*(A+1))	/* symbol in a meta symbol block */
#define MNEXT(A) (A+2)		/* character following a metasymbol block */

#define OSYM(A)	 (*(A+1))	/* symbol in an operation block */
#define OCNT(A)	 (*(A+2))	/* character count */
#define ONEXT(A) (A+3)		/* next character after the operation */
#define OPTR(A)	 (A+*(A+2))	/* place pointed to by the operator */

#define SCNT(A)	 (*(A+1))	/* byte count of a string */
#define SSTR(A)	 (A+2)		/* address of the string */
#define SNEXT(A) (A+2+*(A+1))	/* character following the string */


/*
 *  bit flags in the descriptor 
 */
#define OPT	 1
#define STR	 2
#define META	 4
#define ALT	 8
#define OPER	16

char *ure;		/* pointer current position in unconverted exp */
char *ccre;		/* pointer to current position in converted exp */



/*
 * Convert an irregular expression to the internal form
 */
char *convexp(char *re)
/* re	 unconverted irregular expression */
{
   register char *cre;	/* pointer to converted regular expression */
   
   /* allocate room for the converted expression */
   if (re == NULL)
      return NULL;
   if (*re == '\0')
      return NULL;
   cre = (char*)malloc(4 * strlen(re) + 3);
   ccre = cre;
   ure = re;
   
   /* start the conversion with a %|\a|% */
   *cre = META | OPT;
   MSYM(cre) = 'a';
   ccre = MNEXT(cre);

   /* start the conversion (it's recursive) */
   expconv();
   *ccre = 0;
   return cre;
}


/*
 * Actually do the conversion
 */
static void expconv(void)
{
   register char *cs;	/* pointer to current symbol in converted exp */
   register char c;	/* character being processed */
   register char *acs;	/* pointer to last alternate */
   register int temp;

   /* let the conversion begin */
   acs = NULL;
   cs = NULL;
   while (*ure != '\0') {
	  switch (c = *ure++) {

	  case '\\':
	 switch (c = *ure++) {

		/* escaped characters are just characters */
	 default:
		if (cs == NULL || (*cs & STR) == 0) {
		   cs = ccre;
		   *cs = STR;
		   SCNT(cs) = 1;
		   ccre += 2;
		} else
		   SCNT(cs)++;
		*ccre++ = c;
		break;

		/* normal(?) metacharacters */
	 case 'a':
	 case 'd':
	 case 'e':
	 case 'p':
		if (acs != NULL && acs != cs) {
		   do {
		  temp = OCNT(acs);
		  OCNT(acs) = ccre - acs;
		  acs -= temp;
		   } while (temp != 0);
		   acs = NULL;
		}
		cs = ccre;
		*cs = META;
		MSYM(cs) = c;
	    ccre = MNEXT(cs);
	    break;
	 }
	 break;
	 
	 /* just put the symbol in */
      case '^':
      case '$':
	 if (acs != NULL && acs != cs) {
	    do {
	       temp = OCNT(acs);
	       OCNT(acs) = ccre - acs;
	       acs -= temp;
	    } while (temp != 0);
	    acs = NULL;
	 }
	 cs = ccre;
	 *cs = META;
	 MSYM(cs) = c;
	 ccre = MNEXT(cs);
	 break;
	 
	 /* mark the last match sequence as optional */
      case '?':
	 if (cs)
		*cs = *cs | OPT;
	 break;
	 
	 /* recurse and define a subexpression */
      case '(':
	 if (acs != NULL && acs != cs) {
	    do {
	       temp = OCNT(acs);
	       OCNT(acs) = ccre - acs;
	       acs -= temp;
	    } while (temp != 0);
	    acs = NULL;
	 }
	 cs = ccre;
	 *cs = OPER;
	 OSYM(cs) = '(';
	 ccre = ONEXT(cs);
	 expconv();
	 OCNT(cs) = ccre - cs;		/* offset to next symbol */
	 break;
	 
	 /* return from a recursion */
      case ')':
	 if (acs != NULL) {
	    do {
	       temp = OCNT(acs);
	       OCNT(acs) = ccre - acs;
	       acs -= temp;
	    } while (temp != 0);
	    acs = NULL;
	 }
	 cs = ccre;
	 *cs = META;
	 MSYM(cs) = c;
	 ccre = MNEXT(cs);
	 return;
	 
	 /* Mark the last match sequence as having an alternate. */
	 /* The third byte will contain an offset to jump over the */
	 /* alternate match in case the first did not fail */
      case '|':
	 if (acs != NULL && acs != cs)
	    OCNT(ccre) = ccre - acs;	/* make a back pointer */
	 else
	    OCNT(ccre) = 0;
	 *cs |= ALT;
	 cs = ccre;
	 *cs = OPER;
	 OSYM(cs) = '|';
	 ccre = ONEXT(cs);
	 acs = cs;	/* remember that the pointer is to be filled */
	 break;
	 
	 /* if it's not a metasymbol, just build a character string */
      default:
	 if (cs == NULL || (*cs & STR) == 0) {
	    cs = ccre;
	    *cs = STR;
	    SCNT(cs) = 1;
	    ccre = SSTR(cs);
	 } else
	    SCNT(cs)++;
	 *ccre++ = c;
	 break;
      }
   }
   if (acs != NULL) {
      do {
	 temp = OCNT(acs);
	 OCNT(acs) = ccre - acs;
	 acs -= temp;
      } while (temp != 0);
      acs = NULL;
   }
   return;
} /* end of @expconv()@ */


/*
 *  The following routine recognises an irregular expression
 *  with the following special characters:
 *
 *	%|\?|%		means last match was optional
 *	%|\a|%		matches any number of characters
 *	%|\d|%		matches any number of spaces and tabs
 *	%|\p|%		matches any number of alphanumeric characters.
 *			The characters matched will be copied into
 *			the area pointed to by @mstring@.
 *	%|\||%		alternation
 *	%|\( \)|%	grouping used mostly for alternation and
 *			optionality
 *
 *  The irregular expression must be translated to internal form
 *  prior to calling this routine
 *
 *  The value returned is the pointer to the last character matched.
 *  If @strtptr@ is non-@NULL@, a pointer to the start of the string matched
 *  (excluding %|\a|% matches) will be returned at @*strtptr@.  
 *  If @mstring@ is non-@NULL@, the string matched by a %|\p|% will be copied
 *  into @mstring@.
 */

boolean _escaped;		/* true if we are currently @_escaped@ */
char *_sstart;			/* start of string.  Set in @putScp()@. */


char *expmatch(
   register char *s,		/* string to check for a match in */
   register char *re,		/* a converted irregular expression */
   char **strtptr,		/* where to put ptr to start of match */
   char *mstring)		/* where to put whatever matches a %|\p|% */
{
   register char *cs;		/* the current symbol */
   register char *ptr, *s1;	/* temporary pointer */
   register char c;		/* temporary */
   boolean matched;		/* a temporary @boolean@ */
   
   /* initial conditions */
   if (strtptr)
      *strtptr = '\0';
   if (re == NULL)
      return NULL;
   cs = re;
   matched = FALSE;
   
   /* loop till expression string is exhausted (or at least pretty tired) */
   while (*cs) {
      switch (*cs & (OPER | STR | META)) {
	 
	 /* try to match a string */
      case STR:
	 matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs)));
	 if (matched) {
	    
	    /* hoorah: it matches */
	    s += SCNT(cs);
	    cs = SNEXT(cs);
	 } else if (*cs & ALT) {
	    
	    /* alternation: skip to next expression */
	    cs = SNEXT(cs);
	 } else if (*cs & OPT) {
	    
	    /* the match is optional */
	    cs = SNEXT(cs);
	    matched = 1;		/* indicate a successful match */
	 } else {
	    
	    /* no match: error return */
	    return NULL;
	 }
	 break;
	 
	 /* an operator: do something fancy */
      case OPER:
	 switch (OSYM(cs)) {
	    
	    /* this is an alternation */
	 case '|':
	    if (matched)
	       
	       /* last thing in the alternation was a match: skip ahead */
	       cs = OPTR(cs);
	    else
	       
	       /* no match: keep trying */
	       cs = ONEXT(cs);
	    break;
	    
	    /* this is a grouping: recurse */
	 case '(':
	    ptr = expmatch(s, ONEXT(cs), strtptr, mstring);
	    if (ptr != NULL) {
	       
	       /* the subexpression matched */
	       matched = 1;
	       s = ptr;
	    } else if (*cs & ALT) {
	       
	       /* alternation: skip to next expression */
	       matched = 0;
	    } else if (*cs & OPT) {
	       
	       /* the match is optional */
	       matched = 1;	/* indicate a successful match */
	    } else {
	       
	       /* no match: error return */
	       return NULL;
	    }
	    cs = OPTR(cs);
	    break;
	 }
	 break;
	 
	 /* try to match a metasymbol */
      case META:
	 switch (MSYM(cs)) {
	    
	    /* try to match anything and remember what was matched */
	 case 'p':
	    /*
	     *  This is really the same as trying the match the
	     *  remaining parts of the expression to any subset
	     *  of the string.
	     */
	    s1 = s;
	    do {
	       ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
	       if (ptr != NULL && s1 != s) {
		  
		  /* we have a match: remember the match */
		  if (mstring) {
		     strncpy(mstring, s, (size_t)(s1 - s));
		     mstring[(int)(s1 - s)] = '\0';
		  }
		  return ptr;
		  
	       } else if (ptr != NULL && (*cs & OPT)) {
		  
		  /* %|\p|% was optional, so no match is ok */
		  return ptr;
		  
	       } else if (ptr != NULL) {
		  
		  /* not optional and we still matched */
		  return NULL;
	       }
	       if (!isalnum(*s1) && !strchr(l_id, *s1))
		  return NULL;
	       if (*s1 == '\\')
		  _escaped = _escaped ? FALSE : TRUE;
	       else
		  _escaped = FALSE;
	    } while (*s1++);
	    return NULL;
	    
	    /* try to match anything */
	 case 'a':
	    /*
	     *  This is really the same as trying the match the
	     *  remaining parts of the expression to any subset
	     *  of the string.
	     */
	    s1 = s;
	    do {
	       /*
		* Hack for an important special case: if the next thing
		* in the pattern is a string, just gobble characters until
		* we find something that matches that string (this saves
		* the cost of a recursive call on @expmatch()@ while scanning
		* for the start of comments or strings).  Since many
		* patterns end with a string, we also treat that as a
		* special case.
		*/
	       if(*(ptr = MNEXT(cs)) == STR) {
		  c = *SSTR(ptr);
		  while(*s1 && *s1 != c)
		     s1++;
		  
		  if (*s1 == 0)
		     return NULL;
		  
		  if (SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) {
		     /* next item is a string, it's the last item and
		      * the %|\a|% match is ok --- just loop to try & match
		      * the string.
		      */
		     if (strtptr)
			*strtptr = s1;
		     
		     cs = ptr;
		     s = s1;
		     break;
		  }
	       }
	       ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
	       if (ptr != NULL && (s1 != s || *cs & OPT)) {
		  
		  /* we have a match */
		  if (strtptr)
		     *strtptr = s1;
		  
		  return ptr;
		  
	       } else if (ptr != NULL) {
		  
		  /* not optional and we still matched */
		  return NULL;
	       }
	       if (*s1 == '\\')
		  _escaped = _escaped ? FALSE : TRUE;
	       else
		  _escaped = FALSE;
	    } while (*s1++);
	    return NULL;
	    
	    /* fail if we are currently @_escaped@ */
	 case 'e':
	    if (_escaped)
	       return NULL;
	    cs = MNEXT(cs); 
	    break;
	    
	    /* match any number of tabs and spaces */
	 case 'd':
	    ptr = s;
	    while (*s == ' ' || *s == '\t')
	       s++;

	    if (s != ptr || s == _sstart) {
	       
	       /* match: be happy */
	       matched = 1;
	       cs = MNEXT(cs); 
	    } else if (*s == '\n' || *s == '\0') {
	       
	       /* match: be happy */
	       matched = 1;
	       cs = MNEXT(cs); 
	    } else if (*cs & ALT) {
	       
	       /* try the next part */
	       matched = 0;
	       cs = MNEXT(cs);
	    } else if (*cs & OPT) {
	       
	       /* doesn't matter */
	       matched = 1;
	       cs = MNEXT(cs);
	    } else
	       
	       /* no match: error return */
	       return NULL;

	    break;
	    
	    /* check for end of line */
	 case '$':
	    if (*s == '\0' || *s == '\n') {
	       
	       /* match: be happy */
	       /* s++; was here
		* removed; now $ is left in stream to be matched again, which
		* is not bad (EOL can end several things at once. Also it
		* leaves things like CE on the same line.
		*/
	       matched = 1;
	       cs = MNEXT(cs);
	    } else if (*cs & ALT) {
	       
	       /* try the next part */
	       matched = 0;
	       cs = MNEXT(cs);
	    } else if (*cs & OPT) {
	       
	       /* doesn't matter */
	       matched = 1;
	       cs = MNEXT(cs);
	    } else
	       
	       /* no match: error return */
	       return NULL;
	    break;
	    
	    /* check for start of line */
	 case '^':
	    if (s == _sstart) {
	       
	       /* match: be happy */
	       matched = 1;
	       cs = MNEXT(cs);
	    } else if (*cs & ALT) {
	       
	       /* try the next part */
	       matched = 0;
	       cs = MNEXT(cs);
	    } else if (*cs & OPT) {
	       
	       /* doesn't matter */
	       matched = 1;
	       cs = MNEXT(cs);
	    } else
	       
	       /* no match: error return */
	       return NULL;
	    break;
	    
	    /* end of a subexpression: return success */
	 case ')':
	    return s;
	 }
	 break;
      }
   }
   return s;
}