From d28f6aa761bd44cc51179f96423d9f3fe83605d4 Mon Sep 17 00:00:00 2001 From: Justin Seyster Date: Wed, 16 Feb 2011 20:58:20 -0500 Subject: [PATCH] Adds Russ Cox's awesome regexp DFA implementation. Lots of scary licensing text! I'm keeping this GPL3 just for the sake of simplicity/consistency. Nobody is going to want the changes I make here anyway, so it doesn't make much of a difference that I'm relicensing. --- src/nfa.c | 447 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 447 insertions(+) create mode 100644 src/nfa.c diff --git a/src/nfa.c b/src/nfa.c new file mode 100644 index 0000000..0c5826c --- /dev/null +++ b/src/nfa.c @@ -0,0 +1,447 @@ +/* This program is free software: you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License + and a copy of the GCC Runtime Library Exception along with this + program; see the files COPYING and COPYING.RUNTIME respectively. + If not, see . */ + +/* This file, originally provided under the MIT license, is relicensed + here under the GPL3 for distribution with InterAspect. The + original license is provided at the end of the file but does not + apply to any changes to this file or to any other part of the + InterAspect distribution. */ + +/* + * Regular expression implementation. + * Supports only ( | ) * + ?. No escapes. + * Compiles to NFA and then simulates NFA + * using Thompson's algorithm. + * + * See also http://swtch.com/~rsc/regexp/ and + * Thompson, Ken. Regular Expression Search Algorithm, + * Communications of the ACM 11(6) (June 1968), pp. 419-422. + * + * Copyright (c) 2007 Russ Cox. + */ +#include +#include +#include +#include + +/* + * Convert infix regexp re to postfix notation. + * Insert . as explicit concatenation operator. + * Cheesy parser, return static buffer. + */ +char* +re2post(char *re) +{ + int nalt, natom; + static char buf[8000]; + char *dst; + struct { + int nalt; + int natom; + } paren[100], *p; + + p = paren; + dst = buf; + nalt = 0; + natom = 0; + if(strlen(re) >= sizeof buf/2) + return NULL; + for(; *re; re++){ + switch(*re){ + case '(': + if(natom > 1){ + --natom; + *dst++ = '.'; + } + if(p >= paren+100) + return NULL; + p->nalt = nalt; + p->natom = natom; + p++; + nalt = 0; + natom = 0; + break; + case '|': + if(natom == 0) + return NULL; + while(--natom > 0) + *dst++ = '.'; + nalt++; + break; + case ')': + if(p == paren) + return NULL; + if(natom == 0) + return NULL; + while(--natom > 0) + *dst++ = '.'; + for(; nalt > 0; nalt--) + *dst++ = '|'; + --p; + nalt = p->nalt; + natom = p->natom; + natom++; + break; + case '*': + case '+': + case '?': + if(natom == 0) + return NULL; + *dst++ = *re; + break; + default: + if(natom > 1){ + --natom; + *dst++ = '.'; + } + *dst++ = *re; + natom++; + break; + } + } + if(p != paren) + return NULL; + while(--natom > 0) + *dst++ = '.'; + for(; nalt > 0; nalt--) + *dst++ = '|'; + *dst = 0; + return buf; +} + +/* + * Represents an NFA state plus zero or one or two arrows exiting. + * if c == Match, no arrows out; matching state. + * If c == Split, unlabeled arrows to out and out1 (if != NULL). + * If c < 256, labeled arrow with character c to out. + */ +enum +{ + Match = 256, + Split = 257 +}; +typedef struct State State; +struct State +{ + int c; + State *out; + State *out1; + int lastlist; +}; +State matchstate = { Match }; /* matching state */ +int nstate; + +/* Allocate and initialize State */ +State* +state(int c, State *out, State *out1) +{ + State *s; + + nstate++; + s = malloc(sizeof *s); + s->lastlist = 0; + s->c = c; + s->out = out; + s->out1 = out1; + return s; +} + +/* + * A partially built NFA without the matching state filled in. + * Frag.start points at the start state. + * Frag.out is a list of places that need to be set to the + * next state for this fragment. + */ +typedef struct Frag Frag; +typedef union Ptrlist Ptrlist; +struct Frag +{ + State *start; + Ptrlist *out; +}; + +/* Initialize Frag struct. */ +Frag +frag(State *start, Ptrlist *out) +{ + Frag n = { start, out }; + return n; +} + +/* + * Since the out pointers in the list are always + * uninitialized, we use the pointers themselves + * as storage for the Ptrlists. + */ +union Ptrlist +{ + Ptrlist *next; + State *s; +}; + +/* Create singleton list containing just outp. */ +Ptrlist* +list1(State **outp) +{ + Ptrlist *l; + + l = (Ptrlist*)outp; + l->next = NULL; + return l; +} + +/* Patch the list of states at out to point to start. */ +void +patch(Ptrlist *l, State *s) +{ + Ptrlist *next; + + for(; l; l=next){ + next = l->next; + l->s = s; + } +} + +/* Join the two lists l1 and l2, returning the combination. */ +Ptrlist* +append(Ptrlist *l1, Ptrlist *l2) +{ + Ptrlist *oldl1; + + oldl1 = l1; + while(l1->next) + l1 = l1->next; + l1->next = l2; + return oldl1; +} + +/* + * Convert postfix regular expression to NFA. + * Return start state. + */ +State* +post2nfa(char *postfix) +{ + char *p; + Frag stack[1000], *stackp, e1, e2, e; + State *s; + + // fprintf(stderr, "postfix: %s\n", postfix); + + if(postfix == NULL) + return NULL; + + #define push(s) *stackp++ = s + #define pop() *--stackp + + stackp = stack; + for(p=postfix; *p; p++){ + switch(*p){ + default: + s = state(*p, NULL, NULL); + push(frag(s, list1(&s->out))); + break; + case '.': /* catenate */ + e2 = pop(); + e1 = pop(); + patch(e1.out, e2.start); + push(frag(e1.start, e2.out)); + break; + case '|': /* alternate */ + e2 = pop(); + e1 = pop(); + s = state(Split, e1.start, e2.start); + push(frag(s, append(e1.out, e2.out))); + break; + case '?': /* zero or one */ + e = pop(); + s = state(Split, e.start, NULL); + push(frag(s, append(e.out, list1(&s->out1)))); + break; + case '*': /* zero or more */ + e = pop(); + s = state(Split, e.start, NULL); + patch(e.out, s); + push(frag(s, list1(&s->out1))); + break; + case '+': /* one or more */ + e = pop(); + s = state(Split, e.start, NULL); + patch(e.out, s); + push(frag(e.start, list1(&s->out1))); + break; + } + } + + e = pop(); + if(stackp != stack) + return NULL; + + patch(e.out, &matchstate); + return e.start; +#undef pop +#undef push +} + +typedef struct List List; +struct List +{ + State **s; + int n; +}; +List l1, l2; +static int listid; + +void addstate(List*, State*); +void step(List*, int, List*); + +/* Compute initial state list */ +List* +startlist(State *start, List *l) +{ + l->n = 0; + listid++; + addstate(l, start); + return l; +} + +/* Check whether state list contains a match. */ +int +ismatch(List *l) +{ + int i; + + for(i=0; in; i++) + if(l->s[i] == &matchstate) + return 1; + return 0; +} + +/* Add s to l, following unlabeled arrows. */ +void +addstate(List *l, State *s) +{ + if(s == NULL || s->lastlist == listid) + return; + s->lastlist = listid; + if(s->c == Split){ + /* follow unlabeled arrows */ + addstate(l, s->out); + addstate(l, s->out1); + return; + } + l->s[l->n++] = s; +} + +/* + * Step the NFA from the states in clist + * past the character c, + * to create next NFA state set nlist. + */ +void +step(List *clist, int c, List *nlist) +{ + int i; + State *s; + + listid++; + nlist->n = 0; + for(i=0; in; i++){ + s = clist->s[i]; + if(s->c == c) + addstate(nlist, s->out); + } +} + +/* Run NFA to determine whether it matches s. */ +int +match(State *start, char *s) +{ + int i, c; + List *clist, *nlist, *t; + + clist = startlist(start, &l1); + nlist = &l2; + for(; *s; s++){ + c = *s & 0xFF; + step(clist, c, nlist); + t = clist; clist = nlist; nlist = t; /* swap clist, nlist */ + } + return ismatch(clist); +} + +int +main(int argc, char **argv) +{ + int i; + char *post; + State *start; + + if(argc < 3){ + fprintf(stderr, "usage: nfa regexp string...\n"); + return 1; + } + + post = re2post(argv[1]); + if(post == NULL){ + fprintf(stderr, "bad regexp %s\n", argv[1]); + return 1; + } + + start = post2nfa(post); + if(start == NULL){ + fprintf(stderr, "error in post2nfa %s\n", post); + return 1; + } + + l1.s = malloc(nstate*sizeof l1.s[0]); + l2.s = malloc(nstate*sizeof l2.s[0]); + for(i=2; i