Adds Russ Cox's awesome regexp DFA implementation.
authorJustin Seyster <jseyster@cs.sunysb.edu>
Thu, 17 Feb 2011 01:58:20 +0000 (20:58 -0500)
committerJustin Seyster <jseyster@cs.sunysb.edu>
Thu, 17 Feb 2011 01:58:20 +0000 (20:58 -0500)
Lots of scary licensing text!  I'm keeping this GPL3 just for the sake
of simplicity/consistency.  Nobody is going to want the changes I make
here anyway, so it doesn't make much of a difference that I'm
relicensing.

src/nfa.c [new file with mode: 0644]

diff --git a/src/nfa.c b/src/nfa.c
new file mode 100644 (file)
index 0000000..0c5826c
--- /dev/null
+++ b/src/nfa.c
@@ -0,0 +1,447 @@
+/* This program is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License
+   and a copy of the GCC Runtime Library Exception along with this
+   program; see the files COPYING and COPYING.RUNTIME respectively.
+   If not, see <http://www.gnu.org/licenses/>. */
+
+/* This file, originally provided under the MIT license, is relicensed
+   here under the GPL3 for distribution with InterAspect.  The
+   original license is provided at the end of the file but does not
+   apply to any changes to this file or to any other part of the
+   InterAspect distribution. */
+
+/*
+ * Regular expression implementation.
+ * Supports only ( | ) * + ?.  No escapes.
+ * Compiles to NFA and then simulates NFA
+ * using Thompson's algorithm.
+ *
+ * See also http://swtch.com/~rsc/regexp/ and
+ * Thompson, Ken.  Regular Expression Search Algorithm,
+ * Communications of the ACM 11(6) (June 1968), pp. 419-422.
+ * 
+ * Copyright (c) 2007 Russ Cox.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Convert infix regexp re to postfix notation.
+ * Insert . as explicit concatenation operator.
+ * Cheesy parser, return static buffer.
+ */
+char*
+re2post(char *re)
+{
+       int nalt, natom;
+       static char buf[8000];
+       char *dst;
+       struct {
+               int nalt;
+               int natom;
+       } paren[100], *p;
+       
+       p = paren;
+       dst = buf;
+       nalt = 0;
+       natom = 0;
+       if(strlen(re) >= sizeof buf/2)
+               return NULL;
+       for(; *re; re++){
+               switch(*re){
+               case '(':
+                       if(natom > 1){
+                               --natom;
+                               *dst++ = '.';
+                       }
+                       if(p >= paren+100)
+                               return NULL;
+                       p->nalt = nalt;
+                       p->natom = natom;
+                       p++;
+                       nalt = 0;
+                       natom = 0;
+                       break;
+               case '|':
+                       if(natom == 0)
+                               return NULL;
+                       while(--natom > 0)
+                               *dst++ = '.';
+                       nalt++;
+                       break;
+               case ')':
+                       if(p == paren)
+                               return NULL;
+                       if(natom == 0)
+                               return NULL;
+                       while(--natom > 0)
+                               *dst++ = '.';
+                       for(; nalt > 0; nalt--)
+                               *dst++ = '|';
+                       --p;
+                       nalt = p->nalt;
+                       natom = p->natom;
+                       natom++;
+                       break;
+               case '*':
+               case '+':
+               case '?':
+                       if(natom == 0)
+                               return NULL;
+                       *dst++ = *re;
+                       break;
+               default:
+                       if(natom > 1){
+                               --natom;
+                               *dst++ = '.';
+                       }
+                       *dst++ = *re;
+                       natom++;
+                       break;
+               }
+       }
+       if(p != paren)
+               return NULL;
+       while(--natom > 0)
+               *dst++ = '.';
+       for(; nalt > 0; nalt--)
+               *dst++ = '|';
+       *dst = 0;
+       return buf;
+}
+
+/*
+ * Represents an NFA state plus zero or one or two arrows exiting.
+ * if c == Match, no arrows out; matching state.
+ * If c == Split, unlabeled arrows to out and out1 (if != NULL).
+ * If c < 256, labeled arrow with character c to out.
+ */
+enum
+{
+       Match = 256,
+       Split = 257
+};
+typedef struct State State;
+struct State
+{
+       int c;
+       State *out;
+       State *out1;
+       int lastlist;
+};
+State matchstate = { Match };  /* matching state */
+int nstate;
+
+/* Allocate and initialize State */
+State*
+state(int c, State *out, State *out1)
+{
+       State *s;
+       
+       nstate++;
+       s = malloc(sizeof *s);
+       s->lastlist = 0;
+       s->c = c;
+       s->out = out;
+       s->out1 = out1;
+       return s;
+}
+
+/*
+ * A partially built NFA without the matching state filled in.
+ * Frag.start points at the start state.
+ * Frag.out is a list of places that need to be set to the
+ * next state for this fragment.
+ */
+typedef struct Frag Frag;
+typedef union Ptrlist Ptrlist;
+struct Frag
+{
+       State *start;
+       Ptrlist *out;
+};
+
+/* Initialize Frag struct. */
+Frag
+frag(State *start, Ptrlist *out)
+{
+       Frag n = { start, out };
+       return n;
+}
+
+/*
+ * Since the out pointers in the list are always 
+ * uninitialized, we use the pointers themselves
+ * as storage for the Ptrlists.
+ */
+union Ptrlist
+{
+       Ptrlist *next;
+       State *s;
+};
+
+/* Create singleton list containing just outp. */
+Ptrlist*
+list1(State **outp)
+{
+       Ptrlist *l;
+       
+       l = (Ptrlist*)outp;
+       l->next = NULL;
+       return l;
+}
+
+/* Patch the list of states at out to point to start. */
+void
+patch(Ptrlist *l, State *s)
+{
+       Ptrlist *next;
+       
+       for(; l; l=next){
+               next = l->next;
+               l->s = s;
+       }
+}
+
+/* Join the two lists l1 and l2, returning the combination. */
+Ptrlist*
+append(Ptrlist *l1, Ptrlist *l2)
+{
+       Ptrlist *oldl1;
+       
+       oldl1 = l1;
+       while(l1->next)
+               l1 = l1->next;
+       l1->next = l2;
+       return oldl1;
+}
+
+/*
+ * Convert postfix regular expression to NFA.
+ * Return start state.
+ */
+State*
+post2nfa(char *postfix)
+{
+       char *p;
+       Frag stack[1000], *stackp, e1, e2, e;
+       State *s;
+       
+       // fprintf(stderr, "postfix: %s\n", postfix);
+
+       if(postfix == NULL)
+               return NULL;
+
+       #define push(s) *stackp++ = s
+       #define pop() *--stackp
+
+       stackp = stack;
+       for(p=postfix; *p; p++){
+               switch(*p){
+               default:
+                       s = state(*p, NULL, NULL);
+                       push(frag(s, list1(&s->out)));
+                       break;
+               case '.':       /* catenate */
+                       e2 = pop();
+                       e1 = pop();
+                       patch(e1.out, e2.start);
+                       push(frag(e1.start, e2.out));
+                       break;
+               case '|':       /* alternate */
+                       e2 = pop();
+                       e1 = pop();
+                       s = state(Split, e1.start, e2.start);
+                       push(frag(s, append(e1.out, e2.out)));
+                       break;
+               case '?':       /* zero or one */
+                       e = pop();
+                       s = state(Split, e.start, NULL);
+                       push(frag(s, append(e.out, list1(&s->out1))));
+                       break;
+               case '*':       /* zero or more */
+                       e = pop();
+                       s = state(Split, e.start, NULL);
+                       patch(e.out, s);
+                       push(frag(s, list1(&s->out1)));
+                       break;
+               case '+':       /* one or more */
+                       e = pop();
+                       s = state(Split, e.start, NULL);
+                       patch(e.out, s);
+                       push(frag(e.start, list1(&s->out1)));
+                       break;
+               }
+       }
+
+       e = pop();
+       if(stackp != stack)
+               return NULL;
+
+       patch(e.out, &matchstate);
+       return e.start;
+#undef pop
+#undef push
+}
+
+typedef struct List List;
+struct List
+{
+       State **s;
+       int n;
+};
+List l1, l2;
+static int listid;
+
+void addstate(List*, State*);
+void step(List*, int, List*);
+
+/* Compute initial state list */
+List*
+startlist(State *start, List *l)
+{
+       l->n = 0;
+       listid++;
+       addstate(l, start);
+       return l;
+}
+
+/* Check whether state list contains a match. */
+int
+ismatch(List *l)
+{
+       int i;
+
+       for(i=0; i<l->n; i++)
+               if(l->s[i] == &matchstate)
+                       return 1;
+       return 0;
+}
+
+/* Add s to l, following unlabeled arrows. */
+void
+addstate(List *l, State *s)
+{
+       if(s == NULL || s->lastlist == listid)
+               return;
+       s->lastlist = listid;
+       if(s->c == Split){
+               /* follow unlabeled arrows */
+               addstate(l, s->out);
+               addstate(l, s->out1);
+               return;
+       }
+       l->s[l->n++] = s;
+}
+
+/*
+ * Step the NFA from the states in clist
+ * past the character c,
+ * to create next NFA state set nlist.
+ */
+void
+step(List *clist, int c, List *nlist)
+{
+       int i;
+       State *s;
+
+       listid++;
+       nlist->n = 0;
+       for(i=0; i<clist->n; i++){
+               s = clist->s[i];
+               if(s->c == c)
+                       addstate(nlist, s->out);
+       }
+}
+
+/* Run NFA to determine whether it matches s. */
+int
+match(State *start, char *s)
+{
+       int i, c;
+       List *clist, *nlist, *t;
+
+       clist = startlist(start, &l1);
+       nlist = &l2;
+       for(; *s; s++){
+               c = *s & 0xFF;
+               step(clist, c, nlist);
+               t = clist; clist = nlist; nlist = t;    /* swap clist, nlist */
+       }
+       return ismatch(clist);
+}
+
+int
+main(int argc, char **argv)
+{
+       int i;
+       char *post;
+       State *start;
+
+       if(argc < 3){
+               fprintf(stderr, "usage: nfa regexp string...\n");
+               return 1;
+       }
+       
+       post = re2post(argv[1]);
+       if(post == NULL){
+               fprintf(stderr, "bad regexp %s\n", argv[1]);
+               return 1;
+       }
+
+       start = post2nfa(post);
+       if(start == NULL){
+               fprintf(stderr, "error in post2nfa %s\n", post);
+               return 1;
+       }
+       
+       l1.s = malloc(nstate*sizeof l1.s[0]);
+       l2.s = malloc(nstate*sizeof l2.s[0]);
+       for(i=2; i<argc; i++)
+               if(match(start, argv[i]))
+                       printf("%s\n", argv[i]);
+       return 0;
+}
+
+/* The original license for this file is provided here but does not
+   apply to any changes to the file or to any other part of the
+   InterAspect distribution. */
+
+/*
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the
+ * Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall
+ * be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ * PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS
+ * OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */