#include <string.h>
#include <unistd.h>
+static int
+is_whitespace(char c)
+{
+ return (c == ' ' || c == '\t' || c== '\n');
+}
+
+static int
+is_operator(char c)
+{
+ return (c == '(' || c == '|' || c==')' || c=='*' || c=='+' || c=='?');
+}
+
+/* _0-9a-zA-Z */
+static int
+is_symbol_char(char c)
+{
+ return (c == '_' || (c >= '0' && c <='9') || (c >= 'a' && c <='z')
+ || (c >= 'A' && c <='Z'));
+}
+
+/* Writing this regexp tokenizer would be a whole lot easier if I
+ * could just _use_ regexps to do it. Sigh.
+ *
+ * Returns (, |, ), *, +, or ? if the parsed token is one of those
+ * operators; ! if the tokenizer encounters an error; '\0' if there is
+ * nothing left to parse; and some other value if the token is an
+ * indentifier.
+ *
+ * The parsed token is stored in token and its length in len.
+ **/
+static char
+pop_token(const char** re, const char** token, int* len)
+{
+ while(**re != '\0' && is_whitespace(**re))
+ (*re)++;
+
+ if (**re == '\0'){
+ *len = 0;
+ return '\0';
+ }
+ else if(is_operator(**re)){
+ *token = *re;
+ *len = 1;
+ (*re)++;
+
+ return **token;
+ }
+
+ if (!is_symbol_char(**re)){
+ *len = 0;
+ return '!'; /* ! for error. */
+ }
+
+ *token = *re;
+ for (*len = 0; is_symbol_char(**re); (*re)++, (*len)++)
+ ;
+
+ return **token;
+}
+
+/* Find the index of token (which has length len and need not be
+ * null-terminated) in the list of names. */
+static int
+lookup_name(const char* token, int len, const char** names, int num_names)
+{
+ int i;
+
+ for(i = 0; i < num_names; i++){
+ if(strncmp(token, names[i], len) == 0)
+ return i;
+ }
+
+ return -1;
+}
+
/*
* Convert infix regexp re to postfix notation.
* Insert . as explicit concatenation operator.
* Cheesy parser, return static buffer.
*/
-char*
-re2post(char *re)
+static char*
+re2post(const char* re, const char** names, int num_names)
{
int nalt, natom;
+ char c;
+ const char *token;
+ int len;
+ int sym;
static char buf[8000];
char *dst;
struct {
natom = 0;
if(strlen(re) >= sizeof buf/2)
return NULL;
- for(; *re; re++){
- switch(*re){
+ for(c = pop_token(&re, &token, &len); c; c = pop_token(&re, &token, &len)){
+ switch(c){
+ case '!':
+ return NULL;
case '(':
if(natom > 1){
--natom;
case '?':
if(natom == 0)
return NULL;
- *dst++ = *re;
+ *dst++ = c;
break;
default:
if(natom > 1){
--natom;
*dst++ = '.';
}
- *dst++ = *re;
+ sym = lookup_name(token, len, names, num_names);
+ if(sym < 0){
+ return NULL;
+ }
+ *dst++ = (char)(sym + 'A');
natom++;
break;
}
int c;
State *out;
State *out1;
- int lastlist;
};
-State matchstate = { Match }; /* matching state */
-int nstate;
+static State matchstate = { Match }; /* matching state */
+
+typedef struct NFA NFA;
+struct NFA
+{
+ State* start;
+ int num_states;
+};
/* Allocate and initialize State */
State*
{
State *s;
- nstate++;
s = malloc(sizeof *s);
- s->lastlist = 0;
s->c = c;
s->out = out;
s->out1 = out1;
};
/* Initialize Frag struct. */
-Frag
+static Frag
frag(State *start, Ptrlist *out)
{
Frag n = { start, out };
};
/* Create singleton list containing just outp. */
-Ptrlist*
+static Ptrlist*
list1(State **outp)
{
Ptrlist *l;
}
/* Patch the list of states at out to point to start. */
-void
+static void
patch(Ptrlist *l, State *s)
{
Ptrlist *next;
}
/* Join the two lists l1 and l2, returning the combination. */
-Ptrlist*
+static Ptrlist*
append(Ptrlist *l1, Ptrlist *l2)
{
Ptrlist *oldl1;
/*
* Convert postfix regular expression to NFA.
- * Return start state.
*/
-State*
+static NFA*
post2nfa(char *postfix)
{
char *p;
Frag stack[1000], *stackp, e1, e2, e;
State *s;
+ NFA *nfa;
// fprintf(stderr, "postfix: %s\n", postfix);
if(postfix == NULL)
return NULL;
+ nfa = malloc(sizeof(NFA));
+ if (nfa == NULL)
+ return NULL;
+ nfa->num_states = 1; /* The final (matching) state. */
+
#define push(s) *stackp++ = s
#define pop() *--stackp
switch(*p){
default:
s = state(*p, NULL, NULL);
+ nfa->num_states++;
push(frag(s, list1(&s->out)));
break;
case '.': /* catenate */
e2 = pop();
e1 = pop();
s = state(Split, e1.start, e2.start);
+ nfa->num_states++;
push(frag(s, append(e1.out, e2.out)));
break;
case '?': /* zero or one */
e = pop();
s = state(Split, e.start, NULL);
+ nfa->num_states++;
push(frag(s, append(e.out, list1(&s->out1))));
break;
case '*': /* zero or more */
e = pop();
s = state(Split, e.start, NULL);
+ nfa->num_states++;
patch(e.out, s);
push(frag(s, list1(&s->out1)));
break;
case '+': /* one or more */
e = pop();
s = state(Split, e.start, NULL);
+ nfa->num_states++;
patch(e.out, s);
push(frag(e.start, list1(&s->out1)));
break;
return NULL;
patch(e.out, &matchstate);
- return e.start;
+ nfa->start = e.start;
+ return nfa;
#undef pop
#undef push
}
State **s;
int n;
};
-List l1, l2;
-static int listid;
-void addstate(List*, State*);
-void step(List*, int, List*);
+typedef struct NState NState;
+struct NState
+{
+ struct List l;
+ int num_states;
+};
-/* Compute initial state list */
-List*
-startlist(State *start, List *l)
+static int
+inlist(List *l, State *s)
{
- l->n = 0;
- listid++;
- addstate(l, start);
- return l;
+ int i;
+ for (i = 0; i < l->n; i++)
+ if (l->s[i] == s)
+ return 1;
+
+ return 0;
}
/* Check whether state list contains a match. */
int
-ismatch(List *l)
+ismatch(NState *ns)
{
- int i;
-
- for(i=0; i<l->n; i++)
- if(l->s[i] == &matchstate)
- return 1;
- return 0;
+ return inlist(&ns->l, &matchstate);
}
/* Add s to l, following unlabeled arrows. */
-void
+static void
addstate(List *l, State *s)
{
- if(s == NULL || s->lastlist == listid)
+ if(s == NULL || inlist(l, s))
return;
- s->lastlist = listid;
if(s->c == Split){
/* follow unlabeled arrows */
addstate(l, s->out);
* to create next NFA state set nlist.
*/
void
-step(List *clist, int c, List *nlist)
+step(NState *ns, int c)
{
int i;
State *s;
+ List *srclist;
+ List dstlist;
- listid++;
- nlist->n = 0;
- for(i=0; i<clist->n; i++){
- s = clist->s[i];
+ /* Note that we encode input events as characters starting
+ * from A. (Yeah, it's kind of a hack.) */
+ c += 'A';
+
+ srclist = &ns->l;
+ dstlist.n = 0;
+ dstlist.s = alloca(ns->num_states * sizeof(State*));
+
+ /* Hack: The first state in the state list is _always_ the
+ * start state. That way, we match any input that has a
+ * matching suffix. */
+ addstate(&dstlist, srclist->s[0]);
+
+ for(i=0; i<srclist->n; i++){
+ s = srclist->s[i];
if(s->c == c)
- addstate(nlist, s->out);
+ addstate(&dstlist, s->out);
+ }
+
+ /* Copy dstlist back to srclist. */
+ srclist->n = dstlist.n;
+ memcpy(srclist->s, dstlist.s, ns->num_states * sizeof(State*));
+}
+
+NState*
+getstartstate(NFA *nfa)
+{
+ NState *start = malloc(sizeof(List));
+ State **s = malloc(nfa->num_states * sizeof(State*));
+ if (start == NULL || s == NULL){
+ free(start);
+ free(s);
+ return NULL;
}
+ start->l.n = 0;
+ start->l.s = s;
+ start->num_states = nfa->num_states;
+ addstate(&start->l, nfa->start);
+ return start;
}
-/* Run NFA to determine whether it matches s. */
-int
-match(State *start, char *s)
+void
+freenstate(NState *ns)
{
- int i, c;
- List *clist, *nlist, *t;
-
- clist = startlist(start, &l1);
- nlist = &l2;
- for(; *s; s++){
- c = *s & 0xFF;
- step(clist, c, nlist);
- t = clist; clist = nlist; nlist = t; /* swap clist, nlist */
+ if (ns != NULL){
+ free(ns->l.s);
+ free(ns);
}
- return ismatch(clist);
}
-int
-main(int argc, char **argv)
+NFA*
+compile_re(const char* re, const char** names, int num_names)
{
- int i;
char *post;
- State *start;
+ NFA *nfa;
- if(argc < 3){
- fprintf(stderr, "usage: nfa regexp string...\n");
- return 1;
- }
-
- post = re2post(argv[1]);
- if(post == NULL){
- fprintf(stderr, "bad regexp %s\n", argv[1]);
- return 1;
+ post = re2post(re, names, num_names);
+ if (post == NULL){
+ return NULL;
}
- start = post2nfa(post);
- if(start == NULL){
- fprintf(stderr, "error in post2nfa %s\n", post);
- return 1;
- }
-
- l1.s = malloc(nstate*sizeof l1.s[0]);
- l2.s = malloc(nstate*sizeof l2.s[0]);
- for(i=2; i<argc; i++)
- if(match(start, argv[i]))
- printf("%s\n", argv[i]);
- return 0;
+ nfa = post2nfa(post);
+ if (nfa == NULL)
+ return NULL;
+
+ return nfa;
}
/* The original license for this file is provided here but does not
--- /dev/null
+/* Copyright (c) 2010 Justin Seyster
+ Copyright (c) 2010 Erez Zadok
+ Copyright (c) 2010 Stony Brook University
+ Copyright (c) 2010 The Research Foundation of SUNY
+
+ This program is free software: you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License
+ and a copy of the GCC Runtime Library Exception along with this
+ program; see the files COPYING and COPYING.RUNTIME respectively.
+ If not, see <http://www.gnu.org/licenses/>. */
+
+/* The interface exposed to tracecut.c from our modified dfa1.c */
+
+struct NFA;
+struct NState;
+
+extern int ismatch (struct NState *ns);
+extern void step (struct NState *ns, int c);
+extern struct NState *getstartstate(struct NFA *nfa);
+extern void freenstate (struct NState *ns);
+extern struct NFA *compile_re(const char *re, const char **names,
+ int num_names);