diff options
Diffstat (limited to 'circuitpython/lib/re1.5')
-rw-r--r-- | circuitpython/lib/re1.5/charclass.c | 33 | ||||
-rw-r--r-- | circuitpython/lib/re1.5/compilecode.c | 275 | ||||
-rw-r--r-- | circuitpython/lib/re1.5/dumpcode.c | 65 | ||||
-rw-r--r-- | circuitpython/lib/re1.5/re1.5.h | 156 | ||||
-rw-r--r-- | circuitpython/lib/re1.5/recursiveloop.c | 87 |
5 files changed, 616 insertions, 0 deletions
diff --git a/circuitpython/lib/re1.5/charclass.c b/circuitpython/lib/re1.5/charclass.c new file mode 100644 index 0000000..7f6388c --- /dev/null +++ b/circuitpython/lib/re1.5/charclass.c @@ -0,0 +1,33 @@ +#include "re1.5.h" + +int _re1_5_classmatch(const char *pc, const char *sp) +{ + // pc points to "cnt" byte after opcode + int is_positive = (pc[-1] == Class); + int cnt = *pc++; + while (cnt--) { + if (*sp >= *pc && *sp <= pc[1]) return is_positive; + pc += 2; + } + return !is_positive; +} + +int _re1_5_namedclassmatch(const char *pc, const char *sp) +{ + // pc points to name of class + int off = (*pc >> 5) & 1; + if ((*pc | 0x20) == 'd') { + if (!(*sp >= '0' && *sp <= '9')) { + off ^= 1; + } + } else if ((*pc | 0x20) == 's') { + if (!(*sp == ' ' || (*sp >= '\t' && *sp <= '\r'))) { + off ^= 1; + } + } else { // w + if (!((*sp >= 'A' && *sp <= 'Z') || (*sp >= 'a' && *sp <= 'z') || (*sp >= '0' && *sp <= '9') || *sp == '_')) { + off ^= 1; + } + } + return off; +} diff --git a/circuitpython/lib/re1.5/compilecode.c b/circuitpython/lib/re1.5/compilecode.c new file mode 100644 index 0000000..936f6ed --- /dev/null +++ b/circuitpython/lib/re1.5/compilecode.c @@ -0,0 +1,275 @@ +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +#define INSERT_CODE(at, num, pc) \ + ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num) +#define REL(at, to) (to - at - 2) +#define EMIT(at, byte) (code ? (code[at] = byte) : (at)) +#define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err)) +#define PC (prog->bytelen) + +static char unescape(char c) { + switch (c) { + case 'a': + return '\a'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'v': + return '\v'; + default: + return c; + } +} + + +static void _emit_checked(int at, char *code, int val, bool *err) { + *err |= val != (int8_t)val; + if (code) { + code[at] = val; + } +} + +static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) +{ + char *code = sizecode ? NULL : prog->insts; + bool err = false; + int start = PC; + int term = PC; + int alt_label = 0; + + for (; *re && *re != ')'; re++) { + switch (*re) { + case '\\': + re++; + if (!*re) return NULL; // Trailing backslash + term = PC; + if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') { + EMIT(PC++, NamedClass); + EMIT(PC++, *re); + } else { + EMIT(PC++, Char); + EMIT(PC++, unescape(*re)); + } + prog->len++; + break; + default: + term = PC; + EMIT(PC++, Char); + EMIT(PC++, *re); + prog->len++; + break; + case '.': + term = PC; + EMIT(PC++, Any); + prog->len++; + break; + case '[': { + int cnt; + term = PC; + re++; + if (*re == '^') { + EMIT(PC++, ClassNot); + re++; + } else { + EMIT(PC++, Class); + } + PC++; // Skip # of pair byte + prog->len++; + for (cnt = 0; *re != ']'; re++, cnt++) { + if (!*re) return NULL; + const char *b = re; + if (*re == '\\') { + re += 1; + if (!*re) return NULL; // Trailing backslash + EMIT(PC++, unescape(*re)); + } else { + EMIT(PC++, *re); + } + if (re[1] == '-' && re[2] != ']') { + re += 2; + } else { + re = b; + } + if (*re == '\\') { + re += 1; + if (!*re) return NULL; // Trailing backslash + EMIT(PC++, unescape(*re)); + } else { + EMIT(PC++, *re); + } + } + EMIT_CHECKED(term + 1, cnt); + break; + } + case '(': { + term = PC; + int sub = 0; + int capture = re[1] != '?' || re[2] != ':'; + + if (capture) { + sub = ++prog->sub; + EMIT(PC++, Save); + EMIT_CHECKED(PC++, 2 * sub); + prog->len++; + } else { + re += 2; + } + + re = _compilecode(re + 1, prog, sizecode); + if (re == NULL || *re != ')') return NULL; // error, or no matching paren + + if (capture) { + EMIT(PC++, Save); + EMIT_CHECKED(PC++, 2 * sub + 1); + prog->len++; + } + + break; + } + case '?': + if (PC == term) return NULL; // nothing to repeat + INSERT_CODE(term, 2, PC); + if (re[1] == '?') { + EMIT(term, RSplit); + re++; + } else { + EMIT(term, Split); + } + EMIT_CHECKED(term + 1, REL(term, PC)); + prog->len++; + term = PC; + break; + case '*': + if (PC == term) return NULL; // nothing to repeat + INSERT_CODE(term, 2, PC); + EMIT(PC, Jmp); + EMIT_CHECKED(PC + 1, REL(PC, term)); + PC += 2; + if (re[1] == '?') { + EMIT(term, RSplit); + re++; + } else { + EMIT(term, Split); + } + EMIT_CHECKED(term + 1, REL(term, PC)); + prog->len += 2; + term = PC; + break; + case '+': + if (PC == term) return NULL; // nothing to repeat + if (re[1] == '?') { + EMIT(PC, Split); + re++; + } else { + EMIT(PC, RSplit); + } + EMIT_CHECKED(PC + 1, REL(PC, term)); + PC += 2; + prog->len++; + term = PC; + break; + case '|': + if (alt_label) { + EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1); + } + INSERT_CODE(start, 2, PC); + EMIT(PC++, Jmp); + alt_label = PC++; + EMIT(start, Split); + EMIT_CHECKED(start + 1, REL(start, PC)); + prog->len += 2; + term = PC; + break; + case '^': + EMIT(PC++, Bol); + prog->len++; + term = PC; + break; + case '$': + EMIT(PC++, Eol); + prog->len++; + term = PC; + break; + } + } + + if (alt_label) { + EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1); + } + return err ? NULL : re; +} + +int re1_5_sizecode(const char *re) +{ + ByteProg dummyprog = { + // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code + .bytelen = 5 + NON_ANCHORED_PREFIX + }; + + if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1; + + return dummyprog.bytelen; +} + +int re1_5_compilecode(ByteProg *prog, const char *re) +{ + prog->len = 0; + prog->bytelen = 0; + prog->sub = 0; + + // Add code to implement non-anchored operation ("search"), + // for anchored operation ("match"), this code will be just skipped. + // TODO: Implement search in much more efficient manner + prog->insts[prog->bytelen++] = RSplit; + prog->insts[prog->bytelen++] = 3; + prog->insts[prog->bytelen++] = Any; + prog->insts[prog->bytelen++] = Jmp; + prog->insts[prog->bytelen++] = -5; + prog->len += 3; + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 0; + prog->len++; + + re = _compilecode(re, prog, /*sizecode*/0); + if (re == NULL || *re) return 1; + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 1; + prog->len++; + + prog->insts[prog->bytelen++] = Match; + prog->len++; + + return 0; +} + +#if defined(DEBUG_COMPILECODE) +#include <assert.h> +void re1_5_fatal(char *x) { + fprintf(stderr, "%s\n", x); + abort(); +} + +int main(int argc, char *argv[]) +{ + char *re_str = argv[1]; + int size = re1_5_sizecode(re_str); + ByteProg *code = malloc(sizeof(ByteProg) + size); + int ret = re1_5_compilecode(code, re_str); + if (ret == 0) { + re1_5_dumpcode(code); + } +} +#endif diff --git a/circuitpython/lib/re1.5/dumpcode.c b/circuitpython/lib/re1.5/dumpcode.c new file mode 100644 index 0000000..d7781d8 --- /dev/null +++ b/circuitpython/lib/re1.5/dumpcode.c @@ -0,0 +1,65 @@ +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +void re1_5_dumpcode(ByteProg *prog) +{ + int pc = 0; + char *code = prog->insts; + while (pc < prog->bytelen) { + printf("%2d: ", pc); + switch(code[pc++]) { + default: + assert(0); +// re1_5_fatal("printprog"); + case Split: + printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case RSplit: + printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case Jmp: + printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case Char: + printf("char %c\n", code[pc++]); + break; + case Any: + printf("any\n"); + break; + case Class: + case ClassNot: { + int num = code[pc]; + printf("class%s %d", (code[pc - 1] == ClassNot ? "not" : ""), num); + pc++; + while (num--) { + printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]); + pc += 2; + } + printf("\n"); + break; + } + case NamedClass: + printf("namedclass %c\n", code[pc++]); + break; + case Match: + printf("match\n"); + break; + case Save: + printf("save %d\n", (unsigned char)code[pc++]); + break; + case Bol: + printf("assert bol\n"); + break; + case Eol: + printf("assert eol\n"); + break; + } + } + printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len); +} diff --git a/circuitpython/lib/re1.5/re1.5.h b/circuitpython/lib/re1.5/re1.5.h new file mode 100644 index 0000000..995e2d4 --- /dev/null +++ b/circuitpython/lib/re1.5/re1.5.h @@ -0,0 +1,156 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef _RE1_5_REGEXP__H +#define _RE1_5_REGEXP__H + +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <assert.h> + +#define nil ((void*)0) +#define nelem(x) (sizeof(x)/sizeof((x)[0])) + +typedef struct Regexp Regexp; +typedef struct Prog Prog; +typedef struct ByteProg ByteProg; +typedef struct Inst Inst; +typedef struct Subject Subject; + +struct Regexp +{ + int type; + int n; + int ch; + Regexp *left; + Regexp *right; +}; + +enum /* Regexp.type */ +{ + Alt = 1, + Cat, + Lit, + Dot, + Paren, + Quest, + Star, + Plus, +}; + +Regexp *parse(char*); +Regexp *reg(int type, Regexp *left, Regexp *right); +void printre(Regexp*); +#ifndef re1_5_fatal +void re1_5_fatal(char*); +#endif +#ifndef re1_5_stack_chk +#define re1_5_stack_chk() +#endif +void *mal(int); + +struct Prog +{ + Inst *start; + int len; +}; + +struct ByteProg +{ + int bytelen; + int len; + int sub; + char insts[0]; +}; + +struct Inst +{ + int opcode; + int c; + int n; + Inst *x; + Inst *y; + int gen; // global state, oooh! +}; + +enum /* Inst.opcode */ +{ + // Instructions which consume input bytes (and thus fail if none left) + CONSUMERS = 1, + Char = CONSUMERS, + Any, + Class, + ClassNot, + NamedClass, + + ASSERTS = 0x50, + Bol = ASSERTS, + Eol, + + // Instructions which take relative offset as arg + JUMPS = 0x60, + Jmp = JUMPS, + Split, + RSplit, + + // Other (special) instructions + Save = 0x7e, + Match = 0x7f, +}; + +#define inst_is_consumer(inst) ((inst) < ASSERTS) +#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS) + +Prog *compile(Regexp*); +void printprog(Prog*); + +extern int gen; + +enum { + MAXSUB = 20 +}; + +typedef struct Sub Sub; + +struct Sub +{ + int ref; + int nsub; + const char *sub[MAXSUB]; +}; + +Sub *newsub(int n); +Sub *incref(Sub*); +Sub *copy(Sub*); +Sub *update(Sub*, int, const char*); +void decref(Sub*); + +struct Subject { + const char *begin; + const char *end; +}; + + +#define NON_ANCHORED_PREFIX 5 +#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode)) + +int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int); +int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int); +int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int); +int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int); +int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int); + +int re1_5_sizecode(const char *re); +int re1_5_compilecode(ByteProg *prog, const char *re); +void re1_5_dumpcode(ByteProg *prog); +void cleanmarks(ByteProg *prog); +int _re1_5_classmatch(const char *pc, const char *sp); +int _re1_5_namedclassmatch(const char *pc, const char *sp); + +#endif /*_RE1_5_REGEXP__H*/ diff --git a/circuitpython/lib/re1.5/recursiveloop.c b/circuitpython/lib/re1.5/recursiveloop.c new file mode 100644 index 0000000..f8cb926 --- /dev/null +++ b/circuitpython/lib/re1.5/recursiveloop.c @@ -0,0 +1,87 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +static int +recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp) +{ + const char *old; + int off; + + re1_5_stack_chk(); + + for(;;) { + if(inst_is_consumer(*pc)) { + // If we need to match a character, but there's none left, it's fail + if(sp >= input->end) + return 0; + } + switch(*pc++) { + case Char: + if(*sp != *pc++) + return 0; + MP_FALLTHROUGH + case Any: + sp++; + continue; + case Class: + case ClassNot: + if (!_re1_5_classmatch(pc, sp)) + return 0; + pc += *(unsigned char*)pc * 2 + 1; + sp++; + continue; + case NamedClass: + if (!_re1_5_namedclassmatch(pc, sp)) + return 0; + pc++; + sp++; + continue; + case Match: + return 1; + case Jmp: + off = (signed char)*pc++; + pc = pc + off; + continue; + case Split: + off = (signed char)*pc++; + if(recursiveloop(pc, sp, input, subp, nsubp)) + return 1; + pc = pc + off; + continue; + case RSplit: + off = (signed char)*pc++; + if(recursiveloop(pc + off, sp, input, subp, nsubp)) + return 1; + continue; + case Save: + off = (unsigned char)*pc++; + if(off >= nsubp) { + continue; + } + old = subp[off]; + subp[off] = sp; + if(recursiveloop(pc, sp, input, subp, nsubp)) + return 1; + subp[off] = old; + return 0; + case Bol: + if(sp != input->begin) + return 0; + continue; + case Eol: + if(sp != input->end) + return 0; + continue; + } + re1_5_fatal("recursiveloop"); + } +} + +int +re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored) +{ + return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp); +} |