aboutsummaryrefslogtreecommitdiff
path: root/circuitpython/lib/re1.5
diff options
context:
space:
mode:
authorRaghuram Subramani <raghus2247@gmail.com>2022-06-19 19:47:51 +0530
committerRaghuram Subramani <raghus2247@gmail.com>2022-06-19 19:47:51 +0530
commit4fd287655a72b9aea14cdac715ad5b90ed082ed2 (patch)
tree65d393bc0e699dd12d05b29ba568e04cea666207 /circuitpython/lib/re1.5
parent0150f70ce9c39e9e6dd878766c0620c85e47bed0 (diff)
add circuitpython code
Diffstat (limited to 'circuitpython/lib/re1.5')
-rw-r--r--circuitpython/lib/re1.5/charclass.c33
-rw-r--r--circuitpython/lib/re1.5/compilecode.c275
-rw-r--r--circuitpython/lib/re1.5/dumpcode.c65
-rw-r--r--circuitpython/lib/re1.5/re1.5.h156
-rw-r--r--circuitpython/lib/re1.5/recursiveloop.c87
5 files changed, 616 insertions, 0 deletions
diff --git a/circuitpython/lib/re1.5/charclass.c b/circuitpython/lib/re1.5/charclass.c
new file mode 100644
index 0000000..7f6388c
--- /dev/null
+++ b/circuitpython/lib/re1.5/charclass.c
@@ -0,0 +1,33 @@
+#include "re1.5.h"
+
+int _re1_5_classmatch(const char *pc, const char *sp)
+{
+ // pc points to "cnt" byte after opcode
+ int is_positive = (pc[-1] == Class);
+ int cnt = *pc++;
+ while (cnt--) {
+ if (*sp >= *pc && *sp <= pc[1]) return is_positive;
+ pc += 2;
+ }
+ return !is_positive;
+}
+
+int _re1_5_namedclassmatch(const char *pc, const char *sp)
+{
+ // pc points to name of class
+ int off = (*pc >> 5) & 1;
+ if ((*pc | 0x20) == 'd') {
+ if (!(*sp >= '0' && *sp <= '9')) {
+ off ^= 1;
+ }
+ } else if ((*pc | 0x20) == 's') {
+ if (!(*sp == ' ' || (*sp >= '\t' && *sp <= '\r'))) {
+ off ^= 1;
+ }
+ } else { // w
+ if (!((*sp >= 'A' && *sp <= 'Z') || (*sp >= 'a' && *sp <= 'z') || (*sp >= '0' && *sp <= '9') || *sp == '_')) {
+ off ^= 1;
+ }
+ }
+ return off;
+}
diff --git a/circuitpython/lib/re1.5/compilecode.c b/circuitpython/lib/re1.5/compilecode.c
new file mode 100644
index 0000000..936f6ed
--- /dev/null
+++ b/circuitpython/lib/re1.5/compilecode.c
@@ -0,0 +1,275 @@
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re1.5.h"
+
+#define INSERT_CODE(at, num, pc) \
+ ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
+#define REL(at, to) (to - at - 2)
+#define EMIT(at, byte) (code ? (code[at] = byte) : (at))
+#define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err))
+#define PC (prog->bytelen)
+
+static char unescape(char c) {
+ switch (c) {
+ case 'a':
+ return '\a';
+ case 'b':
+ return '\b';
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'v':
+ return '\v';
+ default:
+ return c;
+ }
+}
+
+
+static void _emit_checked(int at, char *code, int val, bool *err) {
+ *err |= val != (int8_t)val;
+ if (code) {
+ code[at] = val;
+ }
+}
+
+static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
+{
+ char *code = sizecode ? NULL : prog->insts;
+ bool err = false;
+ int start = PC;
+ int term = PC;
+ int alt_label = 0;
+
+ for (; *re && *re != ')'; re++) {
+ switch (*re) {
+ case '\\':
+ re++;
+ if (!*re) return NULL; // Trailing backslash
+ term = PC;
+ if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
+ EMIT(PC++, NamedClass);
+ EMIT(PC++, *re);
+ } else {
+ EMIT(PC++, Char);
+ EMIT(PC++, unescape(*re));
+ }
+ prog->len++;
+ break;
+ default:
+ term = PC;
+ EMIT(PC++, Char);
+ EMIT(PC++, *re);
+ prog->len++;
+ break;
+ case '.':
+ term = PC;
+ EMIT(PC++, Any);
+ prog->len++;
+ break;
+ case '[': {
+ int cnt;
+ term = PC;
+ re++;
+ if (*re == '^') {
+ EMIT(PC++, ClassNot);
+ re++;
+ } else {
+ EMIT(PC++, Class);
+ }
+ PC++; // Skip # of pair byte
+ prog->len++;
+ for (cnt = 0; *re != ']'; re++, cnt++) {
+ if (!*re) return NULL;
+ const char *b = re;
+ if (*re == '\\') {
+ re += 1;
+ if (!*re) return NULL; // Trailing backslash
+ EMIT(PC++, unescape(*re));
+ } else {
+ EMIT(PC++, *re);
+ }
+ if (re[1] == '-' && re[2] != ']') {
+ re += 2;
+ } else {
+ re = b;
+ }
+ if (*re == '\\') {
+ re += 1;
+ if (!*re) return NULL; // Trailing backslash
+ EMIT(PC++, unescape(*re));
+ } else {
+ EMIT(PC++, *re);
+ }
+ }
+ EMIT_CHECKED(term + 1, cnt);
+ break;
+ }
+ case '(': {
+ term = PC;
+ int sub = 0;
+ int capture = re[1] != '?' || re[2] != ':';
+
+ if (capture) {
+ sub = ++prog->sub;
+ EMIT(PC++, Save);
+ EMIT_CHECKED(PC++, 2 * sub);
+ prog->len++;
+ } else {
+ re += 2;
+ }
+
+ re = _compilecode(re + 1, prog, sizecode);
+ if (re == NULL || *re != ')') return NULL; // error, or no matching paren
+
+ if (capture) {
+ EMIT(PC++, Save);
+ EMIT_CHECKED(PC++, 2 * sub + 1);
+ prog->len++;
+ }
+
+ break;
+ }
+ case '?':
+ if (PC == term) return NULL; // nothing to repeat
+ INSERT_CODE(term, 2, PC);
+ if (re[1] == '?') {
+ EMIT(term, RSplit);
+ re++;
+ } else {
+ EMIT(term, Split);
+ }
+ EMIT_CHECKED(term + 1, REL(term, PC));
+ prog->len++;
+ term = PC;
+ break;
+ case '*':
+ if (PC == term) return NULL; // nothing to repeat
+ INSERT_CODE(term, 2, PC);
+ EMIT(PC, Jmp);
+ EMIT_CHECKED(PC + 1, REL(PC, term));
+ PC += 2;
+ if (re[1] == '?') {
+ EMIT(term, RSplit);
+ re++;
+ } else {
+ EMIT(term, Split);
+ }
+ EMIT_CHECKED(term + 1, REL(term, PC));
+ prog->len += 2;
+ term = PC;
+ break;
+ case '+':
+ if (PC == term) return NULL; // nothing to repeat
+ if (re[1] == '?') {
+ EMIT(PC, Split);
+ re++;
+ } else {
+ EMIT(PC, RSplit);
+ }
+ EMIT_CHECKED(PC + 1, REL(PC, term));
+ PC += 2;
+ prog->len++;
+ term = PC;
+ break;
+ case '|':
+ if (alt_label) {
+ EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
+ }
+ INSERT_CODE(start, 2, PC);
+ EMIT(PC++, Jmp);
+ alt_label = PC++;
+ EMIT(start, Split);
+ EMIT_CHECKED(start + 1, REL(start, PC));
+ prog->len += 2;
+ term = PC;
+ break;
+ case '^':
+ EMIT(PC++, Bol);
+ prog->len++;
+ term = PC;
+ break;
+ case '$':
+ EMIT(PC++, Eol);
+ prog->len++;
+ term = PC;
+ break;
+ }
+ }
+
+ if (alt_label) {
+ EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
+ }
+ return err ? NULL : re;
+}
+
+int re1_5_sizecode(const char *re)
+{
+ ByteProg dummyprog = {
+ // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
+ .bytelen = 5 + NON_ANCHORED_PREFIX
+ };
+
+ if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
+
+ return dummyprog.bytelen;
+}
+
+int re1_5_compilecode(ByteProg *prog, const char *re)
+{
+ prog->len = 0;
+ prog->bytelen = 0;
+ prog->sub = 0;
+
+ // Add code to implement non-anchored operation ("search"),
+ // for anchored operation ("match"), this code will be just skipped.
+ // TODO: Implement search in much more efficient manner
+ prog->insts[prog->bytelen++] = RSplit;
+ prog->insts[prog->bytelen++] = 3;
+ prog->insts[prog->bytelen++] = Any;
+ prog->insts[prog->bytelen++] = Jmp;
+ prog->insts[prog->bytelen++] = -5;
+ prog->len += 3;
+
+ prog->insts[prog->bytelen++] = Save;
+ prog->insts[prog->bytelen++] = 0;
+ prog->len++;
+
+ re = _compilecode(re, prog, /*sizecode*/0);
+ if (re == NULL || *re) return 1;
+
+ prog->insts[prog->bytelen++] = Save;
+ prog->insts[prog->bytelen++] = 1;
+ prog->len++;
+
+ prog->insts[prog->bytelen++] = Match;
+ prog->len++;
+
+ return 0;
+}
+
+#if defined(DEBUG_COMPILECODE)
+#include <assert.h>
+void re1_5_fatal(char *x) {
+ fprintf(stderr, "%s\n", x);
+ abort();
+}
+
+int main(int argc, char *argv[])
+{
+ char *re_str = argv[1];
+ int size = re1_5_sizecode(re_str);
+ ByteProg *code = malloc(sizeof(ByteProg) + size);
+ int ret = re1_5_compilecode(code, re_str);
+ if (ret == 0) {
+ re1_5_dumpcode(code);
+ }
+}
+#endif
diff --git a/circuitpython/lib/re1.5/dumpcode.c b/circuitpython/lib/re1.5/dumpcode.c
new file mode 100644
index 0000000..d7781d8
--- /dev/null
+++ b/circuitpython/lib/re1.5/dumpcode.c
@@ -0,0 +1,65 @@
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re1.5.h"
+
+void re1_5_dumpcode(ByteProg *prog)
+{
+ int pc = 0;
+ char *code = prog->insts;
+ while (pc < prog->bytelen) {
+ printf("%2d: ", pc);
+ switch(code[pc++]) {
+ default:
+ assert(0);
+// re1_5_fatal("printprog");
+ case Split:
+ printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+ pc++;
+ break;
+ case RSplit:
+ printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+ pc++;
+ break;
+ case Jmp:
+ printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+ pc++;
+ break;
+ case Char:
+ printf("char %c\n", code[pc++]);
+ break;
+ case Any:
+ printf("any\n");
+ break;
+ case Class:
+ case ClassNot: {
+ int num = code[pc];
+ printf("class%s %d", (code[pc - 1] == ClassNot ? "not" : ""), num);
+ pc++;
+ while (num--) {
+ printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]);
+ pc += 2;
+ }
+ printf("\n");
+ break;
+ }
+ case NamedClass:
+ printf("namedclass %c\n", code[pc++]);
+ break;
+ case Match:
+ printf("match\n");
+ break;
+ case Save:
+ printf("save %d\n", (unsigned char)code[pc++]);
+ break;
+ case Bol:
+ printf("assert bol\n");
+ break;
+ case Eol:
+ printf("assert eol\n");
+ break;
+ }
+ }
+ printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len);
+}
diff --git a/circuitpython/lib/re1.5/re1.5.h b/circuitpython/lib/re1.5/re1.5.h
new file mode 100644
index 0000000..995e2d4
--- /dev/null
+++ b/circuitpython/lib/re1.5/re1.5.h
@@ -0,0 +1,156 @@
+// Copyright 2007-2009 Russ Cox. All Rights Reserved.
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef _RE1_5_REGEXP__H
+#define _RE1_5_REGEXP__H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#define nil ((void*)0)
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+
+typedef struct Regexp Regexp;
+typedef struct Prog Prog;
+typedef struct ByteProg ByteProg;
+typedef struct Inst Inst;
+typedef struct Subject Subject;
+
+struct Regexp
+{
+ int type;
+ int n;
+ int ch;
+ Regexp *left;
+ Regexp *right;
+};
+
+enum /* Regexp.type */
+{
+ Alt = 1,
+ Cat,
+ Lit,
+ Dot,
+ Paren,
+ Quest,
+ Star,
+ Plus,
+};
+
+Regexp *parse(char*);
+Regexp *reg(int type, Regexp *left, Regexp *right);
+void printre(Regexp*);
+#ifndef re1_5_fatal
+void re1_5_fatal(char*);
+#endif
+#ifndef re1_5_stack_chk
+#define re1_5_stack_chk()
+#endif
+void *mal(int);
+
+struct Prog
+{
+ Inst *start;
+ int len;
+};
+
+struct ByteProg
+{
+ int bytelen;
+ int len;
+ int sub;
+ char insts[0];
+};
+
+struct Inst
+{
+ int opcode;
+ int c;
+ int n;
+ Inst *x;
+ Inst *y;
+ int gen; // global state, oooh!
+};
+
+enum /* Inst.opcode */
+{
+ // Instructions which consume input bytes (and thus fail if none left)
+ CONSUMERS = 1,
+ Char = CONSUMERS,
+ Any,
+ Class,
+ ClassNot,
+ NamedClass,
+
+ ASSERTS = 0x50,
+ Bol = ASSERTS,
+ Eol,
+
+ // Instructions which take relative offset as arg
+ JUMPS = 0x60,
+ Jmp = JUMPS,
+ Split,
+ RSplit,
+
+ // Other (special) instructions
+ Save = 0x7e,
+ Match = 0x7f,
+};
+
+#define inst_is_consumer(inst) ((inst) < ASSERTS)
+#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS)
+
+Prog *compile(Regexp*);
+void printprog(Prog*);
+
+extern int gen;
+
+enum {
+ MAXSUB = 20
+};
+
+typedef struct Sub Sub;
+
+struct Sub
+{
+ int ref;
+ int nsub;
+ const char *sub[MAXSUB];
+};
+
+Sub *newsub(int n);
+Sub *incref(Sub*);
+Sub *copy(Sub*);
+Sub *update(Sub*, int, const char*);
+void decref(Sub*);
+
+struct Subject {
+ const char *begin;
+ const char *end;
+};
+
+
+#define NON_ANCHORED_PREFIX 5
+#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode))
+
+int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int);
+int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int);
+int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int);
+int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int);
+int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int);
+
+int re1_5_sizecode(const char *re);
+int re1_5_compilecode(ByteProg *prog, const char *re);
+void re1_5_dumpcode(ByteProg *prog);
+void cleanmarks(ByteProg *prog);
+int _re1_5_classmatch(const char *pc, const char *sp);
+int _re1_5_namedclassmatch(const char *pc, const char *sp);
+
+#endif /*_RE1_5_REGEXP__H*/
diff --git a/circuitpython/lib/re1.5/recursiveloop.c b/circuitpython/lib/re1.5/recursiveloop.c
new file mode 100644
index 0000000..f8cb926
--- /dev/null
+++ b/circuitpython/lib/re1.5/recursiveloop.c
@@ -0,0 +1,87 @@
+// Copyright 2007-2009 Russ Cox. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re1.5.h"
+
+static int
+recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
+{
+ const char *old;
+ int off;
+
+ re1_5_stack_chk();
+
+ for(;;) {
+ if(inst_is_consumer(*pc)) {
+ // If we need to match a character, but there's none left, it's fail
+ if(sp >= input->end)
+ return 0;
+ }
+ switch(*pc++) {
+ case Char:
+ if(*sp != *pc++)
+ return 0;
+ MP_FALLTHROUGH
+ case Any:
+ sp++;
+ continue;
+ case Class:
+ case ClassNot:
+ if (!_re1_5_classmatch(pc, sp))
+ return 0;
+ pc += *(unsigned char*)pc * 2 + 1;
+ sp++;
+ continue;
+ case NamedClass:
+ if (!_re1_5_namedclassmatch(pc, sp))
+ return 0;
+ pc++;
+ sp++;
+ continue;
+ case Match:
+ return 1;
+ case Jmp:
+ off = (signed char)*pc++;
+ pc = pc + off;
+ continue;
+ case Split:
+ off = (signed char)*pc++;
+ if(recursiveloop(pc, sp, input, subp, nsubp))
+ return 1;
+ pc = pc + off;
+ continue;
+ case RSplit:
+ off = (signed char)*pc++;
+ if(recursiveloop(pc + off, sp, input, subp, nsubp))
+ return 1;
+ continue;
+ case Save:
+ off = (unsigned char)*pc++;
+ if(off >= nsubp) {
+ continue;
+ }
+ old = subp[off];
+ subp[off] = sp;
+ if(recursiveloop(pc, sp, input, subp, nsubp))
+ return 1;
+ subp[off] = old;
+ return 0;
+ case Bol:
+ if(sp != input->begin)
+ return 0;
+ continue;
+ case Eol:
+ if(sp != input->end)
+ return 0;
+ continue;
+ }
+ re1_5_fatal("recursiveloop");
+ }
+}
+
+int
+re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
+{
+ return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
+}