#ifndef SLC_H #define SLC_H #include // uint8_t, uint32_t.. #include // memcpy, strlen.. #include // NULL #include // isspace /* Define this if you want to debug the engine doing its thing */ /* #define SLC_DEBUG */ /* Session offset type - defaults to 64 bit because of union types enable storage for it often, but you can override */ #ifndef SLOFFS_T #define SLOFFS_T uint64_t #endif /* Maximum length of words - defaults but you can override; Adds +1 to this for null terminator */ #ifndef SL_MAX_WORD_NAME #define SL_MAX_WORD_NAME 255 #endif union word_body { SLOFFS_T offset; void *ptr; }; typedef union word_body word_body; /** Possible word types */ enum SLC_WORDTYP { /** Still in plain text */ SLC_WORDTYP_TEXT = 0, /** Native code, use get_word_storage_offset to get what to run (relative pointer or array of pointers) */ SLC_WORDTYP_NATIVE = 1, /** "Threaded code" (utf16-like word offsets) and encoded parentheses IN-PLACE inlined where text was before */ SLC_WORDTYP_THREADED_INLINE = 2, /** "Threaded code" that did not fit in-place and is thus stored in session storage, word offset tells where */ SLC_WORDTYP_THREADED_SESSION = 3 }; typedef enum SLC_WORDTYP SLC_WORDTYP; /** Gets padding bytes for a memory address to be padded to alignment */ static inline int get_padding(uint8_t *ptr, int alignment) { // return (alignment - (ptr % alignment)) % alignment; return (ptrdiff_t)(ptr + alignment - 1) / alignment * alignment; } /** Tells if c ends a line (that is either \n or \r) */ static inline char endsline(char c) { return (c == '\n') || (c == '\r'); } /** * A word definition starts right after this. After processing it, we inline overwrite random parts of it in memory... * * Examples: * * #: just_code * #dup * #inc * #swap * # * * : with_vars @a; @b; @c; * @a * inc * @a(.) * ; * * #builtin: to_prefix * #swap * #dup * * ^^This built-in generates threaded code / text while processing in a way so that the * to_prefix becomes prefixed with the "right" prefix and '#' exchanged with the current! * * ^^The above always needs starting '#builtin' at the definition and inside. That is exchanged to real prefix... * This is used for implementation implementing built-ins with FORTH-like code instead of native (saves native interpret. space) * * #: structural ( * #parse_num * #dup * #inc * #swap * #print * #print * ) [ #parse_num #print ] { #parse_num #print } * # */ struct word { /** Defines how to understand this word */ uint8_t flags; uint8_t reserved; // Padding to ensure alignment of vars (4byte) */ uint16_t var_count; // can be zero /* XXX: name is stored in symbol table only, not stored here */ uint32_t first_var; /* uint32_t ..vars[]; // The local variables memories for the word. Like "@a @b @c". Can be empty! */ /* Possibly overlapping with first_var (var_count == 0) possibly using that as padding... */ /* EITHER: */ /* word_body processed_body; // Where to find the body data (ptr or offset) if its not "inline_data[]" */ /* OR: */ /* uint8_t inline_data[]; // The "body" - either text src (contains "ender") or inline threaded code (zero term) */ }; typedef struct word word; /** Gets the wordtyp from a flags field - see word */ static inline SLC_WORDTYP word_type(uint8_t flags) { return (SLC_WORDTYP) (flags >> 6); } /** Gets the variable array of the given word */ static inline uint32_t *word_vars(word *w) { return &(w->first_var); } /** Gets the (inline or processed - same addr) "data" of the word. */ static inline uint8_t *word_inline_data(word *w) { uint32_t *vars = (uint32_t *)word_vars(w); uint16_t vcnt = w->var_count; // Might be overlapping "first_var" in case we had no vars uint8_t *after_wars = (uint8_t *)(vars + vcnt); int padding = get_padding(after_wars, 8); after_wars += padding; return after_wars; // no more wars } /** Gets the processed body of the word. */ static word_body word_processed_body(word *w) { word_body *body = (word_body *)word_inline_data(w); return *body; } enum SLC_SYM_OP { SLC_SYM_SET, SLC_SYM_GET, SLC_SYM_ERASE }; typedef enum SLC_SYM_OP SLC_SYM_OP; enum SLC_STACK_OP { SLC_STACK_PUSH, SLC_STACK_POP, SLC_STACK_AT, SLC_STACK_COUNT, SLC_STACK_ERASE }; typedef enum SLC_STACK_OP SLC_STACK_OP; enum SLC_SESSION_OP { SLC_SESSION_ALLOC, SLC_SESSION_ERASE, SLC_SESSION_PUSH, SLC_SESSION_GET, SLC_SESSION_SET, SLC_SESSION_GET32, SLC_SESSION_SET32, SLC_SESSION_PROCESS }; typedef enum SLC_SESSION_OP SLC_SESSION_OP; enum SLC_IO_OP { SLC_IO_OPEN, SLC_IO_OPEN_TMP, SLC_IO_REMOVE_TMPS, SLC_IO_CLOSE, SLC_IO_READ, SLC_IO_WRITE, SLC_IO_LOCK, SLC_IO_UNLOCK, SLC_IO_CMD, }; typedef enum SLC_IO_OP SLC_IO_OP; union symptr { uint32_t *varp; word *worp; }; typedef union symptr symptr; struct do_not_save_charptr { char *ptr; }; typedef struct do_not_save_charptr do_not_save_charptr; /** * Function-abstraction for a "symbol-table". * * Operations: * * SLC_SYM_SET Saves a mapping from key->word in symbol table. word==NULL removes mapping. Returns ptr back or NULL on errors. * SLC_SYM_GET Gets the symbol at key (the word parameter is unused). Returns NULL if there is no word for the key. * SLC_SYM_ERASE Erases the symbol table so it becomes empty again. Can never fail, returns NULL. * * Rem.: On GET we return a word* in case the key is to a word and a regular uint32_t* if its a variable name! * * @param op Defines which operation the caller wants. * @param key The key (both for SET and GET). This pointer can get easily invalidated so you might need a copy or you do Trie, etc. * @param ptr When adding a found word/variable to the symbol table, the key will point to this word* or uint32_t* * @returns The word/var definition stored for the key, or NULL when it is not stored yet or op is SET and there was an error. */ typedef symptr (*sym)(SLC_SYM_OP op, do_not_save_charptr key, symptr ptr); /** * Function-abstraction for an integer "stack". * * Operations: * * SLC_STACK_PUSH pushes the "elem" to the stack. Returns 1 if succeeded, otherwise 0. * SLC_STACK_POP pops the stack - does not return meaningful value, beware of underflowing! * SLC_STACK_AT returns the "param"th element down from the top of the stack * SLC_STACK_COUNT returns the number of elements in the stack * SLC_STACK_ERASE Makes the stack empty. Basically as if you would POP the COUNT times. * * @param op Defines which operation the caller wants. * @param param On SLC_STACK_PUSH, this is the element to push onto the stack, in case of SLC_STACK_AT, its the index. * @return The element at the given stack location in case of SLC_STACK_AT or the count in case of SLC_STACK_COUNT. Can show error! */ typedef uint32_t (*stack)(SLC_STACK_OP op, uint32_t param); /** * Function-abstraction for a "session-storage". * * Operations: * SLC_SESSION_ALLOC allocates parameter amount of memory and returns an accessor index. * SLC_SESSION_ERASE erase the session storage (all of it) - all parameters are unused * SLC_SESSION_PUSH adds the given byte (value in i) to the end of the session storage (by growing it) - j unused * SLC_SESSION_GET gets byte at the ith accessor index - j unused * SLC_SESSION_SET gets byte at the ith accessor index to be of (byte)j * SLC_SESSION_GET32 gets uint32_t at the ith accessor index - j unused. XXX: Beware, architectures unaligned access crash! * SLC_SESSION_SET32 gets uint32_t at the ith accessor index to be of j. XXX: Beware, architectures unaligned access crash! * SLC_SESSION_PROCESS gets the last j bytes and moves them overriding bytes at index i, then "shrinks" the storage by j. * * @param op Defines which operation the caller wants. * @param i Used on SESSION_GET and is the accessor index, in case of SESSIN_ALLOC it is the amount to allocate. * @param j Used on SESSION_SET as the byte value and on SESSION_PROCESS as the amount of bytes to "process" / shrink. * @returns The accessor index in case of ALLOC (0xFFFFFFFF == -1 means error), on get it returns the store BYTE as uint32_t */ typedef uint32_t (*session)(SLC_SESSION_OP op, uint32_t i, uint32_t j); union iores { /** Either the terminal cmd result or the handle pointer. NULL means some kind of error */ const char *ptr; /** The read character */ char c; }; typedef union iores iores; /** * Function-abstraction for io connectors. * * Operations: * * SLC_IO_OPEN Opens a PERSISTENT file with the given name. Returns the handle pointer (or NULL on error). * SLC_IO_CLOSE Closes a PERSISTENT file with the given handle. * SLC_IO_OPEN_TMP Opens a TEMPORARY file with the given name. Returns the handle pointer. * SLC_IO_CLOSE_TMP Removes the TEMPORARY file with the given handle. * SLC_IO_READ Reads a character from the given file handle. Returns '\0' on EOF and being out of data! * SLC_IO_WRITE Writes a character to the given file handle. The 'param' points to the character to write (1 byte) * SLC_IO_LOCK Locks the given file handle for exclusive reads and writes (others need to use lock/unlock too) * SLC_IO_UNLOCK Locks the given file handle for exclusive reads and writes (others need to use lock/unlock too) * SLC_IO_CMD Runs the given command on the operating system. The 'param' is the command (+args) and returned is std output. * * @param op Defines which operation the caller wants. * @param param The name or temporary name or command or the handle pointer parameter depending on op. * @returns A handle pointer or pointer to character to read / written or closed/unlocked handle (NULL on errors). Also cmd stdout. */ typedef iores (*ioconn)(SLC_IO_OP op, const char *param); /** * Function-abstraction for reading the source code char-by-char. */ typedef char (*coderead)(); /** States the main state-engine can pick up - use characters for debugging better */ enum slc_state : uint8_t { /** Before things */ SLC_START = 's', /** In a comment */ SLC_COMMENT = 'c', /** In multi-line comment */ SLC_MULTILINE_COMMENT = 'm', /** Name part of word-definition (after ':') - whitespace ends it */ SLC_DEF_NAME = 'd', /** Variable-listing part of word-definition - endline, '(', '[' or '{' ends it */ SLC_DEF_VAR = 'D', /** Raw body part of the word definition - these can contain local variable accesses + words, depth counted by vars */ SLC_DEF_BODY = 'b', /** Name part of a word "call" (non-definition). Ends by whitespace, '@' (in case of variable) or various parentheses */ SLC_WORD_NAME = 'w', /** Variable call (MYWORD@MYVAR) - we get to be here from SLC_WORD_NAME or from START */ SLC_WORD_VAR = 'W', /** Syntax error state - recovers by newlines */ SLC_SYN_ERROR = 'e', }; typedef enum slc_state slc_state; static inline slc_state slc_comment_statechange_in( slc_state current_state, char c, const char *singleline_comment, const char *multiline_comment_opener, int *comment_i, int *multiline_i){ char s = singleline_comment[*comment_i]; char m = multiline_comment_opener[*multiline_i]; /* Check if we have finished processing */ if(s == 0) { *comment_i = 0; return SLC_COMMENT; } if(m == 0) { *multiline_i = 0; return SLC_MULTILINE_COMMENT; } /* Single-line comment progress */ if(c == s) { ++(*comment_i); } else { *comment_i = 0; } /* Multi-line comment progress */ if(c == m) { ++(*multiline_i); } else { *multiline_i = 0; } return current_state; } static inline slc_state slc_multiline_comment_statechange_out( slc_state current_state, char c, const char *multiline_comment_closer, int *multiline_i){ char m = multiline_comment_closer[*multiline_i]; /* Check if we have finished processing */ if(m == 0) { *multiline_i = 0; return SLC_START; } /* Multi-line comment progress */ if(c == m) { ++(*multiline_i); } else { *multiline_i = 0; } return current_state; } /** Handles state change into word definitions */ static inline slc_state slc_def_name_statechange( slc_state current_state, char prevc, char c, const char *prefix, int *prefix_i) { /* If not a whitespace currently, check the prefix, otherwise check ending ':' */ if(!isspace(c)) { /* Early exit for not-a-definition sub-state */ if(*prefix_i < 0) { return current_state; } /* Read prefix */ if(prefix[*prefix_i] != 0) { if(prefix[*prefix_i] == c) { ++(*prefix_i); } else { *prefix_i = -1; } } else { if(c == ':') { *prefix_i = 0; /* XXX: restarts scan */ return SLC_WORD_NAME; } else { *prefix_i = -1; } } return current_state; } else { /* Not Found: Probably a word occurence */ *prefix_i = 0; /* XXX: restarts scan */ return current_state; } } /** Handles state change into word occurences - shared wordname with def_name_statechange! */ static inline slc_state slc_word_statechange( slc_state current_state, char c, int *wordname_i, const char *wordname) { // FIXME: Implement return current_state; } #define SET_SLC_START \ comment_i = 0; \ multiline_i = 0; \ prefix_i = 0; \ wordname_i = 0; \ wordname[0] = 0; \ state = SLC_START; /** * This function runs the main slc engine over a snippet of code. * * The code_src is the entry of what we start interpreting, but you can do (un)buffered reads in it * because we will use session_storage to store the source code data into memory while processing... * Rem.: This also helps with the "include" directives using io_connector and do some kind of recursion maybe. * Rem.: This architecture also let us try to immediately "threaded_code" optimize the newly added word definition, * which is possible if it relies on no forward references - this is the most happy case in my opinion! * * The session_storage is where we process the words data (possibly introducing threaded code as early as possible). * This should also let the code "allocate" some random memory too and get an offset for it (there will be no other way). * * The symbol_table not only store "words", but direct access offsets for: * * - words themselves (direct offset) * - variables of the words (direct offset) * * The code stack is what the interpreter uses for return addresses, the data stack however is FORTH-style usual stack. * The "insert_stack" collects things that we will read instead of reading the code_src AFTER a return from current word. * A \0 value should be there at the end of it until we RETURN from the word - from when we start processing. * * The io_connector is needed so that the engine have connection for temporary and real files and things simulating those. * * The prefix, ender and varprefix strings really just help when you use SLC to define a compiler and you need these. * * @param code_src The input source code to interpret / run. Code ends either with \0 or EOF. * @param session_storage Can allocate and use arbitrary memory with this. * @param symbol_table The symbol table to use while processing. * @param code_stack The code stack (return addresses) to use. * @param nesting_stack The stack used for the state-machine of the nested words. * @param data_stack The data stack (forth-like stack) to use. * @param insert_stack Used for temporarily expanding the input stream (one word level above current) with further words. * @param io_connector The engine uses this to open/close pipes/files and write/read them. * @param singleline_comment Like "//" - the character string that makes the rest of the line being comment. Can be "" (no NULL). * @param multiline_comment_opener The character string that starts a multiline comment. Like / and * for C. Can be "" (no NULL). * @param multiline_comment_closer The character string that ends a multiline comment. Like * and / for C. Can be "" (no NULL). * @param prefix The prefix added to the lookup of built-ins. Useful when you write a compiler with SLC. Defaults to "" (empty). * @param ender The character string that ends a word definition. Defaults to ";". * @param varprefix The character string that prefixes variable declarations. Defaults to "@". */ static inline void slc( coderead code_src, session session_storage, sym symbol_table, stack code_stack, stack nesting_stack, stack data_stack, stack insert_stack, ioconn io_connector, const char *singleline_comment, const char *multiline_comment_opener, const char *multiline_comment_closer, const char *prefix, const char *ender, const char *varprefix) { char last_is_endl = 0; int line = 0; int col = -1; char is_indenting = 1; int indent = 0; slc_state state; int comment_i; int multiline_i; int prefix_i; int wordname_i; char wordname[SL_MAX_WORD_NAME + 1]; SET_SLC_START char prevc = 0; char c = 0; while(((c = code_src()) != 0)) { /* Handle lines and columns, parts of indenting */ if(endsline(c)) { /* Handles \n, \r, \r\n and \n\r this way and counts empty lines properly */ /* De-Morgan (!a || b) == (a => b) so (last_is_endl => (prevc == c)) */ if(!last_is_endl || (prevc == c)) { ++line; col = 0; /* Indent part */ is_indenting = 1; indent = 0; } last_is_endl = 1; } else { last_is_endl = 0; ++col; } /* Handle indenting */ if((c == ' ') || (c == ' ')) { indent += is_indenting; } else { /* Defends against state-changer endline */ is_indenting = endsline(c); } process_char: #ifdef SLC_DEBUG fprintf(stderr, "%c state:%c @ line:%d col:%d indent:%d\n", c, state, line, col, indent); #endif switch(state) { case SLC_START: /* state -> comment | multiline_comment */ state = slc_comment_statechange_in( state, c, singleline_comment, multiline_comment_opener, &comment_i, &multiline_i); /* state -> def_name */ if(state == SLC_START) { state = slc_def_name_statechange( state, prevc, c, prefix, &prefix_i); } /* state -> word_name | word_var */ if(state == SLC_START) { state = slc_word_statechange( state, c, &wordname_i, wordname); } break; case SLC_COMMENT: if(endsline(c)) { SET_SLC_START } break; case SLC_MULTILINE_COMMENT: state = slc_multiline_comment_statechange_out( state, c, multiline_comment_closer, &multiline_i); if(state == SLC_START) { SET_SLC_START } break; case SLC_DEF_NAME: // TODO break; case SLC_DEF_VAR: break; case SLC_DEF_BODY: break; case SLC_WORD_NAME: break; case SLC_WORD_VAR: break; case SLC_SYN_ERROR: /* Recover from slc syntax errors at endlines for now */ if(endsline(c)) { SET_SLC_START } break; } prevc = c; } } #endif /* SLC_H */