ed81751999-12-11Martin Stjernholm //! RXML parser and compiler framework. //! //! Created 1999-07-30 by Martin Stjernholm. //!
e213f01999-12-13Martin Stjernholm //! $Id: module.pmod,v 1.2 1999/12/13 22:12:40 mast Exp $
ed81751999-12-11Martin Stjernholm  //! Kludge: Must use "RXML.refs" somewhere for the whole module to be //! loaded correctly. //! WARNING: This API is not yet set in stone; expect incompatible //! changes. #pragma strict_types #if !constant (RequestID) class RequestID {} #endif class Tag //! Interface class for the static information about a tag. { //! Interface. //!string name; //! The name of the tag. Required and considered constant. int flags; //! Various bit flags that affect parsing; see the FLAG_* constants. //! RXML.Frame.flags is initialized from this. mapping(string:Type) req_arg_types; mapping(string:Type) opt_arg_types; //! Define to declare the names and types of the required and //! optional arguments. If a type specifies a parser, it'll be used //! on the argument value. Note that the order in which arguments //! are parsed is arbitrary. Type content_type = t_text (PHtml); //! The handled type of the content, if the tag is used as a //! container. It's taken from the actual result type if set to //! zero. //! //! This default says it's text, but the HTML parser is used to read //! it, which means that the content is preparsed with HTML syntax. //! Use t_text directly with no parser to get the raw text. array(Type) result_types = ({t_text}); //! The possible types of the result, in order of precedence. string scope_name; //! RXML.Frame.scope_name is initialized from this. TagSet additional_tags, local_tags; //! RXML.Frame.additional_tags and RXML.Frame.local_tags are //! initialized from these. //!function(:object)|program frame; //! This function should return an object to be used as a frame. The //! frame object must (in practice) inherit RXML.Frame. //! Services. inline Frame `() (mapping(string:mixed) args, void|mixed|PCode content) //! Make an initialized frame for the tag. Typically useful when //! returning generated tags from e.g. RXML.Frame.do_return(). The //! argument values and the content are not parsed; see //! RXML.Frame.do_return() for details. Note: Never reuse the same //! frame object. { Tag this = [object(Tag)]/*HMM*/ this_object(); Frame frame = [object(Frame)] this->frame(); frame->tag = this; frame->flags = flags; if (scope_name) frame->scope_name = scope_name; if (additional_tags) frame->additional_tags = additional_tags; if (local_tags) frame->local_tags = local_tags; frame->args = args; if (!zero_type (content)) frame->content = content; return frame; } // Internals. array handle_tag (TagSetParser parser, mapping(string:string) args, void|string content) // Callback for tag set parsers. Returns a sequence of result values // to be added to the result queue. { // FIXME: P-code generation. Frame frame = `() (args, Void); frame->_eval (parser, args, content); return frame->result == Void ? ({}) : ({frame->result}); } string _sprintf() { return "Tag(" + [string] this_object()->name + ")"; } } class TagSet //! Contains a set of tags. Tag sets can import other tag sets, and //! later changes are propagated. Parser instances (contexts) to parse //! data are also created from this. TagSet objects may somewhat //! safely be destructed explicitly; the tags in a destructed tag set //! will not be active in parsers that are instantiated later, but //! will work in current instances. { string prefix; //! A prefix that may precede the tags. If zero, it's up to the //! importing tag set(s). int prefix_required; //! The prefix must precede the tags. array(TagSet) imported = ({}); //! Other tag sets that will be used. The precedence is local tags //! first, then imported from left to right. It's not safe to //! destructively change entries in this array. int generation = 1; //! A number that is increased every time something changes in this //! object or in some tag set it imports. mapping(string:mixed) low_tags, low_containers, low_entities; //! Passed directly to Parser.HTML. Note: Changes in these aren't //! tracked; changed() must be called. void create (void|array(Tag) _tags) //! { if (_tags) tags = mkmapping ([array(string)] _tags->name, _tags); } void add_tag (Tag tag) //! { tags[tag->name] = tag; changed(); } void add_tags (array(Tag) _tags) //! { tags += mkmapping (/*[array(string)]HMM*/ _tags->name, _tags); changed(); } void remove_tag (string|Tag tag) //! { if (stringp (tag)) m_delete (tags, tag); else for (string n; !zero_type (n = search (tags, tag));) m_delete (tags, n); changed(); } Tag get_tag (string name) //! { Tag tag; if ((tag = tags[name])) return tag; foreach (imported, TagSet tag_set) if ((tag = [object(Tag)]/*HMM*/ tag_set->get_tag (name))) return tag; return 0; } Tag get_local_tag (string name) //! { return tags[name]; } array(Tag) get_local_tags() //! { return values (tags); } mixed `->= (string var, mixed val) { switch (var) { case "imported": (imported - ({0}))->dont_notify (changed); imported = [array(TagSet)] val; imported->do_notify (changed); break; default: ::`->= (var, val); } changed(); return val; } mixed `[]= (string var, mixed val) {return `->= (var, val);} Parser `() (Type top_level_type, void|RequestID id) //! Creates a new context for parsing content of the specified type, //! and returns the parser object for it. id is put into the //! context. { return Context (this_object(), id)->new_parser (top_level_type); } void changed() //! Should be called whenever something is changed. Done //! automatically most of the time, however. { generation++; (notify_funcs -= ({0}))(); set_weak_flag (notify_funcs, 1); } // Internals. void do_notify (function(:void) func) { notify_funcs |= ({func}); set_weak_flag (notify_funcs, 1); } void dont_notify (function(:void) func) { notify_funcs -= ({func}); set_weak_flag (notify_funcs, 1); } void destroy() { catch (changed()); } private mapping(string:Tag) tags = ([]); // Private since we want to track changes in this. private array(function(:void)) notify_funcs = ({}); // Weak (when nonempty). } TagSet empty_tag_set; //! The empty tag set. class Context //! A parser context. This contains the current variable bindings and //! so on. The current context can always be retrieved with //! get_context(). //! //! Note: Don't keep pointers to this object since that will likely //! introduce circular references. It can be retrieved easily through //! get_context() or parser->context. { Frame frame; //! The currently evaluating frame. RequestID id; //! int type_check; //! Whether to do type checking. TagSet tag_set; //! The current tag set that will be inherited by subparsers. int tag_set_is_local; //! Nonzero if tag_set is a copy local to this context. A local tag //! set that imports the old tag_set might be created whenever //! needed. mixed get_var (string var, void|string scope_name) //! Returns the value a variable in the specified scope, or the //! current scope if none is given. Returns zero with zero_type 1 if //! there's no such variable. { if (mapping(string:mixed) vars = scopes[scope_name || ""]) { mixed val; if (zero_type (val = vars[var])) return ([])[0]; else if (objectp (val) && val->eval) return val->eval (this_object(), var, scope_name); else return val; } else if (scope_name) error ("Unknown scope %O.\n", scope_name); else error ("No current scope.\n"); } mixed set_var (string var, mixed val, void|string scope_name) //! Sets the value of a variable in the specified scope, or the //! current scope if none is given. Returns val. { if (mapping(string:mixed) vars = scopes[scope_name || ""]) return vars[var] = val; else if (scope_name) error ("Unknown scope %O.\n", scope_name); else error ("No current scope.\n"); } void delete_var (string var, void|string scope_name) //! Removes a variable in the specified scope, or the current scope //! if none is given. { if (mapping(string:mixed) vars = scopes[scope_name || ""]) m_delete (vars, var); else if (scope_name) error ("Unknown scope %O.\n", scope_name); else error ("No current scope.\n"); } array(string) list_var (void|string scope_name) //! Returns the names of all variables in the specified scope, or //! the current scope if none is given. { if (mapping(string:mixed) vars = scopes[scope_name || ""]) return indices (vars); else if (scope_name) error ("Unknown scope %O.\n", scope_name); else error ("No current scope.\n"); } void add_runtime_tag (Tag tag) //! Adds a tag that will exist from this point forward in the //! current context only. { if (tag_set_is_local) make_tag_set_local(); tag_set->add_tag (tag); } void remove_runtime_tag (string|Tag tag) //! Removes a tag added by add_runtime_tag(). { if (tag_set_is_local) make_tag_set_local(); tag_set->remove_tag (tag); } array(string) list_scopes() //! Returns the names of all defined scopes. { return indices (scopes) - ({""}); } void add_scope (string scope_name, mapping(string:mixed) vars) //! Adds or replaces the specified scope at the global level. { if (scopes[scope_name]) if (scope_name == "") { mapping(string:mixed) inner = scopes[""]; while (mapping(string:mixed) outer = hidden[inner]) inner = outer; hidden[inner] = vars; } else { Frame outermost; for (Frame f = frame; f; f = f->up) if (f->scope_name == scope_name) outermost = f; if (outermost) hidden[outermost] = vars; else scopes[scope_name] = vars; } else scopes[scope_name] = vars; } void remove_scope (string scope_name) //! Removes the named scope from the global level, if it exists. { #ifdef MODULE_DEBUG if (scope_name == "") error ("Cannot remove current scope.\n"); #endif Frame outermost; for (Frame f = frame; f; f = f->up) if (f->scope_name == scope_name) outermost = f; if (outermost) m_delete (hidden, outermost); else m_delete (scopes, scope_name); } string current_scope() //! Returns the name of the current scope, if it has any. { if (mapping(string:mixed) vars = scopes[""]) { string scope_name; while (scope_name = search (scopes, vars, scope_name)) if (scope_name != "") return scope_name; } return 0; } void error (string msg, mixed... args) //! Throws an error with a dump of the parser stack. { if (sizeof (args)) msg = sprintf (msg, @args); msg = "RXML parser error: " + msg; for (Frame f = frame; f; f = f->up) { if (f->tag) msg += "<" + f->tag->name; else if (!f->up) break; else msg += "<(unknown tag)"; if (f->args) foreach (sort (indices (f->args)), string arg) { mixed val = f->args[arg]; msg += " " + arg + "="; if (arrayp (val)) msg += map (val, error_print_val) * ","; else msg += error_print_val (val); } else msg += " (no argmap)"; msg += ">\n"; } array b = backtrace(); throw (({msg, b[..sizeof (b) - 2]})); } // Internals. private string error_print_val (mixed val) { if (arrayp (val)) return "array"; else if (mappingp (val)) return "mapping"; else if (multisetp (val)) return "multiset"; else return sprintf ("%O", val); } mapping(string:mapping(string:mixed)) scopes = ([]); // The variable mappings for every currently visible scope. A // special entry "" points to the current local scope. mapping(mapping(string:mixed)|Frame:mapping(string:mixed)) hidden = ([]); // The currently hidden variable mappings in scopes. The old "" // entries are indexed by the replacing variable mapping. The old // named scope entries are indexed by the frame object which // replaced them. void enter_scope (Frame frame) { mapping(string:mixed) vars; #ifdef DEBUG if (!frame->vars) error ("Internal error: Frame has no variables.\n"); #endif if ((vars = [mapping(string:mixed)] frame->vars) != scopes[""]) { hidden[vars] = scopes[""]; scopes[""] = vars; if (string scope_name = [string] frame->scope_name) { hidden[frame] = scopes[scope_name]; scopes[scope_name] = vars; } } } void leave_scope (Frame frame) { if (string scope_name = [string] frame->scope_name) if (hidden[frame]) { scopes[scope_name] = hidden[frame]; m_delete (hidden, frame); } mapping(string:mixed) vars; if (hidden[vars = [mapping(string:mixed)] frame->vars]) { scopes[""] = hidden[vars]; m_delete (hidden, vars); } } #define ENTER_SCOPE(ctx, frame) (frame->vars && ctx->enter_scope (frame)) #define LEAVE_SCOPE(ctx, frame) (frame->vars && ctx->leave_scope (frame)) void make_tag_set_local() { if (!tag_set_is_local) { TagSet new_tag_set = TagSet(); // FIXME: Cache this? new_tag_set->imported = ({tag_set}); tag_set = new_tag_set; tag_set_is_local = 1; } } Parser new_parser (Type top_level_type) // Returns a new parser object to start parsing with this context. // Normally TagSet.`() should be used instead of this. { #ifdef MODULE_DEBUG if (in_use || frame) error ("Context already in use.\n"); #endif return top_level_type->get_parser (this_object()); } void create (TagSet _tag_set, void|RequestID _id) // Normally TagSet.`() should be used instead of this. { tag_set = _tag_set; id = _id; } mapping(string:mixed)|mapping(Frame:array) unwind_state; // If this is a mapping, we have an unwound stack state. It contains // strings with arbitrary exception info, and frames with arrays // containing the extra state info needed by Frame._eval(). The // first entry in these arrays are always the subframe. The special // entries are: // // "top": ({Frame (top frame), Parser (parser for frame)}) // "up_frames": int (Go up these many frames and continue.) // "stream_piece": mixed (When continuing, do a streaming // do_return() with this stream piece.) // "exec_left": array (Exec array left to evaluate. Only used // between Frame._exec_array() and Frame._eval().) #ifdef MODULE_DEBUG int in_use; #endif } //! Current context. //! It's set before any function in RXML.Tag or RXML.Frame is called. #if constant (thread_create) private object _context = thread_local(); inline void set_context (Context ctx) {_context->set (ctx);} inline Context get_context() {return [object(Context)] _context->get();} #else private Context _context; inline void set_context (Context ctx) {_context = ctx;} inline Context get_context() {return _context;} #endif #ifdef MODULE_DEBUG // Got races in this debug check, but looks like we have to live with that. :\ #define ENTER_CONTEXT(ctx) \ Context __old_ctx = get_context(); \ set_context (ctx); \ if (ctx) { \ if (ctx->in_use && __old_ctx != ctx) \ parse_error ("Attempt to use context asynchronously.\n"); \ ctx->in_use = 1; \ } #define LEAVE_CONTEXT() \ if (Context ctx = get_context()) \ if (__old_ctx != ctx) ctx->in_use = 0; \ set_context (__old_ctx); #else #define ENTER_CONTEXT(ctx) \ Context __old_ctx = get_context(); \ set_context (ctx); #define LEAVE_CONTEXT() \ set_context (__old_ctx); #endif void parse_error (string msg, mixed... args) //! Tries to throw an error with error() in the current context to //! include the frame stack. { Context ctx = get_context(); if (ctx && ctx->error) ctx->error (msg, @args); else { if (sizeof (args)) msg = sprintf (msg, @args); msg = "RXML parser error (no context): " + msg; array b = backtrace(); throw (({msg, b[..sizeof (b) - 2]})); } } //! Constants for the bit field RXML.Frame.flags. //! Static flags (i.e. tested in the Tag object). constant FLAG_CONTAINER = 0x00000001; //! If set, the tag accepts non-empty content. E.g. with the standard //! HTML parser this defines whether the tag is a container or not. //! The rest of the flags are dynamic (i.e. tested in the Frame object). constant FLAG_PARENT_SCOPE = 0x00000100; //! If set, the array from do_return() and cached_return() will be //! interpreted in the scope of the parent tag, rather than in the //! current one. constant FLAG_NO_IMPLICIT_ARGS = 0x00000200; //! If set, the parser won't apply any implicit arguments. FIXME: Not //! yet implemented. constant FLAG_STREAM_RESULT = 0x00000400; //! If set, the do_return() function will be called repeatedly until //! it returns 0 or no more content is wanted. constant FLAG_STREAM_CONTENT = 0x00000800; //! If set, the tag supports getting its content in streaming mode: //! do_return() will be called repeatedly with successive parts of the //! content then. Can't be changed from do_return(). //! Note: It might be obvious, but using streaming is significantly //! less effective than nonstreaming, so it should only be done when //! big delays are expected. constant FLAG_STREAM = FLAG_STREAM_RESULT | FLAG_STREAM_CONTENT; //! The following flags specifies whether certain conditions must be //! met for a cached frame to be considered (if RXML.Frame.is_valid() //! is defined). They may be read directly after do_return() returns. //! The tag name is always the same. FIXME: These are ideas only; not //! yet implemented. constant FLAG_CACHE_DIFF_ARGS = 0x00010000; //! If set, the arguments to the tag need not be the same (using //! equal()) as the cached args. constant FLAG_CACHE_DIFF_CONTENT = 0x00020000; //! If set, the content need not be the same. constant FLAG_CACHE_DIFF_RESULT_TYPE = 0x00040000; //! If set, the result type need not be the same. (Typically //! not useful unless cached_return() is used.) constant FLAG_CACHE_DIFF_VARS = 0x00080000; //! If set, the variables with external scope in vars (i.e. normally //! those that has been accessed with get_var()) need not have the //! same values (using equal()) as the actual variables. constant FLAG_CACHE_SAME_STACK = 0x00100000; //! If set, the stack of call frames needs to be the same. constant FLAG_CACHE_EXECUTE_RESULT = 0x00200000; //! If set, an array to execute will be stored in the frame instead of //! the final result. On a cache hit it'll be executed like the return //! value from do_return() to produce the result. class Frame //! A tag instance. { constant is_RXML_Frame = 1; //! Interface. Frame up; //! The parent frame. This frame is either created from the content //! inside the up frame, or it's in the array returned from //! do_return() in the up frame. Tag tag; //! The RXML.Tag object this frame was created from. int flags; //! Various bit flags that affect parsing. See the FLAG_* constants. mapping(string:mixed) args; //! The arguments passed to the tag. Set before //! do_enter()/do_return() are called. Type content_type; //! The type of the content. mixed content = Void; //! The content. Set before do_return() is called, but only when the //! tag is actually used with container syntax. Type result_type; //! The required result type. Set before do_enter()/do_return() are //! called. do_return() should produce a result of this type. mixed result = Void; //! The result. //!mapping(string:mixed) vars; //! Set this to introduce a new variable scope that will be active //! during parsing of the content and return values (but see also //! FLAG_PARENT_SCOPE). Don't replace or remove the mapping later. //!string scope_name; //! The scope name for the variables. Don't change this later. //!TagSet additional_tags; //! If set, the tags in this tag set will be used in addition to the //! tags inherited from the surrounding parser. The additional tags //! will in turn be inherited by subparsers. //!TagSet local_tags; //! If set, the tags in this tag set will be used in the parser for //! the content, instead of the one inherited from the surrounding //! parser. The tags are not inherited by subparsers. //!int|function(RequestID:int|function) do_enter (RequestID id); //! Called before the content (if any) is processed. This function //! typically only initializes vars. Return values: //! //! int - Do this many passes through the content. do_return() //! will be called after each pass. //! function(RequestID:int|function) - A function that is handled //! just like do_enter(), only repeatedly until it returns //! 0 or another function. //! //! If this function is missing, one pass is done. //!array do_return (RequestID id, void|mixed piece); //! Called after the content (if any) has been processed. //! //! The result_type variable is set to the type of result the parser //! wants. It's any type that is valid by tag->result_type. If the //! result type is sequential, it's spliced into the surrounding //! content, otherwise it replaces the previous value of the //! content, if any. //! //! Return values: //! //! array - A so-called execution array to be handled by the parser: //! //! string - Added or put into the result. If the result type has //! a parser, the string will be parsed with it before //! it's assigned to the result variable and passed on. //! RXML.Frame - Already initialized frame to process. Neither //! arguments nor content will be parsed. It's result is //! added or put into the result of this tag. //! mapping(string:mixed) - Fields to merge into the headers. //! FIXME: Not yet implemented. FIXME: Somehow represent //! removal of headers? //! object - Treated as a file object to read in blocking or //! nonblocking mode. FIXME: Not yet implemented, details //! not decided. //! multiset(mixed) - Should only contain one element that'll be //! added or put into the result. Normally not necessary; //! assign it directly to the result variable instead. //! //! 0 - Do nothing special. Ends the stream when //! FLAG_STREAM_RESULT is set. //! //! Note that the intended use is not to postparse by returning //! strings, but instead to return an array with literal strings and //! RXML.Frame objects where parsing (or, more accurately, //! evaluation) needs to be done. //! //! piece is used when the tag is operating in streaming mode (i.e. //! FLAG_STREAM_CONTENT is set). It's then set to each successive //! part of the content in the stream, and the content variable is //! never touched. do_return() is also called "normally" with no //! piece argument afterwards. Note that tags that support streaming //! mode might still be used nonstreaming (it might also vary //! between iterations). //! //! As long as FLAG_STREAM_RESULT is set, do_return() will be called //! repeatedly until it returns 0. It's only the result piece from //! the execution array that is propagated after each turn; the //! result variable only accumulates all these pieces. //! //! If this function is an array, it's executed as above. If it's //! zero, the value in the result variable is simply used. If the //! result variable is Void, content is used as result if it's of a //! compatible type. //!int|function(:int) is_valid; //! When defined, the frame may be cached. First the name of the tag //! must be the same. Then the conditions specified by the cache //! bits in flag are checked. Then, if this is a function, it's //! called. If it returns 1, the frame is reused. FIXME: Not yet //! implemented. //!array cached_return (Context ctx, void|mixed piece); //! If defined, this will be called to get the value from a cached //! frame (that's still valid) instead of using the cached result. //! It's otherwise handled like do_return(). Note that the cached //! frame may be used from several threads. FIXME: Not yet //! implemented. //! Services. void error (string msg, mixed... args) //! Throws an error with a backtrace from the current context. { parse_error (msg, @args); } void terminate() //! Makes the parser abort. The data parsed so far will be returned. //! Does not return; throws a special exception instead. { // FIXME } void suspend() //! Used together with resume() for nonblocking mode. May be called //! from do_enter() or do_return() to suspend the parser: The parser //! will just stop, leaving the context intact. If it returns, the //! parser is used in a place that doesn't support nonblocking, so //! just go ahead and block. { // FIXME } void resume() //! Makes the parser continue where it left off. The function that //! called suspend() will be called again. { // FIXME } // Internals. mixed _exec_array (Context ctx, array exec) { Frame this = [object(Frame)]/*HMM*/ this_object(); int i = 0; mixed res = Void; Parser subparser = 0; mixed err = catch { if (flags & FLAG_PARENT_SCOPE) LEAVE_SCOPE (ctx, this); for (; i < sizeof (exec); i++) { mixed elem = exec[i], piece = Void; switch (sprintf ("%t", elem)) { case "string": if (result_type->_parser_prog == PNone) piece = elem; else { subparser = result_type->get_parser (ctx); subparser->finish (elem); // May unwind. piece = subparser->eval(); // May unwind. subparser = 0; } break; case "object": if (elem->is_RXML_Frame) { elem->_eval (0); // May unwind. piece = elem->result; } else if (elem->is_RXML_Parser) { // The subparser above unwound. elem->finish(); // May unwind. piece = elem->eval(); // May unwind. } else error ("File objects not yet implemented.\n"); break; case "mapping": error ("Header mappings not yet implemented.\n"); break; case "multiset": if (sizeof (elem) == 1) piece = ((array) elem)[0]; else if (sizeof (elem) > 1) error (sizeof (elem) + " values in multiset in exec array.\n"); else error ("No value in multiset in exec array.\n"); break; default: error ("Invalid type %t in exec array.\n", elem); } if (result_type->sequential) res += piece; else if (piece != Void) result = res = piece; } if (result_type->sequential) result += res; if (flags & FLAG_PARENT_SCOPE) ENTER_SCOPE (ctx, this); return res; }; if (result_type->sequential) result += res; if (objectp (err) && err->is_RXML_Frame) { mapping(string:mixed)|mapping(Frame:array) ustate; if ((ustate = ctx->unwind_state) && !zero_type (ustate->stream_piece)) // Subframe wants to stream. Update stream_piece and send it on. if (result_type->sequential) ustate->stream_piece = res + ustate->stream_piece; else if (ustate->stream_piece == Void) ustate->stream_piece = res; ustate->exec_left = exec[i..]; // Left to execute. if (subparser) // Replace the string with the subparser object so that we'll // continue in it later. It's done here so that the original // exec array isn't touched. ustate->exec_left[0] = subparser; } throw (err); } void _eval (TagSetParser parser, void|mapping(string:string) raw_args, void|string raw_content) // Note: It might be somewhat tricky to override this function. { Frame this = [object(Frame)]/*HMM*/ this_object(); Context ctx = parser->context; #ifdef DEBUG if (ctx != get_context()) error ("Internal error: Context not current.\n"); if (!parser->tag_set_eval) error ("Internal error: Calling _eval() with non-tag set parser.\n"); #endif // Unwind state data. int|function(RequestID:int|function) fn, iter; //string raw_content; Parser subparser; mixed piece; Frame subframe; array exec; int tags_added; // Flag that we added additional_tags to ctx->tag_set. #define PRE_INIT_ERROR(X) (ctx->frame = this, error (X)) if (array state = ctx->unwind_state && ctx->unwind_state[this]) { #ifdef DEBUG if (!up) PRE_INIT_ERROR ("Internal error: Resuming frame without up pointer.\n"); if (raw_args || raw_content) PRE_INIT_ERROR ("Internal error: Can't feed new arguments or content " "when resuming parse.\n"); #endif [subframe, fn, iter, raw_content, subparser, piece, exec, tags_added] = state; m_delete (ctx->unwind_state, this); if (!sizeof (ctx->unwind_state)) ctx->unwind_state = 0; #ifdef DEBUG if (piece && subframe && exec && (!sizeof (exec) || exec[0] != subframe)) PRE_INIT_ERROR ("Internal error: Subframe ambiguity " "when handling a stream piece.\n"); #endif #ifdef MODULE_DEBUG if (piece && !(flags & FLAG_STREAM_CONTENT)) PRE_INIT_ERROR ("The subframe failed to notice that this frame doesn't support " "streaming - flags did probably change.\n"); #endif } else { #ifdef MODULE_DEBUG if (up && up != ctx->frame) PRE_INIT_ERROR ("Reuse of frame in different context.\n"); #endif up = ctx->frame; piece = Void; } #undef PRE_INIT_ERROR ctx->frame = this; int tag_set_gen = [int]/*HMM*/ parser->tag_set->generation; if (raw_args) { args = ([]); mapping(string:Type) atypes; if (tag->req_arg_types) { atypes = [mapping(string:Type)] (raw_args & tag->req_arg_types); if (sizeof (atypes) < sizeof (tag->req_arg_types)) { array(string) missing = sort (indices (tag->req_arg_types - atypes)); parse_error ("Required " + (sizeof (missing) > 1 ? "arguments " + String.implode_nicely (missing) + " are" : "argument " + missing[0] + " is") + " missing.\n"); } } if (tag->opt_arg_types) if (atypes) atypes += /*[mapping(string:Type)]HMM*/ (raw_args & tag->opt_arg_types); else atypes = [mapping(string:Type)] (raw_args & tag->opt_arg_types); if (atypes) if (mixed err = catch { foreach (indices (atypes), string arg) args[arg] = atypes[arg]->eval (raw_args[arg], ctx, 0, 1); // May currently NOT unwind. }) { if (objectp (err) && err->is_RXML_Frame) error ("Can't save parser state when evaluating arguments.\n"); throw (err); } } #ifdef DEBUG if (!args) error ("Internal error: args not set.\n"); #endif if (TagSet add_tags = raw_content && [object(TagSet)] this->additional_tags) { if (!ctx->tag_set_is_local) ctx->make_tag_set_local(); if (search (ctx->tag_set->imported, add_tags) < 0) { ctx->tag_set->imported = ({add_tags}) + ctx->tag_set->imported; tags_added = 1; } } if (!result_type) { Type ptype = [object(Type)]/*HMM*/ parser->type; foreach (tag->result_types, Type rtype) if (rtype->subtype_of (ptype)) {result_type = rtype; break;} if (!result_type) // Sigh.. error ("Tag returns " + String.implode_nicely (tag->result_types->name, "or") + " but " + parser->type->name + " is expected.\n"); } if (!content_type) content_type = tag->content_type || result_type; mixed err = catch { if (!fn) fn = this->do_enter ? [int|function(RequestID:int|function)] this->do_enter (ctx->id) : // May unwind. 1; do { if (!iter) { iter = fn; while (functionp (iter)) { // Got a function from do_enter. int|function(RequestID:int|function) newiter = [int|function(mixed/*HMM*/:int|function)] iter (ctx->id); // May unwind. fn = iter, iter = newiter; } } ENTER_SCOPE (ctx, this); for (; iter > 0; iter--) { if (raw_content) { // Got nested parsing to do. int finished = 0; if (!subparser) { // The nested content is not yet parsed. subparser = content_type->get_parser (ctx, this->local_tags); subparser->finish (raw_content); // May unwind. finished = 1; } do { if (flags & FLAG_STREAM_CONTENT && subparser->read) { // Handle a stream piece. // Squeeze out any free text from the subparser first. mixed res = subparser->read(); if (content_type->sequential) piece = res + piece; else if (piece == Void) piece = res; if (piece != Void) { array|function(RequestID,mixed:array) do_return; if ((do_return = [array|function(RequestID,mixed:array)] this->do_return) && !arrayp (do_return)) { if (!exec) exec = do_return (ctx->id, piece); // May unwind. if (exec) { mixed res = _exec_array (ctx, exec); // May unwind. if (flags & FLAG_STREAM_RESULT) { #ifdef DEBUG if (!zero_type (ctx->unwind_state->stream_piece)) error ("Internal error: " "Clobbering unwind_state->stream_piece.\n"); #endif ctx->unwind_state->stream_piece = res; throw (this); } exec = 0; } else if (flags & FLAG_STREAM_RESULT) { // do_return() finished the stream. Ignore remaining content. ctx->unwind_state = 0; piece = Void; break; } } piece = Void; } if (finished) break; } else { // The frame doesn't handle streamed content. piece = Void; if (finished) { mixed res = subparser->eval(); // May unwind. if (content_type->sequential) content += res; else if (res != Void) content = res; break; } } // subparser has unwound if we get here; nested content // is half parsed. if (subframe && subparser->tag_set_eval) { // A subframe unwound. Since the subparser already // processed it, we got to handle it ourselves. subframe->_eval (subparser); // May unwind. if (subframe->result != Void) subparser->write_out (subframe->result); subframe = 0; } subparser->finish(); // May unwind. finished = 1; } while (1); // Only loops when an unwound subparser has been recovered. subparser = 0; } if (array|function(RequestID,mixed:array) do_return = [array|function(RequestID,mixed:array)] this->do_return) { if (!exec) exec = arrayp (do_return) ? [array] do_return : do_return (ctx->id); // May unwind. if (exec) { mixed res = _exec_array (ctx, exec); // May unwind. if (flags & FLAG_STREAM_RESULT) { #ifdef DEBUG if (ctx->unwind_state) error ("Internal error: Clobbering unwind_state to do streaming.\n"); if (piece != Void) error ("Internal error: Thanks, we think about how nice it must " "be to play the harmonica...\n"); #endif ctx->unwind_state = (["stream_piece": res]); throw (this); } } } else if (result == Void && content_type->subtype_of (result_type)) result = content; } } while (fn); }; LEAVE_SCOPE (ctx, this); if (tag_set_gen != parser->tag_set->generation && ctx->tag_set == parser->tag_set) parser->recheck_tags(); if (err) { string action; if (objectp (err) && err->is_RXML_Frame) { mapping(string:mixed)|mapping(Frame:array) ustate = ctx->unwind_state; if (!ustate) ustate = ctx->unwind_state = ([]); #ifdef DEBUG if (ustate[this]) error ("Internal error: Frame already has an unwind state.\n"); #endif if (ustate->exec_left) { exec = [array] ustate->exec_left; m_delete (ustate, "exec_left"); } if (err == this || exec && sizeof (exec) && err == exec[0]) { // This frame or a frame in the exec array wants to stream. // Rethrow to continue in parent since we've already done // the appropriate do_return stuff in this frame in either // case. if (err == this) err = 0; if (tags_added) { ctx->tag_set->imported -= ({/*[object(TagSet)]HMM*/ this->additional_tags}); tags_added = 0; } action = "break"; } else if (!zero_type (ustate->stream_piece)) { // Got a stream piece from a subframe. We handle it above; // store the state and tail recurse. piece = ustate->stream_piece; m_delete (ustate, "stream_piece"); action = "continue"; } else action = "break"; // Some other reason - back up to the top. ustate[this] = ({err, fn, iter, raw_content, subparser, piece, exec, tags_added}); } else action = "throw"; switch (action) { case "break": // Throw and handle in parent frame. throw (this); case "continue": // Continue in this frame through tail recursion. _eval (parser); return; case "throw": // Any old exception. throw (err); default: error ("Internal error: Don't you come here and %O on me!\n", action); } } else { if (tags_added) ctx->tag_set->imported -= ({/*[object(TagSet)]HMM*/ this->additional_tags}); ctx->frame = up; } } string _sprintf() { return "Frame(" + (tag && [string] tag->name) + ")"; } } //! Parsers. class Parser //! Interface class for a syntax parser that scans, parses and //! evaluates an input stream. Access to a parser object is assumed to //! be done in a thread safe way except where noted. { constant is_RXML_Parser = 1; //! Services. function(Parser:void) data_callback; //! A function to be called when data is likely to be available from //! eval(). It's always called when the source stream closes. //! write() and write_end() are the functions to use from outside //! the parser system, not feed() or finish(). int write (string in) //! Writes some source data to the parser. Returns nonzero if there //! might be data available in eval(). { int res; ENTER_CONTEXT (context); mixed err = catch { if (context && context->unwind_state) _handle_rewind(); // May unwind. if (feed (in)) res = 1; // May unwind. if (res && data_callback) data_callback (this_object()); }; LEAVE_CONTEXT(); if (err) _handle_unwind (err); return res; } void write_end (void|string in) //! Closes the source data stream, optionally with a last bit of //! data. { int res; ENTER_CONTEXT (context); mixed err = catch { if (context && context->unwind_state) _handle_rewind(); // May unwind. finish (in); // May unwind. if (data_callback) data_callback (this_object()); }; LEAVE_CONTEXT(); if (err) _handle_unwind (err); } //! Interface. Context context; //! The context to do evaluation in. It's assumed to never be //! modified asynchronously during the time the parser is working on //! an input stream. Type type; //! The expected result type of the current stream. (The parser //! should not do any type checking on this.) int compile; //! Must be set to nonzero before a stream is fed which should be //! compiled to p-code. mixed feed (string in); //! Feeds some source data to the parse stream. The parser may do //! scanning and parsing before returning. If context is set, it may //! also do evaluation in that context. Returns nonzero if there //! could be new data to get from eval(). //! //! Note: If write_out() is given, this function may be interrupted //! by throw() and must be able to continue after that; don't store //! state in local variables. void finish (void|string in); //! Like feed(), but also finishes the parse stream. A last bit of //! data may be given. It should work to call this on an already //! finished stream if no argument is given to it. //!void write_out (mixed data); //! Define to allow continuation after broken parsing. Takes the //! evaluated result from some subexpression (e.g. a tag or entity) //! and adds it to the accumulated result. //!mixed read(); //! Define to allow streaming operation. Returns the evaluated //! result so far, but does not do any evaluation. Returns Void if //! there's no data (for sequential types the empty value is also //! ok). //!mixed eval(); //! Evaluates the data fed so far and returns the result. The result //! returned by previous eval() calls should not be returned again //! as (part of) this return value. Returns Void if there's no data //! (for sequential types the empty value is also ok). //!PCode p_compile() //! Define this to return a p-code representation of the current //! stream, which always is finished. //!void reset (Context ctx, Type type, mixed... args); //! Define to support reuse of a parser object. It'll be called //! instead of making a new object for a new stream. It keeps the //! static configuration, i.e. the type. //!Parser clone (Context ctx, Type type, mixed... args); //! Define to create new parser objects by cloning instead of //! creating from scratch. It returns a new instance of this parser //! with the same static configuration, i.e. the type. void create (Context ctx, Type _type /*, mixed... args*/) { context = ctx; type = _type; } // Internals. Parser _next_free; // Used to link together unused parser objects for reuse. void _handle_rewind() { Parser this = [object(Parser)]/*HMM*/ this_object(); mapping(string:mixed)|mapping(Frame:array) ustate; if ((ustate = context->unwind_state) && ustate->top && this->write_out) { #ifdef MODULE_DEBUG if (ustate->top[1] != this) context->error ("Resuming parse state with different parser.\n"); #endif Frame top = [object(Frame)] ustate->top[0]; m_delete (ustate, "top"); if (!sizeof (ustate)) context->unwind_state = ustate = 0; top->_eval (this); // May unwind. if (top->result != Void) this->write_out (top->result); } } void _handle_unwind (mixed err) { if (context && objectp (err) && err->is_RXML_Frame) { mapping(string:mixed)|mapping(Frame:array) ustate = context->unwind_state; if (!ustate) ustate = context->unwind_state = ([]); #ifdef DEBUG if (ustate->exec_left || ustate->stream_piece || ustate->top) error ("Internal error: Unexpected unwind_state at top level: %O\n", ustate); #endif ustate->top = ({err, this_object()}); } else throw (err); } } class TagSetParser //! Interface class for parsers that evaluates using the tag set. It //! provides the evaluation and compilation functionality. The parser //! should call Tag.handle_tag() from feed() and finish() for every //! encountered tag, and Context.get_var() for encountered variable //! references. The parser must provide a result queue with //! write_out() and read(). It must be able to continue cleanly after //! throw() from Tag.handle_tag(). { inherit Parser; constant tag_set_eval = 1; // Interface. TagSet tag_set; //! The tag set used for parsing. void write_out (mixed data); mixed read(); //! Must be defined for tag set parsers. //!void reset (Context ctx, Type type, TagSet tag_set, mixed... args); //!Parser clone (Context ctx, Type type, TagSet tag_set, mixed... args); void create (Context ctx, Type type, TagSet _tag_set /*, mixed... args*/) { ::create (ctx, type); tag_set = _tag_set; } //! In addition to the type, the tag set is part of the static //! configuration. void recheck_tags(); //! Called when the tags in tag_set have changed during the //! evaluation and need to take effect immediately. Only the local //! tags in tag_set needs to be checked for changes. // Services. mixed eval() { return read(); } } class PNone //! The identity parser. It only returns its input. { inherit Parser; string data = ""; int evalpos = 0; int feed (string in) { data += in; return 1; } void finish (void|string in) { if (in) data += in; } string eval() { string res = data[evalpos..]; evalpos = sizeof (data); return res; } string byte_compile() { return data; } string byte_interpret (string byte_code, Context ctx) { return byte_code; } void reset (Context ctx) { context = ctx; data = ""; evalpos = 0; } } mixed simple_parse (string in, void|program parser) //! A convenience function to parse a string with no type info, no tag //! set, and no variable references. The parser defaults to PExpr. { // FIXME: Recycle contexts? return t_any (parser || PExpr)->eval (in, Context (empty_tag_set)); } //! Types. class Type //! A static type definition. It does type checking and specifies some //! properties of the type. It may also contain a Parser program that //! will be used to read text and evaluate values of this type. Note //! that the parser is not relevant for type checking. { constant is_RXML_Type = 1; //! Interface. //!string name; //! Unique type identifier. Required and considered constant. Type //! hierarchies are currently implemented with glob patterns, e.g. //! "image/png" is a subtype of "image/*", and "array(string)" is a //! subtype of "array(*)". //!mixed sequential; //! Nonzero if data of this type is sequential, defined as: //! o One or more data items can be concatenated with `+. //! o (Sane) parsers are homomorphic on the type, i.e. //! eval ("da") + eval ("ta") == eval ("da" + "ta") //! and //! eval ("data") + eval ("") == eval ("data")
e213f01999-12-13Martin Stjernholm  //!mixed empty_value;
ed81751999-12-11Martin Stjernholm  //! The empty value for sequential data types, i.e. what eval ("") //! would produce.
e213f01999-12-13Martin Stjernholm  //!mixed free_text;
ed81751999-12-11Martin Stjernholm  //! Nonzero if the type keeps the free text between parsed tokens, //! e.g. the plain text between tags in HTML. The type must be //! sequential and use strings. void type_check (mixed val); //! Checks whether the given value is a valid one of this type. //! Errors are thrown with parse_error(). Type clone() //! Returns a copy of the type. { Type newtype = [object(Type)]/*HMM*/ object_program ([object(Type)]/*HMM*/ this_object())(); newtype->_parser_prog = _parser_prog; newtype->_parser_args = _parser_args; newtype->_t_obj_cache = _t_obj_cache; return newtype; } //! Services. int `== (mixed other) //! { return objectp (other) && other->is_RXML_Type && other->name == this_object()->name; } int subtype_of (Type other) //! { return glob ([string] other->name, [string] ([object(Type)]/*HMM*/ this_object())->name); } Type `() (program newparser, mixed... parser_args) //! Returns a type identical to this one, but which has the given //! parser. parser_args is passed as extra arguments to the //! create()/reset()/clone() functions. { Type newtype; if (sizeof (parser_args)) { // Can't cache this. newtype = clone(); newtype->_parser_args = parser_args; if (newparser->tag_set_eval) newtype->_p_cache = ([]); } else { if (!_t_obj_cache) _t_obj_cache = ([]); if (!(newtype = _t_obj_cache[newparser])) if (newparser == _parser_prog) _t_obj_cache[newparser] = newtype = [object(Type)]/*HMM*/ this_object(); else { _t_obj_cache[newparser] = newtype = clone(); newtype->_parser_prog = newparser; if (newparser->tag_set_eval) newtype->_p_cache = ([]); } } return newtype; } inline Parser get_parser (Context ctx, void|TagSet tag_set) //! Returns a parser instance initialized with the given context. { Parser p; if (_p_cache) { // It's a tag set parser. TagSet tset; // vvv Using interpreter lock from here. PCacheObj pco = _p_cache[tset = tag_set || ctx->tag_set]; if (pco && pco->tag_set_gen == tset->generation) { if ((p = pco->free_parser)) { pco->free_parser = p->_next_free; // ^^^ Using interpreter lock to here. p->data_callback = p->compile = 0; p->reset (ctx, this_object(), @_parser_args); } else // ^^^ Using interpreter lock to here. if (pco->clone_parser) p = [object(Parser)] pco->clone_parser->clone ( ctx, this_object(), @_parser_args); else if ((p = [object(Parser)]/*HMM*/ _parser_prog ( ctx, this_object(), @_parser_args))->clone) // pco->clone_parser might already be initialized here due // to race, but that doesn't matter. p = [object(Parser)] (pco->clone_parser = p)->clone ( ctx, this_object(), @_parser_args); } else { // ^^^ Using interpreter lock to here. pco = PCacheObj(); pco->tag_set_gen = [int]/*HMM*/ tset->generation; _p_cache[tset] = pco; // Might replace an object due to race, but that's ok. if ((p = [object(Parser)]/*HMM*/ _parser_prog ( ctx, this_object(), @_parser_args))->clone) // pco->clone_parser might already be initialized here due // to race, but that doesn't matter. p = [object(Parser)] (pco->clone_parser = p)->clone ( ctx, this_object(), @_parser_args); } } else { if ((p = free_parser)) { // Relying on interpreter lock here. free_parser = p->_next_free; p->data_callback = p->compile = 0; p->reset (ctx, this_object(), @_parser_args); } else if (clone_parser) // Relying on interpreter lock here. p = [object(Parser)] clone_parser->clone ( ctx, this_object(), @_parser_args); else if ((p = [object(Parser)]/*HMM*/ _parser_prog ( ctx, this_object(), @_parser_args))->clone) // clone_parser might already be initialized here due to race, // but that doesn't matter. p = [object(Parser)] (clone_parser = p)->clone ( ctx, this_object(), @_parser_args); } return p; } mixed eval (string in, void|Context ctx, void|TagSet tag_set, void|int dont_switch_ctx) //! Convenience function to parse and evaluate the value in the //! given string. If a context isn't given, the current one is used. //! The current context and ctx are assumed to be the same if //! dont_switch_ctx is nonzero. { mixed res; if (!ctx) ctx = get_context(); if (_parser_prog == PNone) res = in; else { Parser p = get_parser (ctx, tag_set); if (dont_switch_ctx) p->finish (in); // Optimize the job in p->write_end(). else p->write_end (in); res = p->eval(); if (p->reset) if (_p_cache) { // Relying on interpreter lock in this block. PCacheObj pco = _p_cache[tag_set || ctx->tag_set]; p->_next_free = pco->free_parser; pco->free_parser = p; } else { // Relying on interpreter lock in this block. p->_next_free = free_parser; free_parser = p; } } if (ctx->type_check) type_check (res); return res; } // Internals. program/*(Parser)HMM*/ _parser_prog = PNone; // The parser to use. Should never be changed in a type object. private array(mixed) _parser_args = ({}); /*private*/ mapping(program:Type) _t_obj_cache; // To avoid creating new type objects all the time in `(). // Cache used for parsers that doesn't depend on the tag set. private Parser clone_parser; // Used with Parser.clone(). private Parser free_parser; // The list of objects to reuse with Parser.reset(). // Cache used for parsers that depend on the tag set. private class PCacheObj { int tag_set_gen; Parser clone_parser; Parser free_parser; } /*private*/ mapping(TagSet:PCacheObj) _p_cache; } Type t_text = class //! The standard type for generic document text. { inherit Type; constant name = "text/*"; constant sequential = 1; constant empty_value = ""; constant free_text = 1; }(); Type t_any = class //! A completely unspecified nonsequential type. { inherit Type; constant name = "*"; }(); // P-code compilation and evaluation. class VarRef //! A helper for representing variable reference tokens. { constant is_RXML_VarRef = 1; string scope, var; void create (string _scope, string _var) {scope = _scope, var = _var;} int valid (Context ctx) {return !!ctx->scopes[scope];} mixed get (Context ctx) {return ctx->scopes[scope][var];} mixed set (Context ctx, mixed val) {return ctx->scopes[scope][var] = val;} void remove (Context ctx) {m_delete (ctx->scopes[scope], var);} string name() {return scope + "." + var;} } class PCode //! Holds p-code and evaluates it. P-code is the intermediate form //! after parsing and before evaluation. { constant is_RXML_PCode = 1; array p_code = ({}); mixed eval (Context ctx) //! Evaluates the p-code in the given context. { // FIXME } //!function(Context:mixed) compile() //! Returns a compiled function for doing the evaluation. The //! function will receive a context to do the evaluation in. } //! Some parser tools. static class VoidType { mixed `+ (mixed... vals) {return sizeof (vals) ? predef::`+ (@vals) : this_object();} mixed ``+ (mixed val) {return val;} int `!() {return 1;} string _sprintf (string flag) {return flag == "O" && "Void";} }; VoidType Void = VoidType(); //! An object representing the void value. Works as initializer for //! sequences, since Void + anything == anything + Void == anything. class ScanStream //! A helper class for the input and scanner stage in a parser. It's a //! stream that takes unparsed strings and splits them into tokens //! which are queued. Intended to be inherited in a Parser class. { private class Link { array data; Link next; } private Link head = Link(); // Last link is an empty eof marker. private Link tail = head; private int next_token = 0; private string end = ""; private int fin = 0; array scan (string in, int finished); //! The scanner function. It gets an unparsed string and should //! return an array of tokens. If the second argument is nonzero, //! there won't be any more data later. If the second argument is //! zero, the last item in the returned array is handled as unparsed //! data that will be passed back to the scanner later. Tokens may //! be of any type. Use VarRef objects for variables. void feed (string in) //! { #ifdef MODULE_DEBUG if (fin) error ("Cannot feed data to a finished stream.\n"); #endif array tokens = scan (end + in, 0); end = [string] tokens[-1]; if (sizeof (tokens) > 1) { tail->data = tokens[..sizeof (tokens) - 2]; tail = tail->next = Link(); } } void finish (void|string in) //! { if (in || !fin && sizeof (end)) { #ifdef MODULE_DEBUG if (in && fin) error ("Cannot feed data to a finished stream.\n"); #endif fin = 1; if (in) end += in; tail->data = scan (end, 1); tail = tail->next = Link(); } } void reset() //! { head = Link(); tail = head; next_token = 0; end = ""; fin = 0; } mixed read() //! Returns the next token, or Void if there's no more data. { while (head->next) if (next_token >= sizeof (head->data)) { next_token = 0; head = head->next; } else return head->data[next_token++]; return Void; } void unread (mixed... put_back) //! Puts back tokens and variable references at the beginning of the //! stream so that the leftmost argument will be read first. { int i = sizeof (put_back); while (i) head->data[--next_token] = put_back[--i]; if (i) { Link l = Link(); l->next = head, head = l; l->data = allocate (next_token = [int]/*HMM*/ max (i - 32, 0)) + put_back[..--i]; } } array read_all() //! { array data; if (next_token) { data = head->data[next_token..]; head = head->next; next_token = 0; } else data = ({}); while (head->next) { data += head->data; head = head->next; } return data; } int finished() //! Returns nonzero if the write end is finished. { return fin; } } // Various internal stuff. // Argh! static program PHtml; static program PExpr; void _fix_module_ref (string name, mixed val) { mixed err = catch { switch (name) { case "PHtml": PHtml = [program] val; break; case "PExpr": PExpr = [program] val; break; case "empty_tag_set": empty_tag_set = [object(TagSet)] val; break; default: error ("Herk\n"); } }; if (err) werror (describe_backtrace (err)); }