Friday, April 25, 2008

JavaScript Creole 1.0 Wiki Markup Parser

Update. If you look for a server-side lightweight wiki engine solution, you may be interested in my PHP port of this wiki parser.

Source code

This program is free software; you can redistribute it and/or modify it under the MIT/X11 License.

/*
 * JavaScript Creole 1.0 Wiki Markup Parser
 * $Id: creole.js 14 2009-03-21 16:15:08Z ifomichev $
 *
 * Copyright (c) 2009 Ivan Fomichev
 *
 * Portions Copyright (c) 2007 Chris Purcell
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

if (!Parse) { var Parse = {}; }
if (!Parse.Simple) { Parse.Simple = {}; }

Parse.Simple.Base = function(grammar, options) {
    if (!arguments.length) { return; }

    this.grammar = grammar;
    this.grammar.root = new this.ruleConstructor(this.grammar.root);
    this.options = options;
};

Parse.Simple.Base.prototype = {
    ruleConstructor: null,
    grammar: null,
    options: null,

    parse: function(node, data, options) {
        if (options) {
            for (i in this.options) {
                if (typeof options[i] == 'undefined') { options[i] = this.options[i]; }
            }
        }
        else {
            options = this.options;
        }
        data = data.replace(/\r\n?/g, '\n');
        this.grammar.root.apply(node, data, options);
        if (options && options.forIE) { node.innerHTML = node.innerHTML.replace(/\r?\n/g, '\r\n'); }
    }
};

Parse.Simple.Base.prototype.constructor = Parse.Simple.Base;

Parse.Simple.Base.Rule = function(params) {
    if (!arguments.length) { return; }

    for (var p in params) { this[p] = params[p]; }
    if (!this.children) { this.children = []; }
};

Parse.Simple.Base.prototype.ruleConstructor = Parse.Simple.Base.Rule;

Parse.Simple.Base.Rule.prototype = {
    regex: null,
    capture: null,
    replaceRegex: null,
    replaceString: null,
    tag: null,
    attrs: null,
    children: null,

    match: function(data, options) {
        return data.match(this.regex);
    },

    build: function(node, r, options) {
        var data;
        if (this.capture !== null) {
            data = r[this.capture];
        }

        var target;
        if (this.tag) {
            target = document.createElement(this.tag);
            node.appendChild(target);
        }
        else { target = node; }

        if (data) {
            if (this.replaceRegex) {
                data = data.replace(this.replaceRegex, this.replaceString);
            }
            this.apply(target, data, options);
        }

        if (this.attrs) {
            for (var i in this.attrs) {
                target.setAttribute(i, this.attrs[i]);
                if (options && options.forIE && i == 'class') { target.className = this.attrs[i]; }
            }
        }
        return this;
    },

    apply: function(node, data, options) {
        var tail = '' + data;
        var matches = [];

        if (!this.fallback.apply) {
            this.fallback = new this.constructor(this.fallback);
        }

        while (true) {
            var best = false;
            var rule  = false;
            for (var i = 0; i < this.children.length; i++) {
                if (typeof matches[i] == 'undefined') {
                    if (!this.children[i].match) {
                        this.children[i] = new this.constructor(this.children[i]);
                    }
                    matches[i] = this.children[i].match(tail, options);
                }
                if (matches[i] && (!best || best.index > matches[i].index)) {
                    best = matches[i];
                    rule = this.children[i];
                    if (best.index == 0) { break; }
                }
            }
                
            var pos = best ? best.index : tail.length;
            if (pos > 0) {
                this.fallback.apply(node, tail.substring(0, pos), options);
            }
            
            if (!best) { break; }

            if (!rule.build) { rule = new this.constructor(rule); }
            rule.build(node, best, options);

            var chopped = best.index + best[0].length;
            tail = tail.substring(chopped);
            for (var i = 0; i < this.children.length; i++) {
                if (matches[i]) {
                    if (matches[i].index >= chopped) {
                        matches[i].index -= chopped;
                    }
                    else {
                        matches[i] = void 0;
                    }
                }
            }
        }

        return this;
    },

    fallback: {
        apply: function(node, data, options) {
            if (options && options.forIE) {
                // workaround for bad IE
                data = data.replace(/\n/g, ' \r');
            }
            node.appendChild(document.createTextNode(data));
        }
    }    
};

Parse.Simple.Base.Rule.prototype.constructor = Parse.Simple.Base.Rule;

Parse.Simple.Creole = function(options) {
    var rx = {};
    rx.link = '[^\\]|~\\n]*(?:(?:\\](?!\\])|~.)[^\\]|~\\n]*)*';
    rx.linkText = '[^\\]~\\n]*(?:(?:\\](?!\\])|~.)[^\\]~\\n]*)*';
    rx.uriPrefix = '\\b(?:(?:https?|ftp)://|mailto:)';
    rx.uri = rx.uriPrefix + rx.link;
    rx.rawUri = rx.uriPrefix + '\\S*[^\\s!"\',.:;?]';
    rx.interwikiPrefix = '[\\w.]+:';
    rx.interwikiLink = rx.interwikiPrefix + rx.link;
    rx.img = '\\{\\{((?!\\{)[^|}\\n]*(?:}(?!})[^|}\\n]*)*)' +
             (options && options.strict ? '' : '(?:') + 
             '\\|([^}~\\n]*((}(?!})|~.)[^}~\\n]*)*)' +
             (options && options.strict ? '' : ')?') +
             '}}';

    var formatLink = function(link, format) {
        if (format instanceof Function) {
            return format(link);
        }

        format = format instanceof Array ? format : [ format ];
        if (typeof format[1] == 'undefined') { format[1] = ''; }
        return format[0] + link + format[1];
    };

    var g = {
        hr: { tag: 'hr', regex: /(^|\n)\s*----\s*(\n|$)/ },

        br: { tag: 'br', regex: /\\\\/ },
        
        preBlock: { tag: 'pre', capture: 2,
            regex: /(^|\n)\{\{\{\n((.*\n)*?)\}\}\}(\n|$)/,
            replaceRegex: /^ ([ \t]*\}\}\})/gm,
            replaceString: '$1' },
        tt: { tag: 'tt',
            regex: /\{\{\{(.*?\}\}\}+)/, capture: 1,
            replaceRegex: /\}\}\}$/, replaceString: '' },

        ulist: { tag: 'ul', capture: 0,
            regex: /(^|\n)([ \t]*\*[^*#].*(\n|$)([ \t]*[^\s*#].*(\n|$))*([ \t]*[*#]{2}.*(\n|$))*)+/ },
        olist: { tag: 'ol', capture: 0,
            regex: /(^|\n)([ \t]*#[^*#].*(\n|$)([ \t]*[^\s*#].*(\n|$))*([ \t]*[*#]{2}.*(\n|$))*)+/ },
        li: { tag: 'li', capture: 0,
            regex: /[ \t]*([*#]).+(\n[ \t]*[^*#\s].*)*(\n[ \t]*\1[*#].+)*/,
            replaceRegex: /(^|\n)[ \t]*[*#]/g, replaceString: '$1' },

        table: { tag: 'table', capture: 0,
            regex: /(^|\n)(\|.*?[ \t]*(\n|$))+/ },
        tr: { tag: 'tr', capture: 2, regex: /(^|\n)(\|.*?)\|?[ \t]*(\n|$)/ },
        th: { tag: 'th', regex: /\|+=([^|]*)/, capture: 1 },
        td: { tag: 'td', capture: 1,
            regex: '\\|+([^|~\\[{]*((~(.|(?=\\n)|$)|' +
                   '\\[\\[' + rx.link + '(\\|' + rx.linkText + ')?\\]\\]' +
                   (options && options.strict ? '' : '|' + rx.img) +
                   '|[\\[{])[^|~]*)*)' },

        singleLine: { regex: /.+/, capture: 0 },
        paragraph: { tag: 'p', capture: 0,
            regex: /(^|\n)([ \t]*\S.*(\n|$))+/ },
        text: { capture: 0, regex: /(^|\n)([ \t]*[^\s].*(\n|$))+/ },

        strong: { tag: 'strong', capture: 1,
            regex: /\*\*([^*~]*((\*(?!\*)|~(.|(?=\n)|$))[^*~]*)*)(\*\*|\n|$)/ },
        em: { tag: 'em', capture: 1,
            regex: '\\/\\/(((?!' + rx.uriPrefix + ')[^\\/~])*' +
                   '((' + rx.rawUri + '|\\/(?!\\/)|~(.|(?=\\n)|$))' +
                   '((?!' + rx.uriPrefix + ')[^\\/~])*)*)(\\/\\/|\\n|$)' },

        img: { regex: rx.img,
            build: function(node, r, options) {
                var img = document.createElement('img');
                img.src = r[1];
                img.alt = r[2] === undefined
                    ? (options && options.defaultImageText ? options.defaultImageText : '')
                    : r[2].replace(/~(.)/g, '$1');
                node.appendChild(img);
            } },

        namedUri: { regex: '\\[\\[(' + rx.uri + ')\\|(' + rx.linkText + ')\\]\\]',
            build: function(node, r, options) {
                var link = document.createElement('a');
                link.href = r[1];
                if (options && options.isPlainUri) {
                    link.appendChild(document.createTextNode(r[2]));
                }
                else {
                    this.apply(link, r[2], options);
                }
                node.appendChild(link);
            } },

        namedLink: { regex: '\\[\\[(' + rx.link + ')\\|(' + rx.linkText + ')\\]\\]',
            build: function(node, r, options) {
                var link = document.createElement('a');
                
                link.href = options && options.linkFormat
                    ? formatLink(r[1].replace(/~(.)/g, '$1'), options.linkFormat)
                    : r[1].replace(/~(.)/g, '$1');
                this.apply(link, r[2], options);
                
                node.appendChild(link);
            } },

        unnamedUri: { regex: '\\[\\[(' + rx.uri + ')\\]\\]',
            build: 'dummy' },
        unnamedLink: { regex: '\\[\\[(' + rx.link + ')\\]\\]',
            build: 'dummy' },
        unnamedInterwikiLink: { regex: '\\[\\[(' + rx.interwikiLink + ')\\]\\]',
            build: 'dummy' },

        rawUri: { regex: '(' + rx.rawUri + ')',
            build: 'dummy' },

        escapedSequence: { regex: '~(' + rx.rawUri + '|.)', capture: 1,
            tag: 'span', attrs: { 'class': 'escaped' } },
        escapedSymbol: { regex: /~(.)/, capture: 1,
            tag: 'span', attrs: { 'class': 'escaped' } }
    };
    g.unnamedUri.build = g.rawUri.build = function(node, r, options) {
        if (!options) { options = {}; }
        options.isPlainUri = true;
        g.namedUri.build.call(this, node, Array(r[0], r[1], r[1]), options);
    };
    g.unnamedLink.build = function(node, r, options) {
        g.namedLink.build.call(this, node, Array(r[0], r[1], r[1]), options);
    };
    g.namedInterwikiLink = { regex: '\\[\\[(' + rx.interwikiLink + ')\\|(' + rx.linkText + ')\\]\\]',
        build: function(node, r, options) {
                var link = document.createElement('a');
                
                var m, f;
                if (options && options.interwiki) {
                m = r[1].match(/(.*?):(.*)/);
                f = options.interwiki[m[1]];
            }
            
            if (typeof f == 'undefined') {
                if (!g.namedLink.apply) {
                    g.namedLink = new this.constructor(g.namedLink);
                }
                return g.namedLink.build.call(g.namedLink, node, r, options);
            }

            link.href = formatLink(m[2].replace(/~(.)/g, '$1'), f);
            
            this.apply(link, r[2], options);
            
            node.appendChild(link);
        }
    };
    g.unnamedInterwikiLink.build = function(node, r, options) {
        g.namedInterwikiLink.build.call(this, node, Array(r[0], r[1], r[1]), options);
    };
    g.namedUri.children = g.unnamedUri.children = g.rawUri.children =
            g.namedLink.children = g.unnamedLink.children =
            g.namedInterwikiLink.children = g.unnamedInterwikiLink.children =
        [ g.escapedSymbol, g.img ];

    for (var i = 1; i <= 6; i++) {
        g['h' + i] = { tag: 'h' + i, capture: 2,
            regex: '(^|\\n)[ \\t]*={' + i + '}[ \\t]' +
                   '([^~]*?(~(.|(?=\\n)|$))*)[ \\t]*=*\\s*(\\n|$)'
        };
    }

    g.ulist.children = g.olist.children = [ g.li ];
    g.li.children = [ g.ulist, g.olist ];
    g.li.fallback = g.text;

    g.table.children = [ g.tr ];
    g.tr.children = [ g.th, g.td ];
    g.td.children = [ g.singleLine ];
    g.th.children = [ g.singleLine ];

    g.h1.children = g.h2.children = g.h3.children =
            g.h4.children = g.h5.children = g.h6.children =
            g.singleLine.children = g.paragraph.children =
            g.text.children = g.strong.children = g.em.children =
        [ g.escapedSequence, g.strong, g.em, g.br, g.rawUri,
            g.namedUri, g.namedInterwikiLink, g.namedLink,
            g.unnamedUri, g.unnamedInterwikiLink, g.unnamedLink,
            g.tt, g.img ];

    g.root = {
        children: [ g.h1, g.h2, g.h3, g.h4, g.h5, g.h6,
            g.hr, g.ulist, g.olist, g.preBlock, g.table ],
        fallback: { children: [ g.paragraph ] }
    };

    Parse.Simple.Base.call(this, g, options);
};

Parse.Simple.Creole.prototype = new Parse.Simple.Base();

Parse.Simple.Creole.prototype.constructor = Parse.Simple.Creole;

31 comments:

Don Schuy said...

Hi Ivan,

Really appreciate the post. I'm getting some good use of it to build a internal wiki for the consulting company I work for.

I've added two extensions.

The first extension is a "dot" command. From this I add a variety of custom features that go against the grain of a standard wiki creole syntax. For example:

.alert
my alert text
.

The above displays the "my alert text" in a red box. Using jquery.chili-2.2.js, I've added colored syntax highlighting for code examples like this:

.js
alert('hi');
.

The second extension is I added a "#" command. In the wiki I'm creating, I use a JQuery accordion menu to navigate between pages. I was able to use your code to allow the accordion menu to be editable. I use the "#" command to include this menu source on another page sort of like a C/C++ include directive. For example:

#nav

inserts the contents of the nav page onto the current page.

This is all great fun when editing the page is real-time. Thanks to you and Chris Purcell for the wiki engine!

Anonymous said...

it'd be neat if this were put in a repository

codeholic said...

It is! See http://jscreole.svn.sourceforge.net/viewvc/jscreole/

codeholic said...

BTW, here's original Chris Purcell's parser.

sbarkdull said...

Hi Ivan,

Thank you for the parser! Nice job.

It looks like the image element is broken. When I paste this:
{{http://www.performancebike.com/images/hd_teamp_07.gif}}

into the creole markup, it doesn't generate an HTML image.

codeholic said...

sbarkdull, I've supposed that Creole requires a title for an image to be set, since there's no example for an image w/o a title anywhere on WikiCreole. Perhaps, it should be clarified.

For now, you can insert an image with an empty title instead, for instance, {{http://www.performancebike.com/images/hd_teamp_07.gif|}}

prologic said...

Hi, Is it possible to extend this JavaScript creole parser to include support for creole additions such as:
* Plugins ?

--JamesMills

codeholic said...

Yes, I'm planning to implement additions. Shall I let you know when it is available?

Or did you mean something else by "* Plugins"?

prologic said...

I meant the "Plugin" addition - yes. Of the form:
<<Plugin(arg1=foo arg2=bar)>&ht;

I've managed to extend your parser to do this in the simplest possible way by adding the following:

plugin: { regex: '\\<\\<((?!\\<)[^ >\\n]*(?:}(?!})[^|}\\n]*)*)>>',
build: function(node, r, options) {
if (options && typeof(options.plugin) != "undefined") {
options.plugin(node, r, options);
}
} },

My low-level JavaScript ain't that great though and I'm unfamiliar with your parser design (not necessary recursive descent parsers).

I look forward to your improvements :)

--JamesMills (prologic)

prologic said...

FYI, I'm developing a heavily client-side (using jquery) based wiki with a restful backend and mercurial storage using circutis.web

Demo: http://124.171.217.91:8000/ (may be intermittently offline)

cheers
James

codeholic said...

The only problem with extensions, and the only reason I still have not implemented them yet, is that according to the specification, "the content of the extension element may contain any text, including line breaks, except for the opening and closing extension element markup-codes".

The parser used in this Creole markup implementation requires that you describe all possible content elements may contain. (It's why Chris Purcell called this parser "regular language" recursive descent parser.)

According to the spec, "the extension element markup may be nested in any other markup". It means it may contain the text, identical to closing markup-up codes for other elements.

So, in order to implement the extension element in accordance with the spec, you have to assume in your regexp, that virtually any container element may contain extension elements, which may contain any text, including closing markup for the parent element. It's quite a task to implement, you know.

I had enough puzzles working with this parser engine, and sometimes I thought that it would be easier to implement a toothier parser engine than to describe the grammar, that this parser engine can chew. Do you know maybe a simple implementation of context-free grammer parser generator in any imperative programming language? I would translate it to JavaScript then. Please don't recommend JS/CC, it's an overkill. I don't like such solutions.

prologic said...

I'm a Python developer (hence my project circuits and circuits.web). Have you looked at pyparsing ?

cheers
James

prologic said...

Also in regards to your earlier comment ... Can we not make it simpler and define some assumptions (if it's too hard to implement the plugin extension according to the spec ?)

For example, in Trac's wiki I don't believe it's possible to have [[FooMacro(...)]] in any other markup.
Trac's wiki also allows you to use those same macros in processors:

{{{
#!FooMacro
...
}}}

cheers
James

codeholic said...

Thanks, I'll take a look.

codeholic said...

> Can we not make it simpler and define some assumptions

Yes, I thought about it too...

codeholic said...

pyparsing is too complicated. I found Spark, that's almost what I looked for. I'll try to translate it into JavaScript.

prologic said...

Okay I look forward to it!
I've just finished the latest version of my wiki (I rewrite the UI in mootools): http://124.171.217.91:8000/

Source: http://124.171.217.91:8000/source/

I look forward to a new and flexible creole parser - I especially need Plugin support :)

Would you mind taking a look at my wiki engine ... I (as yet) don't have any idea how I'm going to implement Plugin extensions

--JamesMills

prologic said...

Oh one more thing. I think your code could benefit (very much so) if you were to use the mootools library. Just the core mootools library that provides classes and far nicer ways of extending and writing interfaces.

I don't know enough about your code to help you, but I'll be integrating it into a mootools Class and adding extra bits and pieces to it.

cheers
JamesMills

codeholic said...

An easy way to implement extensions is like this:

if (options && options.extension instanceof Function) {
g.extension = { capture: 1, regex: /<<(.+?)>>/,
build: options.extension };
g.root.children.push(g.extension);
}

Then you can use it like this:

options: {
extension: function(node, r, options) {
data = eval(r[1]);
if (options && options.forIE) {
// workaround for bad IE
data = data.replace(/\n/g, ' \r');
}
node.appendChild(document.createTextNode(data));
}
}

input: "<<'some' + 'text';>>"

output: 'sometext'

prologic said...

I did something very similar :)

prologic said...

I've spent a few hours this morning adding more improvements to your creole parser. Mainly I've added the ability to have pre-processors, eg:
{{{
#!html
<p>Hello World</p>
}}}

etc...

Can we converse via email instead of clogging up your blog ? :)

My email is prologic at shortcircuit dot net dot au

cheers
JamesMills

Mikeumus said...

Wow, this is awesome. I hope more wikis adopt the WikiCreole standard.

Thanks for sharing!

~Mikeumus

Chris said...

Hello !
Heading tags don't seem to work !
Thank you !

Chris said...

In fact it does... sorry !
keep up the good work !

thor said...

I've just come across this parser and it's working great for me. I would like to be able to customise it by adding DIV's though - 3 simple div's with set classes, but having looked at the code for a while I'm still none the wiser as to how to implement that...

Anybody able to give me a few pointers in the right direction?

What I want is:

Markup: << content <<
HTML: < div class="left_panel">content

Markup: >> content >>
HTML: < div class="right_panel">content

Markup: ^^ content ^^
HTML: < div class="horz_panel">content

Any help would be greatly appreciated!

Thanks

codeholic said...

Hello, thor! You can either make a subclass of Parse.Simple.Creole or just make a Parse.Simple.Creole instance and override some grammar rules, e. g.:

g = this.grammar;
g.leftPanel = { tag: 'div', attrs: { class: 'left_panel' }, regex: /(^|\n)[ \t]*<<([\S\s]*?)<</, capture: 2 };
// other rules here ...
g.root.children.push(g.leftPanel, g.rightPanel, g.horzPanel);

DISCLAIMER. The code above hasn't been tested and is not supported. Particularly, it clashes against some proposed WikiCreole standards, see http://wikicreole.org/wiki/CreoleAdditions

thor, you are welcome to publish your final code here (or give a link to an external page) to illustrate how one can add his own grammar rules.

Creoledit said...

Hi folks,

this is a very nice parser for Creole 1.0. I'm using it too and I've writen an online editor for Creole Markup language. If you want to add it to your Wiki- or CMS-Software please feel free to visit Creoledit homepage to get an equivalent editor.

JL said...

Great script! Makes the mercurial wiki setup a lot more worth...

Robert said...

Wow, a level 3 heading in this demo renders as a post title.

Die Reise genItalien said...
This comment has been removed by the author.
What a blog said...

Hey everybody, great tool by the way!

Did somebody of you create an 'Table of Contents' function?


Would be great :-)