Implement code spans. - md4c - C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.

commit a284a382ea5026c456e3cf52834c525adf619fbc
parent 479a117937dc869b2ef3cc685698b53c1f875945
Author: Martin Mitas <mity@morous.org>
Date:   Thu,  6 Oct 2016 23:50:56 +0200

Implement code spans.

Diffstat:
M README.md  | 2 +-
M md2html/md2html.c  | 12 ++++++++++++
M md4c/md4c.c  | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M md4c/md4c.h  | 2 +-

4 files changed, 214 insertions(+), 38 deletions(-)
diff --git a/README.md b/README.md
@@ -99,7 +99,7 @@ more or less forms our to do list.
 - **Inlines:**
   - [x] 6.1 Backslash escapes
   - [ ] 6.2 Entity and numeric character references
-  - [ ] 6.3 Code spans
+  - [x] 6.3 Code spans
   - [ ] 6.4 Emphasis and strong emphasis
   - [ ] 6.5 Links
   - [ ] 6.6 Images
diff --git a/md2html/md2html.c b/md2html/md2html.c
@@ -181,12 +181,24 @@ leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
 static int
 enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
 {
+    struct membuffer* out = (struct membuffer*) userdata;
+
+    switch(type) {
+        case MD_SPAN_CODE:      MEMBUF_APPEND_LITERAL(out, "<code>"); break;
+    }
+
     return 0;
 }
 
 static int
 leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
 {
+    struct membuffer* out = (struct membuffer*) userdata;
+
+    switch(type) {
+        case MD_SPAN_CODE:      MEMBUF_APPEND_LITERAL(out, "</code>"); break;
+    }
+
     return 0;
 }
 
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -305,8 +305,8 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
 /* Structure marking an offset which needs special attention. The type
  * of the attention is determined by the member ch:
  *
- * '\\': Escape sequence.
- *       (beg points to '\\'; beg+1 to the escaped char.)
+ * '\\': Maybe escape sequence.
+ *  '`': Maybe code span start/end.
  *
  * Note that not all instances of these chars in the text imply creation of the
  * structure. Only those which have (or may have, after we see more context)
@@ -315,14 +315,20 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
 struct MD_MARK_tag {
     OFF beg;
     OFF end;
+
+    /* Index of another mark. Before resolving the member may be used for
+     * arbitrary purpose during the analyzes phase.
+     * For resolved openers, it has to point to the corresponding closer. */
+    int next;
+
     MD_CHAR ch;
     unsigned short flags;
 };
 
 /* Mark flags. */
-#define MD_MARK_RESOLVED    0x0001
-#define MD_MARK_OPENER      0x0002
-#define MD_MARK_CLOSER      0x0004
+#define MD_MARK_RESOLVED    0x0001  /* Yes, the special meaning is indeed recognized. */
+#define MD_MARK_OPENER      0x0002  /* This opens (or potentially may open) a span. */
+#define MD_MARK_CLOSER      0x0004  /* This closes (or potentially may close) a span. */
 
 
 static MD_MARK*
@@ -374,9 +380,6 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
     int ret = 0;
     MD_MARK* mark;
 
-    /* Reset the previously collected stack of marks. */
-    ctx->n_marks = 0;
-
     for(i = 0; i < n_lines; i++) {
         const MD_LINE* line = &lines[i];
         OFF off = line->beg;
@@ -384,14 +387,53 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
 
         while(off < end) {
             CHAR ch = CH(off);
-            /* Analyze backslash escapes.
-             * Note it can go beyond line->end as it may involve
-             * escaped new line to form a hard break. */
+            /* A backslash escape.
+             * It can go beyond line->end as it may involve escaped new
+             * line to form a hard break. */
             if(ch == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
                 /* Hard-break cannot be on the last line of the block. */
                 if(!ISNEWLINE(off+1)  ||  i+1 < n_lines)
                     PUSH(ch, off, off+2, MD_MARK_RESOLVED);
-                off += 2;
+
+                /* If '`' follows, we need both marks as the backslash may be
+                 * inside a code span. */
+                if(CH(off+1) == _T('`'))
+                    off++;
+                else
+                    off += 2;
+                continue;
+            }
+
+            /* Turn non-trivial whitespace into single space. */
+            if(ISWHITESPACE_(ch)) {
+                OFF tmp = off+1;
+
+                while(tmp < end  &&  ISWHITESPACE(tmp))
+                    tmp++;
+
+                if(tmp - end > 1  ||  ch != _T(' ')) {
+                    PUSH(ch, off, tmp, MD_MARK_RESOLVED);
+                    off = tmp;
+                    continue;
+                }
+            }
+
+            /* A potential code span start/end. */
+            if(ch == _T('`')) {
+                unsigned flags;
+                OFF tmp = off+1;
+
+                /* It may be opener only if it is not escaped. */
+                if(ctx->n_marks > 0  &&  ctx->marks[ctx->n_marks-1].beg == off-1  &&  CH(off-1) == _T('\\'))
+                    flags = MD_MARK_CLOSER;
+                else
+                    flags = MD_MARK_OPENER | MD_MARK_CLOSER;
+
+                while(tmp < end  &&  CH(tmp) == _T('`'))
+                    tmp++;
+                PUSH(ch, off, tmp, flags);
+
+                off = tmp;
                 continue;
             }
 
@@ -409,23 +451,115 @@ abort:
     return ret;
 }
 
+
+/* Table of precedence of various span types. */
+static const CHAR* md_precedence_table[] = {
+    _T("`"),        /* Code spans. */
+    _T("\\")        /* Backslash escapes. */
+};
+
+
+static void
+md_analyze_backtick(MD_CTX* ctx, int mark_index, int* p_unresolved_openers)
+{
+    MD_MARK* mark = &ctx->marks[mark_index];
+    int opener = *p_unresolved_openers;
+
+    /* Try to find unresolved opener of the same length. If we find it,
+     * we form a code span. */
+    while(opener >= 0) {
+        MD_MARK* op = &ctx->marks[opener];
+
+        if(op->end - op->beg == mark->end - mark->beg) {
+            /* Resolve the span. */
+            op->flags = MD_MARK_OPENER | MD_MARK_RESOLVED;
+            mark->flags = MD_MARK_CLOSER | MD_MARK_RESOLVED;
+
+            /* Shorten the list of unresolved openers. */
+            *p_unresolved_openers = op->next;
+
+            /* Make the opener point to us as its closer. */
+            op->next = mark_index;
+
+            /* Cancel any escapes inside the code span. */
+            if(mark_index - opener > 1)
+                memset(ctx->marks + opener + 1, 0, sizeof(MD_MARK) * (mark_index - opener - 1));
+
+            /* Append any space or new line inside the span into the mark itself
+             * to swallow it. */
+            while(CH(op->end) == _T(' ')  ||  ISNEWLINE(op->end))
+                op->end++;
+            while(CH(mark->beg-1) == _T(' ')  ||  ISNEWLINE(mark->beg-1))
+                mark->beg--;
+
+            /* Done. */
+            return;
+        }
+
+        opener = ctx->marks[opener].next;
+    }
+
+    /* We didn't find any matching opener, remember it as a potential opener. */
+    if(mark->flags & MD_MARK_OPENER) {
+        mark->next = *p_unresolved_openers;
+        *p_unresolved_openers = mark_index;
+    }
+}
+
+static void
+md_analyze_marks(MD_CTX* ctx, int precedence_level)
+{
+    const CHAR* mark_chars = md_precedence_table[precedence_level];
+    /* Chain of potential/unresolved code span openers. */
+    int code_span_unresolved_openers = -1;
+    int i = 0;
+
+    while(i < ctx->n_marks) {
+        MD_MARK* mark = &ctx->marks[i];
+
+        /* Skip resolved spans. */
+        if(mark->flags & MD_MARK_RESOLVED) {
+            if(mark->flags & MD_MARK_OPENER)
+                i = mark->next + 1;
+            else
+                i++;
+            continue;
+        }
+
+        /* Skip marks we do not want to deal with. */
+        if(!ISANYOF_(mark->ch, mark_chars)) {
+            i++;
+            continue;
+        }
+
+        /* Analyze the mark. */
+        switch(mark->ch) {
+            case _T('`'):
+                md_analyze_backtick(ctx, i, &code_span_unresolved_openers);
+                break;
+        }
+
+        i++;
+    }
+}
+
 /* Analyze marks (build ctx->marks). */
-static int
+static void
 md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
 {
-    int ret = 0;
-
-    MD_CHECK(md_collect_marks(ctx, lines, n_lines));
+    int i;
 
-abort:
-    return ret;
+    for(i = 0; i < SIZEOF_ARRAY(md_precedence_table); i++)
+        md_analyze_marks(ctx, i);
 }
 
 /* Render the output, accordingly to the analyzed ctx->marks. */
 static int
 md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
 {
+    MD_TEXTTYPE text_type;
     const MD_LINE* line = lines;
+    const MD_MARK* prev_mark = NULL;
     const MD_MARK* mark;
     OFF off = lines[0].beg;
     OFF end = lines[n_lines-1].end;
@@ -440,28 +574,45 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
     while(!(mark->flags & MD_MARK_RESOLVED))
         mark++;
 
+    text_type = MD_TEXT_NORMAL;
+
     while(1) {
         /* Process the text up to the next mark or end-of-line. */
         OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
         if(tmp > off) {
-            MD_TEXT(MD_TEXT_NORMAL, STR(off), tmp - off);
+            MD_TEXT(text_type, STR(off), tmp - off);
             off = tmp;
         }
 
         /* If reached the mark, process it and move to next one. */
         if(off >= mark->beg) {
             switch(mark->ch) {
-            case _T('\\'):      /* Backslash escape. */
-                if(ISNEWLINE(mark->beg+1))
-                    enforce_hardbreak = 1;
-                else
-                    MD_TEXT(MD_TEXT_NORMAL, STR(mark->beg+1), 1);
-                break;
+                case _T('\\'):      /* Backslash escape. */
+                    if(ISNEWLINE(mark->beg+1))
+                        enforce_hardbreak = 1;
+                    else
+                        MD_TEXT(text_type, STR(mark->beg+1), 1);
+                    break;
+
+                case _T(' '):       /* Non-trivial space. */
+                    MD_TEXT(text_type, _T(" "), 1);
+                    break;
+
+                case _T('`'):       /* Code span. */
+                    if(mark->flags & MD_MARK_OPENER) {
+                        MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
+                        text_type = MD_TEXT_CODE;
+                    } else {
+                        MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
+                        text_type = MD_TEXT_NORMAL;
+                    }
+                    break;
             }
 
             off = mark->end;
 
             /* Move to next resolved mark. */
+            prev_mark = mark;
             mark++;
             while(!(mark->flags & MD_MARK_RESOLVED))
                 mark++;
@@ -475,12 +626,23 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
             if(off >= end)
                 break;
 
-            /* Output soft or hard line break. */
-            if(enforce_hardbreak  ||  (CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
-                break_type = MD_TEXT_BR;
-            else
-                break_type = MD_TEXT_SOFTBR;
-            MD_TEXT(break_type, _T("\n"), 1);
+            if(text_type == MD_TEXT_CODE) {
+                /* Inside code spans, new lines are transformed into single
+                 * spaces. */
+                MD_ASSERT(prev_mark != NULL);
+                MD_ASSERT(prev_mark->ch == _T('`')  &&  (prev_mark->flags & MD_MARK_OPENER));
+                MD_ASSERT(mark->ch == _T('`')  &&  (mark->flags & MD_MARK_CLOSER));
+
+                if(prev_mark->end < off  &&  off < mark->beg)
+                    MD_TEXT(MD_SPAN_CODE, _T(" "), 1);
+            } else {
+                /* Output soft or hard line break. */
+                if(enforce_hardbreak  ||  (CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
+                    break_type = MD_TEXT_BR;
+                else
+                    break_type = MD_TEXT_SOFTBR;
+                MD_TEXT(break_type, _T("\n"), 1);
+            }
 
             /* Switch to the following line. */
             line++;
@@ -504,7 +666,13 @@ md_process_normal_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
 {
     int ret;
 
-    MD_CHECK(md_analyze_inlines(ctx, lines, n_lines));
+    /* Reset the previously collected stack of marks. */
+    ctx->n_marks = 0;
+
+    MD_CHECK(md_collect_marks(ctx, lines, n_lines));
+
+    md_analyze_inlines(ctx, lines, n_lines);
+
     MD_CHECK(md_process_inlines(ctx, lines, n_lines));
 
 abort:
@@ -616,8 +784,6 @@ md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
 {
     OFF off = beg;
 
-    MD_ASSERT(CH(beg) == _T('`') || CH(beg) == _T('~'));
-
     while(off < ctx->size && CH(off) == CH(beg))
         off++;
 
@@ -728,8 +894,6 @@ md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
     OFF off = beg + 1;
     int i;
 
-    MD_ASSERT(CH(beg) == _T('<'));
-
     /* Check for type 1: <script, <pre, or <style */
     for(i = 0; t1[i].name != NULL; i++) {
         if(off + t1[i].len < ctx->size) {
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -89,7 +89,7 @@ enum MD_BLOCKTYPE_tag {
  * like paragraph or list item. */
 typedef enum MD_SPANTYPE_tag MD_SPANTYPE;
 enum MD_SPANTYPE_tag {
-    MD_SPAN_DUMMY = 0       /* not yet used... */
+    MD_SPAN_CODE
 };
 
 /* Text is the actual textual contents of span. */

	md4c C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
	git clone https://noulin.net/git/md4c.git
	Log \| Files \| Refs \| README \| LICENSE

M	README.md	\|	2	+-
M	md2html/md2html.c	\|	12	++++++++++++
M	md4c/md4c.c	\|	236	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M	md4c/md4c.h	\|	2	+-