sources/Google.Solutions.Mvvm/Format/MarkdownDocument.cs (538 lines of code) (raw):
//
// Copyright 2023 Google LLC
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
namespace Google.Solutions.Mvvm.Format
{
/// <summary>
/// A Markdown document.
/// </summary>
internal class MarkdownDocument
{
public DocumentNode Root { get; }
private static readonly char[] NonLineBreakingWhitespace = new char[] { ' ', '\t' };
private static readonly char[] UnorderedListBullets = new char[] { '*', '-', '+' };
public static MarkdownDocument Parse(TextReader reader)
{
//
// There's no proper grammar for Markdown, so we can't use a classic
// lexer/parser architecture to read Markdown.
//
// CommonMark suggests a 2-phase parsing approach [1], which is what
// we're using here:
//
// 1. Block-level parsing: We break a document into its major
// blocks: Headings, List items, Spans. Spans are blocks
// that contain formatted text, and these need additional
// processing.
// 2. Span-level-parsing: Each span is parsed to identify links,
// emphasized text, etc.
//
// Note that we're only supporting a subset of Markdown syntax
// features here.
//
// [1] https://spec.commonmark.org/0.30/#appendix-a-parsing-strategy
//
return new MarkdownDocument(DocumentNode.Parse(reader));
}
public static MarkdownDocument Parse(string markdown)
{
using (var reader = new StringReader(markdown))
{
return Parse(reader);
}
}
private MarkdownDocument(DocumentNode root)
{
this.Root = root;
}
public override string ToString()
{
return this.Root.ToString();
}
//---------------------------------------------------------------------
//
// Block-level parsing
//
//---------------------------------------------------------------------
/// <summary>
/// A node in a Markdown document tree.
/// </summary>
public abstract class Node
{
private Node? next;
private Node? firstChild;
protected Node? lastChild;
/// <summary>
/// List direct children of this node.
/// </summary>
public virtual IEnumerable<Node> Children
{
get
{
if (this.firstChild == null)
{
yield break;
}
else
{
for (var block = this.firstChild; block != null; block = block.next)
{
yield return block;
}
}
}
}
/// <summary>
/// Append a child to this node.
/// </summary>
/// <param name="block"></param>
protected void AppendNode(Node block)
{
Debug.Assert((this.firstChild == null) == (this.lastChild == null));
if (this.firstChild == null)
{
Debug.Assert(this.lastChild == null);
this.firstChild = block;
this.lastChild = this.firstChild;
}
else
{
Debug.Assert(this.lastChild != null);
this.lastChild!.next = block;
this.lastChild = block;
}
}
/// <summary>
/// Create a new node for a given line. Only nodes that
/// are allowed as children are considered.
protected virtual Node CreateNode(string line)
{
if (UnorderedListItemNode.IsUnorderedListItemNode(line))
{
return new UnorderedListItemNode(line);
}
else if (OrderedListItemNode.IsOrderedListItemNode(line))
{
return new OrderedListItemNode(line);
}
else
{
return new SpanNode(line);
}
}
/// <summary>
/// Try to extend the node by consuming an additional line.
/// <returns>true if line was consumed, false otherwise.</returns>
protected virtual bool TryConsume(string line)
{
if (this.lastChild != null && this.lastChild.TryConsume(line))
{
//
// Continuation of last block.
//
return true;
}
else if (string.IsNullOrWhiteSpace(line))
{
//
// An empty line always ends a block, but does
// not start a new one yet.
//
AppendNode(new ParagraphBreak());
return false;
}
else
{
//
// Last block is closed, append a new block.
//
AppendNode(CreateNode(line));
return true;
}
}
/// <summary>
/// Summary string describing this node.
/// </summary>
protected abstract string Summary { get; }
/// <summary>
/// Create string representation of this node and its
/// children.
/// </summary>
public override string ToString()
{
var buffer = new StringBuilder();
void Visit(Node block, int level)
{
buffer.Append(new string(' ', level));
buffer.Append(block.Summary);
buffer.Append('\n');
foreach (var child in block.Children)
{
Visit(child, level + 1);
}
}
Visit(this, 0);
return buffer.ToString(); ;
}
}
/// <summary>
/// A break between two pararaphs, typically created by an
/// empty line.
/// </summary>
public class ParagraphBreak : Node
{
protected override string Summary => "[ParagraphBreak]";
protected override bool TryConsume(string line)
{
return false;
}
}
/// <summary>
/// Heading.
/// </summary>
public class HeadingNode : Node
{
public int Level { get; }
public string Text { get; }
public static bool IsHeadingNode(string line)
{
var index = line.IndexOfAny(NonLineBreakingWhitespace);
return index > 0 && line.Substring(0, index).All(c => c == '#');
}
public HeadingNode(string line)
{
Debug.Assert(IsHeadingNode(line));
var whitespaceIndex = line.IndexOfAny(NonLineBreakingWhitespace);
this.Level = line.Substring(0, whitespaceIndex).Count();
this.Text = line.Substring(whitespaceIndex).Trim();
}
protected override bool TryConsume(string line)
{
//
// Headings are always single-line.
//
return false;
}
protected override string Summary => $"[Heading level={this.Level}] {this.Text}";
}
/// <summary>
/// Ordered list item.
/// </summary>
public class OrderedListItemNode : Node
{
private readonly string indent;
public static bool IsOrderedListItemNode(string line)
{
var dotIndex = line.IndexOf('.');
return dotIndex > 0 &&
dotIndex < line.Length - 1 &&
line[dotIndex + 1] == ' ' &&
line.Substring(0, dotIndex).All(char.IsDigit);
}
public OrderedListItemNode(string line)
{
Debug.Assert(IsOrderedListItemNode(line));
//
// Determine the necessary indent for subsequent
// lines.
//
var indent = line.IndexOf(' ');
while (indent < line.Length && line[indent] == ' ')
{
indent++;
}
this.indent = new string(' ', indent);
AppendNode(new SpanNode(line.Substring(indent)));
}
protected override bool TryConsume(string line)
{
if (string.IsNullOrEmpty(line))
{
AppendNode(new ParagraphBreak());
return true;
}
else if (!line.StartsWith(this.indent))
{
//
// Line doesn't have the minimum amount of indentation,
// so it can't be a continuation.
//
// NB. We don't support lazy continations.
//
return false;
}
else
{
return base.TryConsume(line.Substring(this.indent.Length));
}
}
protected override string Summary
=> $"[OrderedListItem indent={this.indent.Length}]";
}
/// <summary>
/// Unodered list item.
/// </summary>
public class UnorderedListItemNode : Node
{
private readonly string indent;
private readonly char bullet;
public static bool IsUnorderedListItemNode(string line)
{
return line.Length >= 3 &&
UnorderedListBullets.Contains(line[0]) &&
NonLineBreakingWhitespace.Contains(line[1]);
}
public UnorderedListItemNode(string line)
{
Debug.Assert(IsUnorderedListItemNode(line));
this.bullet = line[0];
//
// Determine the necessary indent for subsequent
// lines.
//
var indent = 1;
while (indent < line.Length && line[indent] == ' ')
{
indent++;
}
this.indent = new string(' ', indent);
AppendNode(new SpanNode(line.Substring(indent)));
}
protected override bool TryConsume(string line)
{
if (string.IsNullOrWhiteSpace(line))
{
AppendNode(new ParagraphBreak());
return true;
}
else if (!line.StartsWith(this.indent))
{
//
// Line doesn't have the minimum amount of indentation,
// so it can't be a continuation.
//
// NB. We don't support lazy continations.
//
return false;
}
else
{
return base.TryConsume(line.Substring(this.indent.Length));
}
}
protected override string Summary
=> $"[UnorderedListItem bullet={this.bullet} indent={this.indent.Length}]";
}
/// <summary>
/// Document, this forms the root of the tree.
/// </summary>
public class DocumentNode : Node
{
protected override Node CreateNode(string line)
{
if (HeadingNode.IsHeadingNode(line))
{
return new HeadingNode(line);
}
else
{
return base.CreateNode(line);
}
}
protected override string Summary => "[Document]";
public static DocumentNode Parse(TextReader reader)
{
var document = new DocumentNode();
while (true)
{
var line = reader.ReadLine();
if (line == null)
{
break;
}
else
{
document.TryConsume(line);
}
}
return document;
}
}
//---------------------------------------------------------------------
//
// Span-level parsing
//
//---------------------------------------------------------------------
internal enum TokenType
{
Text,
Delimiter
}
/// <summary>
/// A token within a text span. Tokens are separated by delimiters.
/// Delimiters may indicate a new kind of span, but sometimes they
/// can just be text (for ex, a * in the middle of a sentence doesn't
/// begin an emphasis).
/// </summary>
internal class Token
{
public TokenType Type { get; }
public string Value { get; }
internal Token(TokenType type, string value)
{
Debug.Assert(type != TokenType.Delimiter || value.Length <= 2);
this.Type = type;
this.Value = value;
}
public static IEnumerable<Token> Tokenize(string text)
{
var textStart = -1;
for (var i = 0; i < text.Length; i++)
{
switch (text[i])
{
case '*':
{
if (textStart >= 0 && i - textStart > 0)
{
//
// Flush previous text token, if non-empty.
//
yield return new Token(TokenType.Text, text.Substring(textStart, i - textStart));
textStart = -1;
}
if (i + 1 < text.Length && text[i + 1] == '*')
{
i++;
yield return new Token(TokenType.Delimiter, "**");
}
else
{
yield return new Token(TokenType.Delimiter, "*");
}
break;
}
case '_':
case '`':
case '[':
case ']':
case '(':
case ')':
//
// Delimiter.
//
if (textStart >= 0 && i - textStart > 0)
{
//
// Flush previous text token, if non-empty.
//
yield return new Token(TokenType.Text, text.Substring(textStart, i - textStart));
textStart = -1;
}
yield return new Token(TokenType.Delimiter, text[i].ToString());
break;
default:
//
// Text.
//
if (textStart == -1)
{
textStart = i;
}
break;
}
}
if (textStart >= 0)
{
yield return new Token(TokenType.Text, text.Substring(textStart));
}
}
public override string ToString()
{
return $"{this.Type}: {this.Value}";
}
public override bool Equals(object obj)
{
return obj is Token token &&
token.Type == this.Type &&
token.Value == this.Value;
}
public static bool operator ==(Token lhs, Token rhs)
{
if (lhs is null)
{
return rhs is null;
}
else
{
return lhs.Equals(rhs);
}
}
public static bool operator !=(Token lhs, Token rhs) => !(lhs == rhs);
public override int GetHashCode()
{
return this.Value.GetHashCode();
}
}
/// <summary>
/// Container for formatted text.
/// </summary>
public class SpanNode : Node
{
protected override string Summary => $"[Span]";
internal SpanNode(string text)
{
TryConsume(text);
}
protected SpanNode()
{
}
protected SpanNode CreateSpanNode(Token token, IEnumerable<Token> remainder)
{
if (token.Type == TokenType.Text)
{
return new TextNode(token.Value, true);
}
if ((token.Value == "_" || token.Value == "*" || token.Value == "**" || token.Value == "`") &&
remainder.FirstOrDefault() is Token next &&
next != null! &&
next.Type == TokenType.Text &&
next.Value.Length >= 1 &&
!NonLineBreakingWhitespace.Contains(next.Value[0]))
{
return new EmphasisNode(token.Value);
}
else if (token.Value == "[" &&
remainder
.SkipWhile(t => t != new Token(TokenType.Delimiter, "]"))
.Skip(1)
.FirstOrDefault() == new Token(TokenType.Delimiter, "("))
{
return new LinkNode();
}
else
{
return new TextNode(token.Value, false);
}
}
protected override sealed bool TryConsume(string line)
{
if (this.lastChild != null && string.IsNullOrWhiteSpace(line))
{
//
// A blank line indicates a new paragraph, so we must
// not consume this.
//
return false;
}
else if (this.lastChild != null)
{
//
// Inject a space to compensate for the line break.
//
line = " " + line;
}
//
// Break the line into tokens and build a tree
// that represents the formatting.
//
var tokens = Token.Tokenize(line);
while (tokens.Any())
{
var token = tokens.First();
var remainder = tokens.Skip(1);
TryConsumeToken(token, remainder);
tokens = remainder;
}
return true;
}
protected virtual bool TryConsumeToken(Token token, IEnumerable<Token> remainder)
{
if (this.lastChild != null &&
((SpanNode)this.lastChild).TryConsumeToken(token, remainder))
{
//
// Continuation of last span.
//
return true;
}
else
{
//
// Last block is closed, append a new block.
//
AppendNode(CreateSpanNode(token, remainder));
return true;
}
}
}
/// <summary>
/// Unformatted text.
/// </summary>
public class TextNode : SpanNode
{
private readonly bool space;
public string Text { get; protected set; }
protected override string Summary => $"[Text] {this.Text}";
public TextNode(string text, bool space)
{
this.Text = text;
this.space = space;
}
protected override bool TryConsumeToken(Token token, IEnumerable<Token> remainder)
{
if (token.Type == TokenType.Delimiter)
{
return false;
}
else
{
if (this.space)
{
this.Text = this.Text + token.Value;
}
else
{
this.Text += token.Value;
}
return true;
}
}
}
/// <summary>
/// Strong or normal emphasis
/// </summary>
public class EmphasisNode : SpanNode
{
private readonly string delimiter;
private bool bodyCompleted = false;
public string Text { get; protected set; } = string.Empty;
public bool IsStrong => this.delimiter == "**";
public bool IsCode => this.delimiter == "`";
public EmphasisNode(string delimiter)
{
this.delimiter = delimiter;
}
protected override string Summary => $"[Emphasis delimiter={this.delimiter}] {this.Text}";
protected override bool TryConsumeToken(Token token, IEnumerable<Token> remainder)
{
if (this.bodyCompleted)
{
return false;
}
else if (token.Type == TokenType.Delimiter && token.Value == this.delimiter)
{
//
// Eat the delimiter.
//
this.bodyCompleted = true;
return true;
}
else if (token.Type == TokenType.Text)
{
this.Text += token.Value;
return true;
}
else
{
return false;
}
}
}
/// <summary>
/// Link. Links can contain formatted text.
/// </summary>
public class LinkNode : SpanNode
{
private bool linkBodyCompleted = false;
private bool linkHrefCompleted = false;
protected override string Summary => $"[Link href={this.Href}]";
public string Href { get; protected set; } = string.Empty;
protected override bool TryConsumeToken(Token token, IEnumerable<Token> remainder)
{
if (this.linkHrefCompleted)
{
//
// Link completed.
//
return false;
}
else if (this.linkBodyCompleted)
{
//
// Building the link href.
//
if (this.Href == string.Empty && token == new Token(TokenType.Delimiter, "("))
{
return true;
}
else if (token == new Token(TokenType.Delimiter, ")"))
{
this.linkHrefCompleted = true;
return true;
}
else
{
this.Href += token.Value;
return true;
}
}
else
{
//
// Building the link body/text.
//
if (token == new Token(TokenType.Delimiter, "]"))
{
this.linkBodyCompleted = true;
return true;
}
else
{
return base.TryConsumeToken(token, remainder);
}
}
}
}
}
}