//
// Copyright (c) 2024, Brian Frank and Andy Frank
// Licensed under the Academic Free License version 3.0
//
// History:
// 08 Oct 2024 Matthew Giannini Creation
//
**
** Block parsing state.
**
@Js
mixin ParserState
{
** The current source line being parsed (full line)
abstract SourceLine? line()
** The current index within the line (0-based)
abstract Int index()
** Get the index of the next non-space character starting from `index`
** (may be the same) (0-based)
abstract Int nextNonSpaceIndex()
** The colum is the position within the line after tab characters have been
** processed as 4-space tab stops. If the line doesn't contain any tabs, it's
** the same as the `index`. If the line starts with a tab, followed by text, then
** the column for the first character of the text is 4 (the index is 1)
**
** Returns the current column within the line (0-based)
abstract Int column()
** Get the indentation in columns (either by spaces or tab stop of 4), starting
** from `column`.
abstract Int indent()
** Return true if the current line is blank starting from the `index`
abstract Bool isBlank()
** Get the deepest open block parser
abstract BlockParser activeBlockParser()
}
**
** Parses text into a `Document` AST
**
@Js
internal class DocumentParser : ParserState
{
** Core block types
static const Type[] core_block_types := [
BlockQuote#,
Heading#,
FencedCode#,
HtmlBlock#,
ThematicBreak#,
ListBlock#,
IndentedCode#,
]
static const [Type:BlockParserFactory] core_factories
static
{
acc := [Type:BlockParserFactory][:] { ordered = true }
acc[BlockQuote#] = BlockQuoteParser.factory
acc[Heading#] = HeadingParser.factory
acc[FencedCode#] = FencedCodeParser.factory
acc[HtmlBlock#] = HtmlBlockParser.factory
acc[ThematicBreak#] = ThematicBreakParser.factory
acc[ListBlock#] = ListBlockParser.factory
acc[IndentedCode#] = IndentedCodeParser.factory
DocumentParser.core_factories = acc.toImmutable
}
//////////////////////////////////////////////////////////////////////////
// Constructor
//////////////////////////////////////////////////////////////////////////
new make(Parser parser)
{
this.parser = parser
this.blockParserFactories = parser.blockParserFactories
this.includeSourceSpans = parser.includeSourceSpans
// initialize the document block parser
this.documentBlockParser = DocumentBlockParser()
activateBlockParser(OpenBlockParser(documentBlockParser, 0))
}
private DocumentBlockParser documentBlockParser
private Definitions definitions := Definitions()
//////////////////////////////////////////////////////////////////////////
// Config
//////////////////////////////////////////////////////////////////////////
const Parser parser
const BlockParserFactory[] blockParserFactories
const IncludeSourceSpans includeSourceSpans
static BlockParserFactory[] calculateBlockParserFactories(BlockParserFactory[] custom, Type[] enabled)
{
acc := BlockParserFactory[,]
// by having the custom factories come first, extensions are able to change behavior
// of core syntax!
acc.addAll(custom)
enabled.each |type| { acc.add(core_factories[type]) }
return acc
}
static Void checkEnabledBlockTypes(Type[] types)
{
types.each |type|
{
if (core_factories[type] == null)
throw ArgErr("Can't enable block type ${type}, possible options are ${core_block_types}")
}
}
//////////////////////////////////////////////////////////////////////////
// State
//////////////////////////////////////////////////////////////////////////
override SourceLine? line := null { private set }
** Current line index (0-based)
private Int lineIndex := -1
override Int index := 0 { private set }
override Int column := 0 { private set }
** Is the current column within a tab character (partially consumed tab)
private Bool columnIsInTab := false
override Int nextNonSpaceIndex := 0 { private set } // nextNonSpace
private Int nextNonSpaceColumn := 0
override Int indent := 0 { private set }
override Bool isBlank := false { private set }
override BlockParser activeBlockParser()
{
openBlockParsers.last.blockParser
}
** Block parsers that are currently in the "open" state
private OpenBlockParser[] openBlockParsers := [,]
** All block parsers
private BlockParser[] allBlockParsers := [,]
//////////////////////////////////////////////////////////////////////////
// Parse
//////////////////////////////////////////////////////////////////////////
** Parse the input into a document AST
Document parse(InStream in)
{
Str? line := null
while ((line = in.readLine) != null)
parseLine(line)
return finalizeAndProcess
}
** Inspect the current line and update the document accordingly.
private Void parseLine(Str ln)
{
setLine(ln)
// For each containing block, try to parse the associated line start.
// The document will always match, so we can skip the first block parser
// and start at 1 matches
matches := 1
for (i := 1; i < openBlockParsers.size; ++i)
{
openBlockParser := openBlockParsers[i]
blockParser := openBlockParser.blockParser
findNextNonSpace
blockContinue := blockParser.tryContinue(this)
// we found the first block parser that could *not*
// continue parsing from the current state
if (blockContinue == null) break
// this open block parser matched
openBlockParser.sourceIndex = this.index
if (blockContinue.finalize)
{
addSourceSpans
closeBlockParsers(openBlockParsers.size - i)
return
}
else
{
if (blockContinue.newIndex != -1) setNewIndex(blockContinue.newIndex)
else if (blockContinue.newColumn != -1) setNewColumn(blockContinue.newColumn)
++matches
}
}
unmatchedBlocks := openBlockParsers.size - matches
blockParser := openBlockParsers[matches-1].blockParser
startedNewBlock := false
lastIndex := this.index
// unless last matched container is a code block, try new container starts,
// adding children to the last matched container:
tryBlockStarts := blockParser.block is Paragraph || blockParser.isContainer
while (tryBlockStarts)
{
lastIndex = this.index
findNextNonSpace
// this is a little performance optimization
if (isBlank || (indent < Parsing.code_block_indent && Chars.isLetter(this.line.content, nextNonSpaceIndex)))
{
setNewIndex(nextNonSpaceIndex)
break
}
blockStart := findBlockStart(blockParser)
if (blockStart == null)
{
setNewIndex(nextNonSpaceIndex)
break
}
startedNewBlock = true
sourceIndex := this.index
// We're starting a new block. If we have any previous blocks that need to
// be closed, we need to do it now.
if (unmatchedBlocks > 0)
{
closeBlockParsers(unmatchedBlocks)
unmatchedBlocks = 0
}
if (blockStart.newIndex != -1)
setNewIndex(blockStart.newIndex)
else if (blockStart.newColumn != -1)
setNewColumn(blockStart.newColumn)
SourceSpan[]? replacedSourceSpans := null
if (blockStart.isReplaceActiveBlockParser)
{
replacedBlock := prepareActiveBlockParserForReplacement
replacedSourceSpans = replacedBlock.sourceSpans
}
blockStart.blockParsers.each |newBlockParser|
{
addChild(OpenBlockParser(newBlockParser, sourceIndex))
if (replacedSourceSpans != null)
newBlockParser.block.setSourceSpans(replacedSourceSpans)
blockParser = newBlockParser
tryBlockStarts = newBlockParser.isContainer
}
}
// What remains at the offset is a text line.
// Add the text to the appropriate block.
// First check for a lazy continuation line
if (!startedNewBlock && !isBlank && activeBlockParser.canHaveLazyContinuationLines)
{
// lazy paragraph continuation
openBlockParsers.last.sourceIndex = lastIndex
addLine
}
else
{
// finalize any blocks not matched
if (unmatchedBlocks > 0)
closeBlockParsers(unmatchedBlocks)
if (!blockParser.isContainer)
{
addLine
}
else if (!isBlank)
{
// create paragraph container for line
paragraphParser := ParagraphParser()
addChild(OpenBlockParser(paragraphParser, lastIndex))
addLine
}
else
{
// This can happen for a list item like this:
// ```
// *
// list item
// ```
//
// The first line does not start a paragraph yet, but we still want to
// record the source position
addSourceSpans
}
}
}
** Update document parser state for a new line of input
private Void setLine(Str ln)
{
// move to next line
this.lineIndex++
this.index = 0
this.column = 0
this.columnIsInTab = false
// set current source line (with optional source span)
lineContent := prepareLine(ln)
SourceSpan? sourceSpan := null
if (includeSourceSpans != IncludeSourceSpans.none)
sourceSpan = SourceSpan(lineIndex, 0, lineContent.size)
this.line = SourceLine(lineContent, sourceSpan)
}
private Void findNextNonSpace()
{
i := this.index
cols := this.column
this.isBlank = true
len := line.content.size
while (i < len)
{
switch (line.content[i])
{
case ' ':
++i
++cols
continue
case '\t':
// move cols to next tabstop
++i
cols += Parsing.columnsToNextTabStop(cols)
continue
}
this.isBlank = false
break
}
this.nextNonSpaceIndex = i
this.nextNonSpaceColumn = cols
this.indent = nextNonSpaceColumn - this.column
}
private Void setNewIndex(Int newIndex)
{
if (newIndex >= nextNonSpaceIndex)
{
// we can start from here, no need to calculate tab stops again
this.index = nextNonSpaceIndex
this.column = nextNonSpaceColumn
}
len := line.content.size
while (index < newIndex && index != len) advance
// If we're going to an index as opposed to a column, we're never within a tab
this.columnIsInTab = false
}
private Void setNewColumn(Int newColumn)
{
if (newColumn >= nextNonSpaceColumn)
{
// we can start from here, no need to calcualte tab stops again
this.index = this.nextNonSpaceIndex
this.column = this.nextNonSpaceColumn
}
len := line.content.size
while (column < newColumn && index != len) advance
if (column > newColumn)
{
// last character was a tab and we overshot our target
--index
this.column = newColumn
this.columnIsInTab = true
}
else
{
this.columnIsInTab = false
}
}
** Advance to the next character in the line and update
** column taking into account tab stops.
private Void advance()
{
c := line.content[index]
index++
if (c == '\t')
column += Parsing.columnsToNextTabStop(column)
else
column++
}
** Add line content to the active block parser. We assume it can accept lines --
** that check should be done before calling this.
private Void addLine()
{
Str? content := null
if (columnIsInTab)
{
// Our column is in a partially consumed tab. Expand the remaining
// columns (to the next tab stop) to spaces
afterTab := index + 1
rest := line.content[afterTab..-1]
spaces := Parsing.columnsToNextTabStop(column)
sb := StrBuf(spaces + rest.size)
spaces.times { sb.addChar(' ') }
sb.add(rest)
content = sb.toStr
}
else if (index == 0)
{
content = line.content
}
else
{
content = line.content[index..<line.content.size]
}
SourceSpan? sourceSpan := null
if (includeSourceSpans === IncludeSourceSpans.blocks_and_inlines)
{
// Note that if we're in a partially-consume tab, the length here corresponds to
// the content but not to the actual source length. That sounds like a problem,
// but I haven't found a test case where it matters (yet).
sourceSpan = SourceSpan(lineIndex, index, content.size)
}
activeBlockParser.addLine(SourceLine(content, sourceSpan))
addSourceSpans
}
private Void addSourceSpans()
{
if (includeSourceSpans === IncludeSourceSpans.none) return
throw Err("TODO:HERE")
}
private BlockStart? findBlockStart(BlockParser blockParser)
{
matchedBlockParser := MatchedBlockParser(blockParser)
return blockParserFactories.eachWhile |factory|
{
factory.tryStart(this, matchedBlockParser)
}
}
** Walk through a block & children recursively, parsing string content into
** inline content where appropriate.
private Void processInLines()
{
cx := InlineParserContext(parser, definitions)
inlineParser := DefaultInlineParser(cx)
allBlockParsers.each |blockParser| { blockParser.parseInlines(inlineParser) }
}
** Add block of type tag as a child of the tip. If the tip can't accept children,
** close and finalize it and try its parent, and so on until we find a block
** that can accept children.
private Void addChild(OpenBlockParser openBlockParser)
{
block := openBlockParser.blockParser.block
while (!activeBlockParser.canContain(block))
{
closeBlockParsers(1)
}
activeBlockParser.block.appendChild(block)
activateBlockParser(openBlockParser)
}
private Void activateBlockParser(OpenBlockParser openBlockParser)
{
openBlockParsers.add(openBlockParser)
}
private OpenBlockParser deactivateBlockParser()
{
openBlockParsers.pop
}
private Block prepareActiveBlockParserForReplacement()
{
// Note that we don't want to parse inlines, as it's getting replaced
old := deactivateBlockParser.blockParser
if (old is ParagraphParser)
{
// Collect any link reference definitions. Note that replacing the
// active block parser is done after a block parser got the current
// paragraph content using MatchedBlockParer.content. In case the
// paragraph started with link reference definitions, we parse and strip
// them before the block parser gets the content. We want to keep them.
// If no replacement happens, we collect the definitions as part of
// finalizing blocks
addDefinitionsFrom((ParagraphParser)old)
}
// Do this so that source positions are calculated, which we will carry over
// to the replacing block
old.closeBlock
old.block.unlink
return old.block
}
** Prepares the input line by replacing '\0' characters with 'U+FFFD'.
** See ยง 2.3 - Insecure Characters.
private static Str prepareLine(Str line)
{
if (!line.containsChar('\u0000')) return line
return line.replace("\u0000", "\uFFFD")
}
//////////////////////////////////////////////////////////////////////////
// Finalize
//////////////////////////////////////////////////////////////////////////
private Document finalizeAndProcess()
{
closeBlockParsers(openBlockParsers.size)
processInLines
return documentBlockParser.block
}
private Void closeBlockParsers(Int count)
{
count.times |i|
{
blockParser := deactivateBlockParser.blockParser
finalize(blockParser)
// Remember for inline parsing. Note that a lot of blocks don't need inline
// parsing. We could have a separate mixin (e.g. BlockParserWithInlines) so that
// we donly have to remember those that actually have inlines to parse
allBlockParsers.add(blockParser)
}
}
** Finalize a block. Close it and do any necessary post-processing; e.g. setting the
** content of blocks and collecting link reference definitions from paragraphs.
private Void finalize(BlockParser blockParser)
{
addDefinitionsFrom(blockParser)
blockParser.closeBlock
}
private Void addDefinitionsFrom(BlockParser blockParser)
{
blockParser.definitions.each |DefinitionMap defMap|
{
definitions.addDefinitions(defMap)
}
}
//////////////////////////////////////////////////////////////////////////
// Debug
//////////////////////////////////////////////////////////////////////////
// @NoDoc Void dumpState()
// {
// s := StrBuf()
// s.add("line=${line.content} index=${index} col=${column} nextNonSpaceIndex=${nextNonSpaceIndex} nextNonSpaceColumn=${nextNonSpaceColumn} indent=${indent} isBlank=${isBlank}")
// echo(s)
// }
}
**************************************************************************
** OpenBlockParser
**************************************************************************
@Js
internal class OpenBlockParser
{
new make(BlockParser blockParser, Int sourceIndex)
{
this.blockParser = blockParser
this.sourceIndex = sourceIndex
}
BlockParser blockParser { private set }
Int sourceIndex
override Str toStr() { "OpenBlockParser(${blockParser} ${sourceIndex})"}
}
**************************************************************************
** MatchedBlockParser
**************************************************************************
**
** Open block parser that was last matched during the continue phase. This
** is different from the currently active block parser, as an unmatched block
** is only closed when a new block is started.
**
@Js
class MatchedBlockParser
{
new make(BlockParser matchedBlockParser)
{
this.matchedBlockParser = matchedBlockParser
}
BlockParser matchedBlockParser { private set }
SourceLines paragraphLines()
{
if (matchedBlockParser is ParagraphParser)
{
return ((ParagraphParser)matchedBlockParser).paragraphLines
}
return SourceLines.empty
}
}
**************************************************************************
** Parsing
**************************************************************************
@Js
@NoDoc class Parsing
{
static const Int code_block_indent := 4
static Int columnsToNextTabStop(Int column)
{
// Tab stop is 4
4 - (column % 4)
}
}