diff --git a/Sources/CodeParserCollection/Markdown/MarkdownConstructState.swift b/Sources/CodeParserCollection/Markdown/MarkdownConstructState.swift index a09e578..126e974 100644 --- a/Sources/CodeParserCollection/Markdown/MarkdownConstructState.swift +++ b/Sources/CodeParserCollection/Markdown/MarkdownConstructState.swift @@ -1,51 +1,25 @@ import CodeParserCore import Foundation -/// Main construction state for Markdown language with line-based processing +/// Minimal construction state for Markdown language +/// Only contains state that cannot be derived from the AST (context.current) public class MarkdownConstructState: CodeConstructState { public typealias Node = MarkdownNodeElement public typealias Token = MarkdownTokenElement - // Current token index in the line - public var position: Int = 0 - // Flag indicates if the block builders should run another round on the same line. - public var refreshed: Bool = false - // Flag indicates if the current line is being reprocessed after partial consumption - public var isPartialLine: Bool = false - - // Fenced code block state - public var openFence: OpenFenceInfo? - - // HTML block state - public var openHTMLBlock: OpenHTMLBlockInfo? - - /// Stack for nested list processing - public var listStack: [ListNode] = [] - public var currentDefinitionList: DefinitionListNode? - - /// Enhanced list context tracking for better indentation and nesting management - public var listContextStack: [ListContextInfo] = [] - - /// Indicates the last consumed line break formed a blank line (two or more consecutive newlines) - public var lastWasBlankLine: Bool = false - - /// When a quoted blank line (`>\\n`) is seen inside a blockquote, the next quoted - /// content should start a new paragraph inside the same blockquote instead of - /// merging into the previous one. - public var pendingBlockquoteParagraphSplit: Bool = false - - /// True when the previous quoted line (inside a blockquote) began with a token - /// that could start a block (e.g., `#`, `-`, `*`, `+`, number.). We use this to - /// prevent merging the next quoted line into the same paragraph, matching CommonMark - /// semantics where block-starting constructs introduce a new block. - public var prevBlockquoteLineWasBlockStart: Bool = false - /// Reference link definitions storage for resolving reference links /// Key is normalized reference identifier (case-insensitive, whitespace collapsed) + /// Note: This cannot be derived from AST since reference definitions may appear + /// anywhere in the document and need to be available for link resolution public var referenceDefinitions: [String: (url: String, title: String)] = [:] - - /// Pending reference link definition being parsed across multiple lines - public var pendingReference: PendingReferenceDefinition? + + /// Current line tokens being processed - builders can modify these + /// This allows builders to consume their part and leave remaining tokens for further processing + public var tokens: [any CodeToken] = [] + + /// Flag indicating if current line has been fully processed by a builder + /// When false, MarkdownBlockBuilder should continue processing the remaining tokens + public var currentLineProcessed: Bool = true public init() {} @@ -72,85 +46,3 @@ public class MarkdownConstructState: CodeConstructState { .trimmingCharacters(in: .whitespacesAndNewlines) } } - -/// Information about a pending reference link definition being parsed across multiple lines -public struct PendingReferenceDefinition { - public let identifier: String - public let referenceNode: ReferenceNode - public var hasDestination: Bool - public var hasTitle: Bool - public let originalLineTokens: [any CodeToken] // For fallback to paragraph - - public init(identifier: String, referenceNode: ReferenceNode, originalLineTokens: [any CodeToken]) { - self.identifier = identifier - self.referenceNode = referenceNode - self.hasDestination = false - self.hasTitle = false - self.originalLineTokens = originalLineTokens - } -} - -/// Information about an open fenced code block -public struct OpenFenceInfo { - public let character: String - public let length: Int - public let indentation: Int - public let codeBlock: CodeBlockNode - - public init(character: String, length: Int, indentation: Int, codeBlock: CodeBlockNode) { - self.character = character - self.length = length - self.indentation = indentation - self.codeBlock = codeBlock - } -} - -/// Information about an open HTML block -public struct OpenHTMLBlockInfo { - public let type: Int // HTML block type (1-7) - public let endCondition: String? // What string ends this block - public let htmlBlock: HTMLBlockNode - - public init(type: Int, endCondition: String?, htmlBlock: HTMLBlockNode) { - self.type = type - self.endCondition = endCondition - self.htmlBlock = htmlBlock - } -} - -/// Information about detected HTML block type -public struct HTMLBlockTypeInfo { - public let type: Int - public let name: String - public let closedOnSameLine: Bool - public let endCondition: String? - - public init(type: Int, name: String, closedOnSameLine: Bool, endCondition: String? = nil) { - self.type = type - self.name = name - self.closedOnSameLine = closedOnSameLine - self.endCondition = endCondition - } -} - -/// Enhanced list context information for better nesting and indentation management -public struct ListContextInfo { - /// The list node itself - public let list: ListNode - /// The parent list item that contains this list (nil for top-level lists) - public let parentListItem: ListItemNode? - /// The calculated indentation level for content in this list context - public let contentIndent: Int - /// The nesting level (1 for top-level, 2 for first nested, etc.) - public let level: Int - /// The marker type for compatibility checking - public let markerType: String - - public init(list: ListNode, parentListItem: ListItemNode?, contentIndent: Int, level: Int, markerType: String) { - self.list = list - self.parentListItem = parentListItem - self.contentIndent = contentIndent - self.level = level - self.markerType = markerType - } -} diff --git a/Sources/CodeParserCollection/Markdown/MarkdownNodes.swift b/Sources/CodeParserCollection/Markdown/MarkdownNodes.swift index 85ab00d..fa7bdab 100644 --- a/Sources/CodeParserCollection/Markdown/MarkdownNodes.swift +++ b/Sources/CodeParserCollection/Markdown/MarkdownNodes.swift @@ -110,13 +110,16 @@ public class DocumentNode: MarkdownNodeBase { } // MARK: - Block Elements -public class ParagraphNode: MarkdownNodeBase { +public class ParagraphNode: MarkdownNodeBase, MarkdownBlockNode { + public var blockType: String { "paragraph" } + public init(range: Range) { super.init(element: .paragraph) } } -public class HeaderNode: MarkdownNodeBase { +public class HeaderNode: MarkdownNodeBase, MarkdownBlockNode { + public var blockType: String { "heading" } public var level: Int public init(level: Int) { @@ -130,7 +133,8 @@ public class HeaderNode: MarkdownNodeBase { } } -public class ThematicBreakNode: MarkdownNodeBase { +public class ThematicBreakNode: MarkdownNodeBase, MarkdownBlockNode { + public var blockType: String { "thematic_break" } public var marker: String public init(marker: String = "---") { @@ -144,8 +148,14 @@ public class ThematicBreakNode: MarkdownNodeBase { } } -public class BlockquoteNode: MarkdownNodeBase { +public class BlockquoteNode: MarkdownNodeBase, MarkdownBlockNode { + public var blockType: String { "blockquote" } public var level: Int + + // Package-level indentation properties for nested block parsing + package var indent: Int = 0 // Number of spaces before the '>' marker + package var markerColumn: Int = 0 // Column position of the '>' marker + package var contentColumn: Int = 0 // Column position where content starts after '> ' public init(level: Int = 1) { self.level = level @@ -198,11 +208,17 @@ public class UnorderedListNode: ListNode { } } -public class ListItemNode: MarkdownNodeBase { +public class ListItemNode: MarkdownNodeBase, MarkdownBlockNode { + public var blockType: String { "list_item" } public var marker: String // indentation before marker and content indent column for continuation public var markerIndent: Int = 0 public var contentIndent: Int = 0 + + // Package-level properties for enhanced nested block parsing + package var markerColumn: Int = 0 // Exact column position of the marker + package var contentColumn: Int = 0 // Exact column position where content starts + package var markerLength: Int = 0 // Length of the marker (e.g., "1." = 2, "-" = 1) public init(marker: String) { self.marker = marker @@ -215,9 +231,13 @@ public class ListItemNode: MarkdownNodeBase { } } -public class CodeBlockNode: MarkdownNodeBase { +public class CodeBlockNode: MarkdownNodeBase, MarkdownBlockNode { + public var blockType: String { "code_block" } public var language: String? public var source: String + + // Package-level indentation properties for nested block parsing + package var indent: Int = 0 // Number of spaces before the code block public init(source: String, language: String? = nil) { self.language = language @@ -635,3 +655,12 @@ public class ContentNode: MarkdownNodeBase { super.init(element: .content) } } + +// MARK: - Type Aliases for Block Builders +public typealias MarkdownHeading = HeaderNode +public typealias MarkdownThematicBreak = ThematicBreakNode +public typealias MarkdownText = TextNode +public typealias MarkdownParagraph = ParagraphNode +public typealias MarkdownBlockquote = BlockquoteNode +public typealias MarkdownLineBreak = LineBreakNode +public typealias MarkdownListItem = ListItemNode diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownATXHeadingBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownATXHeadingBuilder.swift index cf3f341..3123089 100644 --- a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownATXHeadingBuilder.swift +++ b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownATXHeadingBuilder.swift @@ -1,150 +1,196 @@ import CodeParserCore import Foundation -/// Handles ATX headings (# through ######) -/// CommonMark Spec: https://spec.commonmark.org/0.31.2/#atx-headings -public class MarkdownATXHeadingBuilder: CodeNodeBuilder { - public typealias Node = MarkdownNodeElement - public typealias Token = MarkdownTokenElement - +/// Builder for ATX headings (# heading, ## heading, etc.) +/// Implements CommonMark specification for ATX headings (Spec 011) +public class MarkdownATXHeadingBuilder: MarkdownBlockBuilderProtocol { + + private let inlineProcessor = MarkdownInlineProcessor() + public init() {} - - public func build(from context: inout CodeConstructContext) -> Bool { - guard context.state is MarkdownConstructState else { + + public func canStart(line: MarkdownLine) -> Bool { + // ATX headings can be indented 0-3 spaces + let leadingSpaces = line.leadingWhitespace + if leadingSpaces > 3 { return false } - - // In phased pipeline, builders receive the suffix tokens; always start at local 0 - let startIndex = 0 - guard startIndex < context.tokens.count else { - return false + + // Check tokens directly for escaped content + var hashCount = 0 + var tokenIndex = 0 + + // Skip leading whitespace token + if tokenIndex < line.tokens.count && line.tokens[tokenIndex].element == .whitespaces { + tokenIndex += 1 } - - // Check for optional indentation (0-3 spaces only) - var currentIndex = startIndex - var indentationSpaces = 0 - - if currentIndex < context.tokens.count, - context.tokens[currentIndex].element == .whitespaces { - // Count spaces in the whitespace token - for char in context.tokens[currentIndex].text { - if char == " " { - indentationSpaces += 1 - } else if char == "\t" { - // Tab counts as up to 4 spaces for indentation - indentationSpaces += 4 - } - } - - // ATX headings allow 0-3 spaces of indentation - // 4 or more spaces creates an indented code block instead - if indentationSpaces >= 4 { + + // Check for escaped hash at start + if tokenIndex < line.tokens.count { + let token = line.tokens[tokenIndex] + // If it's a characters token starting with #, it was likely escaped + if token.element == .characters && token.text.hasPrefix("#") { return false } - - // Move past the whitespace token - currentIndex += 1 } - - // Check for opening hash sequence - var hashCount = 0 - - // Count consecutive # characters - while currentIndex < context.tokens.count, - context.tokens[currentIndex].element == .punctuation, - context.tokens[currentIndex].text == "#" { - hashCount += 1 - currentIndex += 1 - - // ATX headings support levels 1-6 only - if hashCount > 6 { - return false + + // Count hash tokens + while tokenIndex < line.tokens.count { + let token = line.tokens[tokenIndex] + if token.element == .punctuation && token.text == "#" { + hashCount += 1 + tokenIndex += 1 + } else { + break } } - - // Must have at least one # and at most 6 - guard hashCount >= 1 && hashCount <= 6 else { + + // Must have 1-6 # characters + if hashCount < 1 || hashCount > 6 { return false } - - // Check what follows the hashes - if currentIndex >= context.tokens.count { - // End of line - valid heading with empty content - } else if context.tokens[currentIndex].element == .newline { - // Newline after hashes - valid empty heading - } else if context.tokens[currentIndex].element == .whitespaces { - // Space after hashes - consume it - currentIndex += 1 - } else { - // No space and not end of line - not a valid ATX heading - return false + + // After the hashes, must be either end of line or whitespace + if tokenIndex >= line.tokens.count { + return true // Just hashes at end of line, valid empty heading } - - // If we're in a paragraph context, close it first (ATX headings can interrupt paragraphs) - if context.current.element == .paragraph { - if let parent = context.current.parent { - context.current = parent + + // Check next token after hashes + let nextToken = line.tokens[tokenIndex] + if nextToken.element == .whitespaces || nextToken.element == .newline || nextToken.element == .eof { + return true + } + + // If next token is not whitespace, it's not a valid heading + return false + } + + public func canContinue(block: any MarkdownBlockNode, line: MarkdownLine) -> Bool { + // ATX headings are single-line blocks - they cannot continue + return false + } + + public func createBlock(from line: MarkdownLine) -> (any MarkdownBlockNode)? { + // Extract level by counting hash tokens + var level = 0 + var tokenIndex = 0 + + // Skip leading whitespace token + if tokenIndex < line.tokens.count && line.tokens[tokenIndex].element == .whitespaces { + tokenIndex += 1 + } + + // Count hash tokens + while tokenIndex < line.tokens.count { + let token = line.tokens[tokenIndex] + if token.element == .punctuation && token.text == "#" { + level += 1 + tokenIndex += 1 + } else { + break } } - + + guard level >= 1 && level <= 6 else { return nil } + // Create heading node - let heading = HeaderNode(level: hashCount) - context.current.append(heading) - - // Collect content tokens (everything after opening sequence, excluding newline) - var contentTokens: [any CodeToken] = [] - - // Find end of content (before newline or EOF) - var contentEnd = context.tokens.count - for i in currentIndex.. 0 && remainingTokens[endIndex - 1].element == .whitespaces { - endIndex -= 1 + + return heading + } + + /// Extract content tokens after hash markers and leading whitespace + private func extractContentTokens(from tokens: [any CodeToken], level: Int) -> [any CodeToken] { + var resultTokens: [any CodeToken] = [] + var hashCount = 0 + var index = 0 + + // Skip hash tokens + while index < tokens.count && hashCount < level { + let token = tokens[index] + if token.element == .punctuation && token.text == "#" { + hashCount += 1 + index += 1 + } else { + break } - - // Then look for trailing # characters - var trailingHashStart = endIndex - while trailingHashStart > 0, - remainingTokens[trailingHashStart - 1].element == .punctuation, - remainingTokens[trailingHashStart - 1].text == "#" { - trailingHashStart -= 1 + } + + // Skip one whitespace token if present + if index < tokens.count && tokens[index].element == .whitespaces { + index += 1 + } + + // Collect remaining tokens (except EOF) + while index < tokens.count { + let token = tokens[index] + if token.element != .eof && token.element != .newline { + resultTokens.append(token) } - - // If we found trailing hashes, check if they're preceded by whitespace or at start - if trailingHashStart < endIndex { - if trailingHashStart == 0 { - // All remaining content is hashes - empty heading - contentTokens = [] - } else if remainingTokens[trailingHashStart - 1].element == .whitespaces { - // Whitespace before trailing hashes - remove the whitespace and hashes - contentTokens = Array(remainingTokens[0..<(trailingHashStart - 1)]) - } else { - // No whitespace before hashes - they're part of content, include everything up to endIndex - contentTokens = Array(remainingTokens[0..]) -> [any CodeToken] { + var result = tokens + + // Work backwards to find trailing hash tokens + var foundClosingHashes = false + var trailingHashCount = 0 + + // First pass: count trailing hashes + var index = result.count - 1 + while index >= 0 { + let token = result[index] + if token.element == .punctuation && token.text == "#" { + trailingHashCount += 1 + foundClosingHashes = true + index -= 1 + } else if token.element == .whitespaces && foundClosingHashes { + // Whitespace before closing hashes + index -= 1 + break } else { - // No trailing hashes, but strip trailing whitespace - contentTokens = Array(remainingTokens[0..= 0 && trailingHashCount > 0 { + // Check if the content before the whitespace and hashes is valid + let beforeWhitespace = index + if beforeWhitespace >= 0 { + // Remove the trailing hashes and the whitespace before them + let removeCount = trailingHashCount + 1 // +1 for whitespace + let newCount = max(0, result.count - removeCount) + result = Array(result[0.. Bool { + // ATX headings are single-line blocks, no processing needed + return false } } \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockBuilder.swift index 1518b96..3fe0719 100644 --- a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockBuilder.swift +++ b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockBuilder.swift @@ -1,189 +1,412 @@ import CodeParserCore import Foundation -/// Main block-level builder that handles line-by-line processing following CommonMark - GFM spec -/// Organizes tokens into logical lines and delegates to specialized CodeNodeBuilder instances +/// MarkdownBlockBuilder - The main CodeNodeBuilder implementation for Markdown +/// +/// This class processes Markdown tokens using the AST (context.current) as the editable single source of truth. +/// It directly consumes tokens and modifies the AST structure, delegating block-specific logic to pluggable builders. +/// +/// Individual block builders are pluggable through MarkdownBlockBuilderProtocol and contain no grammar-related logic. public class MarkdownBlockBuilder: CodeNodeBuilder { public typealias Node = MarkdownNodeElement public typealias Token = MarkdownTokenElement - - // Phased block parsing - private enum BlockPhase: CaseIterable { case openContainer, leafOnLine, postParagraph } - - private struct BlockRule { - let builder: any CodeNodeBuilder - let phase: BlockPhase - let priority: Int + + private let blockBuilders: [MarkdownBlockBuilderProtocol] + + /// Initialize with custom block builders (pluggable architecture) + public init(blockBuilders: [MarkdownBlockBuilderProtocol]) { + self.blockBuilders = blockBuilders } - - private let rulesByPhase: [BlockPhase: [BlockRule]] - - public init() { - // Declare rules with explicit phase and priority (lower number runs earlier within phase) - let rules: [BlockRule] = [ - // Open containers first (strip markers, reprocess line) - .init(builder: MarkdownBlockQuoteBuilder(), phase: .openContainer, priority: 10), - .init(builder: MarkdownUnifiedListBuilder(), phase: .openContainer, priority: 20), - - // Leaf on line - .init(builder: MarkdownEOFBuilder(), phase: .leafOnLine, priority: 0), - .init(builder: MarkdownReferenceLinkDefinitionBuilder(), phase: .leafOnLine, priority: 5), - .init(builder: MarkdownFencedCodeBlockBuilder(), phase: .leafOnLine, priority: 10), - .init(builder: MarkdownATXHeadingBuilder(), phase: .leafOnLine, priority: 20), - .init(builder: MarkdownThematicBreakBuilder(), phase: .leafOnLine, priority: 30), - .init(builder: MarkdownSetextHeadingBuilder(), phase: .leafOnLine, priority: 32), - .init(builder: MarkdownHTMLBlockBuilder(), phase: .leafOnLine, priority: 35), - .init(builder: MarkdownIndentedCodeBlockBuilder(), phase: .leafOnLine, priority: 40), - .init(builder: MarkdownParagraphBuilder(), phase: .leafOnLine, priority: 1000), // fallback - - // Post paragraph (needs previous paragraph context) - .init(builder: MarkdownSetextHeadingBuilder(), phase: .postParagraph, priority: 10), - ] - - var grouped: [BlockPhase: [BlockRule]] = [:] - for r in rules { - grouped[r.phase, default: []].append(r) - } - // Sort each phase by priority while preserving declaration order as tie-breaker (stable sort) - self.rulesByPhase = Dictionary( - uniqueKeysWithValues: grouped.map { phase, arr in - ( - phase, - arr.sorted { (a, b) in - if a.priority == b.priority { return true } // keep stable - return a.priority < b.priority - } - ) - }) + + /// Initialize with default block builders + public convenience init() { + self.init(blockBuilders: Self.createDefaultBuilders()) } - + + /// Main CodeNodeBuilder implementation - processes tokens and directly edits AST public func build(from context: inout CodeConstructContext) -> Bool { - guard context.consuming < context.tokens.count else { - return false - } - - let lines = lines(from: context) - guard !lines.isEmpty else { return false } - - for line in lines { - process(line: line, context: &context) + guard context.consuming < context.tokens.count else { return false } + + // Process tokens line by line, directly editing the AST + while context.consuming < context.tokens.count { + // Extract one line of tokens starting from current position + let lineTokens = extractNextLine(from: &context) + guard !lineTokens.isEmpty else { break } + + // Create a line representation + let line = MarkdownLine(tokens: lineTokens, lineNumber: getCurrentLineNumber(context)) + + // Process this line by directly editing the AST + processLineIntoAST(line, context: &context) } - - // Consume all tokens since we processed all lines - context.consuming = context.tokens.count - - // Return true to prevent further processing + + // Finalize all blocks (important for blocks like blockquotes that need recursive parsing) + finalizeBlocksInAST(node: context.current) + return true } - - private func process( - line: [any CodeToken], context: inout CodeConstructContext - ) { - guard let state = context.state as? MarkdownConstructState else { + + /// Extract the next line of tokens from the current position + private func extractNextLine(from context: inout CodeConstructContext) -> [any CodeToken] { + var lineTokens: [any CodeToken] = [] + + // Collect tokens until we hit a newline or EOF + while context.consuming < context.tokens.count { + let token = context.tokens[context.consuming] + lineTokens.append(token) + context.consuming += 1 + + // Stop at newline or EOF (include them in the line) + if token.element == .newline || token.element == .eof { + break + } + } + + return lineTokens + } + + /// Get current line number (approximate) + private func getCurrentLineNumber(_ context: CodeConstructContext) -> Int { + // Count newlines up to current position + var lineNumber = 0 + for i in 0..) { + // Skip blank lines - they typically close blocks or are ignored + if line.isBlank { + closeOpenBlocks(context: &context) return } - - // Ensure the state is initialized - state.position = 0 - state.isPartialLine = false - - repeat { - state.refreshed = false - - // Ensure position doesn't exceed line bounds, but allow empty lines for EOF processing - guard state.position < line.count || (line.isEmpty && state.position == 0) else { break } - - let tokens = - state.position < line.count - ? line.suffix(from: state.position) : ArraySlice>() - - // Run phases in order - var handledInAnyPhase = false - for phase in [BlockPhase.openContainer, .leafOnLine, .postParagraph] { - guard let rules = rulesByPhase[phase] else { continue } - - var handledInPhase = false - for rule in rules { - var ctx = CodeConstructContext( - root: context.root, - current: context.current, - tokens: Array(tokens), - state: context.state - ) - - if rule.builder.build(from: &ctx) { - handledInPhase = true - handledInAnyPhase = true - // Update context - context.current = ctx.current - - if state.refreshed { - // The builder refreshed tokens (container stripped etc.), reprocess from start - state.isPartialLine = true + + // Store the current line in state for builders to process + guard var state = context.state as? MarkdownConstructState else { return } + state.tokens = line.tokens + state.currentLineProcessed = false + + // Process with yield-back pattern: keep processing until line is fully consumed + var iterations = 0 + let maxIterations = 10 + while !state.currentLineProcessed && !state.tokens.isEmpty && iterations < maxIterations { + iterations += 1 + let currentLine = MarkdownLine(tokens: state.tokens, lineNumber: line.lineNumber) + state.currentLineProcessed = true // Will be set to false if a builder yields back + + // Store tokens count to detect infinite loops + let tokensBeforeProcessing = state.tokens.count + + // Check if any existing block can continue with current tokens + if let continuingBlock = findBlockThatCanContinue(currentLine, in: context.current) { + // Check if continuation is valid before processing + if canContinueBlock(continuingBlock, with: currentLine) { + processLineWithBuilder(currentLine, for: continuingBlock, state: &state) + + // If this is a container block and tokens were yielded back, process them in the container's context + if !state.currentLineProcessed && isContainerBlock(continuingBlock) { + processYieldedTokensInContainer(continuingBlock, state: &state, lineNumber: line.lineNumber) + } + } else { + // Block cannot continue, close it and try new block + closeBlock(continuingBlock, context: &context) + _ = tryCreateNewBlockWithLine(currentLine, context: &context, state: &state) + } + } else { + // No continuing block, check for interruption and try new block + if canLineInterruptExistingBlocks(currentLine) { + closeInterruptibleBlocks(context: &context) + } + + let newBlockCreated = tryCreateNewBlockWithLine(currentLine, context: &context, state: &state) + + // If a container block was created and tokens were yielded back, process them in the container's context + if !state.currentLineProcessed && newBlockCreated != nil && isContainerBlock(newBlockCreated!) { + processYieldedTokensInContainer(newBlockCreated!, state: &state, lineNumber: line.lineNumber) + } + } + + // Safety check: if tokens weren't consumed and line isn't processed, break to prevent infinite loop + if state.tokens.count == tokensBeforeProcessing && !state.currentLineProcessed { + state.currentLineProcessed = true // Force completion to avoid infinite loop + break + } + } + + // Update context state + context.state = state + } + + /// Find an existing block in the AST that can continue with this line + private func findBlockThatCanContinue(_ line: MarkdownLine, in node: CodeNode) -> (any MarkdownBlockNode)? { + // First, check if any immediate children can continue + for child in node.children.reversed() { // Check from last to first (most recent) + if let markdownChild = child as? MarkdownNodeBase { + if let blockNode = markdownChild as? any MarkdownBlockNode { + // Check if any builder can continue this block + for builder in blockBuilders { + if builder.canContinue(block: blockNode, line: line) { + return blockNode + } + } + } + + // Recursively check nested structures (for open container blocks) + if let nestedBlock = findBlockThatCanContinue(line, in: markdownChild) { + return nestedBlock + } + } + } + + return nil + } + + /// Check if this line can interrupt existing blocks + private func canLineInterruptExistingBlocks(_ line: MarkdownLine) -> Bool { + // Check if any builder can start an interrupting block type + for builder in blockBuilders { + if builder.canStart(line: line) { + let builderType = type(of: builder) + if builderType is MarkdownATXHeadingBuilder.Type || + builderType is MarkdownThematicBreakBuilder.Type || + builderType is MarkdownFencedCodeBlockBuilder.Type { + return true + } + } + } + return false + } + + /// Close blocks that can be interrupted + private func closeInterruptibleBlocks(context: inout CodeConstructContext) { + // For now, close paragraphs when interrupted + // In the future, this could be more sophisticated + closeOpenBlocks(context: &context) + } + + /// Close all open blocks + private func closeOpenBlocks(context: inout CodeConstructContext) { + // Walk the AST and finalize any blocks that need closing + finalizeBlocksInAST(node: context.current) + } + + /// Recursively finalize blocks in the AST + private func finalizeBlocksInAST(node: CodeNode) { + for child in node.children { + if let markdownChild = child as? MarkdownNodeBase { + if let blockNode = markdownChild as? any MarkdownBlockNode { + // Close this block + for builder in blockBuilders { + if canBuilderHandle(builder, blockType: blockNode.blockType) { + builder.closeBlock(block: blockNode) break - } else { - // If we're still in openContainer phase, allow proceeding to leafOnLine on same line - if phase == .openContainer { - // Continue to next phase without returning; break out of builder loop - break - } else if phase == .leafOnLine { - // For leafOnLine phase, allow proceeding to postParagraph phase - break - } else { - // For postParagraph phase, we're done with this line - return - } } } } - - if state.refreshed { break } // restart outer repeat - - // If openContainer phase consumed and didn't refresh, proceed to next phase naturally - if handledInPhase && phase == .openContainer { - // fallthrough to next phase - continue + // Recursively process children + finalizeBlocksInAST(node: markdownChild) + } + } + } + + /// Try to create a new block for this line + private func tryCreateNewBlock(for line: MarkdownLine) -> (any MarkdownBlockNode)? { + // Try each builder in order (most specific first) + for builder in blockBuilders { + if builder.canStart(line: line) { + return builder.createBlock(from: line) + } + } + return nil + } + + /// Check if a block can continue with the given line + private func canContinueBlock(_ block: any MarkdownBlockNode, with line: MarkdownLine) -> Bool { + for builder in blockBuilders { + if builder.canContinue(block: block, line: line) { + return true + } + } + return false + } + + /// Process a line with the appropriate builder for the given block + private func processLineWithBuilder(_ line: MarkdownLine, for block: any MarkdownBlockNode, state: inout MarkdownConstructState) { + for builder in blockBuilders { + if builder.canContinue(block: block, line: line) { + _ = builder.processLine(block: block, line: line, state: &state) + return + } + } + } + + /// Try to create a new block with the current line + private func tryCreateNewBlockWithLine(_ line: MarkdownLine, context: inout CodeConstructContext, state: inout MarkdownConstructState) -> (any MarkdownBlockNode)? { + // Try each plugged builder to see if it can start a new block + for builder in blockBuilders { + if builder.canStart(line: line) { + if let newBlock = builder.createBlock(from: line) { + // Determine where to add the new block based on current context + let targetNode = findTargetNodeForNewBlock(in: context.current) + targetNode.append(newBlock as! MarkdownNodeBase) + + // Process the opening line with the builder + _ = builder.processLine(block: newBlock, line: line, state: &state) + return newBlock } } - - // If nothing handled in any phase, break to avoid infinite loop - if !handledInAnyPhase { break } - } while state.refreshed + } + + // Fallback to paragraph if no builder can handle the line + createAndProcessParagraph(for: line, context: &context, state: &state) + return nil } - - private func lines(from context: CodeConstructContext) -> [[any CodeToken< - MarkdownTokenElement - >]] { - var result: [[any CodeToken]] = [] - var line: [any CodeToken] = [] - var index = context.consuming - - while index < context.tokens.count { - let token = context.tokens[index] - - if token.element == .eof { - // Handle EOF: if not after newline, insert newline and treat EOF as blank line - if !line.isEmpty { - // Add current line with synthetic newline - line.append(MarkdownToken(element: .newline, text: token.text, range: token.range)) - result.append(line) + + /// Close a specific block + private func closeBlock(_ block: any MarkdownBlockNode, context: inout CodeConstructContext) { + for builder in blockBuilders { + if canBuilderHandle(builder, blockType: block.blockType) { + builder.closeBlock(block: block) + return + } + } + } + + /// Process yielded-back tokens within a container block's context + private func processYieldedTokensInContainer(_ containerBlock: any MarkdownBlockNode, state: inout MarkdownConstructState, lineNumber: Int) { + guard !state.tokens.isEmpty else { + state.currentLineProcessed = true + return + } + + // Create a sub-context where context.current points to the container block + let containerNode = containerBlock as! MarkdownNodeBase + + // Process the remaining tokens as a new line within the container + let containerLine = MarkdownLine(tokens: state.tokens, lineNumber: lineNumber) + + // First, check if any existing block within the container can continue + if let continuingBlock = findBlockThatCanContinue(containerLine, in: containerNode) { + if canContinueBlock(continuingBlock, with: containerLine) { + processLineWithBuilder(containerLine, for: continuingBlock, state: &state) + state.currentLineProcessed = true + return + } + } + + // Try each plugged builder to see if it can start a new block within the container + for builder in blockBuilders { + if builder.canStart(line: containerLine) { + if let newBlock = builder.createBlock(from: containerLine) { + // Add the new block to the container + containerNode.append(newBlock as! MarkdownNodeBase) + + // Process the opening line with the builder + _ = builder.processLine(block: newBlock, line: containerLine, state: &state) + + // Mark as processed since we handled the yielded tokens + state.currentLineProcessed = true + return } - // Add empty line for EOF - result.append([]) - break - } else if token.element == .newline { - // Include newline token at end of line and preserve empty lines - line.append(token) - result.append(line) - line = [] - index += 1 - } else { - line.append(token) - index += 1 } } - - return result + + // Fallback to paragraph within the container + let paragraph = createParagraphBlock() + containerNode.append(paragraph as! MarkdownNodeBase) + + // Process the line into the paragraph + for builder in blockBuilders { + if builder is MarkdownParagraphBuilder { + _ = builder.processLine(block: paragraph, line: containerLine, state: &state) + state.currentLineProcessed = true + return + } + } + + // Ensure we mark as processed + state.currentLineProcessed = true } - -} + /// This ensures blocks are added in the correct container context + private func findTargetNodeForNewBlock(in node: CodeNode) -> CodeNode { + // For now, find the deepest open container block or return the root + if let lastChild = node.children.last as? MarkdownNodeBase { + if let blockNode = lastChild as? any MarkdownBlockNode { + if isContainerBlock(blockNode) { + // This is a container block, add content to it + return lastChild + } + } + } + + // Default to the current node + return node + } + + /// Create and process a paragraph for this line + private func createAndProcessParagraph(for line: MarkdownLine, context: inout CodeConstructContext, state: inout MarkdownConstructState) { + // Create a new paragraph + let paragraph = createParagraphBlock() + + // Add paragraph to the appropriate target node + let targetNode = findTargetNodeForNewBlock(in: context.current) + targetNode.append(paragraph as! MarkdownNodeBase) + + // Process the line into the paragraph + for builder in blockBuilders { + if builder is MarkdownParagraphBuilder { + _ = builder.processLine(block: paragraph, line: line, state: &state) + return + } + } + } + + /// Create a default paragraph block + private func createParagraphBlock() -> any MarkdownBlockNode { + // Use a dummy range - the range will be updated when content is added + let dummyString = "" + let range = dummyString.startIndex.. Bool { + switch block.blockType { + case "blockquote": return true + case "list_item": return true + default: return false + } + } + + /// Check if a builder can handle a specific block type + private func canBuilderHandle(_ builder: MarkdownBlockBuilderProtocol, blockType: String) -> Bool { + switch blockType { + case "paragraph": return builder is MarkdownParagraphBuilder + case "heading": return builder is MarkdownATXHeadingBuilder + case "thematic_break": return builder is MarkdownThematicBreakBuilder + case "code_block": return builder is MarkdownIndentedCodeBlockBuilder + case "fenced_code_block": return builder is MarkdownFencedCodeBlockBuilder + case "blockquote": return builder is MarkdownBlockquoteBuilder + case "list_item": return builder is MarkdownListItemBuilder + default: return false + } + } + + /// Create default set of block builders + public static func createDefaultBuilders() -> [MarkdownBlockBuilderProtocol] { + return [ + // Order matters: more specific builders should come first + MarkdownATXHeadingBuilder(), + MarkdownThematicBreakBuilder(), + MarkdownFencedCodeBlockBuilder(), + MarkdownListItemBuilder(), + MarkdownBlockquoteBuilder(), + MarkdownIndentedCodeBlockBuilder(), + MarkdownParagraphBuilder() // Paragraph should be last as it's the fallback + ] + } +} \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockBuilderProtocol.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockBuilderProtocol.swift new file mode 100644 index 0000000..856f6d4 --- /dev/null +++ b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockBuilderProtocol.swift @@ -0,0 +1,101 @@ +import CodeParserCore +import Foundation + +/// Protocol for pluggable Markdown block builders that work within the CommonMark parsing algorithm +/// These builders are NOT CodeNodeBuilders - they work with line-based processing within MarkdownBlockBuilder +public protocol MarkdownBlockBuilderProtocol { + + /// Check if this builder can start a new block with the given line + /// - Parameter line: The line tokens to examine + /// - Returns: True if this builder can handle this line as a new block start + func canStart(line: MarkdownLine) -> Bool + + /// Check if this builder can continue an existing block with the given line + /// - Parameters: + /// - block: The existing block being processed + /// - line: The line tokens to examine + /// - Returns: True if this builder can continue the block with this line + func canContinue(block: any MarkdownBlockNode, line: MarkdownLine) -> Bool + + /// Create a new block from the given line + /// - Parameter line: The line tokens to process + /// - Returns: The created block node, or nil if creation failed + func createBlock(from line: MarkdownLine) -> (any MarkdownBlockNode)? + + /// Process a line for an existing block + /// - Parameters: + /// - block: The existing block to add content to + /// - line: The line tokens to process + /// - state: The construction state that can be modified by the builder + /// - Returns: True if the line was successfully processed + func processLine(block: any MarkdownBlockNode, line: MarkdownLine, state: inout MarkdownConstructState) -> Bool + + /// Close and finalize a block (post-processing) + /// - Parameter block: The block to finalize + func closeBlock(block: any MarkdownBlockNode) +} + +/// Represents a line of tokens for block processing +public struct MarkdownLine { + public let tokens: [any CodeToken] + public let lineNumber: Int + + public init(tokens: [any CodeToken], lineNumber: Int) { + self.tokens = tokens + self.lineNumber = lineNumber + } + + /// Get the content of this line as a string + public var content: String { + return tokens.map { $0.text }.joined() + } + + /// Check if this line is blank (only whitespace/newline) + public var isBlank: Bool { + return tokens.allSatisfy { token in + token.element == .whitespaces || token.element == .newline || token.element == .eof + } + } + + /// Get leading whitespace count (converts tabs to equivalent spaces according to CommonMark) + public var leadingWhitespace: Int { + guard let firstToken = tokens.first, + firstToken.element == .whitespaces else { + return 0 + } + + // Convert tabs to spaces according to CommonMark tab expansion rules + return expandTabsToSpaceCount(firstToken.text) + } + + /// Expand tabs to equivalent space count according to CommonMark spec + /// Tabs expand to the next 4-character tab stop + private func expandTabsToSpaceCount(_ text: String) -> Int { + var column = 0 + + for char in text { + if char == "\t" { + // Add spaces until next 4-character boundary + let spacesToAdd = 4 - (column % 4) + column += spacesToAdd + } else { + column += 1 + } + } + + return column + } +} + +/// Base protocol for Markdown block nodes +public protocol MarkdownBlockNode: AnyObject { + var blockType: String { get } +} + +/// Extension to add default implementations +extension MarkdownBlockBuilderProtocol { + /// Default implementation that returns false - override if the builder needs closing logic + public func closeBlock(block: any MarkdownBlockNode) { + // Default: no special closing logic needed + } +} \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockQuoteBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockQuoteBuilder.swift deleted file mode 100644 index 1ee4006..0000000 --- a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockQuoteBuilder.swift +++ /dev/null @@ -1,87 +0,0 @@ -import CodeParserCore -import Foundation - -/// Handles block quotes starting with > characters -/// CommonMark Spec: https://spec.commonmark.org/0.31.2/#block-quotes -/// This is a container builder that uses position/refreshed mechanism for nested content -public class MarkdownBlockQuoteBuilder: CodeNodeBuilder { - public typealias Node = MarkdownNodeElement - public typealias Token = MarkdownTokenElement - - public init() {} - - public func build(from context: inout CodeConstructContext) -> Bool { - guard let state = context.state as? MarkdownConstructState else { - return false - } - - // Don't process blockquotes when inside a fenced code block - if state.openFence != nil { - return false - } - - // In phased pipeline, builders receive the suffix tokens; always start at local 0 - let startIndex = 0 - guard startIndex < context.tokens.count else { - return false - } - - var index = startIndex - - // Skip leading whitespace (up to 3 spaces allowed before >) - var leadingSpaces = 0 - while index < context.tokens.count, - let token = context.tokens[index] as? any CodeToken, - token.element == .whitespaces { - let spaceCount = token.text.count - if leadingSpaces + spaceCount > 3 { - return false - } - leadingSpaces += spaceCount - index += 1 - } - - // Must have > character - guard index < context.tokens.count, - let token = context.tokens[index] as? any CodeToken, - token.element == .punctuation, - token.text == ">" else { - return false - } - - index += 1 // consume the > - - // Optionally consume one space after > - if index < context.tokens.count, - let nextToken = context.tokens[index] as? any CodeToken, - nextToken.element == .whitespaces, - nextToken.text == " " { - index += 1 - } - - // Create or reuse blockquote - let blockquote: BlockquoteNode - if let currentBlockquote = context.current as? BlockquoteNode { - // We're already inside a blockquote, continue using it - blockquote = currentBlockquote - } else { - // Check if the last child is a blockquote we can continue - if let lastChild = context.current.children.last as? BlockquoteNode { - blockquote = lastChild - } else { - // Create new blockquote - blockquote = BlockquoteNode() - context.current.append(blockquote) - } - } - - // Set current context to the blockquote for nested content - context.current = blockquote - - // Update state to process remaining tokens as nested content in 3-phase approach - state.position += index - state.refreshed = true - - return true - } -} \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockquoteBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockquoteBuilder.swift new file mode 100644 index 0000000..d8644fa --- /dev/null +++ b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownBlockquoteBuilder.swift @@ -0,0 +1,97 @@ +import CodeParserCore +import Foundation + +/// Builder for blockquotes (> quoted text) +/// Implements CommonMark specification for blockquotes (Spec 024) +public class MarkdownBlockquoteBuilder: MarkdownBlockBuilderProtocol { + + public init() {} + + public func canStart(line: MarkdownLine) -> Bool { + // Blockquotes can be indented 0-3 spaces + let (leadingSpaces, _, _) = MarkdownIndentation.calculateIndentation(from: line.tokens) + if leadingSpaces > 3 { + return false + } + + // Look for '>' marker after whitespace + let (found, _, _) = MarkdownIndentation.findMarkerPosition(tokens: line.tokens, marker: ">", afterWhitespace: true) + return found + } + + public func canContinue(block: any MarkdownBlockNode, line: MarkdownLine) -> Bool { + guard block.blockType == "blockquote" else { return false } + + // Blockquotes can continue with lines that start with '>' + // or with lazy continuation (lines without '>') + let (leadingSpaces, _, _) = MarkdownIndentation.calculateIndentation(from: line.tokens) + if leadingSpaces > 3 { + return false + } + + // Can continue with '>' lines + let (found, _, _) = MarkdownIndentation.findMarkerPosition(tokens: line.tokens, marker: ">", afterWhitespace: true) + if found { + return true + } + + // Can continue with lazy continuation (non-blank lines) + if !line.isBlank { + return true + } + + // Blank lines generally end blockquotes + return false + } + + public func createBlock(from line: MarkdownLine) -> (any MarkdownBlockNode)? { + guard canStart(line: line) else { return nil } + + let blockquote = MarkdownBlockquote(level: 1) + + // Set package-level indentation properties + let (leadingSpaces, _, _) = MarkdownIndentation.calculateIndentation(from: line.tokens) + let (found, markerColumn, _) = MarkdownIndentation.findMarkerPosition(tokens: line.tokens, marker: ">", afterWhitespace: true) + + if found { + blockquote.indent = leadingSpaces + blockquote.markerColumn = markerColumn + blockquote.contentColumn = MarkdownIndentation.findContentColumn(tokens: line.tokens, afterMarkerAt: markerColumn) + } + + return blockquote + } + + public func processLine(block: any MarkdownBlockNode, line: MarkdownLine, state: inout MarkdownConstructState) -> Bool { + guard let blockquote = block as? MarkdownBlockquote else { return false } + + // Find content tokens after the '>' marker using package-level properties + var contentTokens: [any CodeToken] = [] + + let (found, _, _) = MarkdownIndentation.findMarkerPosition(tokens: state.tokens, marker: ">", afterWhitespace: true) + if found { + // Remove content up to the content column (after '> ') + contentTokens = MarkdownIndentation.removeIndentation(from: state.tokens, upToColumn: blockquote.contentColumn) + } else { + // Lazy continuation - use tokens after the blockquote's indent + contentTokens = MarkdownIndentation.removeIndentation(from: state.tokens, upToColumn: blockquote.indent) + } + + // Update state with remaining content tokens for MarkdownBlockBuilder to process + state.tokens = contentTokens + + // Only signal for more processing if there are actually tokens to process + if !contentTokens.isEmpty { + state.currentLineProcessed = false // Signal that remaining tokens need processing within this blockquote + } else { + state.currentLineProcessed = true // No more tokens to process + } + + return true + } + + /// Close the block - no special processing needed as content is parsed recursively by MarkdownBlockBuilder + public func closeBlock(block: any MarkdownBlockNode) { + // No special closing logic needed - the recursive parsing is handled by MarkdownBlockBuilder + } +} \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownCodeSpanBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownCodeSpanBuilder.swift new file mode 100644 index 0000000..d8620cf --- /dev/null +++ b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownCodeSpanBuilder.swift @@ -0,0 +1,140 @@ +import CodeParserCore +import Foundation + +/// Builder for processing inline code spans (`code`) +public class MarkdownCodeSpanBuilder { + + public init() {} + + /// Process code spans from tokens - backticks have higher precedence than emphasis + public func processCodeSpans(in tokens: [any CodeToken]) -> [ProcessedCodeSpan] { + var codeSpans: [ProcessedCodeSpan] = [] + var index = 0 + + while index < tokens.count { + let token = tokens[index] + + // Look for opening backticks - must be punctuation backtick + guard token.element == .punctuation && token.text == "`" else { + index += 1 + continue + } + + // Count consecutive backticks for opening delimiter + var openingBackticks = 0 + let openingStart = index + while index < tokens.count && + tokens[index].element == .punctuation && + tokens[index].text == "`" { + openingBackticks += 1 + index += 1 + } + let openingEnd = index - 1 + + // Look for matching closing backticks (same count) + var closingStart: Int? = nil + var searchIndex = index + + // Special case: if we're at the end after reading opening backticks, + // check if we can split them into opening and closing for empty code span + if index >= tokens.count && openingBackticks % 2 == 0 && openingBackticks >= 2 { + let delimiterLength = openingBackticks / 2 + // Split the backticks: first half is opening, second half is closing + let realOpeningEnd = openingStart + delimiterLength - 1 + let closingStart = realOpeningEnd + 1 + let closingEnd = openingEnd + + let range = openingStart...closingEnd + let codeSpan = ProcessedCodeSpan(range: range, backtickCount: delimiterLength) + codeSpans.append(codeSpan) + break + } + + if index >= tokens.count { + // No content, no closing - not a valid code span + index = openingEnd + 1 + continue + } + + while searchIndex < tokens.count { + // Look for start of a backtick run + if tokens[searchIndex].element == .punctuation && tokens[searchIndex].text == "`" { + let runStart = searchIndex + var runLength = 0 + + // Count consecutive backticks in this run + while searchIndex < tokens.count && + tokens[searchIndex].element == .punctuation && + tokens[searchIndex].text == "`" { + runLength += 1 + searchIndex += 1 + } + + // If this run matches our opening length, we found the closing + if runLength == openingBackticks { + closingStart = runStart + break + } + } else { + searchIndex += 1 + } + } + + if let closingStart = closingStart { + // Found matching closing backticks + let closingEnd = closingStart + openingBackticks - 1 + let range = openingStart...closingEnd + let codeSpan = ProcessedCodeSpan(range: range, backtickCount: openingBackticks) + codeSpans.append(codeSpan) + + // Continue from after the closing backticks + index = closingEnd + 1 + } else { + // No matching closing backticks found, continue from next character + // Reset index to just after the opening backticks we couldn't match + index = openingEnd + 1 + } + } + + return codeSpans + } + + /// Extract content from code span, handling whitespace normalization + public func extractCodeContent(from tokens: [any CodeToken], in range: ClosedRange, backtickCount: Int) -> String { + // Skip backtick tokens at the beginning and end + let contentStart = range.lowerBound + backtickCount + let contentEnd = range.upperBound - backtickCount + + guard contentStart <= contentEnd else { + return "" + } + + let contentTokens = Array(tokens[contentStart...contentEnd]) + var content = contentTokens.map { $0.text }.joined() + + // Normalize whitespace according to CommonMark spec: + // - Single spaces at beginning and end are stripped if there are non-space characters + // - Line endings are converted to spaces + + // Convert line endings to spaces first + content = content.replacingOccurrences(of: "\n", with: " ") + content = content.replacingOccurrences(of: "\r\n", with: " ") + content = content.replacingOccurrences(of: "\r", with: " ") + content = content.replacingOccurrences(of: "__SOFT_LINE_BREAK__", with: " ") + content = content.replacingOccurrences(of: "__HARD_LINE_BREAK__", with: " ") + + // Strip single leading and trailing spaces if there are non-space characters + if content.count > 2 && content.hasPrefix(" ") && content.hasSuffix(" ") && + content.dropFirst().dropLast().contains(where: { $0 != " " }) { + content = String(content.dropFirst().dropLast()) + } + + return content + } +} + +/// Processed code span range information +public struct ProcessedCodeSpan { + let range: ClosedRange + let backtickCount: Int +} \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownContentBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownContentBuilder.swift deleted file mode 100644 index 647bb9e..0000000 --- a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownContentBuilder.swift +++ /dev/null @@ -1,299 +0,0 @@ -import CodeParserCore -import Foundation - -/// ContentBuilder that dispatches inline markdown via a phase-based processor pipeline -public class MarkdownContentBuilder: CodeNodeBuilder { - public typealias Node = MarkdownNodeElement - public typealias Token = MarkdownTokenElement - - private let scanPhaseProcessors: [MarkdownInlinePhaseProcessor] - private let rebuildPhaseProcessors: [MarkdownInlinePhaseProcessor] - - public init() { - // Assemble phase-based inline processors with priorities - let inlineProcessors: [MarkdownInlinePhaseProcessor] = [ - // prefer native scan processors first - EmphasisDelimiterScanProcessor(priority: -300), - StrikethroughDelimiterScanProcessor(priority: -295), - CodeSpanDelimiterScanProcessor(priority: -290), - BracketDelimiterScanProcessor(priority: -285), - AutolinkDelimiterScanProcessor(priority: -280), - // rebuild-phase processors - HardLineBreakRebuildProcessor(priority: 0), - UnmatchedDelimiterInlineProcessor(priority: 0), - // pair processors - ReferenceLinkPairProcessor(priority: 3), - AutolinkPairProcessor(priority: 4), - LinkImagePairProcessor(priority: 5), - CodeSpanPairProcessor(priority: 8), // Higher precedence than emphasis/strong - EmphasisStrongPairProcessor(priority: 10), - StrikethroughPairProcessor(priority: 10), - ] - self.scanPhaseProcessors = inlineProcessors.filter { $0.phase == .scan }.sorted { $0.priority < $1.priority } - self.rebuildPhaseProcessors = inlineProcessors.filter { $0.phase == .rebuild }.sorted { $0.priority < $1.priority } - } - - public func build(from context: inout CodeConstructContext) -> Bool { - // Store reference to construct state for processors that need access to reference definitions - let markdownState = context.state as? MarkdownConstructState - - // Traverse the AST to parse all the content nodes - context.root.dfs { node in - if let node = node as? ContentNode { - let inlined = process(node.tokens, constructState: markdownState) - finalize(node: node, with: inlined) - } - } - return true - } - - /// Process tokens into inline nodes using the configured processors - /// Internal so processors can reuse it to parse nested content between delimiters. - func process(_ tokens: [any CodeToken], constructState: MarkdownConstructState? = nil) -> [MarkdownNodeBase] { - var context = MarkdownContentContext(tokens: tokens, constructState: constructState) - - // Process all tokens via scan-phase processors - while context.current < tokens.count { - let token = tokens[context.current] - var handled = false - for p in scanPhaseProcessors { - if p.canHandle(token: token, at: context.current, context: context) { - if p.handle(token: token, at: context.current, context: &context) { - handled = true - break - } - } - } - if !handled { - // Fallback: plain text, whitespace, entities, soft line breaks - switch token.element { - case .characters, .punctuation, .whitespaces: - context.add(token.text) - case .newline: - context.add(LineBreakNode(variant: .soft)) - case .charef: - context.add(token.text) - case .eof: - break - } - context.current += 1 - } - } - - // Finalize processing by matching delimiter pairs and creating nodes - finalizeDelimiters(context: &context) - - return context.inlined - } - - /// Finalize delimiter processing by matching pairs and creating nodes - private func finalizeDelimiters(context: inout MarkdownContentContext) { - // Process delimiter pairs following CommonMark algorithm - var currentDelimiterNode = context.delimiters.forward(from: nil) - var processedRanges: [ProcessedRange] = [] - - while let closerNode = currentDelimiterNode.next() { - guard closerNode.run.closable, closerNode.run.isActive else { - continue - } - - // Collect all pair processors that can handle this delimiter, in priority order - let pairHandlers = rebuildPhaseProcessors.filter { $0.canHandlePair(for: closerNode.run.delimiter) } - - // Look for matching opener - if let openerNode = context.delimiters.opener(for: closerNode.run.delimiter, before: closerNode) { - guard openerNode !== closerNode else { continue } - - // Get content tokens between delimiters - let openerTokenIndex = openerNode.run.index - let closerTokenIndex = closerNode.run.index - let contentStart = openerTokenIndex + openerNode.run.length - let contentEnd = closerTokenIndex - - guard contentStart <= contentEnd else { continue } - - // Get content tokens - let contentTokens = context.tokens[contentStart.. context.tokens.count { safeCloserEnd = context.tokens.count } - - // Store the processed range - processedRanges.append(ProcessedRange( - openerStart: openerTokenIndex, - openerEnd: openerTokenIndex + openerNode.run.length, - closerStart: closerTokenIndex, - closerEnd: safeCloserEnd, - node: built.node - )) - - // Mark delimiters as processed and remove only the matched pair - openerNode.run.isActive = false - closerNode.run.isActive = false - - // Remove closer then opener to keep links valid - context.delimiters.remove(closerNode) - context.delimiters.remove(openerNode) - - // Restart from the beginning to find further pairs (including outers) - currentDelimiterNode = context.delimiters.forward(from: nil) - } - } - } - - // Sort processed ranges to ensure deterministic rebuild and avoid overlaps - let orderedRanges = processedRanges.sorted { lhs, rhs in - if lhs.openerStart != rhs.openerStart { return lhs.openerStart < rhs.openerStart } - // If same start, consume the longer range first - return (lhs.closerEnd - lhs.openerStart) > (rhs.closerEnd - rhs.openerStart) - } - - // Rebuild content with processed ranges - rebuildContentWithProcessedRanges(context: &context, processedRanges: orderedRanges) - } - - /// Helper struct for tracking processed delimiter ranges - private struct ProcessedRange { - let openerStart: Int - let openerEnd: Int - let closerStart: Int - let closerEnd: Int - let node: MarkdownNodeBase - } - - // No legacy processor lookup; all inline semantics are handled by phase processors - - /// Rebuild content incorporating processed delimiter ranges - private func rebuildContentWithProcessedRanges( - context: inout MarkdownContentContext, - processedRanges: [ProcessedRange] - ) { - // Clear existing content - context.inlined.removeAll() - - var tokenIndex = 0 - - while tokenIndex < context.tokens.count { - // Check if we're at the start of a processed range - if let range = processedRanges.first(where: { $0.openerStart == tokenIndex }) { - // Insert the processed node - context.add(range.node) - // Skip all tokens covered by this range - tokenIndex = range.closerEnd - continue - } - - // Check if this token is part of any processed range - let isPartOfProcessedRange = processedRanges.contains { range in - tokenIndex >= range.openerStart && tokenIndex < range.closerEnd - } - - if !isPartOfProcessedRange { - // Check if this token is an unmatched delimiter - if let delimiterNode = findDelimiterAtTokenIndex(tokenIndex, in: context.delimiters) { - if delimiterNode.run.isActive { - var handled = false - for p in rebuildPhaseProcessors { - if p.canHandleUnmatchedDelimiter(run: delimiterNode.run, at: tokenIndex, context: context) { - if p.handleUnmatchedDelimiter(run: delimiterNode.run, at: tokenIndex, context: &context) { - handled = true - break - } - } - } - if !handled { - // Fallback: reconstruct text from original tokens - let start = max(0, delimiterNode.run.index) - let end = min(context.tokens.count, delimiterNode.run.index + delimiterNode.run.length) - if start < end { - let text = context.tokens[start.. MarkdownDelimiterStackNode? { - var current = delimiterStack.forward(from: nil) - while let delimiterNode = current.next() { - if delimiterNode.run.index == index { - return delimiterNode - } - } - return nil - } - - - private func finalize(node: ContentNode, with inlined: [MarkdownNodeBase]) { - guard let parent = node.parent as? MarkdownNodeBase else { - return - } - - let index = parent.children.firstIndex { $0 === node } ?? 0 - node.remove() - - for (i, inlineNode) in inlined.enumerated() { - parent.insert(inlineNode, at: index + i) - } - } - -} \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownEOFBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownEOFBuilder.swift deleted file mode 100644 index 81629f7..0000000 --- a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownEOFBuilder.swift +++ /dev/null @@ -1,231 +0,0 @@ -import CodeParserCore -import Foundation - -/// Handles end-of-file processing and triggers inline content processing -/// This builder runs when EOF is encountered and processes all ContentNodes in the AST -public class MarkdownEOFBuilder: CodeNodeBuilder { - public typealias Node = MarkdownNodeElement - public typealias Token = MarkdownTokenElement - - private let contentBuilder = MarkdownContentBuilder() - - public init() {} - - public func build(from context: inout CodeConstructContext) -> Bool { - // Check if this is an empty line (which indicates EOF processing) - guard context.tokens.isEmpty else { - return false - } - - // Close any open blocks when we reach EOF - while context.current.parent != nil { - context.current = context.current.parent! - } - - // Now we should be at document root for EOF processing - guard context.current === context.root else { - return false - } - - // Handle any pending reference definition - if let state = context.state as? MarkdownConstructState, - let pending = state.pendingReference { - // Add the pending reference to the AST - context.current.append(pending.referenceNode) - state.pendingReference = nil - } - - // Validate and process all reference definitions - if let state = context.state as? MarkdownConstructState { - validateReferenceDefinitions(context: &context, state: state) - } - - // Clean up trailing whitespace in code blocks before final processing - if let rootNode = context.root as? MarkdownNodeBase { - stripTrailingWhitespaceFromCodeBlocks(rootNode) - } - - // Process all ContentNodes in the AST using the ContentBuilder - // This must happen after all block parsing is complete - var contentContext = CodeConstructContext( - root: context.root, - current: context.root, - tokens: [], - state: context.state - ) - - _ = contentBuilder.build(from: &contentContext) - - context.consuming = context.tokens.count - return true - } - - /// Strips trailing whitespace and blank lines from code blocks - private func stripTrailingWhitespaceFromCodeBlocks(_ node: MarkdownNodeBase) { - // Recursively process all child nodes - for child in node.children { - if let childNode = child as? MarkdownNodeBase { - stripTrailingWhitespaceFromCodeBlocks(childNode) - } - } - - // Process code blocks - if let codeBlock = node as? CodeBlockNode { - codeBlock.source = stripTrailingWhitespace(from: codeBlock.source) - } - } - - /// Strips trailing whitespace and blank lines from a string - private func stripTrailingWhitespace(from source: String) -> String { - let lines = source.components(separatedBy: .newlines) - var processedLines: [String] = [] - - // Process each line - preserve trailing spaces, only remove trailing newlines - for line in lines { - // Only trim trailing newlines, preserve trailing spaces - processedLines.append(line.trimmingCharacters(in: .newlines)) - } - - // Check if the entire content is blank (only empty lines) - let isAllBlank = processedLines.allSatisfy { $0.trimmingCharacters(in: .whitespaces).isEmpty } - - if !isAllBlank { - // Remove trailing empty lines only if there's non-blank content - while !processedLines.isEmpty && processedLines.last?.isEmpty == true { - processedLines.removeLast() - } - } - - return processedLines.joined(separator: "\n") - } - - /// Validates all reference definitions in the AST and handles duplicates and invalid references - private func validateReferenceDefinitions( - context: inout CodeConstructContext, - state: MarkdownConstructState - ) { - guard let rootNode = context.root as? MarkdownNodeBase else { return } - - var validReferences: [String: (url: String, title: String)] = [:] - var invalidNodes: [(node: ReferenceNode, parent: MarkdownNodeBase)] = [] - - // Process all reference nodes and validate them - collectAndValidateReferences( - node: rootNode, - validReferences: &validReferences, - invalidNodes: &invalidNodes, - state: state - ) - } - - /// Recursively collect and validate reference definitions - private func collectAndValidateReferences( - node: MarkdownNodeBase, - validReferences: inout [String: (url: String, title: String)], - invalidNodes: inout [(node: ReferenceNode, parent: MarkdownNodeBase)], - state: MarkdownConstructState - ) { - var invalidIndices: [Int] = [] - - // Process children in forward order to preserve "first wins" rule - for (index, child) in node.children.enumerated() { - if let referenceNode = child as? ReferenceNode { - let normalizedId = normalizeReferenceIdentifier(referenceNode.identifier) - - // Validate the reference definition - if isValidReferenceDefinition(referenceNode) { - // Check if this is the first occurrence (first one wins) - if validReferences[normalizedId] == nil { - validReferences[normalizedId] = (url: referenceNode.url, title: referenceNode.title) - state.addReferenceDefinition(identifier: referenceNode.identifier, url: referenceNode.url, title: referenceNode.title) - } - // Note: duplicate definitions are kept in AST but not used for resolution - } else { - // Invalid reference - mark for conversion - invalidIndices.append(index) - } - } else if let childNode = child as? MarkdownNodeBase { - // Recursively process child nodes - collectAndValidateReferences( - node: childNode, - validReferences: &validReferences, - invalidNodes: &invalidNodes, - state: state - ) - } - } - - // Convert invalid references in reverse order to maintain indices - for index in invalidIndices.reversed() { - if let referenceNode = node.children[index] as? ReferenceNode { - convertInvalidReferenceToParagraphInPlace(referenceNode, parent: node, at: index) - } - } - } - - /// Check if a reference definition is valid according to CommonMark spec - private func isValidReferenceDefinition(_ reference: ReferenceNode) -> Bool { - // Must have non-empty identifier - if reference.identifier.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { - return false - } - - // Empty URL is valid if it was explicitly provided as <> - // We can't distinguish between missing destination and explicit <> here, - // so we need to be more permissive and let the parsing logic handle this - - // Check for invalid URL patterns - let url = reference.url.trimmingCharacters(in: .whitespacesAndNewlines) - - // URL starting with [ indicates it's likely malformed (confused with another reference) - if url.hasPrefix("[") { - return false - } - - return true - } - - /// Convert an invalid reference node back to paragraph text in place - private func convertInvalidReferenceToParagraphInPlace(_ referenceNode: ReferenceNode, parent: MarkdownNodeBase, at index: Int) { - // Create paragraph text from the reference syntax - let range = "".startIndex..<"".endIndex // Synthetic range - let paragraph = ParagraphNode(range: range) - - // Reconstruct the reference syntax as text - let referenceText = "[\(referenceNode.identifier)]:" - let tokens: [any CodeToken] = [ - MarkdownToken(element: .characters, text: referenceText, range: range) - ] - - let contentNode = ContentNode(tokens: tokens) - paragraph.append(contentNode) - - // Replace the reference node with the paragraph at the same position - parent.children[index] = paragraph - } - - /// Convert an invalid reference node back to paragraph text - private func convertInvalidReferenceToParagraph(_ referenceNode: ReferenceNode, parent: MarkdownNodeBase) { - // Create paragraph text from the reference syntax - let range = "".startIndex..<"".endIndex // Synthetic range - let paragraph = ParagraphNode(range: range) - - // Reconstruct the reference syntax as text - let referenceText = "[\(referenceNode.identifier)]:" - let tokens: [any CodeToken] = [ - MarkdownToken(element: .characters, text: referenceText, range: range) - ] - - let contentNode = ContentNode(tokens: tokens) - paragraph.append(contentNode) - parent.append(paragraph) - } - - /// Normalize reference identifier according to CommonMark spec - private func normalizeReferenceIdentifier(_ identifier: String) -> String { - return identifier - .lowercased() - .replacingOccurrences(of: #"\s+"#, with: " ", options: .regularExpression) - .trimmingCharacters(in: .whitespacesAndNewlines) - } -} diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownFencedCodeBlockBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownFencedCodeBlockBuilder.swift index 0c30a3d..70473d9 100644 --- a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownFencedCodeBlockBuilder.swift +++ b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownFencedCodeBlockBuilder.swift @@ -1,316 +1,243 @@ import CodeParserCore import Foundation -/// Handles fenced code blocks with ``` or ~~~ delimiters -/// CommonMark Spec: https://spec.commonmark.org/0.31.2/#fenced-code-blocks -public class MarkdownFencedCodeBlockBuilder: CodeNodeBuilder { - public typealias Node = MarkdownNodeElement - public typealias Token = MarkdownTokenElement - +/// Builder for fenced code blocks (```code``` or ~~~code~~~) +/// Implements CommonMark specification for fenced code blocks (Spec 018) +public class MarkdownFencedCodeBlockBuilder: MarkdownBlockBuilderProtocol { + public init() {} - - public func build(from context: inout CodeConstructContext) -> Bool { - guard let state = context.state as? MarkdownConstructState else { + + public func canStart(line: MarkdownLine) -> Bool { + // Fenced code blocks can be indented 0-3 spaces + let (leadingSpaces, _, _) = MarkdownIndentation.calculateIndentation(from: line.tokens) + if leadingSpaces > 3 { return false } - - let startIndex = 0 - guard startIndex < context.tokens.count else { - return false - } - - // Check if we're currently inside a fenced code block - if let currentFence = state.openFence { - return handleFencedContent(currentFence: currentFence, context: &context, state: state) - } else { - return handleFenceOpening(context: &context, state: state, startIndex: startIndex) - } - } - - private func handleFenceOpening( - context: inout CodeConstructContext, - state: MarkdownConstructState, - startIndex: Int - ) -> Bool { - var index = startIndex - // Skip leading whitespace (up to 3 spaces allowed) - var leadingSpaces = 0 - while index < context.tokens.count, - let token = context.tokens[index] as? any CodeToken, - token.element == .whitespaces { - let spaceCount = token.text.count - if leadingSpaces + spaceCount > 3 { - return false - } - leadingSpaces += spaceCount - index += 1 + // Work directly with tokens - skip leading whitespace + var tokenIndex = 0 + while tokenIndex < line.tokens.count && line.tokens[tokenIndex].element == .whitespaces { + tokenIndex += 1 } - - // Check for fence characters - guard index < context.tokens.count else { return false } - let fenceChar: String - if let firstToken = context.tokens[index] as? any CodeToken, - firstToken.element == .punctuation { - switch firstToken.text { - case "`", "~": - fenceChar = firstToken.text - default: - return false - } - } else { - return false + guard tokenIndex < line.tokens.count else { + return false } - - // Count consecutive fence characters (must be at least 3) - var fenceLength = 0 - while index < context.tokens.count, - let token = context.tokens[index] as? any CodeToken, - token.element == .punctuation, - token.text == fenceChar { - fenceLength += 1 - index += 1 - } - - guard fenceLength >= 3 else { - return false - } - - // Save the starting position after the opening fence for later checking - let afterOpeningFenceIndex = index - // Extract info string (language specification) after the fence - var infoString = "" - var foundNonWhitespace = false + // Check for fence start using tokens directly + let (isFence, _, fenceLength) = checkFencePattern(tokens: line.tokens, startIndex: tokenIndex) - while index < context.tokens.count { - let token = context.tokens[index] - - if token.element == .newline { - break - } else if token.element == .whitespaces { - if foundNonWhitespace { - infoString += token.text + if isFence && fenceLength >= 3 { + // For backticks, check that info string doesn't contain backticks + if let firstFenceToken = line.tokens[tokenIndex].text.first, + firstFenceToken == "`" { + // Check remaining tokens AFTER the fence for backticks in info string + let infoStartIndex = tokenIndex + fenceLength // Skip past all fence tokens + for i in infoStartIndex.. Bool { + guard let codeBlock = block as? MarkdownFencedCodeBlock, + block.blockType == "fenced_code_block" else { return false } - // The correct approach: Only apply same-line detection for backtick fences - // since backtick info strings cannot contain backticks, so any backticks found are closing fences + // If already closed, cannot continue + if codeBlock.isClosed { + return false + } - if fenceChar == "`" { - // For backtick fences, info string cannot contain backticks, so any backticks are closing fences - for checkIndex in afterOpeningFenceIndex..= codeBlock.fenceLength { + // Skip past fence tokens to check for trailing content + tokenIndex += fenceLength - if token.element == .punctuation && token.text == fenceChar { - // Found potential start of closing fence on same line - check if it's valid - var closingFenceLength = 0 - var closingIndex = checkIndex - - // Count consecutive fence characters - while closingIndex < index, - closingIndex < context.tokens.count, - let closingToken = context.tokens[closingIndex] as? any CodeToken, - closingToken.element == .punctuation, - closingToken.text == fenceChar { - closingFenceLength += 1 - closingIndex += 1 + // Check that the rest of the line only contains whitespace + var isValidClosing = true + while tokenIndex < line.tokens.count { + let token = line.tokens[tokenIndex] + if token.element == .newline || token.element == .eof { + break } - - // Check if this is a valid closing fence (at least as long as opening fence) - if closingFenceLength >= fenceLength { - // Check if rest of line is whitespace only or end of line - var isValidClosing = true - var remainingIndex = closingIndex - - while remainingIndex < index { - let remainingToken = context.tokens[remainingIndex] - if remainingToken.element != .whitespaces { - isValidClosing = false - break - } - remainingIndex += 1 - } - - if isValidClosing { - // Valid closing fence found on same line - this is not a fenced code block - return false - } + if token.element != .whitespaces { + isValidClosing = false + break } + tokenIndex += 1 + } + + if isValidClosing { + // This closes the fence + return false } } } - // For tilde fences, do NOT check for same-line closing since tildes can appear in info string - - // Fenced code blocks can interrupt paragraphs - close paragraph if we're in one - if context.current.element == .paragraph { - if let parent = context.current.parent { - context.current = parent + + // If not a closing fence, the block continues + return true + } + + public func createBlock(from line: MarkdownLine) -> (any MarkdownBlockNode)? { + guard canStart(line: line) else { return nil } + + // Work directly with tokens - skip leading whitespace + var tokenIndex = 0 + while tokenIndex < line.tokens.count && line.tokens[tokenIndex].element == .whitespaces { + tokenIndex += 1 + } + + guard tokenIndex < line.tokens.count else { return nil } + + let (isFence, fenceChar, fenceLength) = checkFencePattern(tokens: line.tokens, startIndex: tokenIndex) + guard isFence && fenceLength >= 3 else { return nil } + + // Calculate indentation properties + let (leadingSpaces, _, _) = MarkdownIndentation.calculateIndentation(from: line.tokens) + let fenceColumn = leadingSpaces // For now, assume fence starts after leading whitespace + + // Skip past the fence tokens + tokenIndex += fenceLength + + // Extract info string from remaining tokens + var language: String? = nil + var infoStringParts: [String] = [] + + while tokenIndex < line.tokens.count { + let token = line.tokens[tokenIndex] + if token.element == .newline || token.element == .eof { + break + } + if token.element != .whitespaces || !infoStringParts.isEmpty { + infoStringParts.append(token.text) } + tokenIndex += 1 } - - // Create fenced code block - let language = infoString.isEmpty ? nil : infoString.components(separatedBy: .whitespaces).first - let codeBlock = CodeBlockNode(source: "", language: language) - context.current.append(codeBlock) - - // Store the open fence info for subsequent lines - state.openFence = OpenFenceInfo( - character: fenceChar, - length: fenceLength, - indentation: leadingSpaces, - codeBlock: codeBlock + + if !infoStringParts.isEmpty { + let infoString = infoStringParts.joined().trimmingCharacters(in: .whitespaces) + language = infoString.split(separator: " ").first.map(String.init) + } + + let codeBlock = MarkdownFencedCodeBlock( + fenceChar: fenceChar, + fenceLength: fenceLength, + language: language ) - - return true + + // Set package-level indentation properties + codeBlock.indent = leadingSpaces + codeBlock.fenceIndent = leadingSpaces + codeBlock.fenceColumn = fenceColumn + + return codeBlock } - - private func handleFencedContent( - currentFence: OpenFenceInfo, - context: inout CodeConstructContext, - state: MarkdownConstructState - ) -> Bool { - let startIndex = 0 - - // Check if this line is a closing fence - if let closingFenceLength = checkClosingFence( - character: currentFence.character, - minLength: currentFence.length, - tokens: context.tokens, - startIndex: startIndex - ) { - // This is a closing fence - close the code block - state.openFence = nil + + public func processLine(block: any MarkdownBlockNode, line: MarkdownLine, state: inout MarkdownConstructState) -> Bool { + guard let codeBlock = block as? MarkdownFencedCodeBlock else { return false } + + // Check if this is a closing fence + if !canContinue(block: block, line: line) { + // This is a closing fence, don't add it to content + codeBlock.isClosed = true return true } - - // In 3-phase architecture, container handling is done by container builders - // Fenced code blocks just handle content and closing - - // This is content - add it to the code block - var lineContent = "" - var index = startIndex - - // Include everything in this line, including newline - var contentEnd = context.tokens.count - - // Remove equivalent indentation from this line - var remainingIndentationToRemove = currentFence.indentation - // Skip leading whitespace up to the fence's indentation level - while index < contentEnd && remainingIndentationToRemove > 0 { - let token = context.tokens[index] - if token.element == .whitespaces { - let spaceCount = token.text.count - if spaceCount <= remainingIndentationToRemove { - // Skip this entire whitespace token - remainingIndentationToRemove -= spaceCount - index += 1 - } else { - // Partially use this whitespace token - let remainingSpaces = spaceCount - remainingIndentationToRemove - lineContent += String(repeating: " ", count: remainingSpaces) - remainingIndentationToRemove = 0 - index += 1 - } - } else { - // Non-whitespace token, stop indentation removal + // Remove up to the fence indentation from the content line + let contentTokens = MarkdownIndentation.removeIndentation(from: line.tokens, upToColumn: codeBlock.fenceIndent) + + // Convert tokens to content + var contentParts: [String] = [] + for token in contentTokens { + if token.element == .newline || token.element == .eof { break } + contentParts.append(token.text) } - - // Extract remaining content tokens including newline - while index < contentEnd { - let token = context.tokens[index] - switch token.element { - case .characters, .punctuation, .whitespaces, .charef, .newline: - lineContent += token.text - default: - break - } - index += 1 + let content = contentParts.joined() + + if !codeBlock.source.isEmpty { + codeBlock.source += "\n" } - - // Add content to the code block (lineContent already includes newline) - currentFence.codeBlock.source += lineContent - + codeBlock.source += content + return true } - - private func checkClosingFence( - character: String, - minLength: Int, - tokens: [any CodeToken], - startIndex: Int - ) -> Int? { - var index = startIndex - - // Skip leading whitespace (up to 3 spaces allowed) - var leadingSpaces = 0 - while index < tokens.count, - let token = tokens[index] as? any CodeToken, - token.element == .whitespaces { - let spaceCount = token.text.count - if leadingSpaces + spaceCount > 3 { - return nil - } - leadingSpaces += spaceCount - index += 1 - } - - // Count fence characters + + /// Check if tokens form a fence pattern starting at given index + /// Returns (isFence, fenceChar, fenceLength) + private func checkFencePattern(tokens: [any CodeToken], startIndex: Int) -> (Bool, Character, Int) { + guard startIndex < tokens.count else { return (false, " ", 0) } + + let firstToken = tokens[startIndex] + guard firstToken.element == .punctuation else { return (false, " ", 0) } + + // Check for backtick or tilde fence - each character is a separate token + let firstChar = firstToken.text.first + guard firstChar == "`" || firstChar == "~" else { return (false, " ", 0) } + + // Count consecutive fence characters var fenceLength = 0 - while index < tokens.count, - let token = tokens[index] as? any CodeToken, - token.element == .punctuation, - token.text == character { - fenceLength += 1 - index += 1 - } - - // Must have at least as many characters as opening fence - guard fenceLength >= minLength else { - return nil - } - - // Skip remaining whitespace until end of line - while index < tokens.count, - let token = tokens[index] as? any CodeToken, - token.element == .whitespaces { - index += 1 - } - - // Must reach end of line or newline - if index < tokens.count { + var index = startIndex + + while index < tokens.count { let token = tokens[index] - if token.element != .newline { - return nil + if token.element == .punctuation && token.text.first == firstChar { + fenceLength += 1 + index += 1 + } else { + break } } - - return fenceLength + + return (fenceLength >= 3, firstChar!, fenceLength) + } +} + +/// Specialized code block for fenced code blocks +public class MarkdownFencedCodeBlock: CodeBlockNode { + public override var blockType: String { "fenced_code_block" } + public var fenceChar: Character + public var fenceLength: Int + public var isClosed: Bool = false + + // Package-level properties for enhanced nested block parsing + package var fenceIndent: Int = 0 // Number of spaces before the opening fence + package var fenceColumn: Int = 0 // Column position of the opening fence + + public init(fenceChar: Character, fenceLength: Int, language: String? = nil) { + self.fenceChar = fenceChar + self.fenceLength = fenceLength + // Use empty source initially, will be populated during processing + super.init(source: "", language: language) + } + + public override func hash(into hasher: inout Hasher) { + super.hash(into: &hasher) + hasher.combine(fenceChar) + hasher.combine(fenceLength) } } \ No newline at end of file diff --git a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownHTMLBlockBuilder.swift b/Sources/CodeParserCollection/Markdown/Nodes/MarkdownHTMLBlockBuilder.swift deleted file mode 100644 index cf8cb9f..0000000 --- a/Sources/CodeParserCollection/Markdown/Nodes/MarkdownHTMLBlockBuilder.swift +++ /dev/null @@ -1,232 +0,0 @@ -import CodeParserCore -import Foundation - -/// Handles HTML blocks according to CommonMark specification (all 7 types) -/// CommonMark Spec: https://spec.commonmark.org/0.31.2/#html-blocks -public class MarkdownHTMLBlockBuilder: CodeNodeBuilder { - public typealias Node = MarkdownNodeElement - public typealias Token = MarkdownTokenElement - - public init() {} - - public func build(from context: inout CodeConstructContext) -> Bool { - guard let state = context.state as? MarkdownConstructState else { return false } - guard !context.tokens.isEmpty else { return false } - - // In phased pipeline, builders receive the suffix tokens; always start at local 0 - let startIndex = 0 - guard startIndex < context.tokens.count else { return false } - - // If we have an open HTML block, handle content continuation - if let openHTML = state.openHTMLBlock { - return handleHTMLBlockContent(openHTML: openHTML, context: &context, state: state) - } - - // Reconstruct the raw line (excluding trailing newline) - var line = "" - for t in context.tokens { - if t.element == .newline { break } - switch t.element { - case .characters, .punctuation, .whitespaces, .charef: - line += t.text - default: - break - } - } - - let trimmed = line.trimmingCharacters(in: .whitespaces) - - // Check for HTML block types (1-7 per CommonMark spec) - guard let htmlType = detectHTMLBlockType(line: trimmed) else { return false } - - // HTML blocks can interrupt paragraphs - if context.current.element == .paragraph, let parent = context.current.parent { - context.current = parent - } - - // Place at document level if inside container structures (HTML blocks break out of containers) - if isInsideContainer(context: context) { - context.current = findDocumentLevel(context: context) - } - - // For type 2-5 (closed on same line), create simple HTML block - if htmlType.closedOnSameLine { - let html = HTMLBlockNode(name: htmlType.name, content: trimmed) - context.current.append(html) - return true - } - - // For type 1, 6, 7 (multi-line), start HTML block and set state - let html = HTMLBlockNode(name: htmlType.name, content: line + "\n") - context.current.append(html) - - // Set state to continue collecting HTML content - state.openHTMLBlock = OpenHTMLBlockInfo( - type: htmlType.type, - endCondition: htmlType.endCondition, - htmlBlock: html - ) - - return true - } - - private func isInsideContainer(context: CodeConstructContext) -> Bool { - var current: MarkdownNodeBase? = context.current as? MarkdownNodeBase - while let node = current { - if node is BlockquoteNode || node is ListItemNode || node is ListNode { - return true - } - current = node.parent() - } - return false - } - - private func findDocumentLevel(context: CodeConstructContext) -> CodeNode { - var current = context.current - while let parent = current.parent { - if let markdownParent = parent as? MarkdownNodeBase, - !(markdownParent is BlockquoteNode) && !(markdownParent is ListItemNode) && !(markdownParent is ListNode) { - return parent - } - current = parent - } - return current - } - - /// Handles content for an already open HTML block - private func handleHTMLBlockContent( - openHTML: OpenHTMLBlockInfo, - context: inout CodeConstructContext, - state: MarkdownConstructState - ) -> Bool { - // Reconstruct the raw line (including newline) - var line = "" - for t in context.tokens { - switch t.element { - case .characters, .punctuation, .whitespaces, .charef, .newline: - line += t.text - default: - break - } - } - - // Check if this line ends the HTML block - if let endCondition = openHTML.endCondition { - if line.contains(endCondition) { - // Add this line to the HTML block content and close it - openHTML.htmlBlock.content += line - state.openHTMLBlock = nil - return true - } - } else { - // For type 6 and 7, HTML blocks end at blank line - let trimmed = line.trimmingCharacters(in: .whitespaces) - if trimmed.isEmpty { - // Blank line ends the HTML block (don't include the blank line) - state.openHTMLBlock = nil - return false // Let other builders handle the blank line - } - } - - // Add line to HTML block content - openHTML.htmlBlock.content += line - return true - } - - /// Detects HTML block type according to CommonMark specification - private func detectHTMLBlockType(line: String) -> HTMLBlockTypeInfo? { - let lowercaseLine = line.lowercased() - - // Type 1: