Refactor GFM parser (#11)

* Define options with squiggly heredocs literals * Use case expressions to reduce attribute lookups * DRY repetitions with local variable assignments * Test for truthy instead of negating nil-check * Create regex from list of items with Regexp.union * Use attr_reader to define trivial reader methods * Use whitespace to improve readability * initialize hard_line_break just once per instance * Improve readability of conditional expression * check if last line is empty first * No point in pushing an empty substring * extract complex conditional body to private method * Avoid unnecessary array allocations * Freeze mutable constants
kramdown · Jan 28, 2019 · b780566 · b780566
1 parent deb0177
commit b780566
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 84 deletions.
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -5,7  5,7 @@ AllCops:
 
 
 Metrics/LineLength:
-  Max: 110 # Preferred length is 100
   Max: 120 # Preferred length is 100
 
 Metrics/ParameterLists:
   CountKeywordArgs: false
@@ -90,6  90,8 @@ Style/YodaCondition:
 Style/EmptyElse:
   EnforcedStyle: empty
 
 Style/Documentation:
   Enabled: false
 
 Style/GuardClause:
   Enabled: false # false alarms
@@ -103,9  105,6 @@ Style/Next:
 Style/ParallelAssignment:
   Enabled: false # not really needed
 
-Style/TrivialAccessors:
-  Enabled: false
-
 Style/NestedTernaryOperator:
   Enabled: false # compact nested ternary operators are okay
 
@@ -136,9  135,6 @@ Style/SpecialGlobalVars:
 Style/MultipleComparison:
   Enabled: false # why should an array be created? especially if only two items are compared
 
-Style/AccessModifierDeclarations:
-  Enabled: false
-
 Style/WhileUntilModifier:
   Enabled: false # I prefer to use either one or the other, depending on context
 
@@ -150,7  146,8 @@ Layout/AlignHash:
   EnforcedLastArgumentHashStyle: ignore_implicit
 
 Layout/SpaceInsideBlockBraces:
-  SpaceBeforeBlockParameters: false
   EnforcedStyle: space
   SpaceBeforeBlockParameters: space
 
 Layout/SpaceInsideHashLiteralBraces:
   EnforcedStyle: no_space
@@ -180,11  177,16 @@ Layout/IndentAssignment:
   Enabled: false # false alarms
 
 
 Lint/LiteralAsCondition:
   Enabled: false # we use while true
 
 
 Naming/FileName:
   Exclude:
     - 'lib/kramdown-parser-gfm.rb'
 
 Naming/HeredocDelimiterNaming:
   Enabled: false # we like our delimiters short and obvious
 
 Naming/UncommunicativeMethodParamName:
   Enabled: false # for points the names x,y are perfectly reasonable
-
-Lint/LiteralAsCondition:
-  Enabled: false # we use while true
diff --git a/lib/kramdown/parser/gfm.rb b/lib/kramdown/parser/gfm.rb
@@ -13,49  13,49 @@
 module Kramdown
   module Options
 
-    define(:hard_wrap, Boolean, true, <<EOF)
-Interprets line breaks literally
     define(:hard_wrap, Boolean, true, <<~EOF)
       Interprets line breaks literally
 
-Insert HTML `<br />` tags inside paragraphs where the original Markdown
-document had newlines (by default, Markdown ignores these newlines).
       Insert HTML `<br />` tags inside paragraphs where the original Markdown
       document had newlines (by default, Markdown ignores these newlines).
 
-Default: true
-Used by: GFM parser
-EOF
       Default: true
       Used by: GFM parser
     EOF
 
-    define(:gfm_quirks, Object, [:paragraph_end], <<EOF) do |val|
-Enables a set of GFM specific quirks
     define(:gfm_quirks, Object, [:paragraph_end], <<~EOF) do |val|
       Enables a set of GFM specific quirks
 
-The way how GFM is transformed on Github often differs from the way
-kramdown does things. Many of these differences are negligible but
-others are not.
       The way how GFM is transformed on Github often differs from the way
       kramdown does things. Many of these differences are negligible but
       others are not.
 
-This option allows one to enable/disable certain GFM quirks, i.e. ways
-in which GFM parsing differs from kramdown parsing.
       This option allows one to enable/disable certain GFM quirks, i.e. ways
       in which GFM parsing differs from kramdown parsing.
 
-The value has to be a list of quirk names that should be enabled,
-separated by commas. Possible names are:
       The value has to be a list of quirk names that should be enabled,
       separated by commas. Possible names are:
 
-* paragraph_end
       * paragraph_end
 
-  Disables the kramdown restriction that at least one blank line has to
-  be used after a paragraph before a new block element can be started.
         Disables the kramdown restriction that at least one blank line has to
         be used after a paragraph before a new block element can be started.
 
-  Note that if this quirk is used, lazy line wrapping does not fully
-  work anymore!
         Note that if this quirk is used, lazy line wrapping does not fully
         work anymore!
 
-* no_auto_typographic
       * no_auto_typographic
 
-  Disables automatic conversion of some characters into their
-  corresponding typographic symbols (like `--` to em-dash etc).
-  This helps to achieve results closer to what GitHub Flavored
-  Markdown produces.
         Disables automatic conversion of some characters into their
         corresponding typographic symbols (like `--` to em-dash etc).
         This helps to achieve results closer to what GitHub Flavored
         Markdown produces.
 
-Default: paragraph_end
-Used by: GFM parser
-EOF
       Default: paragraph_end
       Used by: GFM parser
     EOF
       val = simple_array_validator(val, :gfm_quirks)
-      val.map! {|v| str_to_sym(v.to_s)}
       val.map! { |v| str_to_sym(v.to_s) }
       val
     end
 
@@ -68,19  68,22 @@ class GFM < Kramdown::Parser::Kramdown
 
       VERSION = '1.0.1'
 
       attr_reader :paragraph_end
 
       def initialize(source, options)
         super
         @options[:auto_id_stripping] = true
         @id_counter = Hash.new(-1)
 
-        @span_parsers.delete(:line_break) if @options[:hard_wrap]
         @span_parsers.delete(:line_break)       if @options[:hard_wrap]
         @span_parsers.delete(:typographic_syms) if @options[:gfm_quirks].include?(:no_auto_typographic)
 
         if @options[:gfm_quirks].include?(:paragraph_end)
           atx_header_parser = :atx_header_gfm_quirk
-          @paragraph_end = self.class::PARAGRAPH_END_GFM
           @paragraph_end    = self.class::PARAGRAPH_END_GFM
         else
           atx_header_parser = :atx_header_gfm
-          @paragraph_end = self.class::PARAGRAPH_END
           @paragraph_end    = self.class::PARAGRAPH_END
         end
 
         {codeblock_fenced: :codeblock_fenced_gfm,
@@ -93,6  96,8 @@ def initialize(source, options)
         i = @span_parsers.index(:escaped_chars)
         @span_parsers[i] = :escaped_chars_gfm if i
         @span_parsers << :strikethrough_gfm
 
         @hard_line_break = "#{@options[:hard_wrap] ? '' : '\\'}\n"
       end
 
       def parse
@@ -102,20  107,8 @@ def parse
 
       def update_elements(element)
         element.children.map! do |child|
-          if child.type == :text &&
-              child.value.include?(hard_line_break = "#{@options[:hard_wrap] ? '' : '\\'}\n")
-            children = []
-            lines = child.value.split(hard_line_break, -1)
-            omit_trailing_br = (Kramdown::Element.category(element) == :block &&
-                                element.children[-1] == child && lines[-1].empty?)
-            lines.each_with_index do |line, index|
-              new_element_options = {location: child.options[:location]   index}
-
-              children << Element.new(:text, (index > 0 ? "\n#{line}" : line), nil, new_element_options)
-              children << Element.new(:br, nil, nil, new_element_options) if index < lines.size - 2 ||
-                (index == lines.size - 2 && !omit_trailing_br)
-            end
-            children
           if child.type == :text && child.value.include?(@hard_line_break)
             update_text_type(element, child)
           elsif child.type == :html_element
             child
           elsif child.type == :header && @options[:auto_ids] && !child.attr.key?('id')
@@ -133,41  126,46 @@ def update_raw_text(item)
         raw_text =  ''
 
         append_text = lambda do |child|
-          if child.type == :text || child.type == :codespan || child.type == :math
           case child.type
           when :text, :codespan, :math
             raw_text << child.value
-          elsif child.type == :entity
           when :entity
             raw_text << child.value.char
-          elsif child.type == :smart_quote
           when :smart_quote
             raw_text << ::Kramdown::Utils::Entities.entity(child.value.to_s).char
-          elsif child.type == :typographic_sym
-            raw_text << if child.value == :laquo_space
           when :typographic_sym
             raw_text << case child.value
                         when :laquo_space
                           "« "
-                        elsif child.value == :raquo_space
                         when :raquo_space
                           " »"
                         else
                           ::Kramdown::Utils::Entities.entity(child.value.to_s).char
                         end
           else
-            child.children.each {|c| append_text.call(c) }
             child.children.each { |c| append_text.call(c) }
           end
         end
 
         append_text.call(item)
         item.options[:raw_text] = raw_text
       end
 
-      NON_WORD_RE = /[^\p{Word}\- \t]/
       NON_WORD_RE = /[^\p{Word}\- \t]/.freeze
 
       def generate_gfm_header_id(text)
         result = text.downcase
         result.gsub!(NON_WORD_RE, '')
         result.tr!(" \t", '-')
 
         @id_counter[result]  = 1
-        result << (@id_counter[result] > 0 ? "-#{@id_counter[result]}" : '')
         counter_result = @id_counter[result]
         result << "-#{counter_result}" if counter_result > 0
 
         @options[:auto_id_prefix]   result
       end
 
-      ATX_HEADER_START = /^(?<level>\#{1,6})[\t ] (?<contents>.*)\n/
       ATX_HEADER_START = /^(?<level>\#{1,6})[\t ] (?<contents>.*)\n/.freeze
       define_parser(:atx_header_gfm, ATX_HEADER_START, nil, 'parse_atx_header')
       define_parser(:atx_header_gfm_quirk, ATX_HEADER_START)
 
@@ -176,16  174,17 @@ def parse_atx_header_gfm_quirk
         text, id = parse_header_contents
         text.sub!(/[\t ]# \z/, '') && text.rstrip!
         return false if text.empty?
 
         add_header(@src["level"].length, text, id)
         true
       end
 
-      FENCED_CODEBLOCK_START = /^[ ]{0,3}[~`]{3,}/
-      FENCED_CODEBLOCK_MATCH = /^[ ]{0,3}(([~`]){3,})\s*?((\S ?)(?:\?\S*)?)?\s*?\n(.*?)^[ ]{0,3}\1\2*\s*?\n/m
       FENCED_CODEBLOCK_START = /^[ ]{0,3}[~`]{3,}/.freeze
       FENCED_CODEBLOCK_MATCH = /^[ ]{0,3}(([~`]){3,})\s*?((\S ?)(?:\?\S*)?)?\s*?\n(.*?)^[ ]{0,3}\1\2*\s*?\n/m.freeze
       define_parser(:codeblock_fenced_gfm, FENCED_CODEBLOCK_START, nil, 'parse_codeblock_fenced')
 
-      STRIKETHROUGH_DELIM = /~~/
-      STRIKETHROUGH_MATCH = /#{STRIKETHROUGH_DELIM}(?!\s|~).*?[^\s~]#{STRIKETHROUGH_DELIM}/m
       STRIKETHROUGH_DELIM = /~~/.freeze
       STRIKETHROUGH_MATCH = /#{STRIKETHROUGH_DELIM}(?!\s|~).*?[^\s~]#{STRIKETHROUGH_DELIM}/m.freeze
       define_parser(:strikethrough_gfm, STRIKETHROUGH_MATCH, '~~')
 
       def parse_strikethrough_gfm
@@ -204,24  203,29 @@ def parse_strikethrough_gfm
         el
       end
 
       LIST_TYPES = [:ul, :ol].freeze
 
       # To handle task-lists we override the parse method for lists, converting matching text into
       # checkbox input elements where necessary (as well as applying classes to the ul/ol and li
       # elements).
       def parse_list
         super
-        current_list = @tree.children.select {|element| [:ul, :ol].include?(element.type) }.last
         current_list = @tree.children.select { |element| LIST_TYPES.include?(element.type) }.last
 
-        is_tasklist = false
         is_tasklist   = false
         box_unchecked = '<input type="checkbox" class="task-list-item-checkbox" disabled="disabled" />'
-        box_checked = '<input type="checkbox" class="task-list-item-checkbox" ' \
         box_checked   = '<input type="checkbox" class="task-list-item-checkbox" ' \
           'disabled="disabled" checked="checked" />'
 
         current_list.children.each do |li|
-          next unless !li.children.empty? && li.children[0].type == :p
           list_items = li.children
           next unless !list_items.empty? && list_items[0].type == :p
 
           # li -> p -> raw_text
-          checked = li.children[0].children[0].value.gsub!(/\A\s*\[ \]\s /, box_unchecked)
-          unchecked = li.children[0].children[0].value.gsub!(/\A\s*\[x\]\s /i, box_checked)
-          is_tasklist ||= (!checked.nil? || !unchecked.nil?)
           descendant = list_items[0].children[0].value
           checked    = descendant.gsub!(/\A\s*\[ \]\s /,  box_unchecked)
           unchecked  = descendant.gsub!(/\A\s*\[x\]\s /i, box_checked)
           is_tasklist ||= checked || unchecked
 
           li.attr['class'] = 'task-list-item' if is_tasklist
         end
@@ -231,14  235,32 @@ def parse_list
         true
       end
 
-      ESCAPED_CHARS_GFM = /\\([\\.*_ `<>()\[\]{}#!:\|"'\$=\-~])/
       ESCAPED_CHARS_GFM = /\\([\\.*_ `<>()\[\]{}#!:\|"'\$=\-~])/.freeze
       define_parser(:escaped_chars_gfm, ESCAPED_CHARS_GFM, '\\\\', :parse_escaped_chars)
 
-      PARAGRAPH_END_GFM = /#{LAZY_END}|#{LIST_START}|#{ATX_HEADER_START}|
-                           #{DEFINITION_LIST_START}|#{BLOCKQUOTE_START}|#{FENCED_CODEBLOCK_START}/x
       PARAGRAPH_END_GFM = Regexp.union(
         LAZY_END, LIST_START, ATX_HEADER_START, DEFINITION_LIST_START,
         BLOCKQUOTE_START, FENCED_CODEBLOCK_START
       )
 
       private
 
       def update_text_type(element, child)
         children = []
         lines = child.value.split(@hard_line_break, -1)
         omit_trailing_br = (lines[-1].empty? && Kramdown::Element.category(element) == :block &&
                             element.children[-1] == child)
 
         lines.each_with_index do |line, index|
           new_element_options = {location: child.options[:location]   index}
           children << Element.new(:text, (index > 0 ? "\n#{line}" : line), nil, new_element_options)
 
           if index < lines.size - 2 || (index == lines.size - 2 && !omit_trailing_br)
             children << Element.new(:br, nil, nil, new_element_options)
           end
         end
 
-      def paragraph_end
-        @paragraph_end
         children
       end
 
     end