deal with markup segments

2025-01-20 06:37:59 +01:00 · 2019-11-03 15:32:55 -05:00 · 2019-11-03 15:32:55 -05:00 · 32ac705eee
commit 32ac705eee
parent 3e29e3785d
3 changed files with 233 additions and 168 deletions
--- a/lib/textile/helpers.ex
+++ b/lib/textile/helpers.ex
@ -0,0 +1,41 @@
+defmodule Textile.Helpers do
+  import NimbleParsec
+
+  # Helper to "undo" a tokenization and convert it back
+  # to a string
+  def unwrap([{_name, value}]), do: value
+
+  # Lots of extra unicode space characters
+  def space do
+    choice([
+      utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'),
+      utf8_char([0x2000..0x200a])
+    ])
+  end
+
+  # Characters which are valid before and after the main markup characters.
+  def special_characters do
+    choice([
+      space(),
+      utf8_char('#$%&(),-./:;<=?[\\]^`|~\'')
+    ])
+  end
+
+  # Simple tag for a markup element that must
+  # be succeeded immediately by a non-space character
+  def markup_open_tag(str, char \\ nil, tag_name) do
+    char = char || binary_head(str)
+
+    open_stops =
+      choice([
+        space(),
+        string(char)
+      ])
+
+    string(str)
+    |> lookahead_not(open_stops)
+    |> unwrap_and_tag(:"#{tag_name}_open")
+  end
+
+  defp binary_head(<<c::utf8, _rest::binary>>), do: <<c::utf8>>
+end
--- a/lib/textile/lexer.ex
+++ b/lib/textile/lexer.ex
@ -1,16 +1,14 @@
 defmodule Textile.Lexer do
  import NimbleParsec
+  import Textile.Helpers
+  import Textile.MarkupLexer

-  defp unwrap([{_name, value}]),
-    do: value

-  # Lots of extra unicode space characters
-  space =
-    choice([
-      utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'),
-      utf8_char([0x2000..0x200a])
-    ])
+  # Structural tags

+
+  # Literals enclosed via [== ==]
+  # Will never contain any markup
  bracketed_literal =
    ignore(string("[=="))
    |> repeat(lookahead_not(string("==]")) |> utf8_char([]))
@ -18,175 +16,44 @@ defmodule Textile.Lexer do
    |> reduce({List, :to_string, []})
    |> unwrap_and_tag(:bracketed_literal)

-  link_text_with_title =
-    ignore(string("\""))
-    |> times(utf8_char(not: ?(), min: 1)
-    |> reduce({List, :to_string, []})
-    |> unwrap_and_tag(:link_text)
-    |> ignore(string("("))
-    |> concat(
-      times(utf8_char(not: ?), not: ?"), min: 1)
-      |> reduce({List, :to_string, []})
-      |> unwrap_and_tag(:link_title)
-      |> ignore(string(")\":"))
-    )
-
-  link_text_without_title =
-    ignore(string("\""))
-    |> times(utf8_char(not: ?"), min: 1)
-    |> ignore(string("\":"))
-    |> reduce({List, :to_string, []})
-    |> unwrap_and_tag(:link_text)
-
-  link_text =
-    choice([
-      link_text_with_title,
-      link_text_without_title
+  blockquote_cite =
+    lookahead_not(string("\""))
+    |> choice([
+      bracketed_literal |> reduce(:unwrap),
+      utf8_char([])
    ])
+    |> repeat()

-  link_protocol =
-    choice([
-      string("/"), string("https://"), string("http://"), string("data:image/")
-    ])
-
-  uri_ending_at_space =
-    link_protocol
-    |> times(lookahead_not(space) |> utf8_char([]), min: 1)
-    |> reduce({List, :to_string, []})
-
-  uri_ending_at_bracket =
-    link_protocol
-    |> times(lookahead_not(string("]")) |> utf8_char([]), min: 1)
-    |> reduce({List, :to_string, []})
-
-  uri_ending_at_lparen =
-    link_protocol
-    |> times(lookahead_not(string("(")) |> utf8_char([]), min: 1)
-    |> reduce({List, :to_string, []})
-
-  uri_ending_at_bang =
-    link_protocol
-    |> times(lookahead_not(string("!")) |> utf8_char([]), min: 1)
-    |> reduce({List, :to_string, []})
-
-  unbracketed_link =
-    link_text
-    |> concat(uri_ending_at_space |> unwrap_and_tag(:link_url))
-
-  bracketed_link =
-    ignore(string("["))
-    |> concat(link_text)
-    |> concat(uri_ending_at_bracket |> unwrap_and_tag(:link_url))
-    |> ignore(string("]"))
-
-  link =
-    choice([
-      bracketed_link,
-      unbracketed_link
-    ])
-
-  image_url_with_title =
-    ignore(string("!"))
-    |> concat(uri_ending_at_lparen |> unwrap_and_tag(:image_url))
-    |> ignore(string("("))
-    |> concat(
-      times(utf8_char(not: ?), not: ?!), min: 1)
-      |> reduce({List, :to_string, []})
-      |> unwrap_and_tag(:image_title)
-      |> ignore(string(")!"))
-    )
-
-  image_url_without_title =
-    ignore(string("!"))
-    |> concat(uri_ending_at_bang |> unwrap_and_tag(:image_url))
-    |> ignore(string("!"))
-
-  image_url =
-    choice([
-      image_url_with_title,
-      image_url_without_title
-    ])
-
-  unbracketed_image =
-    image_url
-    |> optional(
-      ignore(string(":"))
-      |> concat(uri_ending_at_space)
-      |> unwrap_and_tag(:image_link_url)
-    )
-
-  bracketed_image =
-    ignore(string("["))
-    |> concat(image_url)
-    |> optional(
-      ignore(string(":"))
-      |> concat(uri_ending_at_bracket)
-      |> unwrap_and_tag(:image_link_url)
-    )
-    |> ignore(string("]"))
-
-  image =
-    choice([
-      bracketed_image,
-      unbracketed_image
-    ])
-
-  literal =
-    ignore(string("=="))
-    |> repeat(lookahead_not(string("==")) |> utf8_char([]))
-    |> reduce({List, :to_string, []})
-    |> unwrap_and_tag(:literal)
-    |> ignore(string("=="))
-
-  blockquote_author =
-    repeat(
-      lookahead_not(string("\"]"))
-      |> choice([
-        bracketed_literal,
-        literal,
-        utf8_char([])
-      ])
-    )
-    |> reduce(:unwrap)
-
-  l_bq_author =
+  # Blockquote opening tag with cite: [bq="the author"]
+  # Cite can contain bracketed literals or text
+  blockquote_open_cite =
    ignore(string("[bq=\""))
-    |> concat(blockquote_author)
+    |> concat(blockquote_cite)
    |> ignore(string("\"]"))
    |> reduce({List, :to_string, []})
-    |> unwrap_and_tag(:l_bq_author)
+    |> unwrap_and_tag(:blockquote_open_cite)

-  l_bq = string("[bq]") |> unwrap_and_tag(:l_bq)
-  r_bq = string("[/bq]") |> unwrap_and_tag(:r_bq)
+  # Blockquote opening tag
+  blockquote_open =
+    string("[bq]")
+    |> unwrap_and_tag(:blockquote_open)

-  l_spoiler = string("[spoiler]") |> unwrap_and_tag(:l_spoiler)
-  r_spoiler = string("[/spoiler]") |> unwrap_and_tag(:r_spoiler)
+  # Blockquote closing tag
+  blockquote_close =
+    string("[/bq]")
+    |> unwrap_and_tag(:blockquote_close)

-  stop_words =
-    choice([
-      bracketed_literal,
-      bracketed_link,
-      bracketed_image,
-      link,
-      image,
-      l_bq_author,
-      l_bq,
-      r_bq,
-      l_spoiler,
-      r_spoiler,
-    ])
+  # Spoiler open tag
+  spoiler_open =
+    string("[spoiler]")
+    |> unwrap_and_tag(:spoiler_open)

-  defcombinatorp :top_level,
-    choice([
-      stop_words,
-      times(lookahead_not(stop_words) |> utf8_char([]), min: 1)
-      |> reduce({List, :to_string, []})
-      |> unwrap_and_tag(:text)
-    ])
+  # Spoiler close tag
+  spoiler_close =
+    string("[/spoiler]")
+    |> unwrap_and_tag(:spoiler_close)

-  textile =
-    repeat(parsec(:top_level))
-    |> eos()
+  markup = markup_segment(eos())

-  defparsec :lex, textile
+  defparsec :markup, markup
 end
--- a/lib/textile/markup_lexer.ex
+++ b/lib/textile/markup_lexer.ex
@ -0,0 +1,157 @@
+defmodule Textile.MarkupLexer do
+  import NimbleParsec
+  import Textile.Helpers
+
+  # Markup tags
+
+  def markup_segment(ending_sequence) do
+
+    # The literal tag is special, because
+    # 1. It needs to capture everything inside it as a distinct token.
+    # 2. It can be surrounded by markup on all sides.
+    # 3. If it successfully tokenizes, it will always be in the output.
+
+    literal_open_stops =
+      choice([
+        space(),
+        ending_sequence,
+        string("=")
+      ])
+
+    literal_close_stops =
+      lookahead_not(
+        choice([
+          ending_sequence,
+          string("\n\n"),
+          string("="),
+          space() |> concat(string("="))
+        ])
+      )
+      |> utf8_char([])
+
+    literal =
+      ignore(string("=="))
+      |> lookahead_not(literal_open_stops)
+      |> repeat(literal_close_stops)
+      |> ignore(string("=="))
+      |> reduce({List, :to_string, []})
+      |> unwrap_and_tag(:literal)
+
+    b_open         = markup_open_tag("**", "*", :b)
+    i_open         = markup_open_tag("__", "*", :i)
+
+    strong_open    = markup_open_tag("*", :strong)
+    em_open        = markup_open_tag("_", :em)
+    code_open      = markup_open_tag("@", :code)
+    ins_open       = markup_open_tag("+", :ins)
+    sup_open       = markup_open_tag("^", :sup)
+    del_open       = markup_open_tag("-", :del)
+    sub_open       = markup_open_tag("~", :sub)
+
+    b_b_open       = markup_open_tag("[**", "*", :b_b)
+    b_i_open       = markup_open_tag("[__", "_", :b_i)
+
+    b_strong_open  = markup_open_tag("[*", "*", :b_strong)
+    b_em_open      = markup_open_tag("[_", "_", :b_em)
+    b_code_open    = markup_open_tag("[@", "@", :b_code)
+    b_ins_open     = markup_open_tag("[+", "+", :b_ins)
+    b_sup_open     = markup_open_tag("[^", "^", :b_sup)
+    b_del_open     = markup_open_tag("[-", "-", :b_del)
+    b_sub_open     = markup_open_tag("[~", "~", :b_sub)
+
+    b_b_close      = string("**]") |> unwrap_and_tag(:b_b_close)
+    b_i_close      = string("__]") |> unwrap_and_tag(:b_i_close)
+
+    b_strong_close = string("*]") |> unwrap_and_tag(:b_strong_close)
+    b_em_close     = string("_]") |> unwrap_and_tag(:b_em_close)
+    b_code_close   = string("@]") |> unwrap_and_tag(:b_code_close)
+    b_ins_close    = string("+]") |> unwrap_and_tag(:b_ins_close)
+    b_sup_close    = string("^]") |> unwrap_and_tag(:b_sup_close)
+    b_del_close    = string("-]") |> unwrap_and_tag(:b_del_close)
+    b_sub_close    = string("~]") |> unwrap_and_tag(:b_sub_close)
+
+    b_close        = string("**") |> unwrap_and_tag(:b_close)
+    i_close        = string("__") |> unwrap_and_tag(:i_close)
+
+    strong_close   = string("*") |> unwrap_and_tag(:strong_close)
+    em_close       = string("_") |> unwrap_and_tag(:em_close)
+    code_close     = string("@") |> unwrap_and_tag(:code_close)
+    ins_close      = string("+") |> unwrap_and_tag(:ins_close)
+    sup_close      = string("^") |> unwrap_and_tag(:sup_close)
+    del_close      = string("-") |> unwrap_and_tag(:del_close)
+    sub_close      = string("~") |> unwrap_and_tag(:sub_close)
+
+    bracketed_markup_opening_tags =
+      choice([
+        b_b_open,
+        b_i_open,
+        b_strong_open,
+        b_em_open,
+        b_code_open,
+        b_ins_open,
+        b_sup_open,
+        b_del_open,
+        b_sub_open
+      ])
+
+    markup_opening_tags =
+      choice([
+        b_open,
+        i_open,
+        strong_open,
+        em_open,
+        code_open,
+        ins_open,
+        sup_open,
+        del_open,
+        sub_open
+      ])
+
+    bracketed_markup_closing_tags =
+      choice([
+        b_b_close,
+        b_i_close,
+        b_strong_close,
+        b_em_close,
+        b_code_close,
+        b_ins_close,
+        b_sup_close,
+        b_del_close,
+        b_sub_close,
+        b_close,
+        i_close,
+      ])
+
+    markup_closing_tags =
+      choice([
+        strong_close,
+        em_close,
+        code_close,
+        ins_close,
+        sup_close,
+        del_close,
+        sub_close
+      ])
+
+    markup_at_start =
+      choice([
+        markup_opening_tags,
+        bracketed_markup_opening_tags
+      ])
+
+    markup_element =
+      lookahead_not(ending_sequence)
+      |> choice([
+        special_characters() |> concat(markup_opening_tags),
+        bracketed_markup_opening_tags,
+        # utf8 char which is not a space followed by a closing tag followed by a special or the end
+        utf8_char([]) |> lookahead_not(space()) |> concat(markup_closing_tags) |> lookahead(choice([special_characters(), ending_sequence])),
+        utf8_char([]) |> concat(bracketed_markup_closing_tags),
+        literal,
+        utf8_char([])
+      ])
+
+    optional(markup_at_start)
+    |> repeat(markup_element)
+  end
+end