diff --git a/lib/textile/helpers.ex b/lib/textile/helpers.ex new file mode 100644 index 00000000..fcd05151 --- /dev/null +++ b/lib/textile/helpers.ex @@ -0,0 +1,41 @@ +defmodule Textile.Helpers do + import NimbleParsec + + # Helper to "undo" a tokenization and convert it back + # to a string + def unwrap([{_name, value}]), do: value + + # Lots of extra unicode space characters + def space do + choice([ + utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'), + utf8_char([0x2000..0x200a]) + ]) + end + + # Characters which are valid before and after the main markup characters. + def special_characters do + choice([ + space(), + utf8_char('#$%&(),-./:;<=?[\\]^`|~\'') + ]) + end + + # Simple tag for a markup element that must + # be succeeded immediately by a non-space character + def markup_open_tag(str, char \\ nil, tag_name) do + char = char || binary_head(str) + + open_stops = + choice([ + space(), + string(char) + ]) + + string(str) + |> lookahead_not(open_stops) + |> unwrap_and_tag(:"#{tag_name}_open") + end + + defp binary_head(<>), do: <> +end \ No newline at end of file diff --git a/lib/textile/lexer.ex b/lib/textile/lexer.ex index d22b5edc..72b1a472 100644 --- a/lib/textile/lexer.ex +++ b/lib/textile/lexer.ex @@ -1,16 +1,14 @@ defmodule Textile.Lexer do import NimbleParsec + import Textile.Helpers + import Textile.MarkupLexer - defp unwrap([{_name, value}]), - do: value - # Lots of extra unicode space characters - space = - choice([ - utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'), - utf8_char([0x2000..0x200a]) - ]) + # Structural tags + + # Literals enclosed via [== ==] + # Will never contain any markup bracketed_literal = ignore(string("[==")) |> repeat(lookahead_not(string("==]")) |> utf8_char([])) @@ -18,175 +16,44 @@ defmodule Textile.Lexer do |> reduce({List, :to_string, []}) |> unwrap_and_tag(:bracketed_literal) - link_text_with_title = - ignore(string("\"")) - |> times(utf8_char(not: ?(), min: 1) - |> reduce({List, :to_string, []}) - |> unwrap_and_tag(:link_text) - |> ignore(string("(")) - |> concat( - times(utf8_char(not: ?), not: ?"), min: 1) - |> reduce({List, :to_string, []}) - |> unwrap_and_tag(:link_title) - |> ignore(string(")\":")) - ) - - link_text_without_title = - ignore(string("\"")) - |> times(utf8_char(not: ?"), min: 1) - |> ignore(string("\":")) - |> reduce({List, :to_string, []}) - |> unwrap_and_tag(:link_text) - - link_text = - choice([ - link_text_with_title, - link_text_without_title + blockquote_cite = + lookahead_not(string("\"")) + |> choice([ + bracketed_literal |> reduce(:unwrap), + utf8_char([]) ]) + |> repeat() - link_protocol = - choice([ - string("/"), string("https://"), string("http://"), string("data:image/") - ]) - - uri_ending_at_space = - link_protocol - |> times(lookahead_not(space) |> utf8_char([]), min: 1) - |> reduce({List, :to_string, []}) - - uri_ending_at_bracket = - link_protocol - |> times(lookahead_not(string("]")) |> utf8_char([]), min: 1) - |> reduce({List, :to_string, []}) - - uri_ending_at_lparen = - link_protocol - |> times(lookahead_not(string("(")) |> utf8_char([]), min: 1) - |> reduce({List, :to_string, []}) - - uri_ending_at_bang = - link_protocol - |> times(lookahead_not(string("!")) |> utf8_char([]), min: 1) - |> reduce({List, :to_string, []}) - - unbracketed_link = - link_text - |> concat(uri_ending_at_space |> unwrap_and_tag(:link_url)) - - bracketed_link = - ignore(string("[")) - |> concat(link_text) - |> concat(uri_ending_at_bracket |> unwrap_and_tag(:link_url)) - |> ignore(string("]")) - - link = - choice([ - bracketed_link, - unbracketed_link - ]) - - image_url_with_title = - ignore(string("!")) - |> concat(uri_ending_at_lparen |> unwrap_and_tag(:image_url)) - |> ignore(string("(")) - |> concat( - times(utf8_char(not: ?), not: ?!), min: 1) - |> reduce({List, :to_string, []}) - |> unwrap_and_tag(:image_title) - |> ignore(string(")!")) - ) - - image_url_without_title = - ignore(string("!")) - |> concat(uri_ending_at_bang |> unwrap_and_tag(:image_url)) - |> ignore(string("!")) - - image_url = - choice([ - image_url_with_title, - image_url_without_title - ]) - - unbracketed_image = - image_url - |> optional( - ignore(string(":")) - |> concat(uri_ending_at_space) - |> unwrap_and_tag(:image_link_url) - ) - - bracketed_image = - ignore(string("[")) - |> concat(image_url) - |> optional( - ignore(string(":")) - |> concat(uri_ending_at_bracket) - |> unwrap_and_tag(:image_link_url) - ) - |> ignore(string("]")) - - image = - choice([ - bracketed_image, - unbracketed_image - ]) - - literal = - ignore(string("==")) - |> repeat(lookahead_not(string("==")) |> utf8_char([])) - |> reduce({List, :to_string, []}) - |> unwrap_and_tag(:literal) - |> ignore(string("==")) - - blockquote_author = - repeat( - lookahead_not(string("\"]")) - |> choice([ - bracketed_literal, - literal, - utf8_char([]) - ]) - ) - |> reduce(:unwrap) - - l_bq_author = + # Blockquote opening tag with cite: [bq="the author"] + # Cite can contain bracketed literals or text + blockquote_open_cite = ignore(string("[bq=\"")) - |> concat(blockquote_author) + |> concat(blockquote_cite) |> ignore(string("\"]")) |> reduce({List, :to_string, []}) - |> unwrap_and_tag(:l_bq_author) + |> unwrap_and_tag(:blockquote_open_cite) - l_bq = string("[bq]") |> unwrap_and_tag(:l_bq) - r_bq = string("[/bq]") |> unwrap_and_tag(:r_bq) + # Blockquote opening tag + blockquote_open = + string("[bq]") + |> unwrap_and_tag(:blockquote_open) - l_spoiler = string("[spoiler]") |> unwrap_and_tag(:l_spoiler) - r_spoiler = string("[/spoiler]") |> unwrap_and_tag(:r_spoiler) + # Blockquote closing tag + blockquote_close = + string("[/bq]") + |> unwrap_and_tag(:blockquote_close) - stop_words = - choice([ - bracketed_literal, - bracketed_link, - bracketed_image, - link, - image, - l_bq_author, - l_bq, - r_bq, - l_spoiler, - r_spoiler, - ]) + # Spoiler open tag + spoiler_open = + string("[spoiler]") + |> unwrap_and_tag(:spoiler_open) - defcombinatorp :top_level, - choice([ - stop_words, - times(lookahead_not(stop_words) |> utf8_char([]), min: 1) - |> reduce({List, :to_string, []}) - |> unwrap_and_tag(:text) - ]) + # Spoiler close tag + spoiler_close = + string("[/spoiler]") + |> unwrap_and_tag(:spoiler_close) - textile = - repeat(parsec(:top_level)) - |> eos() + markup = markup_segment(eos()) - defparsec :lex, textile + defparsec :markup, markup end \ No newline at end of file diff --git a/lib/textile/markup_lexer.ex b/lib/textile/markup_lexer.ex new file mode 100644 index 00000000..d77e8f7f --- /dev/null +++ b/lib/textile/markup_lexer.ex @@ -0,0 +1,157 @@ +defmodule Textile.MarkupLexer do + import NimbleParsec + import Textile.Helpers + + # Markup tags + + def markup_segment(ending_sequence) do + + # The literal tag is special, because + # 1. It needs to capture everything inside it as a distinct token. + # 2. It can be surrounded by markup on all sides. + # 3. If it successfully tokenizes, it will always be in the output. + + literal_open_stops = + choice([ + space(), + ending_sequence, + string("=") + ]) + + literal_close_stops = + lookahead_not( + choice([ + ending_sequence, + string("\n\n"), + string("="), + space() |> concat(string("=")) + ]) + ) + |> utf8_char([]) + + literal = + ignore(string("==")) + |> lookahead_not(literal_open_stops) + |> repeat(literal_close_stops) + |> ignore(string("==")) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:literal) + + b_open = markup_open_tag("**", "*", :b) + i_open = markup_open_tag("__", "*", :i) + + strong_open = markup_open_tag("*", :strong) + em_open = markup_open_tag("_", :em) + code_open = markup_open_tag("@", :code) + ins_open = markup_open_tag("+", :ins) + sup_open = markup_open_tag("^", :sup) + del_open = markup_open_tag("-", :del) + sub_open = markup_open_tag("~", :sub) + + b_b_open = markup_open_tag("[**", "*", :b_b) + b_i_open = markup_open_tag("[__", "_", :b_i) + + b_strong_open = markup_open_tag("[*", "*", :b_strong) + b_em_open = markup_open_tag("[_", "_", :b_em) + b_code_open = markup_open_tag("[@", "@", :b_code) + b_ins_open = markup_open_tag("[+", "+", :b_ins) + b_sup_open = markup_open_tag("[^", "^", :b_sup) + b_del_open = markup_open_tag("[-", "-", :b_del) + b_sub_open = markup_open_tag("[~", "~", :b_sub) + + b_b_close = string("**]") |> unwrap_and_tag(:b_b_close) + b_i_close = string("__]") |> unwrap_and_tag(:b_i_close) + + b_strong_close = string("*]") |> unwrap_and_tag(:b_strong_close) + b_em_close = string("_]") |> unwrap_and_tag(:b_em_close) + b_code_close = string("@]") |> unwrap_and_tag(:b_code_close) + b_ins_close = string("+]") |> unwrap_and_tag(:b_ins_close) + b_sup_close = string("^]") |> unwrap_and_tag(:b_sup_close) + b_del_close = string("-]") |> unwrap_and_tag(:b_del_close) + b_sub_close = string("~]") |> unwrap_and_tag(:b_sub_close) + + b_close = string("**") |> unwrap_and_tag(:b_close) + i_close = string("__") |> unwrap_and_tag(:i_close) + + strong_close = string("*") |> unwrap_and_tag(:strong_close) + em_close = string("_") |> unwrap_and_tag(:em_close) + code_close = string("@") |> unwrap_and_tag(:code_close) + ins_close = string("+") |> unwrap_and_tag(:ins_close) + sup_close = string("^") |> unwrap_and_tag(:sup_close) + del_close = string("-") |> unwrap_and_tag(:del_close) + sub_close = string("~") |> unwrap_and_tag(:sub_close) + + bracketed_markup_opening_tags = + choice([ + b_b_open, + b_i_open, + b_strong_open, + b_em_open, + b_code_open, + b_ins_open, + b_sup_open, + b_del_open, + b_sub_open + ]) + + markup_opening_tags = + choice([ + b_open, + i_open, + strong_open, + em_open, + code_open, + ins_open, + sup_open, + del_open, + sub_open + ]) + + bracketed_markup_closing_tags = + choice([ + b_b_close, + b_i_close, + b_strong_close, + b_em_close, + b_code_close, + b_ins_close, + b_sup_close, + b_del_close, + b_sub_close, + b_close, + i_close, + ]) + + markup_closing_tags = + choice([ + strong_close, + em_close, + code_close, + ins_close, + sup_close, + del_close, + sub_close + ]) + + markup_at_start = + choice([ + markup_opening_tags, + bracketed_markup_opening_tags + ]) + + markup_element = + lookahead_not(ending_sequence) + |> choice([ + special_characters() |> concat(markup_opening_tags), + bracketed_markup_opening_tags, + # utf8 char which is not a space followed by a closing tag followed by a special or the end + utf8_char([]) |> lookahead_not(space()) |> concat(markup_closing_tags) |> lookahead(choice([special_characters(), ending_sequence])), + utf8_char([]) |> concat(bracketed_markup_closing_tags), + literal, + utf8_char([]) + ]) + + optional(markup_at_start) + |> repeat(markup_element) + end +end \ No newline at end of file