mirror of
https://github.com/philomena-dev/philomena.git
synced 2024-11-23 20:18:00 +01:00
deal with markup segments
This commit is contained in:
parent
3e29e3785d
commit
32ac705eee
3 changed files with 233 additions and 168 deletions
41
lib/textile/helpers.ex
Normal file
41
lib/textile/helpers.ex
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
defmodule Textile.Helpers do
|
||||||
|
import NimbleParsec
|
||||||
|
|
||||||
|
# Helper to "undo" a tokenization and convert it back
|
||||||
|
# to a string
|
||||||
|
def unwrap([{_name, value}]), do: value
|
||||||
|
|
||||||
|
# Lots of extra unicode space characters
|
||||||
|
def space do
|
||||||
|
choice([
|
||||||
|
utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'),
|
||||||
|
utf8_char([0x2000..0x200a])
|
||||||
|
])
|
||||||
|
end
|
||||||
|
|
||||||
|
# Characters which are valid before and after the main markup characters.
|
||||||
|
def special_characters do
|
||||||
|
choice([
|
||||||
|
space(),
|
||||||
|
utf8_char('#$%&(),-./:;<=?[\\]^`|~\'')
|
||||||
|
])
|
||||||
|
end
|
||||||
|
|
||||||
|
# Simple tag for a markup element that must
|
||||||
|
# be succeeded immediately by a non-space character
|
||||||
|
def markup_open_tag(str, char \\ nil, tag_name) do
|
||||||
|
char = char || binary_head(str)
|
||||||
|
|
||||||
|
open_stops =
|
||||||
|
choice([
|
||||||
|
space(),
|
||||||
|
string(char)
|
||||||
|
])
|
||||||
|
|
||||||
|
string(str)
|
||||||
|
|> lookahead_not(open_stops)
|
||||||
|
|> unwrap_and_tag(:"#{tag_name}_open")
|
||||||
|
end
|
||||||
|
|
||||||
|
defp binary_head(<<c::utf8, _rest::binary>>), do: <<c::utf8>>
|
||||||
|
end
|
|
@ -1,16 +1,14 @@
|
||||||
defmodule Textile.Lexer do
|
defmodule Textile.Lexer do
|
||||||
import NimbleParsec
|
import NimbleParsec
|
||||||
|
import Textile.Helpers
|
||||||
|
import Textile.MarkupLexer
|
||||||
|
|
||||||
defp unwrap([{_name, value}]),
|
|
||||||
do: value
|
|
||||||
|
|
||||||
# Lots of extra unicode space characters
|
# Structural tags
|
||||||
space =
|
|
||||||
choice([
|
|
||||||
utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'),
|
|
||||||
utf8_char([0x2000..0x200a])
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
|
# Literals enclosed via [== ==]
|
||||||
|
# Will never contain any markup
|
||||||
bracketed_literal =
|
bracketed_literal =
|
||||||
ignore(string("[=="))
|
ignore(string("[=="))
|
||||||
|> repeat(lookahead_not(string("==]")) |> utf8_char([]))
|
|> repeat(lookahead_not(string("==]")) |> utf8_char([]))
|
||||||
|
@ -18,175 +16,44 @@ defmodule Textile.Lexer do
|
||||||
|> reduce({List, :to_string, []})
|
|> reduce({List, :to_string, []})
|
||||||
|> unwrap_and_tag(:bracketed_literal)
|
|> unwrap_and_tag(:bracketed_literal)
|
||||||
|
|
||||||
link_text_with_title =
|
blockquote_cite =
|
||||||
ignore(string("\""))
|
lookahead_not(string("\""))
|
||||||
|> times(utf8_char(not: ?(), min: 1)
|
|> choice([
|
||||||
|> reduce({List, :to_string, []})
|
bracketed_literal |> reduce(:unwrap),
|
||||||
|> unwrap_and_tag(:link_text)
|
utf8_char([])
|
||||||
|> ignore(string("("))
|
|
||||||
|> concat(
|
|
||||||
times(utf8_char(not: ?), not: ?"), min: 1)
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|> unwrap_and_tag(:link_title)
|
|
||||||
|> ignore(string(")\":"))
|
|
||||||
)
|
|
||||||
|
|
||||||
link_text_without_title =
|
|
||||||
ignore(string("\""))
|
|
||||||
|> times(utf8_char(not: ?"), min: 1)
|
|
||||||
|> ignore(string("\":"))
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|> unwrap_and_tag(:link_text)
|
|
||||||
|
|
||||||
link_text =
|
|
||||||
choice([
|
|
||||||
link_text_with_title,
|
|
||||||
link_text_without_title
|
|
||||||
])
|
])
|
||||||
|
|> repeat()
|
||||||
|
|
||||||
link_protocol =
|
# Blockquote opening tag with cite: [bq="the author"]
|
||||||
choice([
|
# Cite can contain bracketed literals or text
|
||||||
string("/"), string("https://"), string("http://"), string("data:image/")
|
blockquote_open_cite =
|
||||||
])
|
|
||||||
|
|
||||||
uri_ending_at_space =
|
|
||||||
link_protocol
|
|
||||||
|> times(lookahead_not(space) |> utf8_char([]), min: 1)
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|
|
||||||
uri_ending_at_bracket =
|
|
||||||
link_protocol
|
|
||||||
|> times(lookahead_not(string("]")) |> utf8_char([]), min: 1)
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|
|
||||||
uri_ending_at_lparen =
|
|
||||||
link_protocol
|
|
||||||
|> times(lookahead_not(string("(")) |> utf8_char([]), min: 1)
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|
|
||||||
uri_ending_at_bang =
|
|
||||||
link_protocol
|
|
||||||
|> times(lookahead_not(string("!")) |> utf8_char([]), min: 1)
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|
|
||||||
unbracketed_link =
|
|
||||||
link_text
|
|
||||||
|> concat(uri_ending_at_space |> unwrap_and_tag(:link_url))
|
|
||||||
|
|
||||||
bracketed_link =
|
|
||||||
ignore(string("["))
|
|
||||||
|> concat(link_text)
|
|
||||||
|> concat(uri_ending_at_bracket |> unwrap_and_tag(:link_url))
|
|
||||||
|> ignore(string("]"))
|
|
||||||
|
|
||||||
link =
|
|
||||||
choice([
|
|
||||||
bracketed_link,
|
|
||||||
unbracketed_link
|
|
||||||
])
|
|
||||||
|
|
||||||
image_url_with_title =
|
|
||||||
ignore(string("!"))
|
|
||||||
|> concat(uri_ending_at_lparen |> unwrap_and_tag(:image_url))
|
|
||||||
|> ignore(string("("))
|
|
||||||
|> concat(
|
|
||||||
times(utf8_char(not: ?), not: ?!), min: 1)
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|> unwrap_and_tag(:image_title)
|
|
||||||
|> ignore(string(")!"))
|
|
||||||
)
|
|
||||||
|
|
||||||
image_url_without_title =
|
|
||||||
ignore(string("!"))
|
|
||||||
|> concat(uri_ending_at_bang |> unwrap_and_tag(:image_url))
|
|
||||||
|> ignore(string("!"))
|
|
||||||
|
|
||||||
image_url =
|
|
||||||
choice([
|
|
||||||
image_url_with_title,
|
|
||||||
image_url_without_title
|
|
||||||
])
|
|
||||||
|
|
||||||
unbracketed_image =
|
|
||||||
image_url
|
|
||||||
|> optional(
|
|
||||||
ignore(string(":"))
|
|
||||||
|> concat(uri_ending_at_space)
|
|
||||||
|> unwrap_and_tag(:image_link_url)
|
|
||||||
)
|
|
||||||
|
|
||||||
bracketed_image =
|
|
||||||
ignore(string("["))
|
|
||||||
|> concat(image_url)
|
|
||||||
|> optional(
|
|
||||||
ignore(string(":"))
|
|
||||||
|> concat(uri_ending_at_bracket)
|
|
||||||
|> unwrap_and_tag(:image_link_url)
|
|
||||||
)
|
|
||||||
|> ignore(string("]"))
|
|
||||||
|
|
||||||
image =
|
|
||||||
choice([
|
|
||||||
bracketed_image,
|
|
||||||
unbracketed_image
|
|
||||||
])
|
|
||||||
|
|
||||||
literal =
|
|
||||||
ignore(string("=="))
|
|
||||||
|> repeat(lookahead_not(string("==")) |> utf8_char([]))
|
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|> unwrap_and_tag(:literal)
|
|
||||||
|> ignore(string("=="))
|
|
||||||
|
|
||||||
blockquote_author =
|
|
||||||
repeat(
|
|
||||||
lookahead_not(string("\"]"))
|
|
||||||
|> choice([
|
|
||||||
bracketed_literal,
|
|
||||||
literal,
|
|
||||||
utf8_char([])
|
|
||||||
])
|
|
||||||
)
|
|
||||||
|> reduce(:unwrap)
|
|
||||||
|
|
||||||
l_bq_author =
|
|
||||||
ignore(string("[bq=\""))
|
ignore(string("[bq=\""))
|
||||||
|> concat(blockquote_author)
|
|> concat(blockquote_cite)
|
||||||
|> ignore(string("\"]"))
|
|> ignore(string("\"]"))
|
||||||
|> reduce({List, :to_string, []})
|
|> reduce({List, :to_string, []})
|
||||||
|> unwrap_and_tag(:l_bq_author)
|
|> unwrap_and_tag(:blockquote_open_cite)
|
||||||
|
|
||||||
l_bq = string("[bq]") |> unwrap_and_tag(:l_bq)
|
# Blockquote opening tag
|
||||||
r_bq = string("[/bq]") |> unwrap_and_tag(:r_bq)
|
blockquote_open =
|
||||||
|
string("[bq]")
|
||||||
|
|> unwrap_and_tag(:blockquote_open)
|
||||||
|
|
||||||
l_spoiler = string("[spoiler]") |> unwrap_and_tag(:l_spoiler)
|
# Blockquote closing tag
|
||||||
r_spoiler = string("[/spoiler]") |> unwrap_and_tag(:r_spoiler)
|
blockquote_close =
|
||||||
|
string("[/bq]")
|
||||||
|
|> unwrap_and_tag(:blockquote_close)
|
||||||
|
|
||||||
stop_words =
|
# Spoiler open tag
|
||||||
choice([
|
spoiler_open =
|
||||||
bracketed_literal,
|
string("[spoiler]")
|
||||||
bracketed_link,
|
|> unwrap_and_tag(:spoiler_open)
|
||||||
bracketed_image,
|
|
||||||
link,
|
|
||||||
image,
|
|
||||||
l_bq_author,
|
|
||||||
l_bq,
|
|
||||||
r_bq,
|
|
||||||
l_spoiler,
|
|
||||||
r_spoiler,
|
|
||||||
])
|
|
||||||
|
|
||||||
defcombinatorp :top_level,
|
# Spoiler close tag
|
||||||
choice([
|
spoiler_close =
|
||||||
stop_words,
|
string("[/spoiler]")
|
||||||
times(lookahead_not(stop_words) |> utf8_char([]), min: 1)
|
|> unwrap_and_tag(:spoiler_close)
|
||||||
|> reduce({List, :to_string, []})
|
|
||||||
|> unwrap_and_tag(:text)
|
|
||||||
])
|
|
||||||
|
|
||||||
textile =
|
markup = markup_segment(eos())
|
||||||
repeat(parsec(:top_level))
|
|
||||||
|> eos()
|
|
||||||
|
|
||||||
defparsec :lex, textile
|
defparsec :markup, markup
|
||||||
end
|
end
|
157
lib/textile/markup_lexer.ex
Normal file
157
lib/textile/markup_lexer.ex
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
defmodule Textile.MarkupLexer do
|
||||||
|
import NimbleParsec
|
||||||
|
import Textile.Helpers
|
||||||
|
|
||||||
|
# Markup tags
|
||||||
|
|
||||||
|
def markup_segment(ending_sequence) do
|
||||||
|
|
||||||
|
# The literal tag is special, because
|
||||||
|
# 1. It needs to capture everything inside it as a distinct token.
|
||||||
|
# 2. It can be surrounded by markup on all sides.
|
||||||
|
# 3. If it successfully tokenizes, it will always be in the output.
|
||||||
|
|
||||||
|
literal_open_stops =
|
||||||
|
choice([
|
||||||
|
space(),
|
||||||
|
ending_sequence,
|
||||||
|
string("=")
|
||||||
|
])
|
||||||
|
|
||||||
|
literal_close_stops =
|
||||||
|
lookahead_not(
|
||||||
|
choice([
|
||||||
|
ending_sequence,
|
||||||
|
string("\n\n"),
|
||||||
|
string("="),
|
||||||
|
space() |> concat(string("="))
|
||||||
|
])
|
||||||
|
)
|
||||||
|
|> utf8_char([])
|
||||||
|
|
||||||
|
literal =
|
||||||
|
ignore(string("=="))
|
||||||
|
|> lookahead_not(literal_open_stops)
|
||||||
|
|> repeat(literal_close_stops)
|
||||||
|
|> ignore(string("=="))
|
||||||
|
|> reduce({List, :to_string, []})
|
||||||
|
|> unwrap_and_tag(:literal)
|
||||||
|
|
||||||
|
b_open = markup_open_tag("**", "*", :b)
|
||||||
|
i_open = markup_open_tag("__", "*", :i)
|
||||||
|
|
||||||
|
strong_open = markup_open_tag("*", :strong)
|
||||||
|
em_open = markup_open_tag("_", :em)
|
||||||
|
code_open = markup_open_tag("@", :code)
|
||||||
|
ins_open = markup_open_tag("+", :ins)
|
||||||
|
sup_open = markup_open_tag("^", :sup)
|
||||||
|
del_open = markup_open_tag("-", :del)
|
||||||
|
sub_open = markup_open_tag("~", :sub)
|
||||||
|
|
||||||
|
b_b_open = markup_open_tag("[**", "*", :b_b)
|
||||||
|
b_i_open = markup_open_tag("[__", "_", :b_i)
|
||||||
|
|
||||||
|
b_strong_open = markup_open_tag("[*", "*", :b_strong)
|
||||||
|
b_em_open = markup_open_tag("[_", "_", :b_em)
|
||||||
|
b_code_open = markup_open_tag("[@", "@", :b_code)
|
||||||
|
b_ins_open = markup_open_tag("[+", "+", :b_ins)
|
||||||
|
b_sup_open = markup_open_tag("[^", "^", :b_sup)
|
||||||
|
b_del_open = markup_open_tag("[-", "-", :b_del)
|
||||||
|
b_sub_open = markup_open_tag("[~", "~", :b_sub)
|
||||||
|
|
||||||
|
b_b_close = string("**]") |> unwrap_and_tag(:b_b_close)
|
||||||
|
b_i_close = string("__]") |> unwrap_and_tag(:b_i_close)
|
||||||
|
|
||||||
|
b_strong_close = string("*]") |> unwrap_and_tag(:b_strong_close)
|
||||||
|
b_em_close = string("_]") |> unwrap_and_tag(:b_em_close)
|
||||||
|
b_code_close = string("@]") |> unwrap_and_tag(:b_code_close)
|
||||||
|
b_ins_close = string("+]") |> unwrap_and_tag(:b_ins_close)
|
||||||
|
b_sup_close = string("^]") |> unwrap_and_tag(:b_sup_close)
|
||||||
|
b_del_close = string("-]") |> unwrap_and_tag(:b_del_close)
|
||||||
|
b_sub_close = string("~]") |> unwrap_and_tag(:b_sub_close)
|
||||||
|
|
||||||
|
b_close = string("**") |> unwrap_and_tag(:b_close)
|
||||||
|
i_close = string("__") |> unwrap_and_tag(:i_close)
|
||||||
|
|
||||||
|
strong_close = string("*") |> unwrap_and_tag(:strong_close)
|
||||||
|
em_close = string("_") |> unwrap_and_tag(:em_close)
|
||||||
|
code_close = string("@") |> unwrap_and_tag(:code_close)
|
||||||
|
ins_close = string("+") |> unwrap_and_tag(:ins_close)
|
||||||
|
sup_close = string("^") |> unwrap_and_tag(:sup_close)
|
||||||
|
del_close = string("-") |> unwrap_and_tag(:del_close)
|
||||||
|
sub_close = string("~") |> unwrap_and_tag(:sub_close)
|
||||||
|
|
||||||
|
bracketed_markup_opening_tags =
|
||||||
|
choice([
|
||||||
|
b_b_open,
|
||||||
|
b_i_open,
|
||||||
|
b_strong_open,
|
||||||
|
b_em_open,
|
||||||
|
b_code_open,
|
||||||
|
b_ins_open,
|
||||||
|
b_sup_open,
|
||||||
|
b_del_open,
|
||||||
|
b_sub_open
|
||||||
|
])
|
||||||
|
|
||||||
|
markup_opening_tags =
|
||||||
|
choice([
|
||||||
|
b_open,
|
||||||
|
i_open,
|
||||||
|
strong_open,
|
||||||
|
em_open,
|
||||||
|
code_open,
|
||||||
|
ins_open,
|
||||||
|
sup_open,
|
||||||
|
del_open,
|
||||||
|
sub_open
|
||||||
|
])
|
||||||
|
|
||||||
|
bracketed_markup_closing_tags =
|
||||||
|
choice([
|
||||||
|
b_b_close,
|
||||||
|
b_i_close,
|
||||||
|
b_strong_close,
|
||||||
|
b_em_close,
|
||||||
|
b_code_close,
|
||||||
|
b_ins_close,
|
||||||
|
b_sup_close,
|
||||||
|
b_del_close,
|
||||||
|
b_sub_close,
|
||||||
|
b_close,
|
||||||
|
i_close,
|
||||||
|
])
|
||||||
|
|
||||||
|
markup_closing_tags =
|
||||||
|
choice([
|
||||||
|
strong_close,
|
||||||
|
em_close,
|
||||||
|
code_close,
|
||||||
|
ins_close,
|
||||||
|
sup_close,
|
||||||
|
del_close,
|
||||||
|
sub_close
|
||||||
|
])
|
||||||
|
|
||||||
|
markup_at_start =
|
||||||
|
choice([
|
||||||
|
markup_opening_tags,
|
||||||
|
bracketed_markup_opening_tags
|
||||||
|
])
|
||||||
|
|
||||||
|
markup_element =
|
||||||
|
lookahead_not(ending_sequence)
|
||||||
|
|> choice([
|
||||||
|
special_characters() |> concat(markup_opening_tags),
|
||||||
|
bracketed_markup_opening_tags,
|
||||||
|
# utf8 char which is not a space followed by a closing tag followed by a special or the end
|
||||||
|
utf8_char([]) |> lookahead_not(space()) |> concat(markup_closing_tags) |> lookahead(choice([special_characters(), ending_sequence])),
|
||||||
|
utf8_char([]) |> concat(bracketed_markup_closing_tags),
|
||||||
|
literal,
|
||||||
|
utf8_char([])
|
||||||
|
])
|
||||||
|
|
||||||
|
optional(markup_at_start)
|
||||||
|
|> repeat(markup_element)
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue