philomena/lib/textile/lexer.ex
2019-11-02 21:47:54 -04:00

192 lines
No EOL
4.4 KiB
Elixir

defmodule Textile.Lexer do
import NimbleParsec
defp unwrap([{_name, value}]),
do: value
# Lots of extra unicode space characters
space =
choice([
utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'),
utf8_char([0x2000..0x200a])
])
bracketed_literal =
ignore(string("[=="))
|> repeat(lookahead_not(string("==]")) |> utf8_char([]))
|> ignore(string("==]"))
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:bracketed_literal)
link_text_with_title =
ignore(string("\""))
|> times(utf8_char(not: ?(), min: 1)
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:link_text)
|> ignore(string("("))
|> concat(
times(utf8_char(not: ?), not: ?"), min: 1)
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:link_title)
|> ignore(string(")\":"))
)
link_text_without_title =
ignore(string("\""))
|> times(utf8_char(not: ?"), min: 1)
|> ignore(string("\":"))
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:link_text)
link_text =
choice([
link_text_with_title,
link_text_without_title
])
link_protocol =
choice([
string("/"), string("https://"), string("http://"), string("data:image/")
])
uri_ending_at_space =
link_protocol
|> times(lookahead_not(space) |> utf8_char([]), min: 1)
|> reduce({List, :to_string, []})
uri_ending_at_bracket =
link_protocol
|> times(lookahead_not(string("]")) |> utf8_char([]), min: 1)
|> reduce({List, :to_string, []})
uri_ending_at_lparen =
link_protocol
|> times(lookahead_not(string("(")) |> utf8_char([]), min: 1)
|> reduce({List, :to_string, []})
uri_ending_at_bang =
link_protocol
|> times(lookahead_not(string("!")) |> utf8_char([]), min: 1)
|> reduce({List, :to_string, []})
unbracketed_link =
link_text
|> concat(uri_ending_at_space |> unwrap_and_tag(:link_url))
bracketed_link =
ignore(string("["))
|> concat(link_text)
|> concat(uri_ending_at_bracket |> unwrap_and_tag(:link_url))
|> ignore(string("]"))
link =
choice([
bracketed_link,
unbracketed_link
])
image_url_with_title =
ignore(string("!"))
|> concat(uri_ending_at_lparen |> unwrap_and_tag(:image_url))
|> ignore(string("("))
|> concat(
times(utf8_char(not: ?), not: ?!), min: 1)
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:image_title)
|> ignore(string(")!"))
)
image_url_without_title =
ignore(string("!"))
|> concat(uri_ending_at_bang |> unwrap_and_tag(:image_url))
|> ignore(string("!"))
image_url =
choice([
image_url_with_title,
image_url_without_title
])
unbracketed_image =
image_url
|> optional(
ignore(string(":"))
|> concat(uri_ending_at_space)
|> unwrap_and_tag(:image_link_url)
)
bracketed_image =
ignore(string("["))
|> concat(image_url)
|> optional(
ignore(string(":"))
|> concat(uri_ending_at_bracket)
|> unwrap_and_tag(:image_link_url)
)
|> ignore(string("]"))
image =
choice([
bracketed_image,
unbracketed_image
])
literal =
ignore(string("=="))
|> repeat(lookahead_not(string("==")) |> utf8_char([]))
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:literal)
|> ignore(string("=="))
blockquote_author =
repeat(
lookahead_not(string("\"]"))
|> choice([
bracketed_literal,
literal,
utf8_char([])
])
)
|> reduce(:unwrap)
l_bq_author =
ignore(string("[bq=\""))
|> concat(blockquote_author)
|> ignore(string("\"]"))
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:l_bq_author)
l_bq = string("[bq]") |> unwrap_and_tag(:l_bq)
r_bq = string("[/bq]") |> unwrap_and_tag(:r_bq)
l_spoiler = string("[spoiler]") |> unwrap_and_tag(:l_spoiler)
r_spoiler = string("[/spoiler]") |> unwrap_and_tag(:r_spoiler)
stop_words =
choice([
bracketed_literal,
bracketed_link,
bracketed_image,
link,
image,
l_bq_author,
l_bq,
r_bq,
l_spoiler,
r_spoiler,
])
defcombinatorp :top_level,
choice([
stop_words,
times(lookahead_not(stop_words) |> utf8_char([]), min: 1)
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:text)
])
textile =
repeat(parsec(:top_level))
|> eos()
defparsec :lex, textile
end