From 3e29e3785deef6fac912164e07bb3af34d60ea94 Mon Sep 17 00:00:00 2001 From: "byte[]" Date: Sat, 2 Nov 2019 21:47:54 -0400 Subject: [PATCH] initial, bad attempt at lexing --- lib/textile/lexer.ex | 192 ++++++++++++++++++++++++++++++++++++++++++ lib/textile/parser.ex | 3 + 2 files changed, 195 insertions(+) create mode 100644 lib/textile/lexer.ex create mode 100644 lib/textile/parser.ex diff --git a/lib/textile/lexer.ex b/lib/textile/lexer.ex new file mode 100644 index 00000000..d22b5edc --- /dev/null +++ b/lib/textile/lexer.ex @@ -0,0 +1,192 @@ +defmodule Textile.Lexer do + import NimbleParsec + + defp unwrap([{_name, value}]), + do: value + + # Lots of extra unicode space characters + space = + choice([ + utf8_char('\n\r\f \t\u00a0\u1680\u180e\u202f\u205f\u3000'), + utf8_char([0x2000..0x200a]) + ]) + + bracketed_literal = + ignore(string("[==")) + |> repeat(lookahead_not(string("==]")) |> utf8_char([])) + |> ignore(string("==]")) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:bracketed_literal) + + link_text_with_title = + ignore(string("\"")) + |> times(utf8_char(not: ?(), min: 1) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:link_text) + |> ignore(string("(")) + |> concat( + times(utf8_char(not: ?), not: ?"), min: 1) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:link_title) + |> ignore(string(")\":")) + ) + + link_text_without_title = + ignore(string("\"")) + |> times(utf8_char(not: ?"), min: 1) + |> ignore(string("\":")) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:link_text) + + link_text = + choice([ + link_text_with_title, + link_text_without_title + ]) + + link_protocol = + choice([ + string("/"), string("https://"), string("http://"), string("data:image/") + ]) + + uri_ending_at_space = + link_protocol + |> times(lookahead_not(space) |> utf8_char([]), min: 1) + |> reduce({List, :to_string, []}) + + uri_ending_at_bracket = + link_protocol + |> times(lookahead_not(string("]")) |> utf8_char([]), min: 1) + |> reduce({List, :to_string, []}) + + uri_ending_at_lparen = + link_protocol + |> times(lookahead_not(string("(")) |> utf8_char([]), min: 1) + |> reduce({List, :to_string, []}) + + uri_ending_at_bang = + link_protocol + |> times(lookahead_not(string("!")) |> utf8_char([]), min: 1) + |> reduce({List, :to_string, []}) + + unbracketed_link = + link_text + |> concat(uri_ending_at_space |> unwrap_and_tag(:link_url)) + + bracketed_link = + ignore(string("[")) + |> concat(link_text) + |> concat(uri_ending_at_bracket |> unwrap_and_tag(:link_url)) + |> ignore(string("]")) + + link = + choice([ + bracketed_link, + unbracketed_link + ]) + + image_url_with_title = + ignore(string("!")) + |> concat(uri_ending_at_lparen |> unwrap_and_tag(:image_url)) + |> ignore(string("(")) + |> concat( + times(utf8_char(not: ?), not: ?!), min: 1) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:image_title) + |> ignore(string(")!")) + ) + + image_url_without_title = + ignore(string("!")) + |> concat(uri_ending_at_bang |> unwrap_and_tag(:image_url)) + |> ignore(string("!")) + + image_url = + choice([ + image_url_with_title, + image_url_without_title + ]) + + unbracketed_image = + image_url + |> optional( + ignore(string(":")) + |> concat(uri_ending_at_space) + |> unwrap_and_tag(:image_link_url) + ) + + bracketed_image = + ignore(string("[")) + |> concat(image_url) + |> optional( + ignore(string(":")) + |> concat(uri_ending_at_bracket) + |> unwrap_and_tag(:image_link_url) + ) + |> ignore(string("]")) + + image = + choice([ + bracketed_image, + unbracketed_image + ]) + + literal = + ignore(string("==")) + |> repeat(lookahead_not(string("==")) |> utf8_char([])) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:literal) + |> ignore(string("==")) + + blockquote_author = + repeat( + lookahead_not(string("\"]")) + |> choice([ + bracketed_literal, + literal, + utf8_char([]) + ]) + ) + |> reduce(:unwrap) + + l_bq_author = + ignore(string("[bq=\"")) + |> concat(blockquote_author) + |> ignore(string("\"]")) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:l_bq_author) + + l_bq = string("[bq]") |> unwrap_and_tag(:l_bq) + r_bq = string("[/bq]") |> unwrap_and_tag(:r_bq) + + l_spoiler = string("[spoiler]") |> unwrap_and_tag(:l_spoiler) + r_spoiler = string("[/spoiler]") |> unwrap_and_tag(:r_spoiler) + + stop_words = + choice([ + bracketed_literal, + bracketed_link, + bracketed_image, + link, + image, + l_bq_author, + l_bq, + r_bq, + l_spoiler, + r_spoiler, + ]) + + defcombinatorp :top_level, + choice([ + stop_words, + times(lookahead_not(stop_words) |> utf8_char([]), min: 1) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:text) + ]) + + textile = + repeat(parsec(:top_level)) + |> eos() + + defparsec :lex, textile +end \ No newline at end of file diff --git a/lib/textile/parser.ex b/lib/textile/parser.ex new file mode 100644 index 00000000..a157b2bd --- /dev/null +++ b/lib/textile/parser.ex @@ -0,0 +1,3 @@ +defmodule Textile.Parser do + alias Textile.Lexer +end \ No newline at end of file