From 8a7981c27a99a25328e46b1b066b4a8779d3814a Mon Sep 17 00:00:00 2001 From: Felisp Date: Tue, 30 Jul 2024 02:55:15 +0200 Subject: [PATCH] Implement multiboard and lazyloading as result, added todos And shitton of them --- project.clj | 2 +- src/rss_thread_watch/core.clj | 39 ++++++------ src/rss_thread_watch/feed_generator.clj | 46 ++++++++------ src/rss_thread_watch/watcher.clj | 81 +++++++++++++++++++------ 4 files changed, 111 insertions(+), 57 deletions(-) diff --git a/project.clj b/project.clj index 7a7f6b9..6438fc2 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject rss-thread-watch "0.3.0-SNAPSHOT" +(defproject rss-thread-watch "0.3.5-SNAPSHOT" :description "RSS based thread watcher" :url "http://example.com/FIXME" :license {:name "AGPL-3.0-only" diff --git a/src/rss_thread_watch/core.clj b/src/rss_thread_watch/core.clj index 1543206..bd47155 100644 --- a/src/rss_thread_watch/core.clj +++ b/src/rss_thread_watch/core.clj @@ -61,12 +61,13 @@ "Fills every enabled board with default config values" [config] (let [defaults (:boards-defaults config)] - (update-in config - '(:boards-enabled) - (fn [mp] - (u/fmap (fn [k v] - (u/map-apply-defaults v defaults)) - mp))))) + (dissoc (update-in config + '(:boards-enabled) + (fn [mp] + (u/fmap (fn [k v] + (u/map-apply-defaults v defaults)) + mp))) + :boards-defaults))) (defn get-some-config "Attempts to get config somehow, @@ -88,18 +89,20 @@ "Entry point, starts webserver" [& args] ;; Todo: Think of a way to start repeated download for every catalog efficiently - (let [config (get-some-config args) - expanded-config - - ] - (println args) - (System/exit 0) - (set-interval (fn [] - (println "Starting cache update") - (watcher/update-thread-cache! (:target config) (:starting-page config))) - (* 1000 (:refresh-delay config))) - (jetty/run-jetty (rp/wrap-params feed/http-handler) {:port (:port CONFIG-DEFAULT) - :join? true}))) + (let [config (get-some-config args)] + ;; Init the few globals we have + (reset! watcher/GLOBAL-CONFIG config) + (reset! feed/boards-enabled-cache (set (keys (get config :boards-enabled)))) + (reset! watcher/chod-threads-cache (watcher/generate-chod-cache-structure config)) + (println args) + (clojure.pprint/pprint config) + ;; Needs to be redone and probably removed from here + ;; (set-interval (fn [] + ;; (println "Starting cache update") + ;; (watcher/update-board-cache! (:target config) (:starting-page config))) + ;; (* 1000 (:refresh-delay config))) + (jetty/run-jetty (rp/wrap-params feed/http-handler) {:port (:port CONFIG-DEFAULT) + :join? true}))) ;; Docs: https://github.com/ring-clojure/ring/wiki/Getting-Started (defn repl-main diff --git a/src/rss_thread_watch/feed_generator.clj b/src/rss_thread_watch/feed_generator.clj index 965524a..e619965 100644 --- a/src/rss_thread_watch/feed_generator.clj +++ b/src/rss_thread_watch/feed_generator.clj @@ -22,8 +22,8 @@ [rss-thread-watch.utils :as ut]) (:gen-class)) - - +(def boards-enabled-cache + (atom nil)) (defn new-guid-always "Generates always unique GUID for Feed item. @@ -48,9 +48,11 @@ (defn filter-chod-posts "Return list of all threads with equal or higher ChoD than requested - READS FROM GLOBALS: watcher.time-of-cache" ;Todo: best thing would be to add timestamp to cache - [query-vec chod-treshold repeat? cache] - (let [time-of-generation @watcher/time-of-cache + READS FROM GLOBALS: watcher.time-of-cache" + [query-vec chod-treshold repeat? board-cache] + + (let [{time-of-generation :time + cache :data} board-cache guid-fn (if repeat? (fn [x] (new-guid-always x time-of-generation)) update-only-guid) cache-start-index (first (ut/indices (fn [x] (>= (:chod x) chod-treshold)) @@ -73,9 +75,9 @@ (defn thread-to-rss-item "If I wasnt retarded I could have made the cached version look like rss item already but what can you do. I'll refactor I promise, I just need this done ASAP" ;Todo: do what the docstring says - [t] + [t] ;TODO: oh Luna the hardcodes ;;RESUME (let [link-url (str "https://boards.4chan.org/mlp/thread/" (:no t))] ; jesus, well I said only /mlp/ is supported now so fuck it - {:title (format "%.2f%% - %s" (:chod t) (:title t)) + {:title (format "%.2f%% - %s" (:chod t) (:title t)) ;TODO: Generate link from the target somehow, or just include it from API response ;; :url link-url <- this is supposed to be for images according to: https://cyber.harvard.edu/rss/rss.html :description (format "The thread: '%s' has %.2f%% chance of dying" (:title t) (:chod t)) :link link-url @@ -97,9 +99,11 @@ READS FROM GLOBALS: rss-thread-watch.watcher.chod-threads-cache - rss-thread-watch.core.CONFIG" + rss-thread-watch.watcher.GLOBAL-CONFIG" ;TODO: Update if it really reads from there anymore [rqst] - (try (let [{{chod "chod" :or {chod "94"} + (try (let [{{chod "chod" + board "board" :or {chod "94" + board (get @watcher/GLOBAL-CONFIG :default-board)} :as prms} :params uri :uri} rqst qrs (prms "q") @@ -110,19 +114,23 @@ chod)] (try ;If we can't parse number from chod, use default 94 (if (or (vector? chod) - (<= (Integer/parseInt chod) 60)) ; Never accept chod lower that 60 TODO: don't hardcode this + (<= (Integer/parseInt chod) 60)) ; Never accept chod lower than 60 TODO: don't hardcode this 60 (Integer/parseInt chod)) (catch Exception e 94))) cache @watcher/chod-threads-cache] - ;; (println "RCVD: " rqst) - (println rqst) + (println "\n\nRCVD: " rqst) + ;; (println rqst) ;; ====== Errors ===== ;; Something other than feed.xml requested (when-not (s/ends-with? uri "feed.xml") (throw (ex-info "404" {:status 404 :header {"Content-Type" "text/plain"} :body "404 This server has nothing but /feed.xml"}))) + (when-not (contains? @boards-enabled-cache board) + (throw (ex-info "403" {:status 403 + :header {"Content-Type" "text/plain"} + :body (get @watcher/GLOBAL-CONFIG :board-disabled-message)}))) ;; No url params -> we redirect to documentation about params (when (empty? prms) (throw (ex-info "302" @@ -146,13 +154,15 @@ ;; There shouldn't be any problems with this mime type but if there are ;; replace with "text/xml", or even better, get RSS reader that is not utter shit :header {"Content-Type" "application/rss+xml"} - :body (generate-feed queries real-chod repeat? cache)}) + :body (generate-feed queries real-chod repeat? (watcher/get-thread-data board @watcher/GLOBAL-CONFIG))}) (catch Exception e ;; Ex-info has been crafted to match HTTP response body so we can send it (if-let [caught (ex-data e)] caught ;We have custom crafted error - {:status 500 ;Something else fucked up, we print what happened - :header {"Content-Type" "text/plain"} - :body (str "500 - Something fucked up while generating feed, If you decide to report it, please include url adress you used:\n" - (ex-cause e) "\n" - e)})))) + (do + (print "WTF??: " e) + {:status 500 ;Something else fucked up, we print what happened + :header {"Content-Type" "text/plain"} + :body (str "500 - Something fucked up while generating feed, If you decide to report it, please include url adress you used:\n" + (ex-cause e) "\n" + e)}))))) diff --git a/src/rss_thread_watch/watcher.clj b/src/rss_thread_watch/watcher.clj index 1a00299..6b03f27 100644 --- a/src/rss_thread_watch/watcher.clj +++ b/src/rss_thread_watch/watcher.clj @@ -18,17 +18,23 @@ [clojure.data.json :as js]) (:gen-class)) +(def GLOBAL-CONFIG + "Global config with defaults for missing entires" + ;; I know globals are ew in Clojure but I don't know any + ;; better way of doing this + (atom nil)) + (def chod-threads-cache "Cached map of threads that have CHanceOfDeath > configured" - nil) + (atom {})) -(def time-of-cache nil) - -(defn init-global-cache - "Initializes global cache of catalogs" +(defn generate-chod-cache-structure + "Generates initial structure for global cache + Structure is returned, you have to set it yourself" [config] - (keys (:boards-enabled config)) - ) + (let [ks (keys (:boards-enabled config))] + (zipmap ks + (repeatedly (count ks) #(atom nil))))) (defn process-page "Procesess every thread in page, leaving only relevant information @@ -50,27 +56,62 @@ (defn build-cache "Build cache of near-death threads so the values don't have to be recalculated on each request." [pages-to-index pages-total threads-per-page threads-total] - (vec (flatten (map (fn [single-page] - ;; We have to (dec page-number) bcs otherwise we would get the total number of threads - ;; including the whole page of threads - (let [page-number (dec (:page single-page))] ; inc to get to the actuall page - (process-page (:threads single-page) threads-total (inc (* page-number threads-per-page))))) - pages-to-index)))) + {:time (System/currentTimeMillis) + :data (vec (flatten (map (fn [single-page] + ;; We have to (dec page-number) bcs otherwise we would get the total number of threads + ;; including the whole page of threads + (let [page-number (dec (:page single-page))] ; inc to get to the actuall page + (process-page (:threads single-page) threads-total (inc (* page-number threads-per-page))))) + pages-to-index)))}) -(defn update-thread-cache! +(defn update-board-cache! "Updates cache of near-death threads. Writes to chod-threads-cache as side effect. [url] - Url to download data from - [starting-page] - From which page consider threads to be fit for near-death cache" - [url starting-page] - ;; Todo: surround with try so we can timeout and other stuff + [board] - Board to assign cached data to, it's existence is NOT checked here + [starting-page] - From which page consider threads to be fit for near-death cache + THIS FUNCTION WRITES TO chod-threads-cache + Returns :data part of [board] cache" + [url board starting-page] + ;; Todo: surround with try so we can timeout, 40x and other stuff (let [catalog (with-open [readr (io/reader url)] (js/read readr :key-fn keyword)) pages-total (count catalog) ;; universal calculation for total number of threads: - ;; (pages-total-1) * threadsPerPage + threadsOnLastpage ;;accounts for boards which have stickied threads making them have 11pages + ;; (pages-total -1) * threadsPerPage + threadsOnLastpage ;;accounts for boards which have stickied threads making them have 11pages threads-per-page (count (:threads (first catalog))) threads-total (+ (* threads-per-page (dec pages-total)) (count (:threads (last catalog)))) ;; Todo: Yeah, maybe this calculation could be refactored into let to-index (filter (fn [item] (<= starting-page (:page item))) catalog)] - (reset! chod-threads-cache (build-cache to-index pages-total threads-per-page threads-total)) - (reset! time-of-cache (System/currentTimeMillis)))) + ;; TODO: there absolutely must be try catch for missing - not enabled boards, + ;; This will return nill and that fuck everything up + (reset! (get @chod-threads-cache board) + (build-cache to-index pages-total threads-per-page threads-total)))) + +(defn board-enabled? + "Checks whether board is enabled in config" + [board config] + (contains? board (keys (get config :boards-enabled)))) + +(defn get-board-url + "Gets board url from :target if " + [board config] + ;; TODO: jesus, this needs sanitization and should be probably crafted by some URL class + (str (get-in config [:boards-enabled board :target]) board "catalog.json")) + +(defn get-thread-data + "Gets thread cache for given board. + If board is lazy loaded, downloads new one if needed. + + MAY CAUSE WRITE TO chod-thread-cache IF NECCESARRY" + [board config] + (let [refresh-rate (* 1000 (get-in config `(:boards-enabled ~board :refresh-rate))) + {data :data + time-downloaded :time + :or {time-downloaded 0} + :as board-atom } @(get @chod-threads-cache board) + ;; TODO: This also makes it implictly lazy-load -> if disabled make the check here + time-to-update? (or (nil? board-atom) + (> (System/currentTimeMillis) (+ refresh-rate time-downloaded)))] + (if time-to-update? + (update-board-cache! (get-board-url "/mlp/" config) board (get-in config [:boards-enabled board :starting-page])) + @(get @chod-threads-cache board))))