Implement multiboard and lazyloading as result, added todos

And shitton of them
This commit is contained in:
Felisp 2024-07-30 02:55:15 +02:00
parent 8c1cfbed33
commit 8a7981c27a
4 changed files with 111 additions and 57 deletions

View file

@ -1,4 +1,4 @@
(defproject rss-thread-watch "0.3.0-SNAPSHOT"
(defproject rss-thread-watch "0.3.5-SNAPSHOT"
:description "RSS based thread watcher"
:url "http://example.com/FIXME"
:license {:name "AGPL-3.0-only"

View file

@ -61,12 +61,13 @@
"Fills every enabled board with default config values"
[config]
(let [defaults (:boards-defaults config)]
(update-in config
(dissoc (update-in config
'(:boards-enabled)
(fn [mp]
(u/fmap (fn [k v]
(u/map-apply-defaults v defaults))
mp)))))
mp)))
:boards-defaults)))
(defn get-some-config
"Attempts to get config somehow,
@ -88,16 +89,18 @@
"Entry point, starts webserver"
[& args]
;; Todo: Think of a way to start repeated download for every catalog efficiently
(let [config (get-some-config args)
expanded-config
]
(let [config (get-some-config args)]
;; Init the few globals we have
(reset! watcher/GLOBAL-CONFIG config)
(reset! feed/boards-enabled-cache (set (keys (get config :boards-enabled))))
(reset! watcher/chod-threads-cache (watcher/generate-chod-cache-structure config))
(println args)
(System/exit 0)
(set-interval (fn []
(println "Starting cache update")
(watcher/update-thread-cache! (:target config) (:starting-page config)))
(* 1000 (:refresh-delay config)))
(clojure.pprint/pprint config)
;; Needs to be redone and probably removed from here
;; (set-interval (fn []
;; (println "Starting cache update")
;; (watcher/update-board-cache! (:target config) (:starting-page config)))
;; (* 1000 (:refresh-delay config)))
(jetty/run-jetty (rp/wrap-params feed/http-handler) {:port (:port CONFIG-DEFAULT)
:join? true})))

View file

@ -22,8 +22,8 @@
[rss-thread-watch.utils :as ut])
(:gen-class))
(def boards-enabled-cache
(atom nil))
(defn new-guid-always
"Generates always unique GUID for Feed item.
@ -48,9 +48,11 @@
(defn filter-chod-posts
"Return list of all threads with equal or higher ChoD than requested
READS FROM GLOBALS: watcher.time-of-cache" ;Todo: best thing would be to add timestamp to cache
[query-vec chod-treshold repeat? cache]
(let [time-of-generation @watcher/time-of-cache
READS FROM GLOBALS: watcher.time-of-cache"
[query-vec chod-treshold repeat? board-cache]
(let [{time-of-generation :time
cache :data} board-cache
guid-fn (if repeat? (fn [x] (new-guid-always x time-of-generation))
update-only-guid)
cache-start-index (first (ut/indices (fn [x] (>= (:chod x) chod-treshold))
@ -73,9 +75,9 @@
(defn thread-to-rss-item
"If I wasnt retarded I could have made the cached version look like
rss item already but what can you do. I'll refactor I promise, I just need this done ASAP" ;Todo: do what the docstring says
[t]
[t] ;TODO: oh Luna the hardcodes ;;RESUME
(let [link-url (str "https://boards.4chan.org/mlp/thread/" (:no t))] ; jesus, well I said only /mlp/ is supported now so fuck it
{:title (format "%.2f%% - %s" (:chod t) (:title t))
{:title (format "%.2f%% - %s" (:chod t) (:title t)) ;TODO: Generate link from the target somehow, or just include it from API response
;; :url link-url <- this is supposed to be for images according to: https://cyber.harvard.edu/rss/rss.html
:description (format "The thread: '%s' has %.2f%% chance of dying" (:title t) (:chod t))
:link link-url
@ -97,9 +99,11 @@
READS FROM GLOBALS:
rss-thread-watch.watcher.chod-threads-cache
rss-thread-watch.core.CONFIG"
rss-thread-watch.watcher.GLOBAL-CONFIG" ;TODO: Update if it really reads from there anymore
[rqst]
(try (let [{{chod "chod" :or {chod "94"}
(try (let [{{chod "chod"
board "board" :or {chod "94"
board (get @watcher/GLOBAL-CONFIG :default-board)}
:as prms} :params
uri :uri} rqst
qrs (prms "q")
@ -110,19 +114,23 @@
chod)]
(try ;If we can't parse number from chod, use default 94
(if (or (vector? chod)
(<= (Integer/parseInt chod) 60)) ; Never accept chod lower that 60 TODO: don't hardcode this
(<= (Integer/parseInt chod) 60)) ; Never accept chod lower than 60 TODO: don't hardcode this
60 (Integer/parseInt chod))
(catch Exception e
94)))
cache @watcher/chod-threads-cache]
;; (println "RCVD: " rqst)
(println rqst)
(println "\n\nRCVD: " rqst)
;; (println rqst)
;; ====== Errors =====
;; Something other than feed.xml requested
(when-not (s/ends-with? uri "feed.xml")
(throw (ex-info "404" {:status 404
:header {"Content-Type" "text/plain"}
:body "404 This server has nothing but /feed.xml"})))
(when-not (contains? @boards-enabled-cache board)
(throw (ex-info "403" {:status 403
:header {"Content-Type" "text/plain"}
:body (get @watcher/GLOBAL-CONFIG :board-disabled-message)})))
;; No url params -> we redirect to documentation about params
(when (empty? prms)
(throw (ex-info "302"
@ -146,13 +154,15 @@
;; There shouldn't be any problems with this mime type but if there are
;; replace with "text/xml", or even better, get RSS reader that is not utter shit
:header {"Content-Type" "application/rss+xml"}
:body (generate-feed queries real-chod repeat? cache)})
:body (generate-feed queries real-chod repeat? (watcher/get-thread-data board @watcher/GLOBAL-CONFIG))})
(catch Exception e
;; Ex-info has been crafted to match HTTP response body so we can send it
(if-let [caught (ex-data e)]
caught ;We have custom crafted error
(do
(print "WTF??: " e)
{:status 500 ;Something else fucked up, we print what happened
:header {"Content-Type" "text/plain"}
:body (str "500 - Something fucked up while generating feed, If you decide to report it, please include url adress you used:\n"
(ex-cause e) "\n"
e)}))))
e)})))))

View file

@ -18,17 +18,23 @@
[clojure.data.json :as js])
(:gen-class))
(def GLOBAL-CONFIG
"Global config with defaults for missing entires"
;; I know globals are ew in Clojure but I don't know any
;; better way of doing this
(atom nil))
(def chod-threads-cache
"Cached map of threads that have CHanceOfDeath > configured"
nil)
(atom {}))
(def time-of-cache nil)
(defn init-global-cache
"Initializes global cache of catalogs"
(defn generate-chod-cache-structure
"Generates initial structure for global cache
Structure is returned, you have to set it yourself"
[config]
(keys (:boards-enabled config))
)
(let [ks (keys (:boards-enabled config))]
(zipmap ks
(repeatedly (count ks) #(atom nil)))))
(defn process-page
"Procesess every thread in page, leaving only relevant information
@ -50,27 +56,62 @@
(defn build-cache
"Build cache of near-death threads so the values don't have to be recalculated on each request."
[pages-to-index pages-total threads-per-page threads-total]
(vec (flatten (map (fn [single-page]
{:time (System/currentTimeMillis)
:data (vec (flatten (map (fn [single-page]
;; We have to (dec page-number) bcs otherwise we would get the total number of threads
;; including the whole page of threads
(let [page-number (dec (:page single-page))] ; inc to get to the actuall page
(process-page (:threads single-page) threads-total (inc (* page-number threads-per-page)))))
pages-to-index))))
pages-to-index)))})
(defn update-thread-cache!
(defn update-board-cache!
"Updates cache of near-death threads. Writes to chod-threads-cache as side effect.
[url] - Url to download data from
[starting-page] - From which page consider threads to be fit for near-death cache"
[url starting-page]
;; Todo: surround with try so we can timeout and other stuff
[board] - Board to assign cached data to, it's existence is NOT checked here
[starting-page] - From which page consider threads to be fit for near-death cache
THIS FUNCTION WRITES TO chod-threads-cache
Returns :data part of [board] cache"
[url board starting-page]
;; Todo: surround with try so we can timeout, 40x and other stuff
(let [catalog (with-open [readr (io/reader url)]
(js/read readr :key-fn keyword))
pages-total (count catalog)
;; universal calculation for total number of threads:
;; (pages-total-1) * threadsPerPage + threadsOnLastpage ;;accounts for boards which have stickied threads making them have 11pages
;; (pages-total -1) * threadsPerPage + threadsOnLastpage ;;accounts for boards which have stickied threads making them have 11pages
threads-per-page (count (:threads (first catalog)))
threads-total (+ (* threads-per-page (dec pages-total)) (count (:threads (last catalog)))) ;; Todo: Yeah, maybe this calculation could be refactored into let
to-index (filter (fn [item]
(<= starting-page (:page item))) catalog)]
(reset! chod-threads-cache (build-cache to-index pages-total threads-per-page threads-total))
(reset! time-of-cache (System/currentTimeMillis))))
;; TODO: there absolutely must be try catch for missing - not enabled boards,
;; This will return nill and that fuck everything up
(reset! (get @chod-threads-cache board)
(build-cache to-index pages-total threads-per-page threads-total))))
(defn board-enabled?
"Checks whether board is enabled in config"
[board config]
(contains? board (keys (get config :boards-enabled))))
(defn get-board-url
"Gets board url from :target if "
[board config]
;; TODO: jesus, this needs sanitization and should be probably crafted by some URL class
(str (get-in config [:boards-enabled board :target]) board "catalog.json"))
(defn get-thread-data
"Gets thread cache for given board.
If board is lazy loaded, downloads new one if needed.
MAY CAUSE WRITE TO chod-thread-cache IF NECCESARRY"
[board config]
(let [refresh-rate (* 1000 (get-in config `(:boards-enabled ~board :refresh-rate)))
{data :data
time-downloaded :time
:or {time-downloaded 0}
:as board-atom } @(get @chod-threads-cache board)
;; TODO: This also makes it implictly lazy-load -> if disabled make the check here
time-to-update? (or (nil? board-atom)
(> (System/currentTimeMillis) (+ refresh-rate time-downloaded)))]
(if time-to-update?
(update-board-cache! (get-board-url "/mlp/" config) board (get-in config [:boards-enabled board :starting-page]))
@(get @chod-threads-cache board))))