From 2dd3856389f9080e122f3a76ceda9c64146eb262 Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Sat, 22 Feb 2025 21:30:29 +0100 Subject: [PATCH] tea-cleaner: Configure word lists --- config/tea-cleaner.toml.dist | 11 +++++++ tea-cleaner/Main.hs | 40 ++++++++++++++----------- tea-cleaner/TeaCleaner/Configuration.hs | 27 +++++++++++------ tea-cleaner/TeaCleaner/Filter.hs | 36 +++++++++++++++------- 4 files changed, 78 insertions(+), 36 deletions(-) diff --git a/config/tea-cleaner.toml.dist b/config/tea-cleaner.toml.dist index b09ed3b..5a77089 100644 --- a/config/tea-cleaner.toml.dist +++ b/config/tea-cleaner.toml.dist @@ -2,3 +2,14 @@ token = "" # The gitea instance URL beginning wth "https://". server = "https://" +# List of words prohibited in the description and the website field. +spam_words = [ + "casino", + "gambling" +] +# Unusual email address exteions. +mail_domains = [ + ".online", + ".shop", + ".website" +] diff --git a/tea-cleaner/Main.hs b/tea-cleaner/Main.hs index 949d62a..e9fead1 100644 --- a/tea-cleaner/Main.hs +++ b/tea-cleaner/Main.hs @@ -25,19 +25,32 @@ import TeaCleaner.Configuration , execParser ) import Control.Monad (when) +import Data.IORef (modifyIORef, readIORef) -handleResults :: Settings -> Bool -> Vector User -> IO () -handleResults settings liveRun users = printCount - >> Vector.forM_ users (handleResult settings liveRun) +printStatistics :: Settings -> IO () +printStatistics Settings{ statistics } = + readIORef statistics >>= printCount where - printCount = - let count = Text.Builder.decimal $ Vector.length users + printCount count = + let count' = Text.Builder.decimal count in Text.Lazy.IO.putStrLn $ Text.Builder.toLazyText - $ "Count: " <> count + $ "Count: " <> count' -handleResult :: Settings -> Bool -> User -> IO () -handleResult settings liveRun user = Text.IO.putStrLn buildValue +handleResults :: Settings -> Bool -> Vector FilterResult -> IO (Vector User) +handleResults settings liveRun = + Vector.foldM' handleResult Vector.empty + where + handleResult accumulator (FilterResult user FailedFilter) + = handleFailedFilter settings liveRun user + >> pure accumulator + handleResult accumulator (FilterResult _ PassFilter) = pure accumulator + handleResult accumulator (FilterResult user SuspiciousFilter) = pure + $ Vector.snoc accumulator user + +handleFailedFilter :: Settings -> Bool -> User -> IO () +handleFailedFilter settings liveRun user = Text.IO.putStrLn buildValue + >> modifyIORef (getField @"statistics" settings) (+ 1) >> when liveRun (purgeUser settings user) where buildValue = getField @"username" user <> "\n" @@ -58,14 +71,7 @@ main = execParser commandLineInfo >>= withArguments withSettings :: Bool -> Settings -> IO () withSettings liveRun settings = getUsers settings - >>= Vector.foldM' foldFilterResult Vector.empty . fmap filterByUserProperties + >>= handleResults settings liveRun . fmap (filterByUserProperties settings) >>= traverse (filterByActivities settings) - >>= Vector.foldM' foldFilterResult Vector.empty >>= handleResults settings liveRun - where - foldFilterResult accumulator (FilterResult user SuspiciousFilter) - = pure $ Vector.snoc accumulator user - foldFilterResult accumulator (FilterResult _ PassFilter) = pure accumulator - foldFilterResult accumulator (FilterResult user FailedFilter) - = handleResult settings liveRun user - >> pure accumulator + >> printStatistics settings diff --git a/tea-cleaner/TeaCleaner/Configuration.hs b/tea-cleaner/TeaCleaner/Configuration.hs index a7342d3..1d2db32 100644 --- a/tea-cleaner/TeaCleaner/Configuration.hs +++ b/tea-cleaner/TeaCleaner/Configuration.hs @@ -26,35 +26,44 @@ import Options.Applicative import Text.URI (URI) import qualified Text.URI as URI import Data.Time (UTCTime(..), getCurrentTime) +import Data.IORef (IORef, newIORef) data ConfigFile = ConfigFile { token :: StrictText , server :: StrictText + , spamWords :: [StrictText] + , mailDomains :: [StrictText] } deriving (Eq, Show) configFileCodec :: Toml.TomlCodec ConfigFile configFileCodec = ConfigFile <$> Toml.text "token" .= getField @"token" <*> Toml.text "server" .= getField @"server" + <*> Toml.arrayOf Toml._Text "spam_words" .= getField @"spamWords" + <*> Toml.arrayOf Toml._Text "mail_domains" .= getField @"mailDomains" data Settings = Settings { token :: StrictText , server :: URI , now :: UTCTime - } deriving (Eq, Show) + , spamWords :: [StrictText] + , mailDomains :: [StrictText] + , statistics :: IORef Int + } deriving Eq decodeSettingsFile :: FilePath -> IO Settings -decodeSettingsFile configPath = Toml.decodeFile configFileCodec configPath - >>= withConfiguration - where - withConfiguration configFile@ConfigFile{ server } = URI.mkURI server - >>= withServer configFile - withServer configFile parsedServer = getCurrentTime - >>= withTime configFile parsedServer - withTime ConfigFile{..} parsedServer now = pure $ Settings +decodeSettingsFile configPath = do + ConfigFile{..} <- Toml.decodeFile configFileCodec configPath + parsedServer <- URI.mkURI server + now <- getCurrentTime + ioRef <- newIORef 0 + pure $ Settings { token = token , server = parsedServer , now = now + , spamWords = spamWords + , mailDomains = mailDomains + , statistics = ioRef } newtype ProgramOptions = ProgramOptions diff --git a/tea-cleaner/TeaCleaner/Filter.hs b/tea-cleaner/TeaCleaner/Filter.hs index 8448ff5..658606d 100644 --- a/tea-cleaner/TeaCleaner/Filter.hs +++ b/tea-cleaner/TeaCleaner/Filter.hs @@ -6,11 +6,11 @@ module TeaCleaner.Filter ) where import qualified Data.Text as Text -import Data.Time (LocalTime(..), ZonedTime(..)) -import Data.Time.Calendar.OrdinalDate (fromOrdinalDate) +import Data.Time (LocalTime(..), ZonedTime(..), UTCTime(..), addUTCTime) import qualified Data.Vector as Vector import TeaCleaner.Client (Activity(..), User(..), getActivities) import TeaCleaner.Configuration (Settings(..)) +import GHC.Records (HasField(..)) data UserFilter = PassFilter @@ -21,15 +21,31 @@ data UserFilter data FilterResult = FilterResult User UserFilter deriving (Show) -filterByUserProperties :: User -> FilterResult -filterByUserProperties user@User{ created, lastLogin, description, website } - | zonedDay created == zonedDay lastLogin - , zonedDay created > fromOrdinalDate 2024 1 - , zonedDay created < fromOrdinalDate 2025 17 - , not (Text.null description) - , not (Text.null website) = FilterResult user SuspiciousFilter +filterByUserProperties :: Settings -> User -> FilterResult +filterByUserProperties settings user@User{ created, lastLogin } + | noLoginSinceRegistration = FilterResult user FailedFilter + | containsSpamWords = FilterResult user FailedFilter + | percentEncodedWebsite = FilterResult user FailedFilter + | hasFullDescription = FilterResult user SuspiciousFilter + | unusualMailDomains = FilterResult user SuspiciousFilter | otherwise = FilterResult user PassFilter where + percentEncodedWebsite = Text.elem '%' $ getField @"website" user + unusualMailDomains = + let predicate = (`Text.isSuffixOf` getField @"email" user) + in any predicate (getField @"mailDomains" settings) + containsSpamWords = + let lowerCaseDescription = Text.toLower $ getField @"description" user + lowerCaseWebsite = Text.toLower $ getField @"website" user + predicate word = Text.isInfixOf word lowerCaseWebsite + || Text.isInfixOf word lowerCaseDescription + in any predicate (getField @"spamWords" settings) + hasFullDescription + = not (Text.null $ getField @"description" user) + && not (Text.null $ getField @"website" user) + noLoginSinceRegistration = + let monthAgo = utctDay $ addUTCTime (-2592000) $ getField @"now" settings + in zonedDay created < monthAgo && zonedDay created == zonedDay lastLogin zonedDay = localDay . zonedTimeToLocalTime filterByActivities :: Settings -> User -> IO FilterResult @@ -39,5 +55,5 @@ filterByActivities settings user = getActivities settings user evalActivities activities | Just (Activity{ opType }, rest) <- Vector.uncons activities , Vector.null rest - , opType == "create_repo" = pure $ FilterResult user SuspiciousFilter + , opType == "create_repo" = pure $ FilterResult user FailedFilter evalActivities _ = pure $ FilterResult user PassFilter