aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEugen Wissner <belka@caraus.de>2025-02-22 21:30:29 +0100
committerEugen Wissner <belka@caraus.de>2025-02-22 21:30:29 +0100
commit2dd3856389f9080e122f3a76ceda9c64146eb262 (patch)
tree79f10519b37575d623de2197165e62a5a4438389
parenta4c56fb432e8fd75df930647427bcb582e989344 (diff)
downloadkazbek-2dd3856389f9080e122f3a76ceda9c64146eb262.tar.gz
tea-cleaner: Configure word lists
-rw-r--r--config/tea-cleaner.toml.dist11
-rw-r--r--tea-cleaner/Main.hs40
-rw-r--r--tea-cleaner/TeaCleaner/Configuration.hs27
-rw-r--r--tea-cleaner/TeaCleaner/Filter.hs36
4 files changed, 78 insertions, 36 deletions
diff --git a/config/tea-cleaner.toml.dist b/config/tea-cleaner.toml.dist
index b09ed3b..5a77089 100644
--- a/config/tea-cleaner.toml.dist
+++ b/config/tea-cleaner.toml.dist
@@ -2,3 +2,14 @@
token = ""
# The gitea instance URL beginning wth "https://".
server = "https://"
+# List of words prohibited in the description and the website field.
+spam_words = [
+ "casino",
+ "gambling"
+]
+# Unusual email address exteions.
+mail_domains = [
+ ".online",
+ ".shop",
+ ".website"
+]
diff --git a/tea-cleaner/Main.hs b/tea-cleaner/Main.hs
index 949d62a..e9fead1 100644
--- a/tea-cleaner/Main.hs
+++ b/tea-cleaner/Main.hs
@@ -25,19 +25,32 @@ import TeaCleaner.Configuration
, execParser
)
import Control.Monad (when)
+import Data.IORef (modifyIORef, readIORef)
-handleResults :: Settings -> Bool -> Vector User -> IO ()
-handleResults settings liveRun users = printCount
- >> Vector.forM_ users (handleResult settings liveRun)
+printStatistics :: Settings -> IO ()
+printStatistics Settings{ statistics } =
+ readIORef statistics >>= printCount
where
- printCount =
- let count = Text.Builder.decimal $ Vector.length users
+ printCount count =
+ let count' = Text.Builder.decimal count
in Text.Lazy.IO.putStrLn
$ Text.Builder.toLazyText
- $ "Count: " <> count
+ $ "Count: " <> count'
-handleResult :: Settings -> Bool -> User -> IO ()
-handleResult settings liveRun user = Text.IO.putStrLn buildValue
+handleResults :: Settings -> Bool -> Vector FilterResult -> IO (Vector User)
+handleResults settings liveRun =
+ Vector.foldM' handleResult Vector.empty
+ where
+ handleResult accumulator (FilterResult user FailedFilter)
+ = handleFailedFilter settings liveRun user
+ >> pure accumulator
+ handleResult accumulator (FilterResult _ PassFilter) = pure accumulator
+ handleResult accumulator (FilterResult user SuspiciousFilter) = pure
+ $ Vector.snoc accumulator user
+
+handleFailedFilter :: Settings -> Bool -> User -> IO ()
+handleFailedFilter settings liveRun user = Text.IO.putStrLn buildValue
+ >> modifyIORef (getField @"statistics" settings) (+ 1)
>> when liveRun (purgeUser settings user)
where
buildValue = getField @"username" user <> "\n"
@@ -58,14 +71,7 @@ main = execParser commandLineInfo >>= withArguments
withSettings :: Bool -> Settings -> IO ()
withSettings liveRun settings = getUsers settings
- >>= Vector.foldM' foldFilterResult Vector.empty . fmap filterByUserProperties
+ >>= handleResults settings liveRun . fmap (filterByUserProperties settings)
>>= traverse (filterByActivities settings)
- >>= Vector.foldM' foldFilterResult Vector.empty
>>= handleResults settings liveRun
- where
- foldFilterResult accumulator (FilterResult user SuspiciousFilter)
- = pure $ Vector.snoc accumulator user
- foldFilterResult accumulator (FilterResult _ PassFilter) = pure accumulator
- foldFilterResult accumulator (FilterResult user FailedFilter)
- = handleResult settings liveRun user
- >> pure accumulator
+ >> printStatistics settings
diff --git a/tea-cleaner/TeaCleaner/Configuration.hs b/tea-cleaner/TeaCleaner/Configuration.hs
index a7342d3..1d2db32 100644
--- a/tea-cleaner/TeaCleaner/Configuration.hs
+++ b/tea-cleaner/TeaCleaner/Configuration.hs
@@ -26,35 +26,44 @@ import Options.Applicative
import Text.URI (URI)
import qualified Text.URI as URI
import Data.Time (UTCTime(..), getCurrentTime)
+import Data.IORef (IORef, newIORef)
data ConfigFile = ConfigFile
{ token :: StrictText
, server :: StrictText
+ , spamWords :: [StrictText]
+ , mailDomains :: [StrictText]
} deriving (Eq, Show)
configFileCodec :: Toml.TomlCodec ConfigFile
configFileCodec = ConfigFile
<$> Toml.text "token" .= getField @"token"
<*> Toml.text "server" .= getField @"server"
+ <*> Toml.arrayOf Toml._Text "spam_words" .= getField @"spamWords"
+ <*> Toml.arrayOf Toml._Text "mail_domains" .= getField @"mailDomains"
data Settings = Settings
{ token :: StrictText
, server :: URI
, now :: UTCTime
- } deriving (Eq, Show)
+ , spamWords :: [StrictText]
+ , mailDomains :: [StrictText]
+ , statistics :: IORef Int
+ } deriving Eq
decodeSettingsFile :: FilePath -> IO Settings
-decodeSettingsFile configPath = Toml.decodeFile configFileCodec configPath
- >>= withConfiguration
- where
- withConfiguration configFile@ConfigFile{ server } = URI.mkURI server
- >>= withServer configFile
- withServer configFile parsedServer = getCurrentTime
- >>= withTime configFile parsedServer
- withTime ConfigFile{..} parsedServer now = pure $ Settings
+decodeSettingsFile configPath = do
+ ConfigFile{..} <- Toml.decodeFile configFileCodec configPath
+ parsedServer <- URI.mkURI server
+ now <- getCurrentTime
+ ioRef <- newIORef 0
+ pure $ Settings
{ token = token
, server = parsedServer
, now = now
+ , spamWords = spamWords
+ , mailDomains = mailDomains
+ , statistics = ioRef
}
newtype ProgramOptions = ProgramOptions
diff --git a/tea-cleaner/TeaCleaner/Filter.hs b/tea-cleaner/TeaCleaner/Filter.hs
index 8448ff5..658606d 100644
--- a/tea-cleaner/TeaCleaner/Filter.hs
+++ b/tea-cleaner/TeaCleaner/Filter.hs
@@ -6,11 +6,11 @@ module TeaCleaner.Filter
) where
import qualified Data.Text as Text
-import Data.Time (LocalTime(..), ZonedTime(..))
-import Data.Time.Calendar.OrdinalDate (fromOrdinalDate)
+import Data.Time (LocalTime(..), ZonedTime(..), UTCTime(..), addUTCTime)
import qualified Data.Vector as Vector
import TeaCleaner.Client (Activity(..), User(..), getActivities)
import TeaCleaner.Configuration (Settings(..))
+import GHC.Records (HasField(..))
data UserFilter
= PassFilter
@@ -21,15 +21,31 @@ data UserFilter
data FilterResult = FilterResult User UserFilter
deriving (Show)
-filterByUserProperties :: User -> FilterResult
-filterByUserProperties user@User{ created, lastLogin, description, website }
- | zonedDay created == zonedDay lastLogin
- , zonedDay created > fromOrdinalDate 2024 1
- , zonedDay created < fromOrdinalDate 2025 17
- , not (Text.null description)
- , not (Text.null website) = FilterResult user SuspiciousFilter
+filterByUserProperties :: Settings -> User -> FilterResult
+filterByUserProperties settings user@User{ created, lastLogin }
+ | noLoginSinceRegistration = FilterResult user FailedFilter
+ | containsSpamWords = FilterResult user FailedFilter
+ | percentEncodedWebsite = FilterResult user FailedFilter
+ | hasFullDescription = FilterResult user SuspiciousFilter
+ | unusualMailDomains = FilterResult user SuspiciousFilter
| otherwise = FilterResult user PassFilter
where
+ percentEncodedWebsite = Text.elem '%' $ getField @"website" user
+ unusualMailDomains =
+ let predicate = (`Text.isSuffixOf` getField @"email" user)
+ in any predicate (getField @"mailDomains" settings)
+ containsSpamWords =
+ let lowerCaseDescription = Text.toLower $ getField @"description" user
+ lowerCaseWebsite = Text.toLower $ getField @"website" user
+ predicate word = Text.isInfixOf word lowerCaseWebsite
+ || Text.isInfixOf word lowerCaseDescription
+ in any predicate (getField @"spamWords" settings)
+ hasFullDescription
+ = not (Text.null $ getField @"description" user)
+ && not (Text.null $ getField @"website" user)
+ noLoginSinceRegistration =
+ let monthAgo = utctDay $ addUTCTime (-2592000) $ getField @"now" settings
+ in zonedDay created < monthAgo && zonedDay created == zonedDay lastLogin
zonedDay = localDay . zonedTimeToLocalTime
filterByActivities :: Settings -> User -> IO FilterResult
@@ -39,5 +55,5 @@ filterByActivities settings user = getActivities settings user
evalActivities activities
| Just (Activity{ opType }, rest) <- Vector.uncons activities
, Vector.null rest
- , opType == "create_repo" = pure $ FilterResult user SuspiciousFilter
+ , opType == "create_repo" = pure $ FilterResult user FailedFilter
evalActivities _ = pure $ FilterResult user PassFilter