tea-cleaner: Configure word lists

This commit is contained in:
Eugen Wissner 2025-02-22 21:30:29 +01:00
parent a4c56fb432
commit 2dd3856389
Signed by: belka
GPG Key ID: A27FDC1E8EE902C0
4 changed files with 78 additions and 36 deletions

View File

@ -2,3 +2,14 @@
token = "" token = ""
# The gitea instance URL beginning wth "https://". # The gitea instance URL beginning wth "https://".
server = "https://" server = "https://"
# List of words prohibited in the description and the website field.
spam_words = [
"casino",
"gambling"
]
# Unusual email address exteions.
mail_domains = [
".online",
".shop",
".website"
]

View File

@ -25,19 +25,32 @@ import TeaCleaner.Configuration
, execParser , execParser
) )
import Control.Monad (when) import Control.Monad (when)
import Data.IORef (modifyIORef, readIORef)
handleResults :: Settings -> Bool -> Vector User -> IO () printStatistics :: Settings -> IO ()
handleResults settings liveRun users = printCount printStatistics Settings{ statistics } =
>> Vector.forM_ users (handleResult settings liveRun) readIORef statistics >>= printCount
where where
printCount = printCount count =
let count = Text.Builder.decimal $ Vector.length users let count' = Text.Builder.decimal count
in Text.Lazy.IO.putStrLn in Text.Lazy.IO.putStrLn
$ Text.Builder.toLazyText $ Text.Builder.toLazyText
$ "Count: " <> count $ "Count: " <> count'
handleResult :: Settings -> Bool -> User -> IO () handleResults :: Settings -> Bool -> Vector FilterResult -> IO (Vector User)
handleResult settings liveRun user = Text.IO.putStrLn buildValue handleResults settings liveRun =
Vector.foldM' handleResult Vector.empty
where
handleResult accumulator (FilterResult user FailedFilter)
= handleFailedFilter settings liveRun user
>> pure accumulator
handleResult accumulator (FilterResult _ PassFilter) = pure accumulator
handleResult accumulator (FilterResult user SuspiciousFilter) = pure
$ Vector.snoc accumulator user
handleFailedFilter :: Settings -> Bool -> User -> IO ()
handleFailedFilter settings liveRun user = Text.IO.putStrLn buildValue
>> modifyIORef (getField @"statistics" settings) (+ 1)
>> when liveRun (purgeUser settings user) >> when liveRun (purgeUser settings user)
where where
buildValue = getField @"username" user <> "\n" buildValue = getField @"username" user <> "\n"
@ -58,14 +71,7 @@ main = execParser commandLineInfo >>= withArguments
withSettings :: Bool -> Settings -> IO () withSettings :: Bool -> Settings -> IO ()
withSettings liveRun settings = getUsers settings withSettings liveRun settings = getUsers settings
>>= Vector.foldM' foldFilterResult Vector.empty . fmap filterByUserProperties >>= handleResults settings liveRun . fmap (filterByUserProperties settings)
>>= traverse (filterByActivities settings) >>= traverse (filterByActivities settings)
>>= Vector.foldM' foldFilterResult Vector.empty
>>= handleResults settings liveRun >>= handleResults settings liveRun
where >> printStatistics settings
foldFilterResult accumulator (FilterResult user SuspiciousFilter)
= pure $ Vector.snoc accumulator user
foldFilterResult accumulator (FilterResult _ PassFilter) = pure accumulator
foldFilterResult accumulator (FilterResult user FailedFilter)
= handleResult settings liveRun user
>> pure accumulator

View File

@ -26,35 +26,44 @@ import Options.Applicative
import Text.URI (URI) import Text.URI (URI)
import qualified Text.URI as URI import qualified Text.URI as URI
import Data.Time (UTCTime(..), getCurrentTime) import Data.Time (UTCTime(..), getCurrentTime)
import Data.IORef (IORef, newIORef)
data ConfigFile = ConfigFile data ConfigFile = ConfigFile
{ token :: StrictText { token :: StrictText
, server :: StrictText , server :: StrictText
, spamWords :: [StrictText]
, mailDomains :: [StrictText]
} deriving (Eq, Show) } deriving (Eq, Show)
configFileCodec :: Toml.TomlCodec ConfigFile configFileCodec :: Toml.TomlCodec ConfigFile
configFileCodec = ConfigFile configFileCodec = ConfigFile
<$> Toml.text "token" .= getField @"token" <$> Toml.text "token" .= getField @"token"
<*> Toml.text "server" .= getField @"server" <*> Toml.text "server" .= getField @"server"
<*> Toml.arrayOf Toml._Text "spam_words" .= getField @"spamWords"
<*> Toml.arrayOf Toml._Text "mail_domains" .= getField @"mailDomains"
data Settings = Settings data Settings = Settings
{ token :: StrictText { token :: StrictText
, server :: URI , server :: URI
, now :: UTCTime , now :: UTCTime
} deriving (Eq, Show) , spamWords :: [StrictText]
, mailDomains :: [StrictText]
, statistics :: IORef Int
} deriving Eq
decodeSettingsFile :: FilePath -> IO Settings decodeSettingsFile :: FilePath -> IO Settings
decodeSettingsFile configPath = Toml.decodeFile configFileCodec configPath decodeSettingsFile configPath = do
>>= withConfiguration ConfigFile{..} <- Toml.decodeFile configFileCodec configPath
where parsedServer <- URI.mkURI server
withConfiguration configFile@ConfigFile{ server } = URI.mkURI server now <- getCurrentTime
>>= withServer configFile ioRef <- newIORef 0
withServer configFile parsedServer = getCurrentTime pure $ Settings
>>= withTime configFile parsedServer
withTime ConfigFile{..} parsedServer now = pure $ Settings
{ token = token { token = token
, server = parsedServer , server = parsedServer
, now = now , now = now
, spamWords = spamWords
, mailDomains = mailDomains
, statistics = ioRef
} }
newtype ProgramOptions = ProgramOptions newtype ProgramOptions = ProgramOptions

View File

@ -6,11 +6,11 @@ module TeaCleaner.Filter
) where ) where
import qualified Data.Text as Text import qualified Data.Text as Text
import Data.Time (LocalTime(..), ZonedTime(..)) import Data.Time (LocalTime(..), ZonedTime(..), UTCTime(..), addUTCTime)
import Data.Time.Calendar.OrdinalDate (fromOrdinalDate)
import qualified Data.Vector as Vector import qualified Data.Vector as Vector
import TeaCleaner.Client (Activity(..), User(..), getActivities) import TeaCleaner.Client (Activity(..), User(..), getActivities)
import TeaCleaner.Configuration (Settings(..)) import TeaCleaner.Configuration (Settings(..))
import GHC.Records (HasField(..))
data UserFilter data UserFilter
= PassFilter = PassFilter
@ -21,15 +21,31 @@ data UserFilter
data FilterResult = FilterResult User UserFilter data FilterResult = FilterResult User UserFilter
deriving (Show) deriving (Show)
filterByUserProperties :: User -> FilterResult filterByUserProperties :: Settings -> User -> FilterResult
filterByUserProperties user@User{ created, lastLogin, description, website } filterByUserProperties settings user@User{ created, lastLogin }
| zonedDay created == zonedDay lastLogin | noLoginSinceRegistration = FilterResult user FailedFilter
, zonedDay created > fromOrdinalDate 2024 1 | containsSpamWords = FilterResult user FailedFilter
, zonedDay created < fromOrdinalDate 2025 17 | percentEncodedWebsite = FilterResult user FailedFilter
, not (Text.null description) | hasFullDescription = FilterResult user SuspiciousFilter
, not (Text.null website) = FilterResult user SuspiciousFilter | unusualMailDomains = FilterResult user SuspiciousFilter
| otherwise = FilterResult user PassFilter | otherwise = FilterResult user PassFilter
where where
percentEncodedWebsite = Text.elem '%' $ getField @"website" user
unusualMailDomains =
let predicate = (`Text.isSuffixOf` getField @"email" user)
in any predicate (getField @"mailDomains" settings)
containsSpamWords =
let lowerCaseDescription = Text.toLower $ getField @"description" user
lowerCaseWebsite = Text.toLower $ getField @"website" user
predicate word = Text.isInfixOf word lowerCaseWebsite
|| Text.isInfixOf word lowerCaseDescription
in any predicate (getField @"spamWords" settings)
hasFullDescription
= not (Text.null $ getField @"description" user)
&& not (Text.null $ getField @"website" user)
noLoginSinceRegistration =
let monthAgo = utctDay $ addUTCTime (-2592000) $ getField @"now" settings
in zonedDay created < monthAgo && zonedDay created == zonedDay lastLogin
zonedDay = localDay . zonedTimeToLocalTime zonedDay = localDay . zonedTimeToLocalTime
filterByActivities :: Settings -> User -> IO FilterResult filterByActivities :: Settings -> User -> IO FilterResult
@ -39,5 +55,5 @@ filterByActivities settings user = getActivities settings user
evalActivities activities evalActivities activities
| Just (Activity{ opType }, rest) <- Vector.uncons activities | Just (Activity{ opType }, rest) <- Vector.uncons activities
, Vector.null rest , Vector.null rest
, opType == "create_repo" = pure $ FilterResult user SuspiciousFilter , opType == "create_repo" = pure $ FilterResult user FailedFilter
evalActivities _ = pure $ FilterResult user PassFilter evalActivities _ = pure $ FilterResult user PassFilter