tea-cleaner: Configure word lists

This commit is contained in:
Eugen Wissner 2025-02-22 21:30:29 +01:00
parent a4c56fb432
commit 2dd3856389
Signed by: belka
GPG Key ID: A27FDC1E8EE902C0
4 changed files with 78 additions and 36 deletions

View File

@ -2,3 +2,14 @@
token = ""
# The gitea instance URL beginning wth "https://".
server = "https://"
# List of words prohibited in the description and the website field.
spam_words = [
"casino",
"gambling"
]
# Unusual email address exteions.
mail_domains = [
".online",
".shop",
".website"
]

View File

@ -25,19 +25,32 @@ import TeaCleaner.Configuration
, execParser
)
import Control.Monad (when)
import Data.IORef (modifyIORef, readIORef)
handleResults :: Settings -> Bool -> Vector User -> IO ()
handleResults settings liveRun users = printCount
>> Vector.forM_ users (handleResult settings liveRun)
printStatistics :: Settings -> IO ()
printStatistics Settings{ statistics } =
readIORef statistics >>= printCount
where
printCount =
let count = Text.Builder.decimal $ Vector.length users
printCount count =
let count' = Text.Builder.decimal count
in Text.Lazy.IO.putStrLn
$ Text.Builder.toLazyText
$ "Count: " <> count
$ "Count: " <> count'
handleResult :: Settings -> Bool -> User -> IO ()
handleResult settings liveRun user = Text.IO.putStrLn buildValue
handleResults :: Settings -> Bool -> Vector FilterResult -> IO (Vector User)
handleResults settings liveRun =
Vector.foldM' handleResult Vector.empty
where
handleResult accumulator (FilterResult user FailedFilter)
= handleFailedFilter settings liveRun user
>> pure accumulator
handleResult accumulator (FilterResult _ PassFilter) = pure accumulator
handleResult accumulator (FilterResult user SuspiciousFilter) = pure
$ Vector.snoc accumulator user
handleFailedFilter :: Settings -> Bool -> User -> IO ()
handleFailedFilter settings liveRun user = Text.IO.putStrLn buildValue
>> modifyIORef (getField @"statistics" settings) (+ 1)
>> when liveRun (purgeUser settings user)
where
buildValue = getField @"username" user <> "\n"
@ -58,14 +71,7 @@ main = execParser commandLineInfo >>= withArguments
withSettings :: Bool -> Settings -> IO ()
withSettings liveRun settings = getUsers settings
>>= Vector.foldM' foldFilterResult Vector.empty . fmap filterByUserProperties
>>= handleResults settings liveRun . fmap (filterByUserProperties settings)
>>= traverse (filterByActivities settings)
>>= Vector.foldM' foldFilterResult Vector.empty
>>= handleResults settings liveRun
where
foldFilterResult accumulator (FilterResult user SuspiciousFilter)
= pure $ Vector.snoc accumulator user
foldFilterResult accumulator (FilterResult _ PassFilter) = pure accumulator
foldFilterResult accumulator (FilterResult user FailedFilter)
= handleResult settings liveRun user
>> pure accumulator
>> printStatistics settings

View File

@ -26,35 +26,44 @@ import Options.Applicative
import Text.URI (URI)
import qualified Text.URI as URI
import Data.Time (UTCTime(..), getCurrentTime)
import Data.IORef (IORef, newIORef)
data ConfigFile = ConfigFile
{ token :: StrictText
, server :: StrictText
, spamWords :: [StrictText]
, mailDomains :: [StrictText]
} deriving (Eq, Show)
configFileCodec :: Toml.TomlCodec ConfigFile
configFileCodec = ConfigFile
<$> Toml.text "token" .= getField @"token"
<*> Toml.text "server" .= getField @"server"
<*> Toml.arrayOf Toml._Text "spam_words" .= getField @"spamWords"
<*> Toml.arrayOf Toml._Text "mail_domains" .= getField @"mailDomains"
data Settings = Settings
{ token :: StrictText
, server :: URI
, now :: UTCTime
} deriving (Eq, Show)
, spamWords :: [StrictText]
, mailDomains :: [StrictText]
, statistics :: IORef Int
} deriving Eq
decodeSettingsFile :: FilePath -> IO Settings
decodeSettingsFile configPath = Toml.decodeFile configFileCodec configPath
>>= withConfiguration
where
withConfiguration configFile@ConfigFile{ server } = URI.mkURI server
>>= withServer configFile
withServer configFile parsedServer = getCurrentTime
>>= withTime configFile parsedServer
withTime ConfigFile{..} parsedServer now = pure $ Settings
decodeSettingsFile configPath = do
ConfigFile{..} <- Toml.decodeFile configFileCodec configPath
parsedServer <- URI.mkURI server
now <- getCurrentTime
ioRef <- newIORef 0
pure $ Settings
{ token = token
, server = parsedServer
, now = now
, spamWords = spamWords
, mailDomains = mailDomains
, statistics = ioRef
}
newtype ProgramOptions = ProgramOptions

View File

@ -6,11 +6,11 @@ module TeaCleaner.Filter
) where
import qualified Data.Text as Text
import Data.Time (LocalTime(..), ZonedTime(..))
import Data.Time.Calendar.OrdinalDate (fromOrdinalDate)
import Data.Time (LocalTime(..), ZonedTime(..), UTCTime(..), addUTCTime)
import qualified Data.Vector as Vector
import TeaCleaner.Client (Activity(..), User(..), getActivities)
import TeaCleaner.Configuration (Settings(..))
import GHC.Records (HasField(..))
data UserFilter
= PassFilter
@ -21,15 +21,31 @@ data UserFilter
data FilterResult = FilterResult User UserFilter
deriving (Show)
filterByUserProperties :: User -> FilterResult
filterByUserProperties user@User{ created, lastLogin, description, website }
| zonedDay created == zonedDay lastLogin
, zonedDay created > fromOrdinalDate 2024 1
, zonedDay created < fromOrdinalDate 2025 17
, not (Text.null description)
, not (Text.null website) = FilterResult user SuspiciousFilter
filterByUserProperties :: Settings -> User -> FilterResult
filterByUserProperties settings user@User{ created, lastLogin }
| noLoginSinceRegistration = FilterResult user FailedFilter
| containsSpamWords = FilterResult user FailedFilter
| percentEncodedWebsite = FilterResult user FailedFilter
| hasFullDescription = FilterResult user SuspiciousFilter
| unusualMailDomains = FilterResult user SuspiciousFilter
| otherwise = FilterResult user PassFilter
where
percentEncodedWebsite = Text.elem '%' $ getField @"website" user
unusualMailDomains =
let predicate = (`Text.isSuffixOf` getField @"email" user)
in any predicate (getField @"mailDomains" settings)
containsSpamWords =
let lowerCaseDescription = Text.toLower $ getField @"description" user
lowerCaseWebsite = Text.toLower $ getField @"website" user
predicate word = Text.isInfixOf word lowerCaseWebsite
|| Text.isInfixOf word lowerCaseDescription
in any predicate (getField @"spamWords" settings)
hasFullDescription
= not (Text.null $ getField @"description" user)
&& not (Text.null $ getField @"website" user)
noLoginSinceRegistration =
let monthAgo = utctDay $ addUTCTime (-2592000) $ getField @"now" settings
in zonedDay created < monthAgo && zonedDay created == zonedDay lastLogin
zonedDay = localDay . zonedTimeToLocalTime
filterByActivities :: Settings -> User -> IO FilterResult
@ -39,5 +55,5 @@ filterByActivities settings user = getActivities settings user
evalActivities activities
| Just (Activity{ opType }, rest) <- Vector.uncons activities
, Vector.null rest
, opType == "create_repo" = pure $ FilterResult user SuspiciousFilter
, opType == "create_repo" = pure $ FilterResult user FailedFilter
evalActivities _ = pure $ FilterResult user PassFilter