diff --git a/README.md b/README.md index da31fe2f89c64c1664f5603f4921060a22ea3173..c045f23b94c3bf94f90819d81d244b4f2d214759 100644 --- a/README.md +++ b/README.md @@ -152,3 +152,74 @@ occ email-recovery:recovery-warning-notification 2>> /var/log/nextcloud/recovery # Log with timestamps occ email-recovery:recovery-warning-notification 2>&1 | while IFS= read -r line; do echo "$(date '+%Y-%m-%d %H:%M:%S') $line"; done >> /var/log/nextcloud/recovery-timestamped.log ``` + + +### Spam Account Filter Command + +This document explains how to use the new `FilterLegitimateDomainsFromSpamReport` command to filter out legitimate domains from spam account reports. + +## Overview + +The command processes a spam account report file and removes entries that belong to popular (legitimate) email domains, creating a new file with only true spam entries. + +## Usage + +```bash +occ email-recovery:filter-legitimate-domains [options] +``` + +### Arguments + +- `input-file`: Path to the spam account report file (e.g., `spam-account-report-2025-07-23.txt`) +- `output-file`: Path where the filtered output file will be created + +### Options + +- `--include-recovery-email`: Include recovery email addresses in the output file (format: `userid,recovery-email`) + +## Examples + +### Basic Usage +```bash +occ email-recovery:filter-legitimate-domains spam-account-report-2025-07-23.txt filtered-spam-report.txt +``` + +### Include Recovery Email Addresses +```bash +occ email-recovery:filter-legitimate-domains spam-account-report-2025-07-23.txt filtered-spam-report.txt --include-recovery-email +``` + +## How It Works + +1. **Reads the input file**: Parses each line as a user ID from the spam report +2. **Gets recovery email**: Retrieves the recovery email address for each user +3. **Validates recovery email**: Uses the same validation logic as `getAllSpamEmails()` in `RecoveryEmailService` +4. **Filters entries**: Keeps only entries that fail validation (true spam) and filters out legitimate domains +5. **Creates output file**: Writes the filtered results to the specified output file + +The filtering uses the exact same logic as the spam detection system, ensuring consistency and accuracy. + +## Output Format + +### Without --include-recovery-email +``` +user1 +user2 +user3 +``` + +### With --include-recovery-email +``` +user1,spam@example.com +user2,fake@disposable.com +user3,temp@throwaway.net +``` + +## Summary Information + +The command provides a summary showing: +- Total entries processed +- Legitimate domains (filtered out) +- Spam entries (kept) +- Invalid entries (skipped) + diff --git a/appinfo/info.xml b/appinfo/info.xml index cda7e90aada4536354f19d19f4ef2b05a282eb23..1ded70896909a418e1c4a3907ba95fbe5d5d0f32 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -24,5 +24,6 @@ OCA\EmailRecovery\Command\ResetDisposableDomainsList OCA\EmailRecovery\Command\AdminBlacklistedDomains OCA\EmailRecovery\Command\RecoveryWarningNotificationCommand + OCA\EmailRecovery\Command\FilterLegitimateDomainsFromSpamReport diff --git a/lib/Command/FilterLegitimateDomainsFromSpamReport.php b/lib/Command/FilterLegitimateDomainsFromSpamReport.php new file mode 100644 index 0000000000000000000000000000000000000000..cedf1af0984f1c4a0f5ca36c641a19c9dea09987 --- /dev/null +++ b/lib/Command/FilterLegitimateDomainsFromSpamReport.php @@ -0,0 +1,102 @@ +spamFilterService = $spamFilterService; + $this->logger = $logger; + } + + protected function configure() { + $this + ->setName(Application::APP_ID . ':filter-legitimate-domains') + ->setDescription('Filter out legitimate domains from spam account report and output to console') + ->addArgument('input-file', InputArgument::REQUIRED, 'Path to the spam account report file') + ->addOption('include-recovery-email', null, InputOption::VALUE_NONE, 'Include recovery email addresses in output') + ->addOption('quiet', 'q', InputOption::VALUE_NONE, 'Suppress progress messages and only output filtered data'); + } + + protected function execute(InputInterface $input, OutputInterface $output): int { + try { + $inputFile = $input->getArgument('input-file'); + $includeRecoveryEmail = $input->getOption('include-recovery-email'); + $quiet = $input->getOption('quiet'); + + // Validate and read input file + $spamUserIds = $this->spamFilterService->readSpamReportFile($inputFile, $output); + if ($spamUserIds === null) { + return Command::FAILURE; + } + + // Process the spam users and filter legitimate domains + $results = $this->spamFilterService->processSpamUsers($spamUserIds, $includeRecoveryEmail, $output); + + // Log summary + $this->logSummary($results); + + // Write filtered results to console (data only) + $this->outputFilteredResults($results['filteredEntries'], $output, $quiet); + + return Command::SUCCESS; + } catch (\Throwable $e) { + $this->logger->error('Error while filtering spam report: ' . $e->getMessage()); + $output->writeln('Error: ' . $e->getMessage() . ''); + return Command::FAILURE; + } + } + + /** + * Logs the filtering summary + */ + private function logSummary(array $results): void { + $totalEntries = $results['totalEntries']; + $legitimateCount = $results['legitimateCount']; + $spamCount = $results['spamCount']; + $invalidCount = $totalEntries - $legitimateCount - $spamCount; + + $this->logger->info('=== Filtering Summary ==='); + $this->logger->info('Total entries processed: ' . $totalEntries); + $this->logger->info('Legitimate domains (filtered out): ' . $legitimateCount); + $this->logger->info('Spam entries (kept): ' . $spamCount); + $this->logger->info('Invalid entries (skipped): ' . $invalidCount); + } + + /** + * Outputs the filtered results to console (stdout) and logs status + */ + private function outputFilteredResults(array $filteredEntries, OutputInterface $output, bool $quiet = false): void { + // no entries to write + if (empty($filteredEntries)) { + $this->logger->info('No spam entries to output (all were legitimate domains)'); + return; + } + + $this->logger->info('Outputting ' . count($filteredEntries) . ' filtered entries to console'); + + // Write each entry to stdout (pure data only) + foreach ($filteredEntries as $entry) { + echo $entry . "\n"; + } + + $this->logger->info('Successfully output ' . count($filteredEntries) . ' entries to console'); + } +} diff --git a/lib/Service/RecoveryEmailService.php b/lib/Service/RecoveryEmailService.php index ca8e619f4958f1bb82aaaeaa28a42cedc06bc00e..4266d56baf78908689d2f902fcf3e69b2e3c0c3f 100644 --- a/lib/Service/RecoveryEmailService.php +++ b/lib/Service/RecoveryEmailService.php @@ -682,6 +682,47 @@ class RecoveryEmailService { return false; } + /** + * Validates a user's recovery email and determines if it's spam. + * + * This is the core validation logic used by both getAllSpamEmails and + * filterLegitimateDomainsFromSpamUsers methods. + * + * @param string $userId The user ID to validate + * @param string $recoveryEmail The recovery email to validate + * @param string $userEmail The user's main email address + * @return array Returns an array with 'isSpam' boolean and 'reason' string + */ + private function validateUserRecoveryEmail(string $userId, string $recoveryEmail, string $userEmail): array { + // Check if user has active subscription (skip if they do) + try { + if ($this->hasActiveSubscription($userEmail)) { + $this->logger->info("User $userId has an active subscription. Skipping spam flag for <$recoveryEmail>."); + return ['isSpam' => false, 'reason' => 'active_subscription']; + } + } catch (\Throwable $e) { + $this->logger->error("Error checking subscription for $userId <$userEmail>: " . $e->getMessage()); + return ['isSpam' => false, 'reason' => 'subscription_check_error']; + } + + // Validate the recovery email + try { + if ($this->validateRecoveryEmail($recoveryEmail, $userId)) { + // Validation passed - this is legitimate (not spam) + return ['isSpam' => false, 'reason' => 'validation_passed']; + } else { + // Validation failed - this is spam + return ['isSpam' => true, 'reason' => 'validation_failed']; + } + } catch (BlacklistedEmailException | InvalidRecoveryEmailException $e) { + $this->logger->info("Validation failed (spam) for $userId <$recoveryEmail>: " . $e->getMessage()); + return ['isSpam' => true, 'reason' => 'validation_failed']; + } catch (\Throwable $e) { + $this->logger->info("Error while checking $userId <$recoveryEmail>: " . $e->getMessage()); + return ['isSpam' => false, 'reason' => 'validation_error']; + } + } + /** * Scans all verified recovery email addresses and returns a list of spam accounts. * @@ -721,28 +762,77 @@ class RecoveryEmailService { continue; } - try { - if ($this->hasActiveSubscription($email)) { - $this->logger->info("User $userId has an active subscription. Skipping spam flag for <$recoveryEmail>."); - continue; + $validation = $this->validateUserRecoveryEmail($userId, $recoveryEmail, $email); + if ($validation['isSpam']) { + $onSpamDetected($userId, $recoveryEmail); + } + } + } + + /** + * Filters legitimate domains from a list of user IDs that were flagged as spam. + * + * This method takes a list of user IDs from a spam report and filters out those + * that belong to legitimate (popular) domains, returning only true spam entries. + * + * @param array $spamUserIds Array of user IDs from spam report + * @param callable $onSpamDetected Callback function with signature fn(string $userId, string $recoveryEmail): void + * @param callable $onLegitimateDetected Callback function with signature fn(string $userId, string $recoveryEmail): void + * @param callable $onInvalidDetected Optional callback function with signature fn(string $userId, string $reason): void + * @return void + */ + public function filterLegitimateDomainsFromSpamUsers(array $spamUserIds, callable $onSpamDetected, callable $onLegitimateDetected = null, callable $onInvalidDetected = null): void { + foreach ($spamUserIds as $userId) { + $userId = strtolower(trim($userId)); + + if ($userId === '') { + if ($onInvalidDetected) { + $onInvalidDetected($userId, 'empty_user_id'); } - } catch (\Throwable $e) { - $this->logger->error("Error checking subscription for $userId <$email>: " . $e->getMessage()); continue; } - try { - if (!$this->validateRecoveryEmail($recoveryEmail, $userId)) { - $onSpamDetected($userId, $recoveryEmail); + $user = $this->userManager->get($userId); + if ($user === null) { + $this->logger->info("User not found: $userId"); + if ($onInvalidDetected) { + $onInvalidDetected($userId, 'user_not_found'); } - } catch (BlacklistedEmailException | InvalidRecoveryEmailException $e) { - $this->logger->info("Validation failed (spam) for $userId <$recoveryEmail>: " . $e->getMessage()); + continue; + } + + $email = $user->getEMailAddress(); + if (empty($email)) { + $this->logger->info("No email address found for user: $userId"); + if ($onInvalidDetected) { + $onInvalidDetected($userId, 'no_email_address'); + } + continue; + } + + // Get recovery email for this user + $recoveryEmail = $this->getRecoveryEmail($userId); + if (empty($recoveryEmail)) { + $this->logger->info("No recovery email found for user: $userId"); + if ($onInvalidDetected) { + $onInvalidDetected($userId, 'no_recovery_email'); + } + continue; + } + + $validation = $this->validateUserRecoveryEmail($userId, $recoveryEmail, $email); + + if ($validation['isSpam']) { $onSpamDetected($userId, $recoveryEmail); - } catch (\Throwable $e) { - $this->logger->info("Error while checking $userId <$recoveryEmail>: " . $e->getMessage()); + } else { + // This is legitimate (validation passed or other non-spam reason) + if ($onLegitimateDetected) { + $onLegitimateDetected($userId, $recoveryEmail); + } } } } + /** Recovery email reminder start date **/ public function getRecoveryEmailReminderStartDate(string $uid): ?string { return $this->config->getUserValue($uid, $this->appName, self::RECOVERY_EMAIL_REMINDER_START_DATE, null); diff --git a/lib/Service/SpamFilterService.php b/lib/Service/SpamFilterService.php new file mode 100644 index 0000000000000000000000000000000000000000..06d353817bca4c1165090491586db560561cad81 --- /dev/null +++ b/lib/Service/SpamFilterService.php @@ -0,0 +1,110 @@ +recoveryEmailService = $recoveryEmailService; + $this->logger = $logger; + } + + /** + * Validates and reads the spam report input file + */ + public function readSpamReportFile(string $inputFile, OutputInterface $output): ?array { + // Check if input file exists + if (!file_exists($inputFile)) { + $output->writeln('Input file does not exist: ' . $inputFile . ''); + return null; + } + + $this->logger->info('Reading spam account report from: ' . $inputFile); + + // Read the input file + $lines = file($inputFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if ($lines === false) { + $output->writeln('Failed to read input file'); + return null; + } + + // Filter out header lines and empty lines + $spamUserIds = array_filter($lines, function ($line) { + $line = trim($line); + return !empty($line) && $line !== 'Spam user list:'; + }); + + return array_values($spamUserIds); // Re-index array + } + + /** + * Processes spam users and filters out legitimate domains + */ + public function processSpamUsers(array $spamUserIds, bool $includeRecoveryEmail, OutputInterface $output): array { + $totalEntries = count($spamUserIds); + $legitimateCount = 0; + $spamCount = 0; + $filteredEntries = []; + + $this->logger->info('Processing ' . $totalEntries . ' entries...'); + + // Set the context for the callback methods + $this->processingContext = [ + 'spamCount' => &$spamCount, + 'legitimateCount' => &$legitimateCount, + 'filteredEntries' => &$filteredEntries, + 'includeRecoveryEmail' => $includeRecoveryEmail, + 'output' => $output + ]; + + // Use the service method to filter legitimate domains + $this->recoveryEmailService->filterLegitimateDomainsFromSpamUsers( + $spamUserIds, + // Callback for spam entries (keep them) + [$this, 'onSpamDetected'], + // Callback for legitimate entries (filter them out) + [$this, 'onLegitimateDetected'] + ); + + return [ + 'totalEntries' => $totalEntries, + 'legitimateCount' => $legitimateCount, + 'spamCount' => $spamCount, + 'filteredEntries' => $filteredEntries + ]; + } + + /** + * Callback method for spam entries (keep them) + */ + public function onSpamDetected(string $userId, string $recoveryEmail): void { + $this->processingContext['spamCount']++; + + if ($this->processingContext['includeRecoveryEmail']) { + $this->processingContext['filteredEntries'][] = "$userId,$recoveryEmail"; + } else { + $this->processingContext['filteredEntries'][] = $userId; + } + + $this->logger->info("KEEPING SPAM: $userId -> $recoveryEmail"); + } + + /** + * Callback method for legitimate entries (filter them out) + */ + public function onLegitimateDetected(string $userId, string $recoveryEmail): void { + $this->processingContext['legitimateCount']++; + $this->logger->info("REMOVING LEGITIMATE: $userId -> $recoveryEmail"); + } +}