From 3d1df64a19b07d85b5b6789fcbf3273f555954b4 Mon Sep 17 00:00:00 2001 From: Zaahid Bateson Date: Mon, 30 Jul 2018 09:09:49 -0500 Subject: [PATCH] Use AddressHeader instead of EmailAddressParser --- src/Email/EmailAddressParser.php | 129 ------------------------------- src/EmailSynchronizer.php | 37 +++++++-- 2 files changed, 31 insertions(+), 135 deletions(-) delete mode 100644 src/Email/EmailAddressParser.php diff --git a/src/Email/EmailAddressParser.php b/src/Email/EmailAddressParser.php deleted file mode 100644 index 1288c5e..0000000 --- a/src/Email/EmailAddressParser.php +++ /dev/null @@ -1,129 +0,0 @@ -string = $string; - } - - /** - * @return EmailAddress[] - */ - public function parse() - { - // Workaround some anti-bot measures - $emails = str_replace('(original)', '', $this->string); - $emails = preg_replace('/[<>\(\)]/', '', $emails); - $emails = str_replace(' . ', '.', $emails); - $emails = preg_replace('/([. #]at[.# ])/', '@', $emails); - $emails = preg_replace('/([. #]dot[.# ])/', '.', $emails); - - $names = $this->extractEmails($emails); - $this->extractNames($names); - - // Cleanup dead results - $this->emails = array_values(array_filter($this->emails)); - $this->names = array_values(array_filter($this->names)); - - // Combine informations - $identities = []; - $count = count($this->emails) ?: count($this->names); - - for ($i = 0; $i < $count; ++$i) { - $email = $this->emails[$i] ?? null; - if (!$email || !filter_var($email, FILTER_VALIDATE_EMAIL)) { - $email = null; - } - - $identities[] = new EmailAddress($email, $this->names[$i] ?? null); - } - - return $identities; - } - - /** - * @param string $emails - * - * @return string - */ - private function extractEmails($emails) - { - // Try to split off emails - $names = $emails; - $emails = preg_split('/[\s,\/]+/', $emails); - - foreach ($emails as $key => $email) { - // Check if email is valid, if not - // throw it away - $email = $this->trimCharacters($email); - $email = filter_var($email, FILTER_VALIDATE_EMAIL) ? $email : null; - $names = str_replace($email, '', $names); - $this->emails[$key] = $email; - } - - return $names; - } - - /** - * @return string - */ - private function trimCharacters(string $string) - { - return trim($string, ' /"='); - } - - /** - * @param string $names - */ - private function extractNames($names) - { - $names = preg_split('/(,| |\n)/', $names); - $names = array_filter($names); - - foreach ($names as $key => $name) { - $name = $this->trimCharacters($name); - - // Special cases for that one guy who - // put his whole resume as name and other - // marvelous joys - if ( - mb_strpos($name, 'Watson Research') || - mb_strlen($name) <= 3 || - mb_strpos($name, '?') !== false || - mb_strpos($name, 'http') !== false - ) { - continue; - } - - $this->names[$key] = $name; - } - } -} diff --git a/src/EmailSynchronizer.php b/src/EmailSynchronizer.php index c679d8c..c1d7cd3 100644 --- a/src/EmailSynchronizer.php +++ b/src/EmailSynchronizer.php @@ -7,7 +7,6 @@ use Doctrine\DBAL\Exception\UniqueConstraintViolationException; use Externals\Email\Email; use Externals\Email\EmailAddress; -use Externals\Email\EmailAddressParser; use Externals\Email\EmailContentParser; use Externals\Email\EmailRepository; use Externals\Email\EmailSubjectParser; @@ -17,6 +16,7 @@ use Rvdv\Nntp\Command\ArticleCommand; use Rvdv\Nntp\Connection\Connection; use Rvdv\Nntp\Exception\UnknownHandlerException; +use ZBateson\MailMimeParser\Header\AddressHeader; use ZBateson\MailMimeParser\Header\DateHeader; use ZBateson\MailMimeParser\MailMimeParser; use ZBateson\MailMimeParser\Message; @@ -143,17 +143,13 @@ public function synchronizeEmail(int $number, string $source) $subject = $this->subjectParser->sanitize($parsedDocument->getHeaderValue('subject')); $content = $this->contentParser->parse((string) $parsedDocument->getTextContent()); - // We don't use the special AddressHeader class because it doesn't seem to parse the - // person's name at all $fromHeader = $parsedDocument->getHeader('from'); if (!$fromHeader) { $this->logger->warning("Cannot synchronize message $number because it contains no 'from' header"); return; } - $emailAddressParser = new EmailAddressParser($fromHeader->getRawValue()); - $fromArray = $emailAddressParser->parse(); /** @var EmailAddress $from */ - $from = reset($fromArray); + $from = $this->getFirstFilteredAddress($fromHeader); $emailId = $parsedDocument->getHeaderValue('message-id'); @@ -226,4 +222,33 @@ private function parseDateTime(Message $parsedDocument) return $date; } + + /** + * @return EmailAddress + */ + private function getFirstFilteredAddress(AddressHeader $header) + { + $addresses = $header->getAddresses(); + foreach ($addresses as $address) { + $email = $address->getEmail(); + $email = preg_replace('/([. #]at[.# ])/', '@', $email); + $email = preg_replace('/([. #]dot[.# ])/', '.', $email); + if (filter_var($email, FILTER_VALIDATE_EMAIL)) { + // Special cases for that one guy who + // put his whole resume as name and other + // marvelous joys + $name = $address->getName(); + if ( + mb_strpos($name, 'Watson Research') || + mb_strlen($name) <= 3 || + mb_strpos($name, '?') !== false || + mb_strpos($name, 'http') !== false + ) { + $name = ''; + } + return new EmailAddress($name, $email); + } + } + return null; + } }