diff --git a/CHANGELOG.md b/CHANGELOG.md index 434c5bb..bfa31f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,11 +20,8 @@ CHANGELOG * Duplicate `.com`s are now removed from email domain names when `hashEmail` is used. For example, `example.com.com` will become `example.com`. -* Extraneous characters after `.com` are now removed from email domain - names when `hashEmail` is used. For example, `example.comfoo` will become - `example.com`. -* Certain `.com` typos are now normalized to `.com` when `hashEmail` is - used. For example, `example.cam` will become `example.com`. +* Certain TLD typos are now normalized when `hashEmail` is used. For + example, `example.comcom` will become `example.com`. * Additional `gmail.com` domain names with leading digits are now normalized when `hashEmail` is used. For example, `100gmail.com` will become `gmail.com`. diff --git a/src/MinFraud/Util.php b/src/MinFraud/Util.php index 567d16d..e95c796 100644 --- a/src/MinFraud/Util.php +++ b/src/MinFraud/Util.php @@ -31,6 +31,57 @@ class Util 'putlook.com' => 'outlook.com', ]; + /** + * @var array + */ + private static $typoTLDs = [ + 'comm' => 'com', + 'commm' => 'com', + 'commmm' => 'com', + 'comn' => 'com', + + 'cbm' => 'com', + 'ccm' => 'com', + 'cdm' => 'com', + 'cem' => 'com', + 'cfm' => 'com', + 'cgm' => 'com', + 'chm' => 'com', + 'cim' => 'com', + 'cjm' => 'com', + 'ckm' => 'com', + 'clm' => 'com', + 'cmm' => 'com', + 'cnm' => 'com', + 'cpm' => 'com', + 'cqm' => 'com', + 'crm' => 'com', + 'csm' => 'com', + 'ctm' => 'com', + 'cum' => 'com', + 'cvm' => 'com', + 'cwm' => 'com', + 'cxm' => 'com', + 'cym' => 'com', + 'czm' => 'com', + + 'col' => 'com', + 'con' => 'com', + + 'dom' => 'com', + 'don' => 'com', + 'som' => 'com', + 'son' => 'com', + 'vom' => 'com', + 'von' => 'com', + 'xom' => 'com', + 'xon' => 'com', + + 'clam' => 'com', + 'colm' => 'com', + 'comcom' => 'com', + ]; + /** * @var array */ @@ -271,10 +322,16 @@ private static function cleanDomain(string $domain): string } $domain = preg_replace('/(?:\.com){2,}$/', '.com', $domain); - $domain = preg_replace('/\.com[^.]+$/', '.com', $domain); - $domain = preg_replace('/(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$/', '.com', $domain); $domain = preg_replace('/^\d+(?:gmail?\.com)$/', 'gmail.com', $domain); + $idx = strrpos($domain, '.'); + if ($idx !== false) { + $tld = substr($domain, $idx + 1); + if (isset(self::$typoTLDs[$tld])) { + $domain = substr($domain, 0, $idx) . '.' . self::$typoTLDs[$tld]; + } + } + if (isset(self::$typoDomains[$domain])) { $domain = self::$typoDomains[$domain]; } diff --git a/tests/MaxMind/Test/MinFraud/UtilTest.php b/tests/MaxMind/Test/MinFraud/UtilTest.php index 0d039a0..3f4428d 100644 --- a/tests/MaxMind/Test/MinFraud/UtilTest.php +++ b/tests/MaxMind/Test/MinFraud/UtilTest.php @@ -223,18 +223,18 @@ public function testMaybeHashEmail(): void 'input' => ['email' => ['address' => 'foo@example.comfoo']], 'expected' => [ 'email' => [ - 'address' => md5('foo@example.com'), - 'domain' => 'example.com', + 'address' => md5('foo@example.comfoo'), + 'domain' => 'example.comfoo', ], ], ], [ - 'name' => '.com typo', + 'name' => '.cam', 'input' => ['email' => ['address' => 'foo@example.cam']], 'expected' => [ 'email' => [ - 'address' => md5('foo@example.com'), - 'domain' => 'example.com', + 'address' => md5('foo@example.cam'), + 'domain' => 'example.cam', ], ], ], @@ -258,6 +258,16 @@ public function testMaybeHashEmail(): void ], ], ], + [ + 'name' => 'TLD typo', + 'input' => ['email' => ['address' => 'foo@example.comcom']], + 'expected' => [ + 'email' => [ + 'address' => md5('foo@example.com'), + 'domain' => 'example.com', + ], + ], + ], ]; if (\function_exists('idn_to_ascii')