Skip to content

Commit

Permalink
Replace fewer TLDs when normalizing
Browse files Browse the repository at this point in the history
  • Loading branch information
horgh committed Mar 15, 2024
1 parent 26e43ea commit 5b0d822
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 12 deletions.
7 changes: 2 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,8 @@ CHANGELOG
* Duplicate `.com`s are now removed from email domain names when
`hashEmail` is used. For example, `example.com.com` will become
`example.com`.
* Extraneous characters after `.com` are now removed from email domain
names when `hashEmail` is used. For example, `example.comfoo` will become
`example.com`.
* Certain `.com` typos are now normalized to `.com` when `hashEmail` is
used. For example, `example.cam` will become `example.com`.
* Certain TLD typos are now normalized when `hashEmail` is used. For
example, `example.comcom` will become `example.com`.
* Additional `gmail.com` domain names with leading digits are now
normalized when `hashEmail` is used. For example, `100gmail.com` will
become `gmail.com`.
Expand Down
61 changes: 59 additions & 2 deletions src/MinFraud/Util.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,57 @@ class Util
'putlook.com' => 'outlook.com',
];

/**
* @var array<string, string>
*/
private static $typoTLDs = [
'comm' => 'com',
'commm' => 'com',
'commmm' => 'com',
'comn' => 'com',

'cbm' => 'com',
'ccm' => 'com',
'cdm' => 'com',
'cem' => 'com',
'cfm' => 'com',
'cgm' => 'com',
'chm' => 'com',
'cim' => 'com',
'cjm' => 'com',
'ckm' => 'com',
'clm' => 'com',
'cmm' => 'com',
'cnm' => 'com',
'cpm' => 'com',
'cqm' => 'com',
'crm' => 'com',
'csm' => 'com',
'ctm' => 'com',
'cum' => 'com',
'cvm' => 'com',
'cwm' => 'com',
'cxm' => 'com',
'cym' => 'com',
'czm' => 'com',

'col' => 'com',
'con' => 'com',

'dom' => 'com',
'don' => 'com',
'som' => 'com',
'son' => 'com',
'vom' => 'com',
'von' => 'com',
'xom' => 'com',
'xon' => 'com',

'clam' => 'com',
'colm' => 'com',
'comcom' => 'com',
];

/**
* @var array<string, string>
*/
Expand Down Expand Up @@ -271,10 +322,16 @@ private static function cleanDomain(string $domain): string
}

$domain = preg_replace('/(?:\.com){2,}$/', '.com', $domain);
$domain = preg_replace('/\.com[^.]+$/', '.com', $domain);
$domain = preg_replace('/(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$/', '.com', $domain);
$domain = preg_replace('/^\d+(?:gmail?\.com)$/', 'gmail.com', $domain);

$idx = strrpos($domain, '.');
if ($idx !== false) {
$tld = substr($domain, $idx + 1);
if (isset(self::$typoTLDs[$tld])) {
$domain = substr($domain, 0, $idx) . '.' . self::$typoTLDs[$tld];
}
}

if (isset(self::$typoDomains[$domain])) {
$domain = self::$typoDomains[$domain];
}
Expand Down
20 changes: 15 additions & 5 deletions tests/MaxMind/Test/MinFraud/UtilTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -223,18 +223,18 @@ public function testMaybeHashEmail(): void
'input' => ['email' => ['address' => 'foo@example.comfoo']],
'expected' => [
'email' => [
'address' => md5('foo@example.com'),
'domain' => 'example.com',
'address' => md5('foo@example.comfoo'),
'domain' => 'example.comfoo',
],
],
],
[
'name' => '.com typo',
'name' => '.cam',
'input' => ['email' => ['address' => 'foo@example.cam']],
'expected' => [
'email' => [
'address' => md5('foo@example.com'),
'domain' => 'example.com',
'address' => md5('foo@example.cam'),
'domain' => 'example.cam',
],
],
],
Expand All @@ -258,6 +258,16 @@ public function testMaybeHashEmail(): void
],
],
],
[
'name' => 'TLD typo',
'input' => ['email' => ['address' => 'foo@example.comcom']],
'expected' => [
'email' => [
'address' => md5('foo@example.com'),
'domain' => 'example.com',
],
],
],
];

if (\function_exists('idn_to_ascii')
Expand Down

0 comments on commit 5b0d822

Please sign in to comment.