83
50

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

フリーメールドメイン一覧

Last updated at Posted at 2020-01-10

背景

会社で**「フリーメールからの登録をブロックしたい!」**という要望があり,手前味噌でフリーメールドメイン一覧を集めることにした。本当は電話番号認証とかでやりたいんだけど,ひとまずの妥協案として。

「free mail domain list」とかでググると上記の Gist がヒットする。が,記事執筆時点で3782件と意外と少ない。ところがフォークはたくさんされている。…であれば,フォークを再帰的に辿ってマージすればいいのでは?

と考えて, Laravel でコマンドを作ってみた。

成果物

実装

blacklisted_free_mail_domains というテーブル,およびそれに対応するモデルを作っている。

<?php

namespace App\Console\Commands;

use App\BlacklistedFreeMailDomain;
use Carbon\Carbon;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use function GuzzleHttp\Promise\all;
use function GuzzleHttp\Promise\coroutine;
use GuzzleHttp\Promise\PromiseInterface;
use GuzzleHttp\Psr7\Response;
use Illuminate\Console\Command;
use Illuminate\Support\Str;
use stdClass;
use Throwable;

class BlacklistedFreeMailDomainCrawlCommand extends Command
{
    public const ROOT_GIST_URL = 'https://api.github.com/gists/5992856';
    public const GIST_FILENAME = 'free_email_provider_domains.txt';

    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'blacklisted-free-mail-domain:crawl {--token= : アクセス制限緩和のためのトークン}
                                                               {--bulk : バルクインサート有効}
                                                               {--remove-expired : 期限切れレコードの削除}';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = 'フリーメールブラックリストをクローリング';

    /**
     * @var Client
     */
    protected $client;

    /**
     * @var array
     */
    protected $domains = [];

    public function handle(): void
    {
        $token = (string)$this->option('token');

        $this->client = new Client([
            'headers' => $token ? ['Authorization' => "token $token"] : [],
        ]);

        $this->crawlGistAsync(static::ROOT_GIST_URL)->wait();

        ksort($this->domains);

        $this->option('bulk')
            ? $this->bulkInsert()
            : $this->iterativeInsert();

        if ($this->option('remove-expired')) {
            $this->deleteExpiredRecords();
        }
    }

    /**
     * 反復インサートで挿入(オートインクリメントを無駄に消費しない)
     */
    protected function iterativeInsert(): void
    {
        $now = Carbon::now();
        $memo = "{$now->format('Y-m-d')} 自動登録";

        $count = 0;
        foreach ($this->domains as $domain => $_) {
            $record = BlacklistedFreeMailDomain::firstOrCreate(compact('domain', 'memo'));
            if ($record->wasRecentlyCreated) {
                ++$count;
                $this->info("Inserted: $record->domain");
            }
        }

        $this->info(sprintf(
            '%d %s inserted.',
            $count,
            Str::plural('domain', $count)
        ));
    }

    /**
     * バルクインサートで挿入(高速)
     */
    protected function bulkInsert(): void
    {
        $created_at = $updated_at = Carbon::now();
        $memo = "{$created_at->format('Y-m-d')} 自動登録";

        foreach (array_chunk($this->domains, 1000, true) as $domains) {
            $values = [];
            foreach ($domains as $domain => $_) {
                $values[] = compact('domain', 'memo', 'created_at', 'updated_at');
            }
            $affected = BlacklistedFreeMailDomain::insertOrIgnore($values);

            $this->info(sprintf(
                '%d %s inserted. (new:%d, duplicated:%d)',
                count($values),
                Str::plural('record', count($values)),
                $affected,
                count($values) - $affected
            ));
        }
    }

    /**
     * DNS 応答が無効なレコードを削除
     */
    protected function deleteExpiredRecords(): void
    {
        BlacklistedFreeMailDomain::eachById(function (BlacklistedFreeMailDomain $record) {
            $e = null;
            try {
                if (gethostbynamel($record->domain)) {
                    return;
                }
            } catch (Throwable $e) {
            }
            $this->warn(sprintf(
                'The domain "%s" is invalid: %s',
                $record->domain,
                $e ? $e->getMessage() : 'FALSE returned'
            ));

            $record->delete();
            $this->info("Deleted: $record->domain");
        });
    }

    /**
     * Gist の API URL を1件指定してクローリング
     *
     * @param string $gistUrl
     * @return PromiseInterface
     */
    protected function crawlGistAsync(string $gistUrl): PromiseInterface
    {
        return coroutine(function () use ($gistUrl) {
            try {
                $this->comment("Crawling $gistUrl...");
                $gist = yield $this->getJsonAsync($gistUrl);

                yield all([
                    $this->crawlRawContentAsync($gist),
                    $this->crawlForksAsync($gist),
                ]);
            } catch (GuzzleException $e) {
                $this->error(sprintf(
                    '%s("%s") failed: %s',
                    __FUNCTION__,
                    addcslashes($gistUrl, '"'),
                    $e->getMessage()
                ));
            }
        });
    }

    /**
     * Gist のコンテンツをクローリング
     *
     * @param stdClass $gist
     * @return PromiseInterface
     */
    protected function crawlRawContentAsync(stdClass $gist): PromiseInterface
    {
        return coroutine(function () use ($gist) {
            try {
                if ($contentUrl = $gist->files->{static::GIST_FILENAME}->raw_url ?? null) {
                    $this->comment("Crawling raw content of $gist->url...");
                    $this->collectDomains(yield $this->getBodyAsync($contentUrl));
                }
            } catch (GuzzleException $e) {
                $this->error(sprintf(
                    '%s({"url":"%s"}) failed: %s',
                    __FUNCTION__,
                    addcslashes($gist->url, '"'),
                    $e->getMessage()
                ));
            }
        });
    }

    /**
     * Gist のフォークを再帰的にクローリング
     *
     * @param stdClass $gist
     * @return PromiseInterface
     */
    protected function crawlForksAsync(stdClass $gist): PromiseInterface
    {
        return coroutine(function () use ($gist) {
            try {
                $this->comment("Crawling forks of $gist->url...");
                $forks = yield $this->getJsonAsync($gist->forks_url);

                if (!$forks) {
                    yield;
                    return;
                }

                $this->info(sprintf(
                    '%d %s found.',
                    count($forks),
                    Str::plural('fork', count($forks))
                ));

                $requests = [];
                foreach ($forks as $fork) {
                    $requests[] = $this->crawlGistAsync($fork->url);
                }
                yield all($requests);
            } catch (GuzzleException $e) {
                $this->error(sprintf(
                    '%s({"url":"%s"}) failed: %s',
                    __FUNCTION__,
                    addcslashes($gist->url, '"'),
                    $e->getMessage()
                ));
            }
        });
    }

    /**
     * @param string $url
     * @return PromiseInterface
     */
    protected function getBodyAsync(string $url): PromiseInterface
    {
        return coroutine(function () use ($url) {
            /** @var Response $response */
            $response = yield $this->client->getAsync($url);
            yield (string)$response->getBody();
        });
    }

    /**
     * @param string $url
     * @return PromiseInterface
     */
    protected function getJsonAsync(string $url): PromiseInterface
    {
        return coroutine(function () use ($url) {
            yield json_decode((string)yield $this->getBodyAsync($url));
        });
    }

    /**
     * @param string $content
     */
    protected function collectDomains(string $content): void
    {
        $domains = array_diff_key(array_flip(array_filter(array_map('trim', explode("\n", $content)))), $this->domains);

        if (!$domains) {
            return;
        }

        $this->info(sprintf(
            '%d new %s found.',
            count($domains),
            Str::plural('domain', count($domains))
        ));

        $this->domains += $domains;
    }
}

実行の様子

83
50
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
83
50

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?