Help us understand the problem. What is going on with this article?

フリーメールドメイン一覧

背景

会社で「フリーメールからの登録をブロックしたい!」という要望があり,手前味噌でフリーメールドメイン一覧を集めることにした。本当は電話番号認証とかでやりたいんだけど,ひとまずの妥協案として。

「free mail domain list」とかでググると上記の Gist がヒットする。が,記事執筆時点で3782件と意外と少ない。ところがフォークはたくさんされている。…であれば,フォークを再帰的に辿ってマージすればいいのでは?

と考えて, Laravel でコマンドを作ってみた。

成果物

https://gist.github.com/mpyw/6b59ffbe517da9cccbf40db9aa30d09b

実装

blacklisted_free_mail_domains というテーブル,およびそれに対応するモデルを作っている。

<?php

namespace App\Console\Commands;

use App\BlacklistedFreeMailDomain;
use Carbon\Carbon;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use function GuzzleHttp\Promise\all;
use function GuzzleHttp\Promise\coroutine;
use GuzzleHttp\Promise\PromiseInterface;
use GuzzleHttp\Psr7\Response;
use Illuminate\Console\Command;
use Illuminate\Support\Str;
use stdClass;
use Throwable;

class BlacklistedFreeMailDomainCrawlCommand extends Command
{
    public const ROOT_GIST_URL = 'https://api.github.com/gists/5992856';
    public const GIST_FILENAME = 'free_email_provider_domains.txt';

    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'blacklisted-free-mail-domain:crawl {--token= : アクセス制限緩和のためのトークン}
                                                               {--bulk : バルクインサート有効}
                                                               {--remove-expired : 期限切れレコードの削除}';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = 'フリーメールブラックリストをクローリング';

    /**
     * @var Client
     */
    protected $client;

    /**
     * @var array
     */
    protected $domains = [];

    public function handle(): void
    {
        $token = (string)$this->option('token');

        $this->client = new Client([
            'headers' => $token ? ['Authorization' => "token $token"] : [],
        ]);

        $this->crawlGistAsync(static::ROOT_GIST_URL)->wait();

        ksort($this->domains);

        $this->option('bulk')
            ? $this->bulkInsert()
            : $this->iterativeInsert();

        if ($this->option('remove-expired')) {
            $this->deleteExpiredRecords();
        }
    }

    /**
     * 反復インサートで挿入(オートインクリメントを無駄に消費しない)
     */
    protected function iterativeInsert(): void
    {
        $now = Carbon::now();
        $memo = "{$now->format('Y-m-d')} 自動登録";

        $count = 0;
        foreach ($this->domains as $domain => $_) {
            $record = BlacklistedFreeMailDomain::firstOrCreate(compact('domain', 'memo'));
            if ($record->wasRecentlyCreated) {
                ++$count;
                $this->info("Inserted: $record->domain");
            }
        }

        $this->info(sprintf(
            '%d %s inserted.',
            $count,
            Str::plural('domain', $count)
        ));
    }

    /**
     * バルクインサートで挿入(高速)
     */
    protected function bulkInsert(): void
    {
        $created_at = $updated_at = Carbon::now();
        $memo = "{$created_at->format('Y-m-d')} 自動登録";

        foreach (array_chunk($this->domains, 1000, true) as $domains) {
            $values = [];
            foreach ($domains as $domain => $_) {
                $values[] = compact('domain', 'memo', 'created_at', 'updated_at');
            }
            $affected = BlacklistedFreeMailDomain::insertOrIgnore($values);

            $this->info(sprintf(
                '%d %s inserted. (new:%d, duplicated:%d)',
                count($values),
                Str::plural('record', count($values)),
                $affected,
                count($values) - $affected
            ));
        }
    }

    /**
     * DNS 応答が無効なレコードを削除
     */
    protected function deleteExpiredRecords(): void
    {
        BlacklistedFreeMailDomain::eachById(function (BlacklistedFreeMailDomain $record) {
            $e = null;
            try {
                if (gethostbynamel($record->domain)) {
                    return;
                }
            } catch (Throwable $e) {
            }
            $this->warn(sprintf(
                'The domain "%s" is invalid: %s',
                $record->domain,
                $e ? $e->getMessage() : 'FALSE returned'
            ));

            $record->delete();
            $this->info("Deleted: $record->domain");
        });
    }

    /**
     * Gist の API URL を1件指定してクローリング
     *
     * @param string $gistUrl
     * @return PromiseInterface
     */
    protected function crawlGistAsync(string $gistUrl): PromiseInterface
    {
        return coroutine(function () use ($gistUrl) {
            try {
                $this->comment("Crawling $gistUrl...");
                $gist = yield $this->getJsonAsync($gistUrl);

                yield all([
                    $this->crawlRawContentAsync($gist),
                    $this->crawlForksAsync($gist),
                ]);
            } catch (GuzzleException $e) {
                $this->error(sprintf(
                    '%s("%s") failed: %s',
                    __FUNCTION__,
                    addcslashes($gistUrl, '"'),
                    $e->getMessage()
                ));
            }
        });
    }

    /**
     * Gist のコンテンツをクローリング
     *
     * @param stdClass $gist
     * @return PromiseInterface
     */
    protected function crawlRawContentAsync(stdClass $gist): PromiseInterface
    {
        return coroutine(function () use ($gist) {
            try {
                if ($contentUrl = $gist->files->{static::GIST_FILENAME}->raw_url ?? null) {
                    $this->comment("Crawling raw content of $gist->url...");
                    $this->collectDomains(yield $this->getBodyAsync($contentUrl));
                }
            } catch (GuzzleException $e) {
                $this->error(sprintf(
                    '%s({"url":"%s"}) failed: %s',
                    __FUNCTION__,
                    addcslashes($gist->url, '"'),
                    $e->getMessage()
                ));
            }
        });
    }

    /**
     * Gist のフォークを再帰的にクローリング
     *
     * @param stdClass $gist
     * @return PromiseInterface
     */
    protected function crawlForksAsync(stdClass $gist): PromiseInterface
    {
        return coroutine(function () use ($gist) {
            try {
                $this->comment("Crawling forks of $gist->url...");
                $forks = yield $this->getJsonAsync($gist->forks_url);

                if (!$forks) {
                    yield;
                    return;
                }

                $this->info(sprintf(
                    '%d %s found.',
                    count($forks),
                    Str::plural('fork', count($forks))
                ));

                $requests = [];
                foreach ($forks as $fork) {
                    $requests[] = $this->crawlGistAsync($fork->url);
                }
                yield all($requests);
            } catch (GuzzleException $e) {
                $this->error(sprintf(
                    '%s({"url":"%s"}) failed: %s',
                    __FUNCTION__,
                    addcslashes($gist->url, '"'),
                    $e->getMessage()
                ));
            }
        });
    }

    /**
     * @param string $url
     * @return PromiseInterface
     */
    protected function getBodyAsync(string $url): PromiseInterface
    {
        return coroutine(function () use ($url) {
            /** @var Response $response */
            $response = yield $this->client->getAsync($url);
            yield (string)$response->getBody();
        });
    }

    /**
     * @param string $url
     * @return PromiseInterface
     */
    protected function getJsonAsync(string $url): PromiseInterface
    {
        return coroutine(function () use ($url) {
            yield json_decode((string)yield $this->getBodyAsync($url));
        });
    }

    /**
     * @param string $content
     */
    protected function collectDomains(string $content): void
    {
        $domains = array_diff_key(array_flip(array_filter(array_map('trim', explode("\n", $content)))), $this->domains);

        if (!$domains) {
            return;
        }

        $this->info(sprintf(
            '%d new %s found.',
            count($domains),
            Str::plural('domain', count($domains))
        ));

        $this->domains += $domains;
    }
}

実行の様子

Why not register and get more from Qiita?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away
Comments
Sign up for free and join this conversation.
If you already have a Qiita account
Why do not you register as a user and use Qiita more conveniently?
You need to log in to use this function. Qiita can be used more conveniently after logging in.
You seem to be reading articles frequently this month. Qiita can be used more conveniently after logging in.
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away