More than 5 years have passed since last update.

Linux kernel v2.6.20 network stackを読んでみる - IP受信編

Last updated at 2018-08-19Posted at 2018-08-19

はじめに

2018年8月、LinuxのTCP処理においてDoSに対する複数の脆弱性があると公開されました。

JVNVU#91510483 複数の TCP 実装にサービス運用妨害 (DoS) の脆弱性
 JVNVU#93630542 Linux の IP 実装におけるサービス運用妨害 (DoS) の脆弱性

これら脆弱性を理解するためにはLinux network stackを読まなきゃなー。ということで読んでみます。いきなりTCPはつらそうなので、IPから行きます。

(注)

ここで扱うLinux kernelのバージョンは2.6.20です。基本を抑えるために低いバージョンから読み始めます。(バージョンのチョイスは適当です)
https://elixir.bootlin.com/linux/v2.6.20/source
表示しているソースはIPの受信処理を理解しやすいよう必要最低限にしぼり、だいぶ省略しています。
自ホスト宛にIPパケットが届いた場合を想定しています。マルチキャストやIP forwardなどは考えません。

NICからIP処理の入り口まで

NICがパケットを受信し、IP層の処理にたどり着くまでを説明します。

処理の流れが図示してあります。下から上へ処理が流れます。
NICがパケットを受信し、CPUに割込み(Interrupt)がかかり、kernelに処理が移ります。
kernelはデバドラを通してパケットを取得し、queueに積み(netif_rx)、HW Interruptのコンテキストは終了します。

次にソフト割込み(Soft-IRQ)がかかり(do_softirq) → net_rx_action とcallされます。

(この辺の話は、「はてなにおけるLinuxネットワークスタックパフォーマンス改善」も参考にしてください。わかりやすいです。)

・net_rx_action()
net_rx_action call graph

__skb_dequeue()でqueueからパケットを取り出し(と同時にqueueから削除)、
netif_receive_skb()でパケットの内容が解析され、タイプごとに処理を振り分けます(deliver_skb()→ packet_type->func())。
packet_type->funcにはIPパケットの場合は ip_rcv() が設定されています。
packet_type->func() を呼ぶ事により上位層(ここではIP層)の処理に移ります。

packet_type#func の型は以下です。
https://elixir.bootlin.com/linux/v2.6.20/source/include/linux/netdevice.h#L553

include/linux/netdevice.h#packet_type

int　(*func)(struct sk_buff *,　
            struct net_device *,　
            struct packet_type *,　
            struct net_device *);

IP層

上述の通り、ip_rcv() がIP層のentry pointになっています。

IPのフォーマットです。IPヘッダは
struct iphdr
https://elixir.bootlin.com/linux/v2.6.20/source/include/linux/ip.h#L85

include/linux/ip.h

struct iphdr {
    __u8    version:4, // Version
            ihl:4;     // IHL (ヘッダ長)
    __u8.   tos;       // Codepoint/Type of service
    __be16  tot_len;   // Total length (パケット長)
    __be16  id;        // Fragment Identification
    __be16  frag_off;  // Flags(4bit), Fragment Offset(12bit)
    __u8    ttl;       // TTL
    __u8    protocol;  // Protocol
    __sum16 check;     // Header Checksum
    __be32  saddr;     // Source address
    __be32  daddr;     // Destination address
    /*Options(24bit), Padding(8bit) はここから */
};

と定義されています。

IPヘッダはパケットの情報が格納された sk_buff (socket_buffer) から以下のように取得できます。

struct iphdr *iph;

iph = skb->nh.iph;
//skbは struct sk_buff *skb;

IP層のcall graph

左側は受信処理、右側は送信処理です。
真ん中にあるRoutingは後で説明します。

ip_rcv()

net/ipv4/ip_input.c

/*
 * 	Main IP Receive routine.
 */ 
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
	struct iphdr *iph;
	u32 len;

	/* When the interface is in promisc. mode, drop all the crap
	 * that it receives, do not try to analyse it.
	 */
	if (skb->pkt_type == PACKET_OTHERHOST)
		goto drop;

	iph = skb->nh.iph;

	/*
	 *	RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
	 *
	 *	Is the datagram acceptable?
	 *
	 *	1.	Length at least the size of an ip header
	 *	2.	Version of 4
	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
	 *	4.	Doesn't have a bogus length
	 */
	// IPヘッダ長とバージョンのチェック
	// IHLはIPヘッダの長さ(実際のサイズは4を掛けてByteにする)を表し、最低20Byte(=5*4)となる。
	if (iph->ihl < 5 || iph->version != 4)
		goto inhdr_error;

	// 実際のパケットのデータでヘッダー長に矛盾がないか調べる
	if (!pskb_may_pull(skb, iph->ihl*4))
		goto inhdr_error;

	iph = skb->nh.iph;

	// checksum
	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
		goto inhdr_error;

	// 実際のパケットのデータでデータ長に矛盾がないか調べる
	len = ntohs(iph->tot_len);
	if (skb->len < len || len < (iph->ihl*4))
		goto inhdr_error;

	// Netfilter。hookしてユーザがパケットを調査したり加工したりできる仕組み。登録してなければ何もしない。
	// hookしたあと、ip_rcv_finish()が呼ばれる
	return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
		       ip_rcv_finish);

IP headerのチェックをします。チェック項目は
・ヘッダ長の最低値、IPv4か、チェックサム、ゴミデータがついていないか
です。チェックが終われば ip_rcv_finish() を呼び出します。

ip_rcv_finish()

static inline int ip_rcv_finish(struct sk_buff *skb)
{
	struct iphdr *iph = skb->nh.iph;
	/*
	 *	Initialise the virtual path cache for the packet. It describes
	 *	how the packet travels inside Linux networking.
	 */ 
	if (skb->dst == NULL) {
		// skb->dstは　struct  dst_entry　型です。
		// ip_route_input()でパケットが自分宛てか、他ホストにフォワードするのか、マルチキャストかなど判定し、適切なTransport層の処理を行う関数を選択します。
		// 自ホスト宛のTCP/UDPの場合は
		// skb->dst->inputに ip_local_deliver()が設定されます。
		ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,　skb->dev);
	}
	// IPにオプションがある場合処理。
	if (iph->ihl > 5 && ip_rcv_options(skb))
		goto drop;

	return dst_input(skb);
}

ip_route_input()を呼び出し、パケットの経路を選択します。
経路というのは、自ホスト宛、他ホスト宛にフォワード、マルチキャストと３つあるようです。
経路ごとに上位層の処理を関数ポインタ(struct dst_entryのinput)で変更しています。それぞれ、ip_local_deliver(), ip_forward(), ip_mr_input()が関数の実体です。

IP headerにOptionがついていたら処理し、最後に dst_input() を呼び出します。

dst_input()

include/net/dst.h

/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
	for (;;) {
		// ip_route_input()で設定された、Transport層の処理を行います。
		// TCP/UDPでは実体は ip_local_deliver() です。
		err = skb->dst->input(skb);

		if (likely(err == 0))
			return err;
		/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
		if (unlikely(err != NET_XMIT_BYPASS))
			return err;
	}
}

ソース中にコメントを書きました。

ip_local_deliver()

Transport層に処理を移す前に、まだIP層でやらなければいけないことがあります。パケットのデフラグです。
基礎から学ぶWindowsネットワーク：第10回　IPパケットの構造とIPフラグメンテーション (3/3) を参考にしてください。

/*
 * 	Deliver IP Packets to the higher protocol layers.
 */ 
int ip_local_deliver(struct sk_buff *skb)
{
	/*
	 *	Reassemble IP fragments.
	 */
	/* IP flags. */
	//#define IP_CE		0x8000		/* Flag: "Congestion"		*/
	//#define IP_DF		0x4000		/* Flag: "Don't Fragment"	*/
	//#define IP_MF		0x2000		/* Flag: "More Fragments"	*/
	//#define IP_OFFSET	0x1FFF		/* "Fragment Offset" part	*/
	if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
		// ★フラグメンテーションしていることがわかったので、デフラグ！
		skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
		if (!skb)
			return 0;
	}

	// Netfilter。ip_local_deliver_finish()が呼ばれます.
	return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}

デフラグして、最終処理のip_local_deliver_finish()を呼び出します。

ip_local_deliver_finish()

static inline int ip_local_deliver_finish(struct sk_buff *skb)
{
    int protocol = skb->nh.iph->protocol;
    int hash;
    struct net_protocol *ipprot;

  resubmit:
    hash = protocol & (MAX_INET_PROTOS - 1);

    // パケットのIPヘッダのプロトコル種別でテーブルを引いている.
    ipprot = rcu_dereference(inet_protos[hash]));
    
    // ここがTransport層の入り口.
    // TCPなら tcp_v4_rcv()
    // UDPなら udp_rcv()
    int ret = ipprot->handler(skb);
    if (ret < 0) {
        protocol = -ret;
        goto resubmit;
    }
    return 0;
}