应用层发送一个数据包的时候,是如何到达网卡的(上)
发布日期:2021-05-09 16:03:12 浏览次数:20 分类:精选文章

本文共 12121 字,大约阅读时间需要 40 分钟。

数据包首先从tcp层进行处理,对应的函数是tcp_write

/* *	This routine copies from a user buffer into a socket, *	and starts the transmit system. */static int tcp_write(struct sock *sk, unsigned char *from,	  int len, int nonblock, unsigned flags){   	int copied = 0;	int copy;	int tmp;	struct sk_buff *skb;	struct sk_buff *send_tmp;	unsigned char *buff;	struct proto *prot;	struct device *dev = NULL;	sk->inuse=1;	prot = sk->prot;	while(len > 0) 	{   		if (sk->err) 		{   			/* Stop on an error */			release_sock(sk);			if (copied) 				return(copied);			tmp = -sk->err;			sk->err = 0;			return(tmp);		}		/*		 *	First thing we do is make sure that we are established. 		 */		// 关闭了只能读不能写		if (sk->shutdown & SEND_SHUTDOWN) 		{   			release_sock(sk);			sk->err = EPIPE;			if (copied) 				return(copied);			sk->err = 0;			return(-EPIPE);		}		/* 		 *	Wait for a connection to finish.		 */		// 处于不能写状态,close_wait是可写不可读,因为对端已经关闭了写				while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 		{   			if (sk->err) 			{   				release_sock(sk);				if (copied) 					return(copied);				tmp = -sk->err;				sk->err = 0;				return(tmp);			}			// syn和syn_recv状态的时候可以写,重复发包,否则是出错状态			if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 			{   				release_sock(sk);				if (copied) 					return(copied);				if (sk->err) 				{   					tmp = -sk->err;					sk->err = 0;					return(tmp);				}				// 长连接 				if (sk->keepopen) 				{   					send_sig(SIGPIPE, current, 0);				}				return(-EPIPE);			}			if (nonblock || copied) 			{   				release_sock(sk);				if (copied) 					return(copied);				return(-EAGAIN);			}			release_sock(sk);			cli();					if (sk->state != TCP_ESTABLISHED &&		    		sk->state != TCP_CLOSE_WAIT && sk->err == 0) 		    	{   				interruptible_sleep_on(sk->sleep);				if (current->signal & ~current->blocked) 				{   					sti();					if (copied) 						return(copied);					return(-ERESTARTSYS);				}			}			sk->inuse = 1;			sti();		}	/*	 * The following code can result in copy <= if sk->mss is ever	 * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).	 * sk->mtu is constant once SYN processing is finished.  I.e. we	 * had better not get here until we've seen his SYN and at least one	 * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)	 * But ESTABLISHED should guarantee that.  sk->max_window is by definition	 * non-decreasing.  Note that any ioctl to set user_mss must be done	 * before the exchange of SYN's.  If the initial ack from the other	 * end has a window of 0, max_window and thus mss will both be 0.	 */	/* 	 *	Now we need to check if we have a half built packet. 	 */		// 先看是否有小块的数据被缓存起来,是的话先取出skb,不需要立刻发送的话再入队		if ((skb = tcp_dequeue_partial(sk)) != NULL) 		{   		        int hdrlen;		         /* IP header + TCP header */			hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)			         + sizeof(struct tcphdr);				/* Add more stuff to the end of skb->len */			// 不是紧急数据,则把数据追加到缓存的小包数据后面,是紧急数据则先把小包数据发出去,然后下一个循环再发普通数据			if (!(flags & MSG_OOB)) 			{   					// mss-数据长度等于还可以传多少长度的数据				copy = min(sk->mss - (skb->len - hdrlen), len);				/* FIXME: this is really a bug. */				if (copy <= 0) 				{   			  		printk("TCP: **bug**: \"copy\" <= 0!!\n");			  		copy = 0;				}	  			// 把用户的数据赋值copy长度个字节到数据包的数据部分				memcpy_fromfs(skb->data + skb->len, from, copy);				// 更新skb的data字段使用了多少字节				skb->len += copy;				// 下次复制的首地址				from += copy;				// 已复制的字节长度				copied += copy;				// 还有多少字节需要复制				len -= copy;				// 下一个发送的字节的序列号大小				sk->write_seq += copy;			}			// 数据部分大于等于mss或者是带外数据或者还没有发出去一个数据包则直接发送			if ((skb->len - hdrlen) >= sk->mss ||				(flags & MSG_OOB) || !sk->packets_out)				tcp_send_skb(sk, skb);			else				// 继续缓存,满足条件后一起发送				tcp_enqueue_partial(skb, sk);			continue;		}	/*	 * We also need to worry about the window. 	 * If window < 1/2 the maximum window we've seen from this 	 *   host, don't use it.  This is sender side 	 *   silly window prevention, as specified in RFC1122. 	 *   (Note that this is different than earlier versions of 	 *   SWS prevention, e.g. RFC813.).  What we actually do is 	 *   use the whole MSS.  Since the results in the right	 *   edge of the packet being outside the window, it will	 *   be queued for later rather than sent.	 */		// 可发送的序列化最大值 - 下一个可写的序列化值等于可以发送的字节数		copy = sk->window_seq - sk->write_seq;		if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)			copy = sk->mss;		// 能发送的比需要发送的大,则取需要发送的		if (copy > len)			copy = len;	/*	 *	We should really check the window here also. 	 */	 		send_tmp = NULL;		// 不是紧急数据并且也小于mss,则需要缓存到partial队列,否则直接发送		if (copy < sk->mss && !(flags & MSG_OOB)) 		{   			/*			 *	We will release the socket in case we sleep here. 			 */			release_sock(sk);			/*			 *	NB: following must be mtu, because mss can be increased.			 *	mss is always <= mtu 			 */			skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);			sk->inuse = 1;			send_tmp = skb;		} 		else 		{   			/*			 *	We will release the socket in case we sleep here. 			 */			release_sock(sk);			skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);  			sk->inuse = 1;		}		/*		 *	If we didn't get any memory, we need to sleep. 		 */		// 没有写空间了		if (skb == NULL) 		{   			sk->socket->flags |= SO_NOSPACE;			// 非阻塞直接返回已经写入的字节			if (nonblock) 			{   				release_sock(sk);				if (copied) 					return(copied);				return(-EAGAIN);			}			/*			 *	FIXME: here is another race condition. 			 */			tmp = sk->wmem_alloc;			// 这个函数会处理收到的数据包,如果收到ack包则会腾出写空间			release_sock(sk);			cli();			/*			 *	Again we will try to avoid it. 			 */			// 处于可写状态但是没有写空间,则阻塞			if (tmp <= sk->wmem_alloc &&				  (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)				&& sk->err == 0) 			{   				sk->socket->flags &= ~SO_NOSPACE;				interruptible_sleep_on(sk->sleep);				if (current->signal & ~current->blocked) 				{   					sti();					if (copied) 						return(copied);					return(-ERESTARTSYS);				}			}			sk->inuse = 1;			sti();			continue;		}		skb->len = 0;		skb->sk = sk;		skb->free = 0;		skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);			buff = skb->data;			/*		 * FIXME: we need to optimize this.		 * Perhaps some hints here would be good.		 */		// 构建ip头和mac头,返回ip头+mac头的长度的大小		tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,				 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);		if (tmp < 0 ) 		{   			prot->wfree(sk, skb->mem_addr, skb->mem_len);			release_sock(sk);			if (copied) 				return(copied);			return(tmp);		}		// 更新data中的数据长度		skb->len += tmp;		skb->dev = dev;		// 指向可写地址,准备写入tcp头		buff += tmp;		// skb的tcp头指向data字段的tcp头		skb->h.th =(struct tcphdr *) buff;		// 构建tcp头,len-copy表示是否已经传输完len字节的数据,用于设置push标记		tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);		if (tmp < 0) 		{   			prot->wfree(sk, skb->mem_addr, skb->mem_len);			release_sock(sk);			if (copied) 				return(copied);			return(tmp);		}		// 带外数据		if (flags & MSG_OOB) 		{   	// 设置urg标记位,设置紧急指针指向紧急数据的后面一个字节			((struct tcphdr *)buff)->urg = 1;			((struct tcphdr *)buff)->urg_ptr = ntohs(copy);		}		// 更新skb->data中的数据长度		skb->len += tmp;		// 复制copy个字节到tcp头后面成为tcp报文的负载		memcpy_fromfs(buff+tmp, from, copy);		// 更新需要复制的数据地址		from += copy;		// 复制字节数累加		copied += copy;		// 还有多少个字节需要复制		len -= copy;		// 更新skb->data的数据长度		skb->len += copy;		skb->free = 0;		// 更新下一个tcp报文的序列化		sk->write_seq += copy;		// 数据量太少并且不是紧急数据,并且有待确认的包(nagle算法规则),则先缓存		if (send_tmp != NULL && sk->packets_out) 		{   			tcp_enqueue_partial(send_tmp, sk);			continue;		}		// 否则直接发送		tcp_send_skb(sk, skb);	}	sk->err = 0;/* *	Nagle's rule. Turn Nagle off with TCP_NODELAY for highly *	interactive fast network servers. It's meant to be on and *	it really improves the throughput though not the echo time *	on my slow slip link - Alan *//* *	Avoid possible race on send_tmp - c/o Johannes Stille  */	// 符合nagle算法条件或者没有开启nagle算法且序列号合法则发送	if(sk->partial && ((!sk->packets_out)      /* If not nagling we can send on the before case too.. */	      || (sk->nonagle && before(sk->write_seq , sk->window_seq))      	))  		tcp_send_partial(sk);	release_sock(sk);	return(copied);}

ip层发送函数,进行了数据包的缓存处理

/* * Queues a packet to be sent, and starts the transmitter * if necessary.  if free = 1 then we free the block after * transmit, otherwise we don't. If free==2 we not only * free the block but also don't assign a new ip seq number. * This routine also needs to put in the total length, * and compute the checksum */void ip_queue_xmit(struct sock *sk, struct device *dev,	      struct sk_buff *skb, int free){   	struct iphdr *iph;	unsigned char *ptr;	/* Sanity check */	if (dev == NULL)	{   		printk("IP: ip_queue_xmit dev = NULL\n");		return;	}	IS_SKB(skb);	/*	 *	Do some book-keeping in the packet for later	 */	skb->dev = dev;	// 发送时间	skb->when = jiffies;	/*	 *	Find the IP header and set the length. This is bad	 *	but once we get the skb data handling code in the	 *	hardware will push its header sensibly and we will	 *	set skb->ip_hdr to avoid this mess and the fixed	 *	header length problem	 */	ptr = skb->data;	ptr += dev->hard_header_len;	iph = (struct iphdr *)ptr;	skb->ip_hdr = iph;	// 整个ip头和数据的长度	iph->tot_len = ntohs(skb->len-dev->hard_header_len);#ifdef CONFIG_IP_FIREWALL	if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1)		/* just don't send this packet */		return;#endif		/*	 *	No reassigning numbers to fragments...	 */	// 用于重组分片的id	if(free!=2)		iph->id      = htons(ip_id_count++);	else		free=1;	/* All buffers without an owner socket get freed */	if (sk == NULL)		free = 1;	skb->free = free;	/*	 *	Do we need to fragment. Again this is inefficient.	 *	We need to somehow lock the original buffer and use	 *	bits of it.	 */	// 数据包大小mtu则分片处理	if(skb->len > dev->mtu + dev->hard_header_len)	{   		ip_fragment(sk,skb,dev,0);		IS_SKB(skb);		kfree_skb(skb,FREE_WRITE);		return;	}	/*	 *	Add an IP checksum	 */	// ip层校验和	ip_send_check(iph);	/*	 *	Print the frame when debugging	 */	/*	 *	More debugging. You cannot queue a packet already on a list	 *	Spot this and moan loudly.	 */	if (skb->next != NULL)	{   		printk("ip_queue_xmit: next != NULL\n");		skb_unlink(skb);	}	/*	 *	If a sender wishes the packet to remain unfreed	 *	we add it to his send queue. This arguably belongs	 *	in the TCP level since nobody else uses it. BUT	 *	remember IPng might change all the rules.	 */	// free等于0说明这个包要缓存	if (!free)	{   		unsigned long flags;		/* The socket now has more outstanding blocks */		// 发送但还没收到确认的数据包数量		sk->packets_out++;		/* Protect the list for a moment */		save_flags(flags);		cli();		if (skb->link3 != NULL)		{   			printk("ip.c: link3 != NULL\n");			skb->link3 = NULL;		}		// 插入已发送但未确认队列,用于超时重传		if (sk->send_head == NULL)		{   			sk->send_tail = skb;			sk->send_head = skb;		}		else		{   			sk->send_tail->link3 = skb;			sk->send_tail = skb;		}		/* skb->link3 is NULL */		/* Interrupt restore */		restore_flags(flags);	}	else		/* Remember who owns the buffer */		skb->sk = sk;	/*	 *	If the indicated interface is up and running, send the packet.	 */	 	ip_statistics.IpOutRequests++;#ifdef CONFIG_IP_ACCT	ip_acct_cnt(iph,dev, ip_acct_chain);#endif		#ifdef CONFIG_IP_MULTICAST		/*	 *	Multicasts are looped back for other local users	 */	 	if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK))	{   		if(sk==NULL || sk->ip_mc_loop)		{   			if(iph->daddr==IGMP_ALL_HOSTS)				ip_loopback(dev,skb);			else			{   				struct ip_mc_list *imc=dev->ip_mc_list;				while(imc!=NULL)				{   					if(imc->multiaddr==iph->daddr)					{   						ip_loopback(dev,skb);						break;					}					imc=imc->next;				}			}		}		/* Multicasts with ttl 0 must not go beyond the host */				if(skb->ip_hdr->ttl==0)		{   			kfree_skb(skb, FREE_READ);			return;		}	}#endif	if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))		ip_loopback(dev,skb);			if (dev->flags & IFF_UP)	{   		/*		 *	If we have an owner use its priority setting,		 *	otherwise use NORMAL		 */		if (sk != NULL)		{   				// 调用mac层发送			dev_queue_xmit(skb, dev, sk->priority);		}		else		{   			dev_queue_xmit(skb, dev, SOPRI_NORMAL);		}	}	else	{   		ip_statistics.IpOutDiscards++;		if (free)			kfree_skb(skb, FREE_WRITE);	}}
上一篇:RAW协议源码解析
下一篇:tcp四次挥手源码解析(服务器角度)

发表评论

最新留言

能坚持,总会有不一样的收获!
[***.219.124.196]2025年04月16日 15时56分25秒