
应用层发送一个数据包的时候,是如何到达网卡的(上)
发布日期:2021-05-09 16:03:12
浏览次数:20
分类:精选文章
本文共 12121 字,大约阅读时间需要 40 分钟。
数据包首先从tcp层进行处理,对应的函数是tcp_write
/* * This routine copies from a user buffer into a socket, * and starts the transmit system. */static int tcp_write(struct sock *sk, unsigned char *from, int len, int nonblock, unsigned flags){ int copied = 0; int copy; int tmp; struct sk_buff *skb; struct sk_buff *send_tmp; unsigned char *buff; struct proto *prot; struct device *dev = NULL; sk->inuse=1; prot = sk->prot; while(len > 0) { if (sk->err) { /* Stop on an error */ release_sock(sk); if (copied) return(copied); tmp = -sk->err; sk->err = 0; return(tmp); } /* * First thing we do is make sure that we are established. */ // 关闭了只能读不能写 if (sk->shutdown & SEND_SHUTDOWN) { release_sock(sk); sk->err = EPIPE; if (copied) return(copied); sk->err = 0; return(-EPIPE); } /* * Wait for a connection to finish. */ // 处于不能写状态,close_wait是可写不可读,因为对端已经关闭了写 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) { if (sk->err) { release_sock(sk); if (copied) return(copied); tmp = -sk->err; sk->err = 0; return(tmp); } // syn和syn_recv状态的时候可以写,重复发包,否则是出错状态 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { release_sock(sk); if (copied) return(copied); if (sk->err) { tmp = -sk->err; sk->err = 0; return(tmp); } // 长连接 if (sk->keepopen) { send_sig(SIGPIPE, current, 0); } return(-EPIPE); } if (nonblock || copied) { release_sock(sk); if (copied) return(copied); return(-EAGAIN); } release_sock(sk); cli(); if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) { interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); if (copied) return(copied); return(-ERESTARTSYS); } } sk->inuse = 1; sti(); } /* * The following code can result in copy <= if sk->mss is ever * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). * sk->mtu is constant once SYN processing is finished. I.e. we * had better not get here until we've seen his SYN and at least one * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) * But ESTABLISHED should guarantee that. sk->max_window is by definition * non-decreasing. Note that any ioctl to set user_mss must be done * before the exchange of SYN's. If the initial ack from the other * end has a window of 0, max_window and thus mss will both be 0. */ /* * Now we need to check if we have a half built packet. */ // 先看是否有小块的数据被缓存起来,是的话先取出skb,不需要立刻发送的话再入队 if ((skb = tcp_dequeue_partial(sk)) != NULL) { int hdrlen; /* IP header + TCP header */ hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data) + sizeof(struct tcphdr); /* Add more stuff to the end of skb->len */ // 不是紧急数据,则把数据追加到缓存的小包数据后面,是紧急数据则先把小包数据发出去,然后下一个循环再发普通数据 if (!(flags & MSG_OOB)) { // mss-数据长度等于还可以传多少长度的数据 copy = min(sk->mss - (skb->len - hdrlen), len); /* FIXME: this is really a bug. */ if (copy <= 0) { printk("TCP: **bug**: \"copy\" <= 0!!\n"); copy = 0; } // 把用户的数据赋值copy长度个字节到数据包的数据部分 memcpy_fromfs(skb->data + skb->len, from, copy); // 更新skb的data字段使用了多少字节 skb->len += copy; // 下次复制的首地址 from += copy; // 已复制的字节长度 copied += copy; // 还有多少字节需要复制 len -= copy; // 下一个发送的字节的序列号大小 sk->write_seq += copy; } // 数据部分大于等于mss或者是带外数据或者还没有发出去一个数据包则直接发送 if ((skb->len - hdrlen) >= sk->mss || (flags & MSG_OOB) || !sk->packets_out) tcp_send_skb(sk, skb); else // 继续缓存,满足条件后一起发送 tcp_enqueue_partial(skb, sk); continue; } /* * We also need to worry about the window. * If window < 1/2 the maximum window we've seen from this * host, don't use it. This is sender side * silly window prevention, as specified in RFC1122. * (Note that this is different than earlier versions of * SWS prevention, e.g. RFC813.). What we actually do is * use the whole MSS. Since the results in the right * edge of the packet being outside the window, it will * be queued for later rather than sent. */ // 可发送的序列化最大值 - 下一个可写的序列化值等于可以发送的字节数 copy = sk->window_seq - sk->write_seq; if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss) copy = sk->mss; // 能发送的比需要发送的大,则取需要发送的 if (copy > len) copy = len; /* * We should really check the window here also. */ send_tmp = NULL; // 不是紧急数据并且也小于mss,则需要缓存到partial队列,否则直接发送 if (copy < sk->mss && !(flags & MSG_OOB)) { /* * We will release the socket in case we sleep here. */ release_sock(sk); /* * NB: following must be mtu, because mss can be increased. * mss is always <= mtu */ skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL); sk->inuse = 1; send_tmp = skb; } else { /* * We will release the socket in case we sleep here. */ release_sock(sk); skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL); sk->inuse = 1; } /* * If we didn't get any memory, we need to sleep. */ // 没有写空间了 if (skb == NULL) { sk->socket->flags |= SO_NOSPACE; // 非阻塞直接返回已经写入的字节 if (nonblock) { release_sock(sk); if (copied) return(copied); return(-EAGAIN); } /* * FIXME: here is another race condition. */ tmp = sk->wmem_alloc; // 这个函数会处理收到的数据包,如果收到ack包则会腾出写空间 release_sock(sk); cli(); /* * Again we will try to avoid it. */ // 处于可写状态但是没有写空间,则阻塞 if (tmp <= sk->wmem_alloc && (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) && sk->err == 0) { sk->socket->flags &= ~SO_NOSPACE; interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); if (copied) return(copied); return(-ERESTARTSYS); } } sk->inuse = 1; sti(); continue; } skb->len = 0; skb->sk = sk; skb->free = 0; skb->localroute = sk->localroute|(flags&MSG_DONTROUTE); buff = skb->data; /* * FIXME: we need to optimize this. * Perhaps some hints here would be good. */ // 构建ip头和mac头,返回ip头+mac头的长度的大小 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl); if (tmp < 0 ) { prot->wfree(sk, skb->mem_addr, skb->mem_len); release_sock(sk); if (copied) return(copied); return(tmp); } // 更新data中的数据长度 skb->len += tmp; skb->dev = dev; // 指向可写地址,准备写入tcp头 buff += tmp; // skb的tcp头指向data字段的tcp头 skb->h.th =(struct tcphdr *) buff; // 构建tcp头,len-copy表示是否已经传输完len字节的数据,用于设置push标记 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy); if (tmp < 0) { prot->wfree(sk, skb->mem_addr, skb->mem_len); release_sock(sk); if (copied) return(copied); return(tmp); } // 带外数据 if (flags & MSG_OOB) { // 设置urg标记位,设置紧急指针指向紧急数据的后面一个字节 ((struct tcphdr *)buff)->urg = 1; ((struct tcphdr *)buff)->urg_ptr = ntohs(copy); } // 更新skb->data中的数据长度 skb->len += tmp; // 复制copy个字节到tcp头后面成为tcp报文的负载 memcpy_fromfs(buff+tmp, from, copy); // 更新需要复制的数据地址 from += copy; // 复制字节数累加 copied += copy; // 还有多少个字节需要复制 len -= copy; // 更新skb->data的数据长度 skb->len += copy; skb->free = 0; // 更新下一个tcp报文的序列化 sk->write_seq += copy; // 数据量太少并且不是紧急数据,并且有待确认的包(nagle算法规则),则先缓存 if (send_tmp != NULL && sk->packets_out) { tcp_enqueue_partial(send_tmp, sk); continue; } // 否则直接发送 tcp_send_skb(sk, skb); } sk->err = 0;/* * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly * interactive fast network servers. It's meant to be on and * it really improves the throughput though not the echo time * on my slow slip link - Alan *//* * Avoid possible race on send_tmp - c/o Johannes Stille */ // 符合nagle算法条件或者没有开启nagle算法且序列号合法则发送 if(sk->partial && ((!sk->packets_out) /* If not nagling we can send on the before case too.. */ || (sk->nonagle && before(sk->write_seq , sk->window_seq)) )) tcp_send_partial(sk); release_sock(sk); return(copied);}
ip层发送函数,进行了数据包的缓存处理
/* * Queues a packet to be sent, and starts the transmitter * if necessary. if free = 1 then we free the block after * transmit, otherwise we don't. If free==2 we not only * free the block but also don't assign a new ip seq number. * This routine also needs to put in the total length, * and compute the checksum */void ip_queue_xmit(struct sock *sk, struct device *dev, struct sk_buff *skb, int free){ struct iphdr *iph; unsigned char *ptr; /* Sanity check */ if (dev == NULL) { printk("IP: ip_queue_xmit dev = NULL\n"); return; } IS_SKB(skb); /* * Do some book-keeping in the packet for later */ skb->dev = dev; // 发送时间 skb->when = jiffies; /* * Find the IP header and set the length. This is bad * but once we get the skb data handling code in the * hardware will push its header sensibly and we will * set skb->ip_hdr to avoid this mess and the fixed * header length problem */ ptr = skb->data; ptr += dev->hard_header_len; iph = (struct iphdr *)ptr; skb->ip_hdr = iph; // 整个ip头和数据的长度 iph->tot_len = ntohs(skb->len-dev->hard_header_len);#ifdef CONFIG_IP_FIREWALL if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1) /* just don't send this packet */ return;#endif /* * No reassigning numbers to fragments... */ // 用于重组分片的id if(free!=2) iph->id = htons(ip_id_count++); else free=1; /* All buffers without an owner socket get freed */ if (sk == NULL) free = 1; skb->free = free; /* * Do we need to fragment. Again this is inefficient. * We need to somehow lock the original buffer and use * bits of it. */ // 数据包大小mtu则分片处理 if(skb->len > dev->mtu + dev->hard_header_len) { ip_fragment(sk,skb,dev,0); IS_SKB(skb); kfree_skb(skb,FREE_WRITE); return; } /* * Add an IP checksum */ // ip层校验和 ip_send_check(iph); /* * Print the frame when debugging */ /* * More debugging. You cannot queue a packet already on a list * Spot this and moan loudly. */ if (skb->next != NULL) { printk("ip_queue_xmit: next != NULL\n"); skb_unlink(skb); } /* * If a sender wishes the packet to remain unfreed * we add it to his send queue. This arguably belongs * in the TCP level since nobody else uses it. BUT * remember IPng might change all the rules. */ // free等于0说明这个包要缓存 if (!free) { unsigned long flags; /* The socket now has more outstanding blocks */ // 发送但还没收到确认的数据包数量 sk->packets_out++; /* Protect the list for a moment */ save_flags(flags); cli(); if (skb->link3 != NULL) { printk("ip.c: link3 != NULL\n"); skb->link3 = NULL; } // 插入已发送但未确认队列,用于超时重传 if (sk->send_head == NULL) { sk->send_tail = skb; sk->send_head = skb; } else { sk->send_tail->link3 = skb; sk->send_tail = skb; } /* skb->link3 is NULL */ /* Interrupt restore */ restore_flags(flags); } else /* Remember who owns the buffer */ skb->sk = sk; /* * If the indicated interface is up and running, send the packet. */ ip_statistics.IpOutRequests++;#ifdef CONFIG_IP_ACCT ip_acct_cnt(iph,dev, ip_acct_chain);#endif #ifdef CONFIG_IP_MULTICAST /* * Multicasts are looped back for other local users */ if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK)) { if(sk==NULL || sk->ip_mc_loop) { if(iph->daddr==IGMP_ALL_HOSTS) ip_loopback(dev,skb); else { struct ip_mc_list *imc=dev->ip_mc_list; while(imc!=NULL) { if(imc->multiaddr==iph->daddr) { ip_loopback(dev,skb); break; } imc=imc->next; } } } /* Multicasts with ttl 0 must not go beyond the host */ if(skb->ip_hdr->ttl==0) { kfree_skb(skb, FREE_READ); return; } }#endif if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK)) ip_loopback(dev,skb); if (dev->flags & IFF_UP) { /* * If we have an owner use its priority setting, * otherwise use NORMAL */ if (sk != NULL) { // 调用mac层发送 dev_queue_xmit(skb, dev, sk->priority); } else { dev_queue_xmit(skb, dev, SOPRI_NORMAL); } } else { ip_statistics.IpOutDiscards++; if (free) kfree_skb(skb, FREE_WRITE); }}
发表评论
最新留言
能坚持,总会有不一样的收获!
[***.219.124.196]2025年04月16日 15时56分25秒
关于作者

喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
nginx上配置phpmyadmin
2021-05-09
HustOJ二次开发之修改数据库连接池
2021-05-09
SpringBoot之整合Dubbo
2021-05-09
Drools之关键字及错误信息
2021-05-09
嘿!为你的应用创建滚动日志吧?
2021-05-09
一个JAVA应用启动缓慢问题排查 --来自jdk securerandom 的问候
2021-05-09
报警系统:php输出头信息以方便脚本抓取信息[排查篇]
2021-05-09
spring-boot-2.0.3之redis缓存实现,不是你想的那样哦!
2021-05-09
httprunner学习23-加解密
2021-05-09
jenkins学习6-进docker容器修改jenkins时间
2021-05-09
jenkins学习13-凭据管理(删除多余的凭据)
2021-05-09
有道云笔记 同步到我的博客园
2021-05-09
阿里云“网红"运维工程师白金:做一个平凡的圆梦人
2021-05-09
AnalyticDB for PostgreSQL 6.0 新特性介绍
2021-05-09
Alibaba Cloud Linux 2 LTS 正式发布,提供更高性能和更多保障!
2021-05-09
李笑来必读书籍整理
2021-05-09
vue书籍整理
2021-05-09