网络篇之sendfile

网络篇之sendfilesendfile 通过减少一次数据拷贝和两次内核 用户空间切换 提高文件到网络发送的效率

大家好,欢迎来到IT知识分享网。

将文件中的内容,发送到网络中,一般的步骤是:1.调用read,将文件内容读取到用户缓冲区;2.调用send(或write)将用户缓冲区的内容发送出去。

涉及的数据拷贝有:硬盘 –> 内核文件缓存 –> 用户缓冲区 –> 内核网络缓存。

而sendfile,数据拷贝变成:硬盘 –> 内核文件缓存 –> 内核网络缓存。

可见用sendfile,减少了一次数据拷贝,同时减少了两次内核空间与用户空间的切换,从而提高了数据发送的效率。

首先看下总体的调用流程: 

在splice_direct_to_actor函数中,有个循环。 在循环中,调用do_splice_to将数据拷贝到内核文件缓冲区;调用actor(direct_splice_actor)将数据拷贝到内核网络缓冲区。

一 从硬盘拷贝到内核文件缓冲区

数据从硬盘拷贝到内核文件缓存,涉及的两个数据结构如下: 

struct pipe_inode_info {

struct mutex mutex;

wait_queue_head_t wait; 管道/FIFO等待队列

unsigned int nrbufs, curbuf, buffers;

unsigned int readers; 读进程的标志

unsigned int writers; 写进程的标志

unsigned int files;

unsigned int waiting_writers; 在等待队列中睡眠的写进程的个数

unsigned int r_counter; 与readers类似,但当等待读取FIFO的进程时使用

unsigned int w_counter; 与writers类似,但当等待写入FIFO的进程时使用

struct page *tmp_page;

struct fasync_struct *fasync_readers; 用于通过信息进号的异步I/O通知

struct fasync_struct *fasync_writers; 用于通过信息进号的异步I/O通知

struct pipe_buffer *bufs; 管道缓存区描述符

struct user_struct *user;

};

struct pipe_buffer {

struct page *page; 管道缓冲区页框的描述符地址

offset – 页框内有效数据的当前位置

len – 页框内有效数据的长度

unsigned int offset, len;

const struct pipe_buf_operations *ops; 管道缓冲区方法表的地址

unsigned int flags;

unsigned long private;

};

 

 网络篇之sendfile

 struct pipe_inode_info结构中有个struct pipe_buffer *bufs成员,bufs指向的是struct pipe_buffer类型的数组。struct pipe_buffer中的page指向一个页框(struct page类型),页框保存的是从硬盘中读取的文件的内容。可以参考上面的图片,对数据存储结构有个大体的了解。

从硬盘读取数据到页框中的代码:

static ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; // 获取文件对应的address_space对象 struct inode *inode = mapping->host; // 获取地址空间对象的所有者,即索引节点对象 struct file_ra_state *ra = &filp->f_ra; loff_t *ppos = &iocb->ki_pos; // 正在进行I/O操作的当前文件位置 pgoff_t index; // 第一个请求字节所在页的逻辑号 pgoff_t last_index; // 最后一个请求字节所在的页逻辑号 pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ // 第一个字节在页内的偏移量 unsigned int prev_offset; int error = 0; // 超过文件大小限制 if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); index = *ppos >> PAGE_SHIFT; //当前逻辑页 prev_index = ra->prev_pos >> PAGE_SHIFT; prev_offset = ra->prev_pos & (PAGE_SIZE-1); last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; // 读取的最后一个逻辑页 offset = *ppos & ~PAGE_MASK; // 文件当前指针在页内的偏移 for (;;) { struct page *page; pgoff_t end_index; loff_t isize; unsigned long nr, ret; // 检查当前进程的标志TIF_NEED_RESCHED,如果该标志置位,调用schedule cond_resched(); find_page: if (fatal_signal_pending(current)) { error = -EINTR; goto out; } // 查找页高速缓存以找到包含所请求数据的页描述符 page = find_get_page(mapping, index); if (!page) { if (iocb->ki_flags & IOCB_NOWAIT) goto would_block; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); page = find_get_page(mapping, index); if (unlikely(page == NULL)) goto no_cached_page; } page_ok: /* * i_size must be checked after we know the page is Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ // 文件大小 isize = i_size_read(inode); // 最后一页的页索引 end_index = (isize - 1) >> PAGE_SHIFT; // index超出文件包含的页数 if (unlikely(!isize || index > end_index)) { put_page(page); goto out; } /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_SIZE; // 需要读取的字节数 if (index == end_index) { // 最后一页 nr = ((isize - 1) & ~PAGE_MASK) + 1; if (nr <= offset) { put_page(page); goto out; } } nr = nr - offset; /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) flush_dcache_page(page); /* * When a sequential read accesses a page several times, * only mark it as accessed the first time. */ if (prev_index != index || offset != prev_offset) // 将标志PG_referenced或PG_active置位,表示该页正在被访问不应该被换出 mark_page_accessed(page); prev_index = index; /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ // 将数据拷贝到iter ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; prev_offset = offset; // 减少页描述符的引用计数 put_page(page); written += ret; if (!iov_iter_count(iter)) goto out; if (ret < nr) { error = -EFAULT; goto out; } continue; out: // 更新预读数据结构ra ra->prev_pos = prev_index; ra->prev_pos <<= PAGE_SHIFT; ra->prev_pos |= prev_offset; // 更新文件指针 *ppos = ((loff_t)index << PAGE_SHIFT) + offset; // 更新文件访问时间 file_accessed(filp); return written ? written : error; }

将页框与pipe_buffer关联的部分代码:

static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; struct pipe_buffer *buf; size_t off; int idx; if (unlikely(bytes > i->count)) bytes = i->count; if (unlikely(!bytes)) return 0; if (!sanity(i)) return 0; off = i->iov_offset; idx = i->idx; buf = &pipe->bufs[idx]; if (off) { if (offset == off && buf->page == page) { /* merge with the last one */ buf->len += bytes; i->iov_offset += bytes; goto out; } idx = next_idx(idx, pipe); // 更新idx buf = &pipe->bufs[idx]; } if (idx == pipe->curbuf && pipe->nrbufs) // 缓冲区已满 return 0; pipe->nrbufs++; buf->ops = &page_cache_pipe_buf_ops; get_page(buf->page = page); // 使buf->page指向page buf->offset = offset; buf->len = bytes; i->iov_offset = offset + bytes; i->idx = idx; out: i->count -= bytes; return bytes; }

二 从内核文件缓冲区拷贝到内核网络缓冲区

首先贴下内核网络层几个数据的关联:

网络篇之sendfile

从文件中读取的部分数据,已经保存在pipe->bufs指向的数组中,splice_from_pipe_feed则遍历此数组。

static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; while (pipe->nrbufs) { // 缓冲区包含有效数据的pipe_buffer的数量 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } ret = actor(pipe, buf, sd); // pipe_to_sendpage if (ret <= 0) return ret; buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { pipe_buf_release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); // 获取下一个缓存区索引 pipe->nrbufs--; if (pipe->files) sd->need_wakeup = true; } if (!sd->total_len) return 0; } return 1; }

 在pipe_to_sendpage中获取套机字对应的file

struct file *file = sd->u.file;

return file->f_op->sendpage(file, buf->page, buf->offset,

  sd->len, &pos, more);

下面提到的页框page,即buf->page。

在sock_sendpage中获取套接字对应的socket

struct socket *sock;

int flags;

sock = file->private_data;

 在inet_sendpage中获取对应的sock

struct sock *sk = sock->sk;

在do_tcp_sendpages中:

申请skb

skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,

  tcp_rtx_and_write_queues_empty(sk));

将skb添加到sk->sk_write_queue链表上

skb_entail(sk, skb); 

获取空闲的frags的索引

i = skb_shinfo(skb)->nr_frags;

 将page与skb进行关联:

static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; /* * Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). */ frag->page.p = page; frag->page_offset = off; skb_frag_size_set(frag, size); page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; }

免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/133158.html

(0)
上一篇 2025-07-25 21:20
下一篇 2025-07-25 21:26

相关推荐

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注

关注微信