文本代码基于Linux 5.15 。
当用户层调用write 去做写入, linux 内核里面是如何处理的? 本文以exfat 为例, 讨论这个流程
入口函数
write 系统调用的定义如下:
fs/read_write.c
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{struct fd f = fdget_pos(fd);ssize_t ret = -EBADF;if (f.file) {loff_t pos, *ppos = file_ppos(f.file);if (ppos) {pos = *ppos;ppos = &pos;}ret = vfs_write(f.file, buf, count, ppos);if (ret >= 0 && ppos)f.file->f_pos = pos;fdput_pos(f);}return ret;
}SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,size_t, count)
{return ksys_write(fd, buf, count);
}
主要是调用了ksys_write
, 然后再调用vfs_write
fs/read_write.cksys_write->vfs_write
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{ssize_t ret;if (!(file->f_mode & FMODE_WRITE))return -EBADF;if (!(file->f_mode & FMODE_CAN_WRITE))return -EINVAL;if (unlikely(!access_ok(buf, count)))return -EFAULT;ret = rw_verify_area(WRITE, file, pos, count);if (ret)return ret;if (count > MAX_RW_COUNT)count = MAX_RW_COUNT;file_start_write(file);if (file->f_op->write)ret = file->f_op->write(file, buf, count, pos);else if (file->f_op->write_iter)ret = new_sync_write(file, buf, count, pos);elseret = -EINVAL;if (ret > 0) {fsnotify_modify(file);add_wchar(current, ret);}inc_syscw(current);file_end_write(file);return ret;
}ksys_write->vfs_write->new_sync_write
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };struct kiocb kiocb;struct iov_iter iter;ssize_t ret;init_sync_kiocb(&kiocb, filp);kiocb.ki_pos = (ppos ? *ppos : 0);iov_iter_init(&iter, WRITE, &iov, 1, len);ret = call_write_iter(filp, &kiocb, &iter);BUG_ON(ret == -EIOCBQUEUED);if (ret > 0 && ppos)*ppos = kiocb.ki_pos;return ret;
}include/linux/fs.hksys_write->vfs_write->new_sync_write->call_write_iter->write_iter
static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,struct iov_iter *iter)
{return file->f_op->write_iter(kio, iter);
}
可以看到, 最终会调用到f_op->write_iter
这个回调。
在exfat文件系统中, 这个回调被实现为generic_file_write_iter
fs/exfat/file.c
const struct file_operations exfat_file_operations = {.llseek = generic_file_llseek,.read_iter = generic_file_read_iter,.write_iter = generic_file_write_iter,.mmap = generic_file_mmap,.fsync = exfat_file_fsync,.splice_read = generic_file_splice_read,.splice_write = iter_file_splice_write,
};
generic_file_write_iter
最终会调用到generic_perform_write
执行真正的写入操作:
mm/filemap.c
ksys_write->vfs_write->new_sync_write->call_write_iter->`generic_file_write_iter->__generic_file_write_iter->generic_perform_write
ssize_t generic_perform_write(struct file *file,struct iov_iter *i, loff_t pos)
{struct address_space *mapping = file->f_mapping;const struct address_space_operations *a_ops = mapping->a_ops;long status = 0;ssize_t written = 0;unsigned int flags = 0;do { /* 1 */struct page *page;unsigned long offset; /* Offset into pagecache page */unsigned long bytes; /* Bytes to write to page */size_t copied; /* Bytes copied from user */void *fsdata;offset = (pos & (PAGE_SIZE - 1));bytes = min_t(unsigned long, PAGE_SIZE - offset,iov_iter_count(i));again:/** Bring in the user page that we will copy from _first_.* Otherwise there's a nasty deadlock on copying from the* same page as we're writing to, without it being marked* up-to-date.** Not only is this an optimisation, but it is also required* to check that the address is actually valid, when atomic* usercopies are used, below.*/if (unlikely(iov_iter_fault_in_readable(i, bytes))) {status = -EFAULT;break;}if (fatal_signal_pending(current)) {status = -EINTR;break;}status = a_ops->write_begin(file, mapping, pos, bytes, flags,&page, &fsdata); /* 2 */if (unlikely(status < 0))break;if (mapping_writably_mapped(mapping))flush_dcache_page(page);copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); /* 3 */flush_dcache_page(page);status = a_ops->write_end(file, mapping, pos, bytes, copied,page, fsdata); /* 4 */if (unlikely(status < 0))break;copied = status;cond_resched();iov_iter_advance(i, copied);if (unlikely(copied == 0)) {/** If we were unable to copy any data at all, we must* fall back to a single segment length write.** If we didn't fallback here, we could livelock* because not all segments in the iov can be copied at* once without a pagefault.*/bytes = min_t(unsigned long, PAGE_SIZE - offset,iov_iter_single_seg_count(i));goto again;}pos += copied;written += copied;balance_dirty_pages_ratelimited(mapping);} while (iov_iter_count(i));return written ? written : status;
}
EXPORT_SYMBOL(generic_perform_write);
(1) 执行循环, 直到所有的内容写入完成
(2) 调用a_ops->write_begin
回调, 被具体文件系统用来执行相关的准备
(3) 把需要写的内容写到page cache
(4) 写入完成, 调用a_ops->write_end
处理后续的善后工作
可以看到, 主要是write_begin
和write_end
这两个调用, 下面分别来说明这两个函数
通知为写请求做准备
在exfat文件系统中, write_begin
的实现如下:
fs/exfat/inode.c
ksys_write->vfs_write->new_sync_write->call_write_iter->`generic_file_write_iter->__generic_file_write_iter->generic_perform_write->exfat_write_begin
static int exfat_write_begin(struct file *file, struct address_space *mapping,loff_t pos, unsigned int len, unsigned int flags,struct page **pagep, void **fsdata)
{int ret;*pagep = NULL;ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,exfat_get_block,&EXFAT_I(mapping->host)->i_size_ondisk);if (ret < 0)exfat_write_failed(mapping, pos+len);return ret;
}
主要是对cont_write_begin
的封装
fs/buffer.c
ksys_write->vfs_write->new_sync_write->call_write_iter->`generic_file_write_iter->__generic_file_write_iter->generic_perform_write->exfat_write_begin->cont_write_begin
int cont_write_begin(struct file *file, struct address_space *mapping,loff_t pos, unsigned len, unsigned flags,struct page **pagep, void **fsdata,get_block_t *get_block, loff_t *bytes)
{struct inode *inode = mapping->host;unsigned int blocksize = i_blocksize(inode);unsigned int zerofrom;int err;err = cont_expand_zero(file, mapping, pos, bytes); /* 1 */if (err)return err;zerofrom = *bytes & ~PAGE_MASK;if (pos+len > *bytes && zerofrom & (blocksize-1)) {*bytes |= (blocksize-1);(*bytes)++;}return block_write_begin(mapping, pos, len, flags, pagep, get_block); /* 2 */
}
(1) 如果存在文件空洞, 这需要把空洞的地方填0, 这一点后面详细解释
(2) 这里面主要是建立buffer_head的隐射, 并且做必要的预读。
cont_expand_zero
考虑这种情况, 一个文件的大小是1M , 但用户写的时候, 先seek 到2M的offset, 再进行写入。 那么中间的1M部分即成为了所谓的“文件空洞”, 对应这种空洞, exFAT 会用0去填充对应的位置, 这样如果读取到对应的位置, 读到的就是0。
需要说明的是, 如果文件大小是1M , 用户如果从1M的地方开始写入, 这时候cont_expand_zero
会直接返回, 直接调用到block_write_begin
这个函数。
fs/buffer.c
ksys_write->vfs_write->new_sync_write->call_write_iter->generic_file_write_iter->__generic_file_write_iter->generic_perform_write->exfat_write_begin->cont_write_begin->cont_expand_zero
static int cont_expand_zero(struct file *file, struct address_space *mapping,loff_t pos, loff_t *bytes)
{struct inode *inode = mapping->host;unsigned int blocksize = i_blocksize(inode);struct page *page;void *fsdata;pgoff_t index, curidx;loff_t curpos;unsigned zerofrom, offset, len;int err = 0;index = pos >> PAGE_SHIFT;offset = pos & ~PAGE_MASK;while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) { /* 1 */zerofrom = curpos & ~PAGE_MASK;if (zerofrom & (blocksize-1)) {*bytes |= (blocksize-1);(*bytes)++;}len = PAGE_SIZE - zerofrom;err = pagecache_write_begin(file, mapping, curpos, len, 0,&page, &fsdata); /* 2 */if (err)goto out;zero_user(page, zerofrom, len); /* 3 */err = pagecache_write_end(file, mapping, curpos, len, len,page, fsdata); /* 4 */if (err < 0)goto out;BUG_ON(err != len);err = 0;balance_dirty_pages_ratelimited(mapping);if (fatal_signal_pending(current)) {err = -EINTR;goto out;}}/* page covers the boundary, find the boundary offset */if (index == curidx) { /* 5 */zerofrom = curpos & ~PAGE_MASK;/* if we will expand the thing last block will be filled */if (offset <= zerofrom) {goto out;}if (zerofrom & (blocksize-1)) {*bytes |= (blocksize-1);(*bytes)++;}len = offset - zerofrom;err = pagecache_write_begin(file, mapping, curpos, len, 0,&page, &fsdata);if (err)goto out;zero_user(page, zerofrom, len);err = pagecache_write_end(file, mapping, curpos, len, len,page, fsdata);if (err < 0)goto out;BUG_ON(err != len);err = 0;}
out:return err;
}
(1) 对于超出部分的页面, 全部填充0。 按照上面的例子, size=1m, offset=2m, 那么中间1m的部分要全部填充0.
(2) pagecache_write_begin
实际上是调用aops->write_begin
。 这里又会调用到cont_write_begin
, 只不过现在调用到这个函数的时候, size=1m, offset=1m, cont_expand_zero
直接返回, 会直接到block_write_begin
, 分配对应的空间, 并读取对应的内容到page中。
(3) 将读取到的page 清0。这样下次read的时候,如果读到了这个位置, 那么就会返回0.
(4) pagecache_write_end
实际上调用了aops->write_end
(5) 上面是处理不在一个page 中的情况,但一个page 大小是4k。 还有一种情况是空洞发生在page 内, 比如size=512, offset =1024, 这样中间512Byte就需要清零。 zerofrom
代表页内偏移, 需要把zerofrom
到offset
这部分内容清零。
block_write_begin
这个函数做的事情就是把建立page与磁盘内容的映射关系, 如果文件长度不够, 还会进行块分配; 另外,在必要时, 还是进行预读操作。(以下内容主要参考《存储技术原理分析》)
我们知道, 对于一个page而言, 它是通过buffer_head 去管理具体的内容。一般来说, 一个page(4k) 会有8个buffer_head(512B)。这样某个buffer_head (block_start->block_end) 与要写入的地址(from->to)存在如上六种情况。
前两种情况, 这个buffer_head 落在要写的数据之外; 接下来三种情况, 属于非覆盖写的情况, 这几种情况都需要进行预读, 将对应buffer_head的内容先读出来; 最后一种情况, 是覆盖写的情况, 不需要进行预读, 因为要写的内容完全覆盖了buffer_head的区域, 这时候直接写入就好了, 没必要先读出来。
理解了上面几种情况, 我们再来看看这个函数的具体实现。
fs/buffer.cksys_write->vfs_write->new_sync_write->call_write_iter->generic_file_write_iter->__generic_file_write_iter->generic_perform_write->exfat_write_begin->cont_write_begin->block_write_begin
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,unsigned flags, struct page **pagep, get_block_t *get_block)
{pgoff_t index = pos >> PAGE_SHIFT;struct page *page;int status;page = grab_cache_page_write_begin(mapping, index, flags); /* 1 */if (!page)return -ENOMEM;status = __block_write_begin(page, pos, len, get_block); /* 2 */if (unlikely(status)) {unlock_page(page);put_page(page);page = NULL;}*pagep = page;return status;
}
(1) 获取对应文件位置的page。
(2) 调用__block_write_begin
执行真正的操作
fs/buffer.cksys_write->vfs_write->new_sync_write->call_write_iter->generic_file_write_iter->__generic_file_write_iter->generic_perform_write->exfat_write_begin->cont_write_begin->block_write_begin->__block_write_begin->__block_write_begin_int
int __block_write_begin(struct page *page, loff_t pos, unsigned len,get_block_t *get_block)
{return __block_write_begin_int(page, pos, len, get_block, NULL);
}int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,get_block_t *get_block, struct iomap *iomap)
{unsigned from = pos & (PAGE_SIZE - 1);unsigned to = from + len;struct inode *inode = page->mapping->host;unsigned block_start, block_end;sector_t block;int err = 0;unsigned blocksize, bbits;struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;BUG_ON(!PageLocked(page));BUG_ON(from > PAGE_SIZE);BUG_ON(to > PAGE_SIZE);BUG_ON(from > to);head = create_page_buffers(page, inode, 0);blocksize = head->b_size;bbits = block_size_bits(blocksize);block = (sector_t)page->index << (PAGE_SHIFT - bbits);for(bh = head, block_start = 0; bh != head || !block_start;block++, block_start=block_end, bh = bh->b_this_page) { /* 1 */block_end = block_start + blocksize;if (block_end <= from || block_start >= to) { /* 2 */if (PageUptodate(page)) {if (!buffer_uptodate(bh))set_buffer_uptodate(bh);}continue;}if (buffer_new(bh))clear_buffer_new(bh);if (!buffer_mapped(bh)) { /* 3 */WARN_ON(bh->b_size != blocksize);if (get_block) {err = get_block(inode, block, bh, 1); /* 4 */if (err)break;} else {iomap_to_bh(inode, block, bh, iomap);}if (buffer_new(bh)) {clean_bdev_bh_alias(bh);if (PageUptodate(page)) {clear_buffer_new(bh);set_buffer_uptodate(bh);mark_buffer_dirty(bh);continue;}if (block_end > to || block_start < from) /* 5 */zero_user_segments(page,to, block_end,block_start, from);continue;}}if (PageUptodate(page)) {if (!buffer_uptodate(bh))set_buffer_uptodate(bh);continue; }if (!buffer_uptodate(bh) && !buffer_delay(bh) &&!buffer_unwritten(bh) &&(block_start < from || block_end > to)) { /* 6 */ll_rw_block(REQ_OP_READ, 0, 1, &bh);*wait_bh++=bh;}}/** If we issued read requests - let them complete.*/while(wait_bh > wait) { /* 7 */wait_on_buffer(*--wait_bh);if (!buffer_uptodate(*wait_bh))err = -EIO;}if (unlikely(err))page_zero_new_buffers(page, from, to);return err;
}
(1) 循环处理每个buffer_head, 这个循环推出的条件是, bh == head, 即所有buffer_head遍历完。 这里有一个小疑问,为什么不再block_start > to
的时候就推出循环?
(2) 这里对应上述前两种情况, 这时候直接跳过
(3) (4) 如果buffer_head 没有建立映射关系, 调用get_block
映射到具体的文件系统的sector
(5) 对于③④⑤中情况, 调用zero_user_segments
先将非覆盖写的部分清零
(6) 对于③④⑤中情况, 需要进行预读, 调用ll_rw_block
, 读取对应的部分
(7) 等待(6)的读取完成
通知数据已写入完成
a_ops->write_end
在数据从用户buf复制到page cache 之后, 被具体文件系统用来做相关的善后。
对应exfat, 这个函数的被实例化为
fs/exfat/inode.c
static int exfat_write_end(struct file *file, struct address_space *mapping,loff_t pos, unsigned int len, unsigned int copied,struct page *pagep, void *fsdata)
{struct inode *inode = mapping->host;struct exfat_inode_info *ei = EXFAT_I(inode);int err;err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) {exfat_fs_error(inode->i_sb,"invalid size(size(%llu) > aligned(%llu)\n",i_size_read(inode), EXFAT_I(inode)->i_size_aligned);return -EIO;}if (err < len)exfat_write_failed(mapping, pos+len);if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {inode->i_mtime = inode->i_ctime = current_time(inode);ei->attr |= ATTR_ARCHIVE;mark_inode_dirty(inode);}return err;
}
主要调用了generic_write_end
处理具体的工作:
fs/buffer.cksys_write->vfs_write->new_sync_write->call_write_iter->generic_file_write_iter->__generic_file_write_iter->generic_perform_write->exfat_write_end->generic_write_end
int generic_write_end(struct file *file, struct address_space *mapping,loff_t pos, unsigned len, unsigned copied,struct page *page, void *fsdata)
{struct inode *inode = mapping->host;loff_t old_size = inode->i_size;bool i_size_changed = false;copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);/** No need to use i_size_read() here, the i_size cannot change under us* because we hold i_rwsem.** But it's important to update i_size while still holding page lock:* page writeout could otherwise come in and zero beyond i_size.*/if (pos + copied > inode->i_size) {i_size_write(inode, pos + copied);i_size_changed = true;}unlock_page(page);put_page(page);if (old_size < pos)pagecache_isize_extended(inode, old_size, pos);/** Don't mark the inode dirty under page lock. First, it unnecessarily* makes the holding time of page lock longer. Second, it forces lock* ordering of page lock and transaction start for journaling* filesystems.*/if (i_size_changed)mark_inode_dirty(inode);return copied;
}
EXPORT_SYMBOL(generic_write_end);
这里主要介绍一下block_write_end
这个函数:
fs/buffer.c
int block_write_end(struct file *file, struct address_space *mapping,loff_t pos, unsigned len, unsigned copied,struct page *page, void *fsdata)
{struct inode *inode = mapping->host;unsigned start;start = pos & (PAGE_SIZE - 1);if (unlikely(copied < len)) { /* 1 *//** The buffers that were written will now be uptodate, so we* don't have to worry about a readpage reading them and* overwriting a partial write. However if we have encountered* a short write and only partially written into a buffer, it* will not be marked uptodate, so a readpage might come in and* destroy our partial write.** Do the simplest thing, and just treat any short write to a* non uptodate page as a zero-length write, and force the* caller to redo the whole thing.*/if (!PageUptodate(page))copied = 0;page_zero_new_buffers(page, start+copied, start+len);}flush_dcache_page(page);/* This could be a short (even 0-length) commit */__block_commit_write(inode, page, start, start+copied); /* 2 */return copied;
}
这里有一种特殊的情况需要处理, 即实际写入长度copied < len, 为此专门定义了一个术语: 短写(short write)。
(以下内容主要参考《存储技术原理分析》)
对于完全覆盖写的情况,是没有进行预读, 这样如果没有进行完全覆盖写, 就会出现这个buffer_head中一部分内容是不确定, 既没有从磁盘中读出, 又没有从用户空间更新, 如果将这部分数据贸然更新到磁盘, 势必覆盖原来的有效数据。
因此最简单的处理, 是认为这种场景下没有写入(copied = 0), 然后让调用者重新做整个事情。
下面介绍一下__block_commit_write
这个函数,
static int __block_commit_write(struct inode *inode, struct page *page,unsigned from, unsigned to)
{unsigned block_start, block_end;int partial = 0;unsigned blocksize;struct buffer_head *bh, *head;bh = head = page_buffers(page);blocksize = bh->b_size;block_start = 0;do { /* 1 */block_end = block_start + blocksize;if (block_end <= from || block_start >= to) { /* 2 */if (!buffer_uptodate(bh))partial = 1;} else {set_buffer_uptodate(bh);mark_buffer_dirty(bh);}clear_buffer_new(bh);block_start = block_end;bh = bh->b_this_page;} while (bh != head);/** If this is a partial write which happened to make all buffers* uptodate then we can optimize away a bogus readpage() for* the next read(). Here we 'discover' whether the page went* uptodate as a result of this (potentially partial) write.*/if (!partial) /* 3 */SetPageUptodate(page);return 0;
}
(1) 遍历page中所有的buffer_head
(2) 这里和block_write_begin中一样,对应①②种情况, 对于不在范围内的buffer_head,设置partial = 1
(3) 如果!partitial, 即page中所有buffer_head都被处理, 设置SetPageUptodate。 这个是这个函数的关键操作。