- A+
从vfs_write 开始分析
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(VERIFY_READ, buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret >= 0) { count = ret; file_start_write(file); ret = __vfs_write(file, buf, count, pos); if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); } return ret; }
- access_ok
- __vfs_write
- fsnotify_modify
ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, loff_t *pos) { if (file->f_op->write) return file->f_op->write(file, p, count, pos); else if (file->f_op->write_iter) return new_sync_write(file, p, count, pos); else return -EINVAL; }
这里检查f_op 的write接口,我们以ext4为例。
const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = generic_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, };
那么它会走write_iter接口,
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; }
这里只是转换数据结构。
static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; int overwrite = 0; ssize_t ret; if (o_direct) { size_t length = iov_iter_count(from); loff_t pos = iocb->ki_pos; blk_start_plug(&plug); if (ext4_should_dioread_nolock(inode) && !aio_mutex && !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, len; map.m_lblk = pos >> blkbits; map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) - map.m_lblk; len = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) overwrite = 1; } } ret = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (ret > 0) { ssize_t err; err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } if (o_direct) blk_finish_plug(&plug); if (aio_mutex) mutex_unlock(aio_mutex); return ret; out: mutex_unlock(&inode->i_mutex); if (aio_mutex) mutex_unlock(aio_mutex); return ret; }
这边检查o_direct,应该就是直接写入的标签,前面会做一些工作,我先假设没有这个东东。
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written = 0; ssize_t err; ssize_t status; current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; err = file_update_time(file); if (err) goto out; if (iocb->ki_flags & IOCB_DIRECT) { loff_t pos, endbyte; written = generic_file_direct_write(iocb, from, iocb->ki_pos); if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) goto out; status = generic_perform_write(file, from, pos = iocb->ki_pos); if (unlikely(status < 0)) { err = status; goto out; } endbyte = pos + status - 1; err = filemap_write_and_wait_range(mapping, pos, endbyte); if (err == 0) { iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); } else { } } else { written = generic_perform_write(file, from, iocb->ki_pos); if (likely(written > 0)) iocb->ki_pos += written; } out: current->backing_dev_info = NULL; return written ? written : err; }
这里还是会检查IOCB_DIRECT,直接写流程,我们先不看了。
ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; ssize_t written = 0; unsigned int flags = 0; if (!iter_is_iovec(i)) flags |= AOP_FLAG_UNINTERRUPTIBLE; do { struct page *page; unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ void *fsdata; offset = (pos & (PAGE_CACHE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i)); again: if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } if (fatal_signal_pending(current)) { status = -EINTR; break; } status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); if (unlikely(status < 0)) break; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); flush_dcache_page(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) break; copied = status; cond_resched(); iov_iter_advance(i, copied); if (unlikely(copied == 0)) { bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_single_seg_count(i)); goto again; } pos += copied; written += copied; balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); return written ? written : status; }
这里就是循环一页一页的写。当然是写到缓存中,
- 1、a_ops->write_begin 和a_ops交互准备开始写
- 2、iov_iter_copy_from_user_atomic 拷贝用户数据到 page
- 3、a_ops->write_end 写入
- 4、balance_dirty_pages_ratelimited 做限制检查
那么准备cache的过程在write_begin中。做限制的后面再看。我们的目标就是这两个。
static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct page *page; pgoff_t index; unsigned from, to; trace_android_fs_datawrite_start(inode, pos, len, current->pid, current->comm); trace_ext4_write_begin(inode, pos, len, flags); needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, flags, pagep); if (ret < 0) return ret; if (ret == 1) return 0; } retry_grab: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; unlock_page(page); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { page_cache_release(page); return PTR_ERR(handle); } lock_page(page); if (page->mapping != mapping) { /* The page got truncated from under us */ unlock_page(page); page_cache_release(page); ext4_journal_stop(handle); goto retry_grab; } wait_for_stable_page(page); #ifdef CONFIG_EXT4_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, ext4_get_block_write); else ret = ext4_block_write_begin(page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); else ret = __block_write_begin(page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, page_buffers(page), from, to, NULL, do_journal_get_write_access); } if (ret) { unlock_page(page); if (pos + len > inode->i_size && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (pos + len > inode->i_size) { ext4_truncate_failed_write(inode); if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; page_cache_release(page); return ret; } *pagep = page; mtk_btag_pidlog_write_begin(*pagep); return ret; }
- 前面做一些检查
- 后面的handle是和block buffers进行交互的
- 主要的过程是中间的grab_cache_page_write_begin
struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { struct page *page; int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; if (flags & AOP_FLAG_NOFS) fgp_flags |= FGP_NOFS; page = pagecache_get_page(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); if (page) wait_for_stable_page(page); return page; }
pagecache_get_page 从radix中获取一页。
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t gfp_mask) { struct page *page; repeat: page = find_get_entry(mapping, offset); if (radix_tree_exceptional_entry(page)) page = NULL; if (!page) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!trylock_page(page)) { page_cache_release(page); return NULL; } } else { lock_page(page); } /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { unlock_page(page); page_cache_release(page); goto repeat; } VM_BUG_ON_PAGE(page->index != offset, page); } if (page && (fgp_flags & FGP_ACCESSED)) mark_page_accessed(page); no_page: if (!page && (fgp_flags & FGP_CREAT)) { int err; if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) gfp_mask |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp_mask &= ~__GFP_FS; page = __page_cache_alloc(gfp_mask); if (!page) return NULL; if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __SetPageReferenced(page); err = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_RECLAIM_MASK); if (unlikely(err)) { page_cache_release(page); page = NULL; if (err == -EEXIST) goto repeat; } } return page; }
- find_get_entry 获取一页
- mark_page_accessed 标记页据说是给清除的时候用的
- no_page流程,如果没有获取到page就 __page_cache_alloc 一个 然后add_to_page_cache_lru
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; rcu_read_lock(); repeat: page = NULL; pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); if (pagep) { page = radix_tree_deref_slot(pagep); if (unlikely(!page)) goto out; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) goto repeat; goto out; } if (!page_cache_get_speculative(page)) goto repeat; if (unlikely(page != *pagep)) { page_cache_release(page); goto repeat; } } out: rcu_read_unlock(); return page; }
find_get_entry 也很简单,就是在radix_tree_lookup_slot 找一个。
# Linux必备书籍推荐
《LINUX内核源代码情景分析(上册) 》毛德操,胡希明 >>>京东购买 >>>淘宝购买
《LINUX内核源代码情景分析(下册) 》毛德操,胡希明 >>>京东购买 >>>淘宝购买
《嵌入式Linux应用开发完全手册 》韦东山 著 >>>京东购买 >>>淘宝购买 (领券)
《深入理解Linux内核第3版》(美)博韦 >>>京东购买 >>>淘宝购买
《鸟哥的Linux私房菜:基础学习篇(第四版)》鸟哥 >>>京东购买 >>>淘宝购买

扫一扫关注微信公众号,上述5本电子书免费领取。
https://pan.baidu.com/s/1q5IjXAmybs8NBseR4R8Ksg
扫码关注微信公众号,回复“Linux” ,即可获取提取码
--- Linux文件系统学习系列笔记 ---
(原创笔记,转载请联系博主授权)
Linux文件系统学习:io的plug过程-request请求(9)
Linux文件系统学习:io的plug过程-blk_init_queue(10)
Linux文件系统学习:io的plug过程-blk_flush_plug_list的情况(11)
Linux文件系统学习:io的plug过程-queuelist的问题(12)
<欢迎关注微信公众号,第一时间查看最新内容>
您可以选择一种方式赞助本站
支付宝扫一扫赞助
微信钱包扫描赞助
赏