bdi_queue_work()提交了work给bdi_wq上,由对应的bdi处理函数进行处理,默认的函数为bdi_writeback_workfn,其代码如下:
void bdi_writeback_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; set_worker_desc("flush-%s", dev_name(bdi->dev)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || !test_bit(BDI_registered, &bdi->state))) { /* * The normal path. Keep writing back @bdi until its * work_list is empty. Note that this path is also taken * if @bdi is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&bdi->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ pages_written = writeback_inodes_wb(&bdi->wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } if (!list_empty(&bdi->work_list)) mod_delayed_work(bdi_wq, &wb->dwork, 0); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) bdi_wakeup_thread_delayed(bdi); current->flags &= ~PF_SWAPWRITE; }首先判断当前workqueue能否获得足够的worker进行处理,如果能则将bdi上所有work全部提交,否则只提交一个work并限制写入1024个pages。
正常情况下通过调用wb_do_writeback函数处理回写。
该函数代码如下,遍历bdi上所有work,通过调用wb_writeback()进行数据写入。
static long wb_do_writeback(struct bdi_writeback *wb) { struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; long wrote = 0; set_bit(BDI_writeback_running, &wb->bdi->state); while ((work = get_next_work_item(bdi)) != NULL) { trace_writeback_exec(bdi, work); wrote += wb_writeback(wb, work); /* * Notify the caller of completion if this is a synchronous * work item, otherwise just free it. */ if (work->done) complete(work->done); else kfree(work); } /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); clear_bit(BDI_writeback_running, &wb->bdi->state); return wrote; }wb_writeback()函数最终调用__writeback_single_inode()将某个inode上脏页刷回。
**__writeback_single_inode**__writeback_single_inode()的代码如下,最终通过调用do_writepages()函数写盘:
static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; WARN_ON(!(inode->i_state & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); ret = do_writepages(mapping, wbc); /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data * I/O completion. We don't do it for sync(2) writeback because it has a * separate, external IO completion path and ->sync_fs for guaranteeing * inode metadata is written back correctly. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } /* * Some filesystems may redirty the inode during the writeback * due to delalloc, clear dirty metadata flags right before * write_inode() */ spin_lock(&inode->i_lock); /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state &= ~I_DIRTY_PAGES; dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); spin_unlock(&inode->i_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; } trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } do_writepages函数do_writepages()在上一篇已经介绍过了,它负责调用底层文件系统的a_ops->writepages将pages写入后端存储。