Commit 64769240 authored by Alex Tomas's avatar Alex Tomas Committed by Theodore Ts'o
Browse files

ext4: Add delayed allocation support in data=writeback mode



Updated with fixes from Mingming Cao <cmm@us.ibm.com> to unlock and
release the page from page cache if the delalloc write_begin failed, and
properly handle preallocated blocks.  Also added a fix to clear
buffer_delay in block_write_full_page() after allocating a delayed
buffer.

Updated with fixes from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to update i_disksize properly and to add bmap support for delayed
allocation.

Updated with a fix from Valerie Clement <valerie.clement@bull.net> to
avoid filesystem corruption when the filesystem is mounted with the
delalloc option and blocksize < pagesize.

Signed-off-by: default avatarAlex Tomas <alex@clusterfs.com>
Signed-off-by: default avatarMingming Cao <cmm@us.ibm.com>
Signed-off-by: default avatarDave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: default avatarAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
parent 29a814d2
......@@ -536,6 +536,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
......
......@@ -32,6 +32,7 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
......@@ -46,6 +47,8 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
static void ext4_invalidatepage(struct page *page, unsigned long offset);
/*
* Test whether an inode is a fast symlink.
*/
......@@ -1407,6 +1410,669 @@ static int ext4_journalled_write_end(struct file *file,
return ret ? ret : copied;
}
/*
* Delayed allocation stuff
*/
struct mpage_da_data {
struct inode *inode;
struct buffer_head lbh; /* extent of blocks */
unsigned long first_page, next_page; /* extent of pages */
get_block_t *get_block;
struct writeback_control *wbc;
};
/*
* mpage_da_submit_io - walks through extent of pages and try to write
* them with __mpage_writepage()
*
* @mpd->inode: inode
* @mpd->first_page: first page of the extent
* @mpd->next_page: page after the last page of the extent
* @mpd->get_block: the filesystem's block mapper function
*
* By the time mpage_da_submit_io() is called we expect all blocks
* to be allocated. this may be wrong if allocation failed.
*
* As pages are already locked by write_cache_pages(), we can't use it
*/
static int mpage_da_submit_io(struct mpage_da_data *mpd)
{
struct address_space *mapping = mpd->inode->i_mapping;
struct mpage_data mpd_pp = {
.bio = NULL,
.last_block_in_bio = 0,
.get_block = mpd->get_block,
.use_writepage = 1,
};
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
index = mpd->first_page;
end = mpd->next_page - 1;
while (index <= end) {
/* XXX: optimize tail */
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
index = page->index;
if (index > end)
break;
index++;
err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
/*
* In error case, we have to continue because
* remaining pages are still locked
* XXX: unlock and re-dirty them?
*/
if (ret == 0)
ret = err;
}
pagevec_release(&pvec);
}
if (mpd_pp.bio)
mpage_bio_submit(WRITE, mpd_pp.bio);
return ret;
}
/*
* mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
*
* @mpd->inode - inode to walk through
* @exbh->b_blocknr - first block on a disk
* @exbh->b_size - amount of space in bytes
* @logical - first logical block to start assignment with
*
* the function goes through all passed space and put actual disk
* block numbers into buffer heads, dropping BH_Delay
*/
static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
struct buffer_head *exbh)
{
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
int blocks = exbh->b_size >> inode->i_blkbits;
sector_t pblock = exbh->b_blocknr, cur_logical;
struct buffer_head *head, *bh;
unsigned long index, end;
struct pagevec pvec;
int nr_pages, i;
index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
pagevec_init(&pvec, 0);
while (index <= end) {
/* XXX: optimize tail */
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
index = page->index;
if (index > end)
break;
index++;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
BUG_ON(!page_has_buffers(page));
bh = page_buffers(page);
head = bh;
/* skip blocks out of the range */
do {
if (cur_logical >= logical)
break;
cur_logical++;
} while ((bh = bh->b_this_page) != head);
do {
if (cur_logical >= logical + blocks)
break;
if (buffer_delay(bh)) {
bh->b_blocknr = pblock;
clear_buffer_delay(bh);
} else if (buffer_mapped(bh)) {
BUG_ON(bh->b_blocknr != pblock);
}
cur_logical++;
pblock++;
} while ((bh = bh->b_this_page) != head);
}
pagevec_release(&pvec);
}
}
/*
* __unmap_underlying_blocks - just a helper function to unmap
* set of blocks described by @bh
*/
static inline void __unmap_underlying_blocks(struct inode *inode,
struct buffer_head *bh)
{
struct block_device *bdev = inode->i_sb->s_bdev;
int blocks, i;
blocks = bh->b_size >> inode->i_blkbits;
for (i = 0; i < blocks; i++)
unmap_underlying_metadata(bdev, bh->b_blocknr + i);
}
/*
* mpage_da_map_blocks - go through given space
*
* @mpd->lbh - bh describing space
* @mpd->get_block - the filesystem's block mapper function
*
* The function skips space we know is already mapped to disk blocks.
*
* The function ignores errors ->get_block() returns, thus real
* error handling is postponed to __mpage_writepage()
*/
static void mpage_da_map_blocks(struct mpage_da_data *mpd)
{
struct buffer_head *lbh = &mpd->lbh;
int err = 0, remain = lbh->b_size;
sector_t next = lbh->b_blocknr;
struct buffer_head new;
/*
* We consider only non-mapped and non-allocated blocks
*/
if (buffer_mapped(lbh) && !buffer_delay(lbh))
return;
while (remain) {
new.b_state = lbh->b_state;
new.b_blocknr = 0;
new.b_size = remain;
err = mpd->get_block(mpd->inode, next, &new, 1);
if (err) {
/*
* Rather than implement own error handling
* here, we just leave remaining blocks
* unallocated and try again with ->writepage()
*/
break;
}
BUG_ON(new.b_size == 0);
if (buffer_new(&new))
__unmap_underlying_blocks(mpd->inode, &new);
/*
* If blocks are delayed marked, we need to
* put actual blocknr and drop delayed bit
*/
if (buffer_delay(lbh))
mpage_put_bnr_to_bhs(mpd, next, &new);
/* go for the remaining blocks */
next += new.b_size >> mpd->inode->i_blkbits;
remain -= new.b_size;
}
}
#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
/*
* mpage_add_bh_to_extent - try to add one more block to extent of blocks
*
* @mpd->lbh - extent of blocks
* @logical - logical number of the block in the file
* @bh - bh of the block (used to access block's state)
*
* the function is used to collect contig. blocks in same state
*/
static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
sector_t logical, struct buffer_head *bh)
{
struct buffer_head *lbh = &mpd->lbh;
sector_t next;
next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
/*
* First block in the extent
*/
if (lbh->b_size == 0) {
lbh->b_blocknr = logical;
lbh->b_size = bh->b_size;
lbh->b_state = bh->b_state & BH_FLAGS;
return;
}
/*
* Can we merge the block to our big extent?
*/
if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
lbh->b_size += bh->b_size;
return;
}
/*
* We couldn't merge the block to our extent, so we
* need to flush current extent and start new one
*/
mpage_da_map_blocks(mpd);
/*
* Now start a new extent
*/
lbh->b_size = bh->b_size;
lbh->b_state = bh->b_state & BH_FLAGS;
lbh->b_blocknr = logical;
}
/*
* __mpage_da_writepage - finds extent of pages and blocks
*
* @page: page to consider
* @wbc: not used, we just follow rules
* @data: context
*
* The function finds extents of pages and scan them for all blocks.
*/
static int __mpage_da_writepage(struct page *page,
struct writeback_control *wbc, void *data)
{
struct mpage_da_data *mpd = data;
struct inode *inode = mpd->inode;
struct buffer_head *bh, *head, fake;
sector_t logical;
/*
* Can we merge this page to current extent?
*/
if (mpd->next_page != page->index) {
/*
* Nope, we can't. So, we map non-allocated blocks
* and start IO on them using __mpage_writepage()
*/
if (mpd->next_page != mpd->first_page) {
mpage_da_map_blocks(mpd);
mpage_da_submit_io(mpd);
}
/*
* Start next extent of pages ...
*/
mpd->first_page = page->index;
/*
* ... and blocks
*/
mpd->lbh.b_size = 0;
mpd->lbh.b_state = 0;
mpd->lbh.b_blocknr = 0;
}
mpd->next_page = page->index + 1;
logical = (sector_t) page->index <<
(PAGE_CACHE_SHIFT - inode->i_blkbits);
if (!page_has_buffers(page)) {
/*
* There is no attached buffer heads yet (mmap?)
* we treat the page asfull of dirty blocks
*/
bh = &fake;
bh->b_size = PAGE_CACHE_SIZE;
bh->b_state = 0;
set_buffer_dirty(bh);
set_buffer_uptodate(bh);
mpage_add_bh_to_extent(mpd, logical, bh);
} else {
/*
* Page with regular buffer heads, just add all dirty ones
*/
head = page_buffers(page);
bh = head;
do {
BUG_ON(buffer_locked(bh));
if (buffer_dirty(bh))
mpage_add_bh_to_extent(mpd, logical, bh);
logical++;
} while ((bh = bh->b_this_page) != head);
}
return 0;
}
/*
* mpage_da_writepages - walk the list of dirty pages of the given
* address space, allocates non-allocated blocks, maps newly-allocated
* blocks to existing bhs and issue IO them
*
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @get_block: the filesystem's block mapper function.
*
* This is a library function, which implements the writepages()
* address_space_operation.
*
* In order to avoid duplication of logic that deals with partial pages,
* multiple bio per page, etc, we find non-allocated blocks, allocate
* them with minimal calls to ->get_block() and re-use __mpage_writepage()
*
* It's important that we call __mpage_writepage() only once for each
* involved page, otherwise we'd have to implement more complicated logic
* to deal with pages w/o PG_lock or w/ PG_writeback and so on.
*
* See comments to mpage_writepages()
*/
static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
get_block_t get_block)
{
struct mpage_da_data mpd;
int ret;
if (!get_block)
return generic_writepages(mapping, wbc);
mpd.wbc = wbc;
mpd.inode = mapping->host;
mpd.lbh.b_size = 0;
mpd.lbh.b_state = 0;
mpd.lbh.b_blocknr = 0;
mpd.first_page = 0;
mpd.next_page = 0;
mpd.get_block = get_block;
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
/*
* Handle last extent of pages
*/
if (mpd.next_page != mpd.first_page) {
mpage_da_map_blocks(&mpd);
mpage_da_submit_io(&mpd);
}
return ret;
}
/*
* this is a special callback for ->write_begin() only
* it's intention is to return mapped block or reserve space
*/
static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret = 0;
BUG_ON(create == 0);
BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
/*
* first, we need to know whether the block is allocated already
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0);
if (ret == 0) {
/* the block isn't allocated yet, let's reserve space */
/* XXX: call reservation here */
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
map_bh(bh_result, inode->i_sb, 0);
set_buffer_new(bh_result);
set_buffer_delay(bh_result);
} else if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
}
return ret;
}
static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
loff_t disksize = EXT4_I(inode)->i_disksize;
handle_t *handle = NULL;
if (create) {
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
}
ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
bh_result, create, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
/*
* Update on-disk size along with block allocation
* we don't use 'extend_disksize' as size may change
* within already allocated block -bzzz
*/
disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
if (disksize > i_size_read(inode))
disksize = i_size_read(inode);
if (disksize > EXT4_I(inode)->i_disksize) {
/*
* XXX: replace with spinlock if seen contended -bzzz
*/
down_write(&EXT4_I(inode)->i_data_sem);
if (disksize > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = disksize;
up_write(&EXT4_I(inode)->i_data_sem);
if (EXT4_I(inode)->i_disksize == disksize) {
if (handle == NULL)
handle = ext4_journal_start(inode, 1);
if (!IS_ERR(handle))
ext4_mark_inode_dirty(handle, inode);
}
}
ret = 0;
}
out:
if (handle && !IS_ERR(handle))
ext4_journal_stop(handle);
return ret;
}
/* FIXME!! only support data=writeback mode */
static int ext4_da_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
handle_t *handle = NULL;
int ret = 0;
int err;
if (ext4_journal_current_handle())
goto out_fail;
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_fail;
}
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
ret = nobh_writepage(page, ext4_get_block, wbc);
else
ret = block_write_full_page(page, ext4_get_block, wbc);
if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
EXT4_I(inode)->i_disksize = inode->i_size;
ext4_mark_inode_dirty(handle, inode);
}
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
return ret;
out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return ret;
}
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
}
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
int ret;
struct page *page;
pgoff_t index;
unsigned from, to;
struct inode *inode = mapping->host;
handle_t *handle;
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
/*
* With delayed allocation, we don't log the i_disksize update
* if there is delayed block allocation. But we still need
* to journalling the i_disksize update if writes to the end
* of file which has an already mapped buffer.
*/
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
page = __grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_da_get_block_prep);
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
page_cache_release(page);
}
out:
return ret;
}
static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
{
return !buffer_mapped(bh) || buffer_delay(bh);
}
static int ext4_da_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;