Commit f19d5870 authored by Tao Ma's avatar Tao Ma Committed by Theodore Ts'o
Browse files

ext4: add normal write support for inline data



For a normal write case (not journalled write, not delayed
allocation), we write to the inline if the file is small and convert
it to an extent based file when the write is larger than the max
inline size.
Signed-off-by: default avatarTao Ma <boyu.mt@taobao.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 46c7f254
...@@ -2018,8 +2018,19 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ...@@ -2018,8 +2018,19 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *); ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *, struct buffer_head *ext4_bread(handle_t *, struct inode *,
ext4_lblk_t, int, int *); ext4_lblk_t, int, int *);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock, int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create); struct buffer_head *bh_result, int create);
int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle,
struct buffer_head *bh);
extern struct inode *ext4_iget(struct super_block *, unsigned long); extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_write_inode(struct inode *, struct writeback_control *);
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include <linux/fiemap.h> #include <linux/fiemap.h>
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
#include "ext4_extents.h" #include "ext4_extents.h"
#include "xattr.h"
#include <trace/events/ext4.h> #include <trace/events/ext4.h>
...@@ -2310,7 +2311,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, ...@@ -2310,7 +2311,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{ {
int index; int index;
int depth = ext_depth(inode); int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
depth = ext_depth(inode);
if (chunk) if (chunk)
index = depth * 2; index = depth * 2;
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
#include "ext4.h" #include "ext4.h"
#include "xattr.h" #include "xattr.h"
#include "truncate.h"
#define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
...@@ -515,6 +516,238 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) ...@@ -515,6 +516,238 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
return ret >= 0 ? 0 : ret; return ret >= 0 ? 0 : ret;
} }
static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
struct inode *inode,
unsigned flags)
{
int ret, needed_blocks;
handle_t *handle = NULL;
int retries = 0, sem_held = 0;
struct page *page = NULL;
unsigned from, to;
struct ext4_iloc iloc;
if (!ext4_has_inline_data(inode)) {
/*
* clear the flag so that no new write
* will trap here again.
*/
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
return 0;
}
needed_blocks = ext4_writepage_trans_blocks(inode);
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
goto out;
}
/* We cannot recurse into the filesystem as the transaction is already
* started */
flags |= AOP_FLAG_NOFS;
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page) {
ret = -ENOMEM;
goto out;
}
down_write(&EXT4_I(inode)->xattr_sem);
sem_held = 1;
/* If some one has already done this for us, just exit. */
if (!ext4_has_inline_data(inode)) {
ret = 0;
goto out;
}
from = 0;
to = ext4_get_inline_size(inode);
if (!PageUptodate(page)) {
ret = ext4_read_inline_page(inode, page);
if (ret < 0)
goto out;
}
ret = ext4_destroy_inline_data_nolock(handle, inode);
if (ret)
goto out;
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, from, to, ext4_get_block_write);
else
ret = __block_write_begin(page, from, to, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, page_buffers(page),
from, to, NULL,
do_journal_get_write_access);
}
if (ret) {
unlock_page(page);
page_cache_release(page);
ext4_orphan_add(handle, inode);
up_write(&EXT4_I(inode)->xattr_sem);
sem_held = 0;
ext4_journal_stop(handle);
handle = NULL;
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
* make sure the inode is removed from the
* orphan list in that case.
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
block_commit_write(page, from, to);
out:
if (page) {
unlock_page(page);
page_cache_release(page);
}
if (sem_held)
up_write(&EXT4_I(inode)->xattr_sem);
if (handle)
ext4_journal_stop(handle);
brelse(iloc.bh);
return ret;
}
/*
* Try to write data in the inode.
* If the inode has inline data, check whether the new write can be
* in the inode also. If not, create the page the handle, move the data
* to the page make it update and let the later codes create extent for it.
*/
int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
unsigned flags,
struct page **pagep)
{
int ret;
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
if (pos + len > ext4_get_max_inline_size(inode))
goto convert;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
/*
* The possible write could happen in the inode,
* so try to reserve the space in inode first.
*/
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
goto out;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
goto out;
/* We don't have space in inline inode, so convert it to extent. */
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
brelse(iloc.bh);
goto convert;
}
flags |= AOP_FLAG_NOFS;
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page) {
ret = -ENOMEM;
goto out;
}
*pagep = page;
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
ret = 0;
unlock_page(page);
page_cache_release(page);
goto out_up_read;
}
if (!PageUptodate(page)) {
ret = ext4_read_inline_page(inode, page);
if (ret < 0)
goto out_up_read;
}
ret = 1;
handle = NULL;
out_up_read:
up_read(&EXT4_I(inode)->xattr_sem);
out:
if (handle)
ext4_journal_stop(handle);
brelse(iloc.bh);
return ret;
convert:
return ext4_convert_inline_data_to_extent(mapping,
inode, flags);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct page *page)
{
int ret;
void *kaddr;
struct ext4_iloc iloc;
if (unlikely(copied < len)) {
if (!PageUptodate(page)) {
copied = 0;
goto out;
}
}
ret = ext4_get_inode_loc(inode, &iloc);
if (ret) {
ext4_std_error(inode->i_sb, ret);
copied = 0;
goto out;
}
down_write(&EXT4_I(inode)->xattr_sem);
BUG_ON(!ext4_has_inline_data(inode));
kaddr = kmap_atomic(page);
ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
kunmap_atomic(kaddr);
SetPageUptodate(page);
/* clear page dirty so that writepages wouldn't work for us. */
ClearPageDirty(page);
up_write(&EXT4_I(inode)->xattr_sem);
brelse(iloc.bh);
out:
return copied;
}
int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
{ {
int ret; int ret;
......
...@@ -770,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ...@@ -770,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL; return NULL;
} }
static int walk_page_buffers(handle_t *handle, int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *head, struct buffer_head *head,
unsigned from, unsigned from,
unsigned to, unsigned to,
int *partial, int *partial,
int (*fn)(handle_t *handle, int (*fn)(handle_t *handle,
struct buffer_head *bh)) struct buffer_head *bh))
{ {
struct buffer_head *bh; struct buffer_head *bh;
unsigned block_start, block_end; unsigned block_start, block_end;
...@@ -826,8 +826,8 @@ static int walk_page_buffers(handle_t *handle, ...@@ -826,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile * is elevated. We'll still have enough credits for the tiny quotafile
* write. * write.
*/ */
static int do_journal_get_write_access(handle_t *handle, int do_journal_get_write_access(handle_t *handle,
struct buffer_head *bh) struct buffer_head *bh)
{ {
int dirty = buffer_dirty(bh); int dirty = buffer_dirty(bh);
int ret; int ret;
...@@ -850,8 +850,6 @@ static int do_journal_get_write_access(handle_t *handle, ...@@ -850,8 +850,6 @@ static int do_journal_get_write_access(handle_t *handle,
return ret; return ret;
} }
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create); struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping, static int ext4_write_begin(struct file *file, struct address_space *mapping,
...@@ -876,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ...@@ -876,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1); from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len; to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
flags, pagep);
if (ret < 0)
goto out;
if (ret == 1) {
ret = 0;
goto out;
}
}
retry: retry:
handle = ext4_journal_start(inode, needed_blocks); handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) { if (IS_ERR(handle)) {
...@@ -893,6 +902,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ...@@ -893,6 +902,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
*pagep = page; *pagep = page;
if (ext4_should_dioread_nolock(inode)) if (ext4_should_dioread_nolock(inode))
...@@ -901,8 +911,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ...@@ -901,8 +911,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
ret = __block_write_begin(page, pos, len, ext4_get_block); ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) { if (!ret && ext4_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page), ret = ext4_walk_page_buffers(handle, page_buffers(page),
from, to, NULL, do_journal_get_write_access); from, to, NULL,
do_journal_get_write_access);
} }
if (ret) { if (ret) {
...@@ -957,7 +968,12 @@ static int ext4_generic_write_end(struct file *file, ...@@ -957,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle(); handle_t *handle = ext4_journal_current_handle();
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); if (ext4_has_inline_data(inode))
copied = ext4_write_inline_data_end(inode, pos, len,
copied, page);
else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
/* /*
* No need to use i_size_read() here, the i_size * No need to use i_size_read() here, the i_size
...@@ -1114,8 +1130,8 @@ static int ext4_journalled_write_end(struct file *file, ...@@ -1114,8 +1130,8 @@ static int ext4_journalled_write_end(struct file *file,
page_zero_new_buffers(page, from+copied, to); page_zero_new_buffers(page, from+copied, to);
} }
ret = walk_page_buffers(handle, page_buffers(page), from, ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
to, &partial, write_end_fn); to, &partial, write_end_fn);
if (!partial) if (!partial)
SetPageUptodate(page); SetPageUptodate(page);
new_i_size = pos + copied; new_i_size = pos + copied;
...@@ -1903,7 +1919,7 @@ static int __ext4_journalled_writepage(struct page *page, ...@@ -1903,7 +1919,7 @@ static int __ext4_journalled_writepage(struct page *page,
ClearPageChecked(page); ClearPageChecked(page);
page_bufs = page_buffers(page); page_bufs = page_buffers(page);
BUG_ON(!page_bufs); BUG_ON(!page_bufs);
walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
/* As soon as we unlock the page, it can go away, but we have /* As soon as we unlock the page, it can go away, but we have
* references to buffers so we are safe */ * references to buffers so we are safe */
unlock_page(page); unlock_page(page);
...@@ -1916,11 +1932,11 @@ static int __ext4_journalled_writepage(struct page *page, ...@@ -1916,11 +1932,11 @@ static int __ext4_journalled_writepage(struct page *page,
BUG_ON(!ext4_handle_valid(handle)); BUG_ON(!ext4_handle_valid(handle));
ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
do_journal_get_write_access); do_journal_get_write_access);
err = walk_page_buffers(handle, page_bufs, 0, len, NULL, err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
write_end_fn); write_end_fn);
if (ret == 0) if (ret == 0)
ret = err; ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
...@@ -1928,7 +1944,7 @@ static int __ext4_journalled_writepage(struct page *page, ...@@ -1928,7 +1944,7 @@ static int __ext4_journalled_writepage(struct page *page,
if (!ret) if (!ret)
ret = err; ret = err;
walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA); ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out: out:
return ret; return ret;
...@@ -2007,8 +2023,8 @@ static int ext4_writepage(struct page *page, ...@@ -2007,8 +2023,8 @@ static int ext4_writepage(struct page *page,
commit_write = 1; commit_write = 1;
} }
page_bufs = page_buffers(page); page_bufs = page_buffers(page);
if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) { ext4_bh_delay_or_unwritten)) {
/* /*
* We don't want to do block allocation, so redirty * We don't want to do block allocation, so redirty
* the page and return. We may reach here when we do * the page and return. We may reach here when we do
...@@ -2831,7 +2847,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) ...@@ -2831,7 +2847,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* We allocate an uinitialized extent if blocks haven't been allocated. * We allocate an uinitialized extent if blocks haven't been allocated.
* The extent will be converted to initialized after the IO is complete. * The extent will be converted to initialized after the IO is complete.
*/ */
static int ext4_get_block_write(struct inode *inode, sector_t iblock, int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create) struct buffer_head *bh_result, int create)
{ {
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
...@@ -3738,7 +3754,8 @@ static inline void ext4_iget_extra_inode(struct inode *inode, ...@@ -3738,7 +3754,8 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR); ext4_set_inode_state(inode, EXT4_STATE_XATTR);
ext4_find_inline_data_nolock(inode); ext4_find_inline_data_nolock(inode);
} } else
EXT4_I(inode)->i_inline_off = 0;
} }
struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
...@@ -3907,17 +3924,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ...@@ -3907,17 +3924,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl); ei->i_file_acl);
ret = -EIO; ret = -EIO;
goto bad_inode; goto bad_inode;
} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { } else if (!ext4_has_inline_data(inode)) {
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
(S_ISLNK(inode->i_mode) && if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
!ext4_inode_is_fast_symlink(inode))) (S_ISLNK(inode->i_mode) &&
/* Validate extent which is part of inode */ !ext4_inode_is_fast_symlink(inode))))
ret = ext4_ext_check_inode(inode); /* Validate extent which is part of inode */
} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || ret = ext4_ext_check_inode(inode);
(S_ISLNK(inode->i_mode) && } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
!ext4_inode_is_fast_symlink(inode))) { (S_ISLNK(inode->i_mode) &&
/* Validate block references which are part of inode */ !ext4_inode_is_fast_symlink(inode))) {
ret = ext4_ind_check_inode(inode); /* Validate block references which are part of inode */
ret = ext4_ind_check_inode(inode);
}
} }
if (ret) if (ret)
goto bad_inode; goto bad_inode;
...@@ -4104,9 +4123,10 @@ static int ext4_do_update_inode(handle_t *handle, ...@@ -4104,9 +4123,10 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev)); cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0; raw_inode->i_block[2] = 0;
} }
} else } else if (!ext4_has_inline_data(inode)) {
for (block = 0; block < EXT4_N_BLOCKS; block++) for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block]; raw_inode->i_block[block] = ei->i_data[block];
}
raw_inode->i_disk_version = cpu_to_le32(inode->i_version); raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
if (ei->i_extra_isize) { if (ei->i_extra_isize) {
...@@ -4793,8 +4813,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -4793,8 +4813,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* journal_start/journal_stop which can block and take a long time * journal_start/journal_stop which can block and take a long time
*/ */
if (page_has_buffers(page)) { if (page_has_buffers(page)) {
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, if (!ext4_walk_page_buffers(NULL, page_buffers(page),
ext4_bh_unmapped)) { 0, len, NULL,
ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */ /* Wait so that we don't change page under IO */
wait_on_page_writeback(page); wait_on_page_writeback(page);
ret = VM_FAULT_LOCKED; ret = VM_FAULT_LOCKED;
...@@ -4815,7 +4836,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -4815,7 +4836,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
} }
ret = __block_page_mkwrite(vma, vmf, get_block); ret = __block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) { if (!ret && ext4_should_journal_data(inode)) {
if (walk_page_buffers(handle, page_buffers(page), 0, if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page); unlock_page(page);
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
......