fs: introduce write_begin, write_end, and perform_write aops

These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).

[mark.fasheh@oracle.com: API design contributions, code review and fixes]
[akpm@linux-foundation.org: various fixes]
[dmonakhov@sw.ru: new aop block_write_begin fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Nick Piggin 2007-10-16 01:25:01 -07:00 committed by Linus Torvalds
parent 637aff46f9
commit afddba49d1
11 changed files with 575 additions and 206 deletions

View file

@ -1770,6 +1770,48 @@ recover:
goto done;
}
/*
* If a page has any new buffers, zero them out here, and mark them uptodate
* and dirty so they'll be written out (in order to prevent uninitialised
* block data from leaking). And clear the new bit.
*/
void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
{
unsigned int block_start, block_end;
struct buffer_head *head, *bh;
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
return;
bh = head = page_buffers(page);
block_start = 0;
do {
block_end = block_start + bh->b_size;
if (buffer_new(bh)) {
if (block_end > from && block_start < to) {
if (!PageUptodate(page)) {
unsigned start, size;
start = max(from, block_start);
size = min(to, block_end) - start;
zero_user_page(page, start, size, KM_USER0);
set_buffer_uptodate(bh);
}
clear_buffer_new(bh);
mark_buffer_dirty(bh);
}
}
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
}
EXPORT_SYMBOL(page_zero_new_buffers);
static int __block_prepare_write(struct inode *inode, struct page *page,
unsigned from, unsigned to, get_block_t *get_block)
{
@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
if (!err) {
bh = head;
do {
if (buffer_new(bh))
clear_buffer_new(bh);
} while ((bh = bh->b_this_page) != head);
return 0;
}
/* Error case: */
/*
* Zero out any newly allocated blocks to avoid exposing stale
* data. If BH_New is set, we know that the block was newly
* allocated in the above loop.
*/
bh = head;
block_start = 0;
do {
block_end = block_start+blocksize;
if (block_end <= from)
goto next_bh;
if (block_start >= to)
break;
if (buffer_new(bh)) {
clear_buffer_new(bh);
zero_user_page(page, block_start, bh->b_size, KM_USER0);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
next_bh:
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
if (unlikely(err))
page_zero_new_buffers(page, from, to);
return err;
}
@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
clear_buffer_new(bh);
}
/*
@ -1923,6 +1936,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
return 0;
}
/*
* block_write_begin takes care of the basic task of block allocation and
* bringing partial write blocks uptodate first.
*
* If *pagep is not NULL, then block_write_begin uses the locked page
* at *pagep rather than allocating its own. In this case, the page will
* not be unlocked or deallocated on failure.
*/
int block_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block)
{
struct inode *inode = mapping->host;
int status = 0;
struct page *page;
pgoff_t index;
unsigned start, end;
int ownpage = 0;
index = pos >> PAGE_CACHE_SHIFT;
start = pos & (PAGE_CACHE_SIZE - 1);
end = start + len;
page = *pagep;
if (page == NULL) {
ownpage = 1;
page = __grab_cache_page(mapping, index);
if (!page) {
status = -ENOMEM;
goto out;
}
*pagep = page;
} else
BUG_ON(!PageLocked(page));
status = __block_prepare_write(inode, page, start, end, get_block);
if (unlikely(status)) {
ClearPageUptodate(page);
if (ownpage) {
unlock_page(page);
page_cache_release(page);
*pagep = NULL;
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
}
goto out;
}
out:
return status;
}
EXPORT_SYMBOL(block_write_begin);
int block_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
unsigned start;
start = pos & (PAGE_CACHE_SIZE - 1);
if (unlikely(copied < len)) {
/*
* The buffers that were written will now be uptodate, so we
* don't have to worry about a readpage reading them and
* overwriting a partial write. However if we have encountered
* a short write and only partially written into a buffer, it
* will not be marked uptodate, so a readpage might come in and
* destroy our partial write.
*
* Do the simplest thing, and just treat any short write to a
* non uptodate page as a zero-length write, and force the
* caller to redo the whole thing.
*/
if (!PageUptodate(page))
copied = 0;
page_zero_new_buffers(page, start+copied, start+len);
}
flush_dcache_page(page);
/* This could be a short (even 0-length) commit */
__block_commit_write(inode, page, start, start+copied);
return copied;
}
EXPORT_SYMBOL(block_write_end);
int generic_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold i_mutex.
*
* But it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
mark_inode_dirty(inode);
}
unlock_page(page);
page_cache_release(page);
return copied;
}
EXPORT_SYMBOL(generic_write_end);
/*
* Generic "read page" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.