[Ocfs2-commits] mfasheh commits r2557 -
branches/locking-changes/fs/ocfs2
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Wed Aug 31 13:03:10 CDT 2005
Author: mfasheh
Date: 2005-08-31 13:03:07 -0500 (Wed, 31 Aug 2005)
New Revision: 2557
Added:
branches/locking-changes/fs/ocfs2/aops.h
Modified:
branches/locking-changes/fs/ocfs2/Makefile
branches/locking-changes/fs/ocfs2/aops.c
branches/locking-changes/fs/ocfs2/dir.c
branches/locking-changes/fs/ocfs2/file.c
branches/locking-changes/fs/ocfs2/file.h
branches/locking-changes/fs/ocfs2/inode.c
branches/locking-changes/fs/ocfs2/inode.h
branches/locking-changes/fs/ocfs2/journal.c
branches/locking-changes/fs/ocfs2/journal.h
branches/locking-changes/fs/ocfs2/namei.c
branches/locking-changes/fs/ocfs2/ocfs2.h
branches/locking-changes/fs/ocfs2/super.c
Log:
* seperate out the extend api some more. all i_size changes happen
completely independently of allocation changes now. More could probably be
done here, but this should suffice as a 1st pass.
* start shoving our cluster locking into the commit_write path now. No
locking update has happened to the upper layers yet.
* as a result of the locking change, we must update i_size in the
ocfs2_commit_write path now. This however makes it trivial for us to use
do ordered data writes on extends, via a small journal.[ch] update.
Thanks to ext3 for a small amount of the code used here.
* new strategy for zeroing holes, instead of cont_prepare_write which didn't
give us enough flexibility with locking and journalling. We now manually
zero them up front (but still in a similar fashion) which allows us to do
the right things wrt cluster locking and the journal.
* remove ip_mmu_private and teach ocfs2_get_block to mark blocks as "new"
on similar boundary conditions by sampling i_size.
Modified: branches/locking-changes/fs/ocfs2/Makefile
===================================================================
--- branches/locking-changes/fs/ocfs2/Makefile 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/Makefile 2005-08-31 18:03:07 UTC (rev 2557)
@@ -67,6 +67,7 @@
ocfs2.h \
buffer_head_io.h \
alloc.h \
+ aops.h \
dcache.h \
dir.h \
dlmglue.h \
Modified: branches/locking-changes/fs/ocfs2/aops.c
===================================================================
--- branches/locking-changes/fs/ocfs2/aops.c 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/aops.c 2005-08-31 18:03:07 UTC (rev 2557)
@@ -132,8 +132,8 @@
struct buffer_head *bh_result, int create)
{
int err = -EIO;
- u64 vbo = 0;
u64 p_blkno;
+ u64 eof_blkno;
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create);
@@ -149,8 +149,6 @@
goto bail;
}
- vbo = (u64)iblock << inode->i_sb->s_blocksize_bits;
-
/* this can happen if another node truncs after our extend! */
spin_lock(&OCFS2_I(inode)->ip_lock);
if (iblock >=
@@ -183,22 +181,20 @@
p_blkno, OCFS2_I(inode)->ip_blkno);
}
- if (vbo < OCFS2_I(inode)->ip_mmu_private)
+ eof_blkno = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)) - 1;
+ mlog(0, "Inode %lu, eof_blkno = %"MLFu64"\n", inode->i_ino, eof_blkno);
+ /* We don't support holes, so I/O inside of i_size can't be
+ * marked 'new' */
+ if (iblock <= eof_blkno)
goto bail;
if (!create)
goto bail;
- if (vbo != OCFS2_I(inode)->ip_mmu_private) {
- mlog(ML_ERROR, "Uh-oh, vbo = %"MLFi64", i_size = %lld, "
- "mmu = %lld, inode = %"MLFu64"\n", vbo,
- i_size_read(inode), OCFS2_I(inode)->ip_mmu_private,
- OCFS2_I(inode)->ip_blkno);
- BUG();
- err = -EIO;
- goto bail;
- }
+ mlog_bug_on_msg(iblock != (eof_blkno + 1),
+ "Inode %"MLFu64": I/O past tail of file! (i_size = "
+ "%lld, iblock = %llu)\n", OCFS2_I(inode)->ip_blkno,
+ i_size_read(inode), (unsigned long long) iblock);
set_buffer_new(bh_result);
- OCFS2_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
bail:
if (err < 0)
@@ -221,6 +217,10 @@
return ret;
}
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage. */
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
@@ -234,32 +234,193 @@
return ret;
}
-static int ocfs2_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+int ocfs2_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
{
int ret;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
- ret = cont_prepare_write(page, from, to, ocfs2_get_block,
- &(OCFS2_I(page->mapping->host)->ip_mmu_private));
+ ret = block_prepare_write(page, from, to, ocfs2_get_block);
mlog_exit(ret);
return ret;
}
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh))
+{
+ struct buffer_head *bh;
+ unsigned block_start, block_end;
+ unsigned blocksize = head->b_size;
+ int err, ret = 0;
+ struct buffer_head *next;
+
+ for ( bh = head, block_start = 0;
+ ret == 0 && (bh != head || !block_start);
+ block_start = block_end, bh = next)
+ {
+ next = bh->b_this_page;
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (partial && !buffer_uptodate(bh))
+ *partial = 1;
+ continue;
+ }
+ err = (*fn)(handle, bh);
+ if (!ret)
+ ret = err;
+ }
+ return ret;
+}
+
+/* Does the actual work of a commit_write. No cluster locks are
+ * taken. If extending is nonzero then it's assumed that di_bh is non
+ * null and that the caller has the proper cluster locks. */
+int ocfs2_commit_write_nolocks(struct file *file, struct page *page,
+ unsigned from, unsigned to,
+ struct buffer_head *di_bh,
+ unsigned extending)
+{
+ int ret;
+ u64 size;
+ ocfs2_dinode *di = NULL;
+ ocfs2_journal_handle *handle = NULL;
+ struct inode *inode = page->mapping->host;
+ ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (extending) {
+ handle = ocfs2_start_trans(osb, NULL,
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (!handle) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_set_inode_lock_trans(osb->journal, inode);
+
+ /* Only do this on extending writes - we don't
+ * hold the right cluster locks otherwise. */
+ if (ocfs2_should_order_data(inode)) {
+ ret = walk_page_buffers(handle->k_handle,
+ page_buffers(page),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ di = (ocfs2_dinode *) di_bh->b_data;
+ /* Mark our buffer early. We'd rather catch this error
+ * up here as opposed to after a successful
+ * commit_write which would require us to set back
+ * inode->i_size. */
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = generic_commit_write(file, page, from, to);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (extending) {
+ size = (u64) i_size_read(inode);
+ /* ocfs2_mark_inode_dirty is too heavy to use here. */
+ inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+ di->i_size = cpu_to_le64(size);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ if (handle)
+ ocfs2_commit_trans(handle);
+ return ret;
+}
+
static int ocfs2_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
- int ret;
+ int ret, extending = 0, locklevel = 0;
+ loff_t new_i_size;
+ struct buffer_head *di_bh;
+ struct inode *inode = page->mapping->host;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
- ret = generic_commit_write(file, page, from, to);
+ /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+ * us to sample inode->i_size here without the metadata lock:
+ *
+ * 1) We're currently holding the inode alloc lock, so no
+ * nodes can change it underneath us.
+ *
+ * 2) We've had to take the metadata lock at least once
+ * already to check for extending writes, hence insuring
+ * that our current copy is also up to date.
+ */
+ new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ if (new_i_size > i_size_read(inode)) {
+ extending = 1;
+ locklevel = 1;
+ }
+ ret = ocfs2_meta_lock(inode, NULL, &di_bh, locklevel);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_data_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock_meta;
+ }
+
+ ret = ocfs2_commit_write_nolocks(file, page, from, to, di_bh,
+ extending);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock_data;
+ }
+
+ BUG_ON(i_size_read(inode) != new_i_size);
+
+out_unlock_data:
+ ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+ ocfs2_meta_unlock(inode, locklevel);
+out:
+ if (di_bh)
+ brelse(di_bh);
+
mlog_exit(ret);
-
return ret;
}
Added: branches/locking-changes/fs/ocfs2/aops.h
===================================================================
--- branches/locking-changes/fs/ocfs2/aops.h 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/aops.h 2005-08-31 18:03:07 UTC (rev 2557)
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+int ocfs2_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to);
+
+int ocfs2_commit_write_nolocks(struct file *file, struct page *page,
+ unsigned from, unsigned to,
+ struct buffer_head *di_bh,
+ unsigned extending);
+
+#endif /* OCFS2_FILE_H */
Modified: branches/locking-changes/fs/ocfs2/dir.c
===================================================================
--- branches/locking-changes/fs/ocfs2/dir.c 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/dir.c 2005-08-31 18:03:07 UTC (rev 2557)
@@ -356,9 +356,9 @@
spin_unlock(&OCFS2_I(dir)->ip_lock);
if (extend) {
- status = ocfs2_extend_allocation(OCFS2_SB(sb), dir, 1,
- parent_fe_bh, handle,
- data_ac, meta_ac, NULL);
+ status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+ parent_fe_bh, handle,
+ data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
mlog_errno(status);
Modified: branches/locking-changes/fs/ocfs2/file.c
===================================================================
--- branches/locking-changes/fs/ocfs2/file.c 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/file.c 2005-08-31 18:03:07 UTC (rev 2557)
@@ -36,6 +36,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "aops.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -48,12 +49,6 @@
#include "buffer_head_io.h"
-static int ocfs2_zero_extend(struct inode *inode);
-static int ocfs2_orphan_for_truncate(ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 new_i_size);
-
int ocfs2_sync_inode(struct inode *inode)
{
filemap_fdatawrite(inode->i_mapping);
@@ -146,367 +141,15 @@
return (err < 0) ? -EIO : 0;
}
-static void ocfs2_update_inode_size(struct inode *inode,
- u64 new_size)
-{
- i_size_write(inode, new_size);
- inode->i_blocks = ocfs2_align_bytes_to_sectors(new_size);
-}
-
-void ocfs2_file_finish_extension(struct inode *inode,
- loff_t newsize,
- unsigned direct_extend)
-{
- int ret;
-
- mlog(0, "inode %"MLFu64", newsize = %lld, direct_extend = %u\n",
- OCFS2_I(inode)->ip_blkno, (long long)newsize, direct_extend);
-
- ocfs2_update_inode_size(inode, newsize);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
- if (direct_extend) {
- /*
- * This leaves dirty data in holes.
- * Caveat Emptor.
- */
- OCFS2_I(inode)->ip_mmu_private = newsize;
- return;
- }
-#endif
-
- /* caller won't overwrite return from g_f_w so we don't return */
- ret = ocfs2_zero_extend(inode);
- if (ret)
- mlog(ML_ERROR, "Unable to pre-zero extension of inode (%d)\n",
- ret);
-}
-
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
- const char __user *buf,
- size_t count,
- loff_t pos)
-{
- struct iovec local_iov = { .iov_base = (void __user *)buf,
- .iov_len = count };
- int ret, level;
- ocfs2_super *osb = NULL;
- struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_dentry->d_inode;
- int do_direct = 0, extended = 0;
- loff_t newsize, saved_pos;
-
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
- (unsigned int)count,
- filp->f_dentry->d_name.len,
- filp->f_dentry->d_name.name);
-
- /* happy write of zero bytes */
- if (count == 0) {
- ret = 0;
- goto bail;
- }
-
- if (!inode) {
- mlog(0, "bad inode\n");
- ret = -EIO;
- goto bail;
- }
-
- osb = OCFS2_SB(inode->i_sb);
-
- down(&inode->i_sem);
-
- /* this ginormous block is in here because it has so many inputs
- * and outputs from this function.. */
- level = !!(filp->f_flags & O_APPEND);
- for(;;) {
- u64 bytes_added;
-
- ret = ocfs2_meta_lock(inode, NULL, NULL, level);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail_i_sem;
- }
-
- /* work on a copy of ppos until we're sure that we won't have
- * to recalculate it due to relocking. */
- if (filp->f_flags & O_APPEND) {
- saved_pos = i_size_read(inode);
- mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
-#ifdef OCFS2_ORACORE_WORKAROUNDS
- if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
- /* ugh, work around some applications which
- * open everything O_DIRECT + O_APPEND and
- * really don't mean to use O_DIRECT. */
- filp->f_flags &= ~O_DIRECT;
- }
-#endif
- } else {
- saved_pos = iocb->ki_pos;
- }
- newsize = count + saved_pos;
-
- mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
- saved_pos, newsize, i_size_read(inode));
-
- if (newsize <= i_size_read(inode))
- break;
-
- if (level == 0) {
- ocfs2_meta_unlock(inode, level);
- level = 1;
- continue;
- }
-
- mlog(0, "Writing at EOF, will need more allocation: "
- "i_size=%lld, " "need=%"MLFu64"\n", i_size_read(inode),
- newsize);
-
- /* If we extend AT ALL here then we update our state
- * and continue the write call, regardless of error --
- * this is basically a short write. */
- ret = ocfs2_extend_file(osb, inode, newsize, &bytes_added);
- if (ret < 0 &&
- ret != -ERESTARTSYS && ret != -EINTR && ret != -ENOSPC) {
- mlog_errno(ret);
- mlog(ML_ERROR, "Failed to extend inode %"MLFu64
- " from %lld to %"MLFu64, OCFS2_I(inode)->ip_blkno,
- i_size_read(inode), newsize);
- }
- if (ret < 0 && (!bytes_added))
- goto bail_meta_unlock;
-
- extended = 1;
-
- /* We need to recalulate newsize and count according
- * to what extend could give us. If we got the whole
- * extend then this doesn't wind up changing the
- * values. */
- newsize = i_size_read(inode) + bytes_added;
- count = newsize - saved_pos;
- ret = 0;
- break;
- }
-
- /* we've got whatever cluster lock is appropriate now, so we
- * can stuff *ppos back. */
- iocb->ki_pos = saved_pos;
-
- if (filp->f_flags & O_DIRECT) {
-#ifdef OCFS2_ORACORE_WORKAROUNDS
- if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
- int sector_size = 1 << osb->s_sectsize_bits;
-
- if ((saved_pos & (sector_size - 1)) ||
- (count & (sector_size - 1)) ||
- ((unsigned long)buf & (sector_size - 1))) {
- do_direct = 0;
- filp->f_flags |= O_SYNC;
- } else {
- do_direct = 1;
- }
- } else
-#endif
- do_direct = 1;
-
- mlog(0, "O_DIRECT\n");
- }
-
- if (!do_direct) {
- ret = ocfs2_data_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail_extend;
- }
- }
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
- if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
- unsigned int saved_flags = filp->f_flags;
-
- if (do_direct)
- filp->f_flags |= O_DIRECT;
- else
- filp->f_flags &= ~O_DIRECT;
-
- ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
- &iocb->ki_pos);
-
- filp->f_flags = saved_flags;
- } else
-#endif
- ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
- &iocb->ki_pos);
-
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (!do_direct)
- ocfs2_data_unlock(inode, 1);
-
- /* we might have to finish up extentions that were performed before
- * an error was returned by, say, data locking */
-bail_extend:
- if (extended)
- ocfs2_file_finish_extension(inode, newsize, do_direct);
-bail_meta_unlock:
- ocfs2_meta_unlock(inode, level);
-bail_i_sem:
- up(&inode->i_sem);
-
-bail:
- mlog_exit(ret);
- return ret;
-}
-
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
- char __user *buf,
- size_t count,
- loff_t pos)
-{
- int ret = 0;
- ocfs2_super *osb = NULL;
- struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_dentry->d_inode;
-
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
- (unsigned int)count,
- filp->f_dentry->d_name.len,
- filp->f_dentry->d_name.name);
-
- if (!inode) {
- ret = -EINVAL;
- mlog_errno(ret);
- goto bail;
- }
-
- osb = OCFS2_SB(inode->i_sb);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
- if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
- if (filp->f_flags & O_DIRECT) {
- int sector_size = 1 << osb->s_sectsize_bits;
-
- if ((pos & (sector_size - 1)) ||
- (count & (sector_size - 1)) ||
- ((unsigned long)buf & (sector_size - 1)) ||
- (i_size_read(inode) & (sector_size -1))) {
- filp->f_flags &= ~O_DIRECT;
- }
- }
- }
-#endif
-
- ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- if (!(filp->f_flags & O_DIRECT)) {
- ret = ocfs2_data_lock(inode, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail_unlock_meta;
- }
- }
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
-
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (ret == -EINVAL)
- mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
-
- if (!(filp->f_flags & O_DIRECT))
- ocfs2_data_unlock(inode, 0);
-bail_unlock_meta:
- ocfs2_meta_unlock(inode, 0);
-
-bail:
- mlog_exit(ret);
-
- return ret;
-}
-
-static ssize_t ocfs2_file_sendfile(struct file *in_file,
- loff_t *ppos,
- size_t count,
- read_actor_t actor,
- void *target)
-{
- int ret;
- struct inode *inode = in_file->f_mapping->host;
-
- mlog_entry("inode %"MLFu64", ppos %lld, count = %u\n",
- OCFS2_I(inode)->ip_blkno, (long long) *ppos,
- (unsigned int) count);
-
- /* Obviously, there is no user buffer to worry about here --
- * this simplifies locking, so no need to walk vmas a la
- * read/write. We take a simple set of cluster locks against
- * the inode and call generic_file_sendfile. */
- ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_data_lock(inode, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail_unlock_meta;
- }
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- ret = generic_file_sendfile(in_file, ppos, count, actor, target);
- if (ret < 0)
- mlog_errno(ret);
-
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- ocfs2_data_unlock(inode, 0);
-bail_unlock_meta:
- ocfs2_meta_unlock(inode, 0);
-
-bail:
- mlog_exit(ret);
- return ret;
-}
-
-struct file_operations ocfs2_fops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .sendfile = ocfs2_file_sendfile,
- .mmap = ocfs2_mmap,
- .fsync = ocfs2_sync_file,
- .release = ocfs2_file_release,
- .open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
-};
-
-struct file_operations ocfs2_dops = {
- .read = generic_read_dir,
- .readdir = ocfs2_readdir,
- .fsync = ocfs2_sync_file,
-};
-
int ocfs2_set_inode_size(ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 new_i_size)
{
- int status, grow;
+ int status;
mlog_entry_void();
- grow = new_i_size > inode->i_size;
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -517,17 +160,41 @@
goto bail;
}
- /* FIXME: I think this should all be in the caller */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- if (!grow)
- OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
bail:
mlog_exit(status);
return status;
}
+static int ocfs2_simple_size_update(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size)
+{
+ int ret;
+ ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ ocfs2_journal_handle *handle = NULL;
+
+ handle = ocfs2_start_trans(osb, NULL,
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (handle == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Since we got our cluster lock from caller and we
+ * don't add it to the handle: */
+ ocfs2_set_inode_lock_trans(osb->journal, inode);
+
+ ret = ocfs2_set_inode_size(handle, inode, di_bh,
+ new_i_size);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(handle);
+out:
+ return ret;
+}
+
static int ocfs2_orphan_for_truncate(ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
@@ -562,14 +229,13 @@
return status;
}
-static int ocfs2_truncate_file(ocfs2_super *osb,
- u64 new_i_size,
- struct inode *inode)
+static int ocfs2_truncate_file(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size)
{
int status = 0;
ocfs2_dinode *fe = NULL;
- struct buffer_head *fe_bh = NULL;
- ocfs2_journal_handle *handle = NULL;
+ ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_truncate_context *tc = NULL;
mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
@@ -577,14 +243,7 @@
truncate_inode_pages(inode->i_mapping, new_i_size);
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &fe_bh,
- OCFS2_BH_CACHED, inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- fe = (ocfs2_dinode *) fe_bh->b_data;
+ fe = (ocfs2_dinode *) di_bh->b_data;
OCFS2_BUG_ON_INVALID_DINODE(fe);
mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
"Inode %"MLFu64", inode i_size = %lld != di "
@@ -616,20 +275,7 @@
fe->i_clusters);
/* No allocation change is required, so lets fast path
* this truncate. */
- handle = ocfs2_start_trans(osb, NULL,
- OCFS2_INODE_UPDATE_CREDITS);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- /* Since we got our cluster lock from caller and we
- * don't add it to the handle: */
- ocfs2_set_inode_lock_trans(osb->journal, inode);
-
- status = ocfs2_set_inode_size(handle, inode, fe_bh,
- new_i_size);
+ status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
if (status < 0)
mlog_errno(status);
goto bail;
@@ -647,19 +293,19 @@
* change. Orphan the inode so that recovery can complete the
* truncate if necessary. This does the task of marking
* i_size. */
- status = ocfs2_orphan_for_truncate(osb, inode, fe_bh, new_i_size);
+ status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+ status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -667,70 +313,29 @@
/* TODO: orphan dir cleanup here. */
bail:
- if (handle)
- ocfs2_commit_trans(handle);
- if (fe_bh)
- brelse(fe_bh);
-
mlog_exit(status);
return status;
}
-static int ocfs2_zero_extend(struct inode *inode)
-{
- struct address_space *mapping = inode->i_mapping;
- struct page *page;
- u64 size = i_size_read(inode) - 1;
- unsigned int offset;
- int res = 0;
-
- /* Start the zeroing of blocks */
- if (i_size_read(inode) > OCFS2_I(inode)->ip_mmu_private) {
- page = grab_cache_page(mapping,
- size >> PAGE_CACHE_SHIFT);
- if (!page) {
- res = -ENOMEM;
- mlog_errno(res);
- return res;
- }
- offset = (unsigned int)(size & (PAGE_CACHE_SIZE - 1)) + 1;
- res = mapping->a_ops->prepare_write(NULL, page, offset,
- offset);
- if (res < 0) {
- mlog_errno(res);
- goto bail_unlock;
- }
-
- res = mapping->a_ops->commit_write(NULL, page, offset, offset);
- if (res < 0)
- mlog_errno(res);
-
-bail_unlock:
- unlock_page(page);
- page_cache_release(page);
- mark_inode_dirty(inode);
- }
-
- return res;
-}
-
/*
* extend allocation only here.
* we'll update all the disk stuff, and oip->alloc_size
*
* expect stuff to be locked, a transaction started and enough data /
- * metadata reservations in the contexts. I'll return -EAGAIN, if we
- * run out of transaction credits, so the caller can restart us.
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
*/
-int ocfs2_extend_allocation(ocfs2_super *osb,
- struct inode *inode,
- u32 clusters_to_add,
- struct buffer_head *fe_bh,
- ocfs2_journal_handle *handle,
- ocfs2_alloc_context *data_ac,
- ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason)
+int ocfs2_do_extend_allocation(ocfs2_super *osb,
+ struct inode *inode,
+ u32 clusters_to_add,
+ struct buffer_head *fe_bh,
+ ocfs2_journal_handle *handle,
+ ocfs2_alloc_context *data_ac,
+ ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason)
{
int status = 0;
int free_extents;
@@ -740,6 +345,10 @@
BUG_ON(!clusters_to_add);
+ /* We always want to set this, even if we error later. */
+ if (*reason)
+ *reason = RESTART_NONE;
+
free_extents = ocfs2_num_free_extents(osb, inode, fe);
if (free_extents < 0) {
status = free_extents;
@@ -822,48 +431,24 @@
return status;
}
-/*
- * Ok, this function is heavy on the goto's - we need to clean it up a
- * bit.
- *
- * *bytes_extended is a measure of how much was added to
- * dinode->i_size, NOT how much allocated was actually added to the
- * file. It will always be correct, even when we return an error.
- */
-int ocfs2_extend_file(ocfs2_super *osb,
- struct inode *inode,
- u64 new_i_size,
- u64 *bytes_extended)
+int ocfs2_extend_allocation(struct inode *inode,
+ u32 clusters_to_add)
{
int status = 0;
int restart_func = 0;
int drop_alloc_sem = 0;
int credits, num_free_extents;
- u32 clusters_to_add;
- u64 new_fe_size;
+ u32 prev_clusters;
struct buffer_head *bh = NULL;
- ocfs2_dinode *fe;
+ ocfs2_dinode *fe = NULL;
ocfs2_journal_handle *handle = NULL;
ocfs2_alloc_context *data_ac = NULL;
ocfs2_alloc_context *meta_ac = NULL;
enum ocfs2_alloc_restarted why;
+ ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry("(new_i_size=%"MLFu64")\n", new_i_size);
+ mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
- *bytes_extended = 0;
-
- /* setattr sometimes calls us like this. */
- if (new_i_size == 0)
- goto leave;
-
-restart_all:
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode);
if (status < 0) {
@@ -873,23 +458,21 @@
fe = (ocfs2_dinode *) bh->b_data;
OCFS2_BUG_ON_INVALID_DINODE(fe);
- BUG_ON(i_size_read(inode) !=
- (le64_to_cpu(fe->i_size) - *bytes_extended));
- BUG_ON(new_i_size < i_size_read(inode));
- if (i_size_read(inode) == new_i_size)
- goto leave;
+restart_all:
+ BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- clusters_to_add = ocfs2_clusters_for_bytes(osb->sb, new_i_size) -
- le32_to_cpu(fe->i_clusters);
-
- mlog(0, "extend inode %"MLFu64", new_i_size = %"MLFu64", "
- "i_size = %lld, fe->i_clusters = %u, clusters_to_add = %u\n",
- OCFS2_I(inode)->ip_blkno, new_i_size, i_size_read(inode),
+ mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
+ "clusters_to_add = %u\n",
+ OCFS2_I(inode)->ip_blkno, i_size_read(inode),
fe->i_clusters, clusters_to_add);
- if (!clusters_to_add)
- goto do_start_trans;
+ handle = ocfs2_alloc_handle(osb);
+ if (handle == NULL) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
num_free_extents = ocfs2_num_free_extents(osb,
inode,
@@ -928,7 +511,7 @@
* start_trans is important here -- always do it before! */
down_write(&OCFS2_I(inode)->ip_alloc_sem);
drop_alloc_sem = 1;
-do_start_trans:
+
credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
handle = ocfs2_start_trans(osb, handle, credits);
if (handle == NULL) {
@@ -951,53 +534,39 @@
goto leave;
}
- if (!clusters_to_add)
- goto no_alloc;
+ prev_clusters = OCFS2_I(inode)->ip_clusters;
- status = ocfs2_extend_allocation(osb,
- inode,
- clusters_to_add,
- bh,
- handle,
- data_ac,
- meta_ac,
- &why);
+ status = ocfs2_do_extend_allocation(osb,
+ inode,
+ clusters_to_add,
+ bh,
+ handle,
+ data_ac,
+ meta_ac,
+ &why);
if ((status < 0) && (status != -EAGAIN)) {
if (status != -ENOSPC)
mlog_errno(status);
goto leave;
}
- if (status == -EAGAIN && (new_i_size >
- ocfs2_clusters_to_bytes(osb->sb, le32_to_cpu(fe->i_clusters)))) {
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+ if (why != RESTART_NONE && clusters_to_add) {
if (why == RESTART_META) {
mlog(0, "restarting function.\n");
restart_func = 1;
} else {
BUG_ON(why != RESTART_TRANS);
- new_fe_size = ocfs2_clusters_to_bytes(osb->sb,
- le32_to_cpu(fe->i_clusters));
- *bytes_extended += new_fe_size -
- le64_to_cpu(fe->i_size);
- /* update i_size in case we crash after the
- * extend_trans */
- fe->i_size = cpu_to_le64(new_fe_size);
-
- fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
- fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- clusters_to_add =
- ocfs2_clusters_for_bytes(osb->sb,
- new_i_size)
- - le32_to_cpu(fe->i_clusters);
mlog(0, "restarting transaction.\n");
/* TODO: This can be more intelligent. */
credits = ocfs2_calc_extend_credits(osb->sb,
@@ -1014,34 +583,12 @@
goto restarted_transaction;
}
}
- status = 0;
-no_alloc:
- /* this may not be the end of our allocation so only update
- * i_size to what's appropriate. */
- new_fe_size = ocfs2_clusters_to_bytes(osb->sb,
- le32_to_cpu(fe->i_clusters));
- if (new_i_size < new_fe_size)
- new_fe_size = new_i_size;
-
- *bytes_extended += new_fe_size - le64_to_cpu(fe->i_size);
- fe->i_size = cpu_to_le64(new_fe_size);
-
mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
fe->i_clusters, fe->i_size);
-
mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
OCFS2_I(inode)->ip_clusters, i_size_read(inode));
- fe->i_ctime = fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
- fe->i_ctime_nsec = fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
leave:
if (drop_alloc_sem) {
up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1059,24 +606,148 @@
ocfs2_free_alloc_context(meta_ac);
meta_ac = NULL;
}
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
if ((!status) && restart_func) {
restart_func = 0;
goto restart_all;
}
+ if (bh) {
+ brelse(bh);
+ bh = NULL;
+ }
mlog_exit(status);
return status;
}
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->commit_write(). */
+static int ocfs2_write_zero_page(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 size)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ unsigned long index, offset;
+ int ret;
+
+ offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+ /* ugh. in prepare/commit_write, if from==to==start of block, we
+ ** skip the prepare. make sure we never send an offset for the start
+ ** of a block
+ */
+ if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+ offset++;
+ }
+ index = size >> PAGE_CACHE_SHIFT;
+
+ page = grab_cache_page(mapping, index);
+ if (!page) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_prepare_write(NULL, page, offset, offset);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_commit_write_nolocks(NULL, page, offset, offset, di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = 0;
+out_unlock:
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return ret;
+}
+
+static int ocfs2_zero_extend(struct inode *inode,
+ u64 zero_to_size)
+{
+ int ret = 0;
+ u64 start_off;
+ struct buffer_head *di_bh = NULL;
+ struct super_block *sb = inode->i_sb;
+
+ ret = ocfs2_read_block(OCFS2_SB(sb), OCFS2_I(inode)->ip_blkno, &di_bh,
+ OCFS2_BH_CACHED, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ start_off = ocfs2_align_bytes_to_blocks(sb, zero_to_size);
+ while (start_off < zero_to_size) {
+ ret = ocfs2_write_zero_page(inode, di_bh, start_off);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ start_off += sb->s_blocksize;
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size)
+{
+ int ret = 0;
+ u32 clusters_to_add;
+
+ /* setattr sometimes calls us like this. */
+ if (new_i_size == 0)
+ goto out;
+
+ if (i_size_read(inode) == new_i_size)
+ goto out;
+ BUG_ON(new_i_size < i_size_read(inode));
+
+ clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
+ OCFS2_I(inode)->ip_clusters;
+
+ if (clusters_to_add) {
+ ret = ocfs2_extend_allocation(inode, clusters_to_add);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* This will update i_size for us. */
+ ret = ocfs2_zero_extend(inode, new_i_size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ /* No allocation required, we just use this helper to
+ * do a trivial update of i_size. */
+ ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0;
int unlock = 0;
- u64 newsize, bytes_added;
+ u64 newsize;
struct inode *inode = dentry->d_inode;
struct super_block *sb = inode->i_sb;
ocfs2_super *osb = OCFS2_SB(sb);
@@ -1121,48 +792,16 @@
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE &&
newsize != i_size_read(inode)) {
- bytes_added = 0;
-
if (i_size_read(inode) > newsize)
- status = ocfs2_truncate_file(osb, newsize, inode);
+ status = ocfs2_truncate_file(inode, bh, newsize);
else
- status = ocfs2_extend_file(osb, inode, newsize,
- &bytes_added);
- if (status < 0 && (!bytes_added)) {
+ status = ocfs2_extend_file(inode, bh, newsize);
+ if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
status = -ENOSPC;
goto bail;
}
-
- /* partial extend, we continue with what we've got. */
- if (status < 0
- && status != -ENOSPC
- && status != -EINTR
- && status != -ERESTARTSYS)
- mlog(ML_ERROR,
- "status return of %d extending inode "
- "%"MLFu64"\n", status,
- OCFS2_I(inode)->ip_blkno);
- status = 0;
-
- newsize = bytes_added + i_size_read(inode);
- if (bytes_added)
- ocfs2_update_inode_size(inode, newsize);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
- spin_lock(&OCFS2_I(inode)->ip_lock);
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_OPEN_DIRECT) {
- /* This is a total broken hack for O_DIRECT crack */
- OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
- }
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-#endif
- status = ocfs2_zero_extend(inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
}
handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
@@ -1232,6 +871,302 @@
return err;
}
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+ const char __user *buf,
+ size_t count,
+ loff_t pos)
+{
+ struct iovec local_iov = { .iov_base = (void __user *)buf,
+ .iov_len = count };
+ int ret, level;
+ u32 clusters;
+ ocfs2_super *osb = NULL;
+ struct file *filp = iocb->ki_filp;
+ struct inode *inode = filp->f_dentry->d_inode;
+ int do_direct = 0;
+ loff_t newsize, saved_pos;
+
+ mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+ (unsigned int)count,
+ filp->f_dentry->d_name.len,
+ filp->f_dentry->d_name.name);
+
+ /* happy write of zero bytes */
+ if (count == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ if (!inode) {
+ mlog(0, "bad inode\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ osb = OCFS2_SB(inode->i_sb);
+
+ down(&inode->i_sem);
+
+ /* this ginormous block is in here because it has so many inputs
+ * and outputs from this function.. */
+ level = !!(filp->f_flags & O_APPEND);
+ for(;;) {
+ ret = ocfs2_meta_lock(inode, NULL, NULL, level);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_i_sem;
+ }
+
+ /* work on a copy of ppos until we're sure that we won't have
+ * to recalculate it due to relocking. */
+ if (filp->f_flags & O_APPEND) {
+ saved_pos = i_size_read(inode);
+ mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+ if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+ /* ugh, work around some applications which
+ * open everything O_DIRECT + O_APPEND and
+ * really don't mean to use O_DIRECT. */
+ filp->f_flags &= ~O_DIRECT;
+ }
+#endif
+ } else {
+ saved_pos = iocb->ki_pos;
+ }
+ newsize = count + saved_pos;
+
+ mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+ saved_pos, newsize, i_size_read(inode));
+
+ /* No need for a higher level metadata lock if we're
+ * never going past i_size. */
+ if (newsize <= i_size_read(inode))
+ break;
+
+ if (level == 0) {
+ ocfs2_meta_unlock(inode, level);
+ level = 1;
+ continue;
+ }
+
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
+ OCFS2_I(inode)->ip_clusters;
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+ mlog(0, "Writing at EOF, may need more allocation: "
+ "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
+ i_size_read(inode), newsize, clusters);
+
+ /* We only want to continue the rest of this loop if
+ * our extend will actually require more
+ * allocation. */
+ if (!clusters)
+ break;
+
+ ret = ocfs2_extend_allocation(inode, clusters);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out_meta_unlock;
+ }
+
+ /* Fill any holes which would've been created by this
+ * write. If we're O_APPEND, this will wind up
+ * (correctly) being a noop. */
+ ret = ocfs2_zero_extend(inode, (u64) newsize - count);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_meta_unlock;
+ }
+ break;
+ }
+
+ /* we've got whatever cluster lock is appropriate now, so we
+ * can stuff *ppos back. */
+ iocb->ki_pos = saved_pos;
+
+ if (filp->f_flags & O_DIRECT) {
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+ if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+ int sector_size = 1 << osb->s_sectsize_bits;
+
+ if ((saved_pos & (sector_size - 1)) ||
+ (count & (sector_size - 1)) ||
+ ((unsigned long)buf & (sector_size - 1))) {
+ do_direct = 0;
+ filp->f_flags |= O_SYNC;
+ } else {
+ do_direct = 1;
+ }
+ } else
+#endif
+ do_direct = 1;
+
+ mlog(0, "O_DIRECT\n");
+ }
+
+ if (!do_direct) {
+ ret = ocfs2_data_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_meta_unlock;
+ }
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+ if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+ unsigned int saved_flags = filp->f_flags;
+
+ if (do_direct)
+ filp->f_flags |= O_DIRECT;
+ else
+ filp->f_flags &= ~O_DIRECT;
+
+ ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+ &iocb->ki_pos);
+
+ filp->f_flags = saved_flags;
+ } else
+#endif
+ ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+ &iocb->ki_pos);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (!do_direct)
+ ocfs2_data_unlock(inode, 1);
+
+out_meta_unlock:
+ ocfs2_meta_unlock(inode, level);
+out_i_sem:
+ up(&inode->i_sem);
+
+out:
+ mlog_exit(ret);
+ return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+ char __user *buf,
+ size_t count,
+ loff_t pos)
+{
+ int ret = 0;
+ ocfs2_super *osb = NULL;
+ struct file *filp = iocb->ki_filp;
+ struct inode *inode = filp->f_dentry->d_inode;
+
+ mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+ (unsigned int)count,
+ filp->f_dentry->d_name.len,
+ filp->f_dentry->d_name.name);
+
+ if (!inode) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ osb = OCFS2_SB(inode->i_sb);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+ if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+ if (filp->f_flags & O_DIRECT) {
+ int sector_size = 1 << osb->s_sectsize_bits;
+
+ if ((pos & (sector_size - 1)) ||
+ (count & (sector_size - 1)) ||
+ ((unsigned long)buf & (sector_size - 1)) ||
+ (i_size_read(inode) & (sector_size -1))) {
+ filp->f_flags &= ~O_DIRECT;
+ }
+ }
+ }
+#endif
+
+ ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ if (!(filp->f_flags & O_DIRECT)) {
+ ret = ocfs2_data_lock(inode, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail_unlock_meta;
+ }
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (ret == -EINVAL)
+ mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+
+ if (!(filp->f_flags & O_DIRECT))
+ ocfs2_data_unlock(inode, 0);
+bail_unlock_meta:
+ ocfs2_meta_unlock(inode, 0);
+
+bail:
+ mlog_exit(ret);
+
+ return ret;
+}
+
+static ssize_t ocfs2_file_sendfile(struct file *in_file,
+ loff_t *ppos,
+ size_t count,
+ read_actor_t actor,
+ void *target)
+{
+ int ret;
+ struct inode *inode = in_file->f_mapping->host;
+
+ mlog_entry("inode %"MLFu64", ppos %lld, count = %u\n",
+ OCFS2_I(inode)->ip_blkno, (long long) *ppos,
+ (unsigned int) count);
+
+ /* Obviously, there is no user buffer to worry about here --
+ * this simplifies locking, so no need to walk vmas a la
+ * read/write. We take a simple set of cluster locks against
+ * the inode and call generic_file_sendfile. */
+ ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = ocfs2_data_lock(inode, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail_unlock_meta;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ret = generic_file_sendfile(in_file, ppos, count, actor, target);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_data_unlock(inode, 0);
+bail_unlock_meta:
+ ocfs2_meta_unlock(inode, 0);
+
+bail:
+ mlog_exit(ret);
+ return ret;
+}
+
struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -1241,3 +1176,21 @@
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
};
+
+struct file_operations ocfs2_fops = {
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .sendfile = ocfs2_file_sendfile,
+ .mmap = ocfs2_mmap,
+ .fsync = ocfs2_sync_file,
+ .release = ocfs2_file_release,
+ .open = ocfs2_file_open,
+ .aio_read = ocfs2_file_aio_read,
+ .aio_write = ocfs2_file_aio_write,
+};
+
+struct file_operations ocfs2_dops = {
+ .read = generic_read_dir,
+ .readdir = ocfs2_readdir,
+ .fsync = ocfs2_sync_file,
+};
Modified: branches/locking-changes/fs/ocfs2/file.h
===================================================================
--- branches/locking-changes/fs/ocfs2/file.h 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/file.h 2005-08-31 18:03:07 UTC (rev 2557)
@@ -33,32 +33,26 @@
struct _ocfs2_alloc_context;
enum ocfs2_alloc_restarted {
- RESTART_TRANS = 0,
+ RESTART_NONE = 0,
+ RESTART_TRANS,
RESTART_META
};
-int ocfs2_extend_allocation(ocfs2_super *osb,
- struct inode *inode,
- u32 clusters_to_add,
- struct buffer_head *fe_bh,
- ocfs2_journal_handle *handle,
- struct _ocfs2_alloc_context *data_ac,
- struct _ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason);
+int ocfs2_do_extend_allocation(ocfs2_super *osb,
+ struct inode *inode,
+ u32 clusters_to_add,
+ struct buffer_head *fe_bh,
+ ocfs2_journal_handle *handle,
+ struct _ocfs2_alloc_context *data_ac,
+ struct _ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
int ocfs2_sync_inode(struct inode *inode);
-int ocfs2_extend_file(ocfs2_super *osb,
- struct inode *inode,
- u64 new_i_size,
- u64 *bytes_extended);
int ocfs2_set_inode_size(ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 new_i_size);
-void ocfs2_file_finish_extension(struct inode *inode, loff_t newsize,
- unsigned direct_extend);
-
#endif /* OCFS2_FILE_H */
Modified: branches/locking-changes/fs/ocfs2/inode.c
===================================================================
--- branches/locking-changes/fs/ocfs2/inode.c 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/inode.c 2005-08-31 18:03:07 UTC (rev 2557)
@@ -297,7 +297,6 @@
inode->i_fop = &ocfs2_fops;
inode->i_op = &ocfs2_file_iops;
i_size_write(inode, le64_to_cpu(fe->i_size));
- OCFS2_I(inode)->ip_mmu_private = inode->i_size;
break;
case S_IFDIR:
inode->i_op = &ocfs2_dir_iops;
@@ -1121,9 +1120,6 @@
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
i_size_write(inode, le64_to_cpu(fe->i_size));
- if (S_ISREG(inode->i_mode)) {
- OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
- }
inode->i_nlink = le16_to_cpu(fe->i_links_count);
inode->i_uid = le32_to_cpu(fe->i_uid);
inode->i_gid = le32_to_cpu(fe->i_gid);
Modified: branches/locking-changes/fs/ocfs2/inode.h
===================================================================
--- branches/locking-changes/fs/ocfs2/inode.h 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/inode.h 2005-08-31 18:03:07 UTC (rev 2557)
@@ -41,7 +41,6 @@
spinlock_t ip_lock;
u32 ip_open_count;
u32 ip_clusters;
- loff_t ip_mmu_private;
struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers;
int ip_orphaned_slot;
Modified: branches/locking-changes/fs/ocfs2/journal.c
===================================================================
--- branches/locking-changes/fs/ocfs2/journal.c 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/journal.c 2005-08-31 18:03:07 UTC (rev 2557)
@@ -428,6 +428,18 @@
return status;
}
+int ocfs2_journal_dirty_data(handle_t *handle,
+ struct buffer_head *bh)
+{
+ int err = journal_dirty_data(handle, bh);
+ if (err)
+ mlog_errno(err);
+ /* TODO: When we can handle it, abort the handle and go RO on
+ * error here. */
+
+ return err;
+}
+
/* We always assume you're adding a metadata lock at level 'ex' */
int ocfs2_handle_add_lock(ocfs2_journal_handle *handle,
struct inode *inode)
Modified: branches/locking-changes/fs/ocfs2/journal.h
===================================================================
--- branches/locking-changes/fs/ocfs2/journal.h 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/journal.h 2005-08-31 18:03:07 UTC (rev 2557)
@@ -249,6 +249,8 @@
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
+ * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ * the current handle commits.
* ocfs2_handle_add_lock - Sometimes we need to delay lock release
* until after a transaction has been completed. Use
* ocfs2_handle_add_lock to indicate that a lock needs
@@ -308,6 +310,8 @@
*/
int ocfs2_journal_dirty(ocfs2_journal_handle *handle,
struct buffer_head *bh);
+int ocfs2_journal_dirty_data(handle_t *handle,
+ struct buffer_head *bh);
int ocfs2_handle_add_lock(ocfs2_journal_handle *handle,
struct inode *inode);
/*
Modified: branches/locking-changes/fs/ocfs2/namei.c
===================================================================
--- branches/locking-changes/fs/ocfs2/namei.c 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/namei.c 2005-08-31 18:03:07 UTC (rev 2557)
@@ -1677,8 +1677,9 @@
newsize = l - 1;
if (l > ocfs2_fast_symlink_chars(sb)) {
inode->i_op = &ocfs2_symlink_inode_operations;
- status = ocfs2_extend_allocation(osb, inode, 1, new_fe_bh,
- handle, data_ac, NULL, NULL);
+ status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+ handle, data_ac, NULL,
+ NULL);
if (status < 0) {
if (status != -ENOSPC && status != -EINTR) {
mlog(ML_ERROR, "Failed to extend file to "
Modified: branches/locking-changes/fs/ocfs2/ocfs2.h
===================================================================
--- branches/locking-changes/fs/ocfs2/ocfs2.h 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/ocfs2.h 2005-08-31 18:03:07 UTC (rev 2557)
@@ -271,6 +271,14 @@
struct work_struct osb_truncate_log_wq;
} ocfs2_super;
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ /* TODO: this should be a mount option which we check here. */
+ return 1;
+}
+
#define OCFS2_SB(sb) ((ocfs2_super *)(sb)->s_fs_info)
#define OCFS2_MAX_OSB_ID 65536
@@ -345,6 +353,13 @@
return clusters;
}
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+ u64 bytes)
+{
+ bytes += sb->s_blocksize - 1;
+ return bytes >> sb->s_blocksize_bits;
+}
+
static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
u32 clusters)
{
@@ -361,11 +376,13 @@
return (u64)clusters << cl_bits;
}
-static inline unsigned long ocfs2_align_bytes_to_blocks(struct super_block *sb,
- u64 bytes)
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+ u64 bytes)
{
- bytes += sb->s_blocksize - 1;
- return (unsigned long)(bytes >> sb->s_blocksize_bits);
+ u64 blocks;
+
+ blocks = ocfs2_blocks_for_bytes(sb, bytes);
+ return blocks << sb->s_blocksize_bits;
}
static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
Modified: branches/locking-changes/fs/ocfs2/super.c
===================================================================
--- branches/locking-changes/fs/ocfs2/super.c 2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/super.c 2005-08-31 18:03:07 UTC (rev 2557)
@@ -690,7 +690,6 @@
oi->ip_blkno = 0ULL;
oi->ip_clusters = 0;
- oi->ip_mmu_private = 0LL;
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
More information about the Ocfs2-commits
mailing list