[Ocfs2-commits] mfasheh commits r2911 - branches/ocfs2-1.2/fs/ocfs2
svn-commits@oss.oracle.com
svn-commits at oss.oracle.com
Wed Apr 26 17:22:30 CDT 2006
Author: mfasheh
Signed-off-by: zab
Date: 2006-04-26 17:22:29 -0500 (Wed, 26 Apr 2006)
New Revision: 2911
Modified:
branches/ocfs2-1.2/fs/ocfs2/aio.c
branches/ocfs2-1.2/fs/ocfs2/aops.c
branches/ocfs2-1.2/fs/ocfs2/dlmglue.c
branches/ocfs2-1.2/fs/ocfs2/file.c
branches/ocfs2-1.2/fs/ocfs2/mmap.c
branches/ocfs2-1.2/fs/ocfs2/mmap.h
Log:
We need to take a data lock around extends to protect the pages that
ocfs2_zero_extend is going to be pulling into the page cache.
Signed-off-by: zab
Modified: branches/ocfs2-1.2/fs/ocfs2/aio.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/aio.c 2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/aio.c 2006-04-26 22:22:29 UTC (rev 2911)
@@ -258,7 +258,7 @@
okp->kp_ctxt.b_cb = ocfs2_aio_kick;
okp->kp_ctxt.b_cb_data = (unsigned long)iocb;
- target_binode->ba_lock_data = filp->f_flags & O_DIRECT ? 0 : 1;
+ target_binode->ba_lock_data_level = 0;
}
/* this might return EIOCBRETRY and we'll come back again to
@@ -334,6 +334,11 @@
&iocb->ki_pos,
&okp->kp_info,
&okp->kp_ctxt);
+ /*
+ * XXX this looks totally broken.. what if _maybe_extend
+ * returns EIOCBRETRY? it'll never be called again and the
+ * op will simply proceed without locking?
+ */
okp->kp_have_write_locks = 1;
if (okp->kp_info.wl_extended) {
/*
Modified: branches/ocfs2-1.2/fs/ocfs2/aops.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/aops.c 2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/aops.c 2006-04-26 22:22:29 UTC (rev 2911)
@@ -237,13 +237,19 @@
static int ocfs2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
+ struct inode *inode = page->mapping->host;
int ret;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+ /* we can't bring in pages without holding the data lock */
+ mlog_bug_on_msg(OCFS2_I(inode)->ip_data_lockres.l_ex_holders == 0,
+ "inode %llu holders %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ OCFS2_I(inode)->ip_data_lockres.l_ex_holders);
+
ret = cont_prepare_write(page, from, to, ocfs2_get_block,
- &(OCFS2_I(page->mapping->host)->ip_mmu_private));
-
+ &(OCFS2_I(inode)->ip_mmu_private));
mlog_exit(ret);
return ret;
Modified: branches/ocfs2-1.2/fs/ocfs2/dlmglue.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/dlmglue.c 2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/dlmglue.c 2006-04-26 22:22:29 UTC (rev 2911)
@@ -467,8 +467,19 @@
* information is already up to data. Convert from NL to
* *anything* however should mark ourselves as needing an
* update */
- if (lockres->l_level == LKM_NLMODE)
+ if (lockres->l_level == LKM_NLMODE) {
lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+ if (lockres->l_type == OCFS2_LOCK_TYPE_DATA) {
+ /* we're just now getting the data lock so there
+ * had better not be any pages in the page cache */
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+ struct address_space *mapping = inode->i_mapping;
+ mlog_bug_on_msg(mapping->nrpages,
+ "inode %llu nrpages %lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ mapping->nrpages);
+ }
+ }
lockres->l_level = lockres->l_requested;
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
Modified: branches/ocfs2-1.2/fs/ocfs2/file.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/file.c 2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/file.c 2006-04-26 22:22:29 UTC (rev 2911)
@@ -246,8 +246,11 @@
up_read(&OCFS2_I(inode)->ip_alloc_sem);
bail:
- /* we might have to finish up extentions that were performed before
- * an error was returned by, say, data locking */
+ /*
+ * if this write created a hole then write zeros into it.. wl_extended
+ * is only set if we got the data lock so the buffered zero writing
+ * will have lock coverage. This must be done before unlocking.
+ */
if (info.wl_extended)
ocfs2_file_finish_extension(inode, info.wl_newsize,
info.wl_do_direct_io);
@@ -307,7 +310,7 @@
goto bail;
}
- target_binode->ba_lock_data = (filp->f_flags & O_DIRECT) ? 0 : 1;
+ target_binode->ba_lock_data_level = 0;
ret = ocfs2_lock_buffer_inodes(&ctxt, NULL);
if (ret < 0) {
@@ -985,10 +988,82 @@
return status;
}
+static int ocfs2_setattr_newsize(struct ocfs2_super *osb, struct inode *inode,
+ u64 newsize)
+{
+ u64 bytes_added = 0;
+ int status;
+
+ if (i_size_read(inode) > newsize) {
+ status = ocfs2_truncate_file(osb, newsize, inode);
+ /* should this be translating errnos to enospc? it
+ * used to.. */
+ if (status < 0 && status != -ENOSPC)
+ mlog_errno(status);
+ /* _truncate_file calls _set_file_size which updates
+ * mmu_private and i_size, we're done.. */
+ goto out;
+ }
+
+ /*
+ * protect the pages that ocfs2_zero_extend is going to
+ * be pulling into the page cache.. we do this before the
+ * metadata extend so that we don't get into the situation
+ * where we've extended the metadata but can't get the data
+ * lock to zero.
+ */
+ status = ocfs2_data_lock(inode, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_extend_file(osb, inode, newsize, &bytes_added);
+ if (status < 0 && (!bytes_added)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ status = -ENOSPC;
+ goto out_unlock;
+ }
+
+ /* partial extend, we continue with what we've got. */
+ if (status < 0
+ && status != -ENOSPC
+ && status != -EINTR
+ && status != -ERESTARTSYS)
+ mlog(ML_ERROR,
+ "status return of %d extending inode "
+ "%"MLFu64"\n", status,
+ OCFS2_I(inode)->ip_blkno);
+ status = 0;
+
+ newsize = bytes_added + i_size_read(inode);
+ if (bytes_added)
+ ocfs2_update_inode_size(inode, newsize);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_OPEN_DIRECT) {
+ /* This is a total broken hack for O_DIRECT crack */
+ OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
+ }
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+#endif
+
+ status = ocfs2_zero_extend(inode);
+ if (status < 0)
+ mlog_errno(status);
+
+out_unlock:
+ ocfs2_data_unlock(inode, 1);
+out:
+ return status;
+}
+
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0;
- u64 newsize, bytes_added;
+ u64 newsize;
struct inode *inode = dentry->d_inode;
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1033,48 +1108,9 @@
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE &&
newsize != i_size_read(inode)) {
- bytes_added = 0;
-
- if (i_size_read(inode) > newsize)
- status = ocfs2_truncate_file(osb, newsize, inode);
- else
- status = ocfs2_extend_file(osb, inode, newsize,
- &bytes_added);
- if (status < 0 && (!bytes_added)) {
- if (status != -ENOSPC)
- mlog_errno(status);
- status = -ENOSPC;
+ status = ocfs2_setattr_newsize(osb, inode, newsize);
+ if (status < 0)
goto bail_unlock;
- }
-
- /* partial extend, we continue with what we've got. */
- if (status < 0
- && status != -ENOSPC
- && status != -EINTR
- && status != -ERESTARTSYS)
- mlog(ML_ERROR,
- "status return of %d extending inode "
- "%"MLFu64"\n", status,
- OCFS2_I(inode)->ip_blkno);
- status = 0;
-
- newsize = bytes_added + i_size_read(inode);
- if (bytes_added)
- ocfs2_update_inode_size(inode, newsize);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
- spin_lock(&OCFS2_I(inode)->ip_lock);
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_OPEN_DIRECT) {
- /* This is a total broken hack for O_DIRECT crack */
- OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
- }
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-#endif
- status = ocfs2_zero_extend(inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail_unlock;
- }
}
handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
Modified: branches/ocfs2-1.2/fs/ocfs2/mmap.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/mmap.c 2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/mmap.c 2006-04-26 22:22:29 UTC (rev 2911)
@@ -337,7 +337,7 @@
* to one we haven't then we mark this node in the ctxt so that
* we'll return to it in a future after, say, hitting last_inode
* or EIOCBRETRY in lock_buffer_inodes */
- if (pos && pos->ba_locked && binode)
+ if (pos && pos->ba_meta_locked && pos->ba_data_locked && binode)
ctxt->b_next_unlocked = binode;
return binode;
@@ -353,14 +353,14 @@
int ocfs2_lock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt,
struct inode *last_inode)
{
- int status, data_level;
+ int status;
struct ocfs2_backing_inode *binode = NULL;
struct inode *inode;
while((binode = ocfs2_next_unlocked(ctxt, last_inode, binode))) {
/* the tricksy caller might have locked inodes themselves
* between calls. */
- if (binode->ba_locked)
+ if (binode->ba_meta_locked && binode->ba_data_locked)
continue;
inode = binode->ba_inode;
@@ -379,10 +379,9 @@
binode->ba_meta_locked = 1;
}
- /* ba_lock_data isn't set for direct io */
- if (binode->ba_lock_data) {
- data_level = binode->ba_lock_data_level;
- status = ocfs2_data_lock(inode, data_level);
+ if (!binode->ba_data_locked) {
+ status = ocfs2_data_lock(inode,
+ binode->ba_lock_data_level);
if (status < 0) {
if (status == -EIOCBRETRY)
goto bail;
@@ -397,9 +396,9 @@
mlog_errno(status);
goto bail;
}
+ binode->ba_data_locked = 1;
}
ocfs2_add_io_marker(inode, &binode->ba_task);
- binode->ba_locked = 1;
}
status = 0;
@@ -421,11 +420,11 @@
ocfs2_del_io_marker(binode->ba_inode, &binode->ba_task);
- if (binode->ba_locked && binode->ba_lock_data)
+ if (binode->ba_data_locked)
ocfs2_data_unlock(binode->ba_inode,
binode->ba_lock_data_level);
- if (binode->ba_locked || binode->ba_meta_locked)
+ if (binode->ba_meta_locked)
ocfs2_meta_unlock(binode->ba_inode,
binode->ba_lock_meta_level);
@@ -522,19 +521,22 @@
struct ocfs2_buffer_lock_ctxt *ctxt)
{
int ret = 0;
- struct ocfs2_super *osb = NULL;
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
+ struct ocfs2_super *osb = osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_backing_inode *ba;
int status;
- int level = filp->f_flags & O_APPEND;
loff_t saved_ppos;
u64 bytes_added = 0;
- osb = OCFS2_SB(inode->i_sb);
+ /*
+ * the target inode is different from the other inodes. in o_direct it
+ * gets a PR data lock (see below) and when appending it gets an EX
+ * meta lock. It's locked manually here though the backing_inode
+ * fields are maintained while doing so so that unlock does the
+ * right thing.
+ */
- /* the target inode is different from the other inodes. in o_direct it
- * doesn't get a data lock and when appending it gets a level 1 meta
- * lock. we use target_binode to set its flags accordingly */
if (info->wl_target_binode == NULL) {
ret = ocfs2_setup_io_locks(inode->i_sb, inode,
(char __user *) buf,
@@ -547,6 +549,8 @@
}
}
+ ba = info->wl_target_binode;
+
/* This will lock everyone in the context who's order puts
* them before us. */
if (!info->wl_have_before) {
@@ -558,8 +562,6 @@
goto bail;
}
info->wl_have_before = 1;
- /* we're writing so get an ex data cluster lock */
- info->wl_target_binode->ba_lock_data_level = 1;
}
if (!info->wl_have_i_mutex) {
@@ -567,21 +569,21 @@
info->wl_have_i_mutex = 1;
}
-lock:
- if (!info->wl_have_target_meta) {
- status = ocfs2_meta_lock(inode, NULL, NULL, level);
+ ba->ba_lock_data_level = 1;
+ if (filp->f_flags & O_APPEND)
+ ba->ba_lock_meta_level = 1;
+
+retry_meta_lock:
+ if (!ba->ba_meta_locked) {
+ status = ocfs2_meta_lock(inode, NULL, NULL,
+ ba->ba_lock_meta_level);
if (status < 0) {
mlog_errno(status);
ret = status;
goto bail;
}
- info->wl_have_target_meta = 1;
+ ba->ba_meta_locked = 1;
}
- /* to handle extending writes, we do a bit of our own locking
- * here, but we setup the ctxt do unlock for us (as well as
- * handle locking everything else. */
- if (level)
- info->wl_target_binode->ba_lock_meta_level = 1;
/* Clear suid / sgid if necessary. We do this here instead of
* later in the write path because remove_suid() calls
@@ -592,22 +594,20 @@
* can be lost via setattr during extending writes (we set
* inode->i_size at the end of a write. */
if (ocfs2_write_should_remove_suid(inode)) {
- if (!level) {
+ if (ba->ba_lock_meta_level == 0) {
mlog(0, "inode %"MLFu64", had a PR, looping back for "
"EX so we can remove SUID\n",
OCFS2_I(inode)->ip_blkno);
- ocfs2_meta_unlock(inode, level);
- info->wl_have_target_meta = 0;
- level = 1;
- goto lock;
+ ocfs2_meta_unlock(inode, ba->ba_lock_meta_level);
+ ba->ba_meta_locked = 0;
+ ba->ba_lock_meta_level = 1;
+ goto retry_meta_lock;
}
status = ocfs2_write_remove_suid(inode);
if (status < 0) {
mlog_errno(status);
ret = status;
- ocfs2_meta_unlock(inode, level);
- info->wl_have_target_meta = 0;
goto bail;
}
}
@@ -650,27 +650,58 @@
mlog(0, "O_DIRECT\n");
}
- info->wl_target_binode->ba_lock_data = info->wl_do_direct_io ? 0 : 1;
+ /*
+ * We get PR data locks even for O_DIRECT. This allows concurrent
+ * O_DIRECT writes but doesn't let O_DIRECT writes race with
+ * extending and buffered zeroing writes race. If they did race
+ * then the buffered zeroing could be written back after the O_DIRECT
+ * write and overwrite it. It's one thing to tell people not to
+ * mix buffered and O_DIRECT writes, but expecting them to understand
+ * that file extension is also an implicit buffered write is
+ * too much. By getting the PR we force writeback of the buffered
+ * zeroing before proceeding.
+ */
+ if (info->wl_do_direct_io && !(filp->f_flags & O_APPEND))
+ ba->ba_lock_data_level = 0;
info->wl_newsize = count + saved_ppos;
if (filp->f_flags & O_APPEND)
info->wl_newsize = count + i_size_read(inode);
+ /* get the locking straight for the extending case */
+ if (info->wl_newsize > i_size_read(inode)) {
+ if (ba->ba_lock_meta_level == 0) {
+ mlog(0, "inode %"MLFu64", had a PR meta, looping back "
+ "for EX\n", OCFS2_I(inode)->ip_blkno);
+ ocfs2_meta_unlock(inode, ba->ba_lock_meta_level);
+ ba->ba_meta_locked = 0;
+ ba->ba_lock_meta_level = 1;
+ goto retry_meta_lock;
+ }
+ ba->ba_lock_data_level = 1;
+ }
+
+ /*
+ * get the data lock before extending so that we can be sure
+ * that we'll be able to zero under lock coverage. This does
+ * get an EX data lock for O_DIRECT but as long as zeroing is
+ * buffered we really must hold the lock while manipulating the
+ * page cache.
+ */
+ if (!ba->ba_data_locked) {
+ status = ocfs2_data_lock(inode, ba->ba_lock_data_level);
+ if (status < 0) {
+ mlog_errno(status);
+ ret = status;
+ goto bail;
+ }
+ ba->ba_data_locked = 1;
+ }
+
mlog(0, "ppos=%lld newsize=%"MLFu64" cursize=%lld\n", saved_ppos,
info->wl_newsize, i_size_read(inode));
if (info->wl_newsize > i_size_read(inode)) {
- if (!level) {
- /* we want an extend, but need a higher
- * level cluster lock. */
- mlog(0, "inode %"MLFu64", had a PR, looping back "
- "for EX\n", OCFS2_I(inode)->ip_blkno);
- ocfs2_meta_unlock(inode, level);
- info->wl_have_target_meta = 0;
- level = 1;
- goto lock;
- }
-
mlog(0, "Writing at EOF, will need more allocation: "
"i_size=%lld, need=%"MLFu64"\n", i_size_read(inode),
info->wl_newsize);
@@ -691,9 +722,6 @@
*ppos, info->wl_newsize);
}
ret = status;
-
- info->wl_have_target_meta = 0;
- ocfs2_meta_unlock(inode, level);
goto bail;
}
@@ -720,24 +748,6 @@
* can stuff *ppos back. */
*ppos = saved_ppos;
- if (!info->wl_do_direct_io && !info->wl_have_data_lock) {
- status = ocfs2_data_lock(inode, 1);
- if (status < 0) {
- mlog_errno(status);
- ret = status;
-
- info->wl_have_target_meta = 0;
- ocfs2_meta_unlock(inode, level);
- goto bail;
- }
- info->wl_have_data_lock = 1;
- }
-
- /* Alright, fool the io locking stuff into thinking it's
- * handled our inode for us. We can now count on it to do the
- * unlock for us. */
- info->wl_target_binode->ba_locked = 1;
-
/* This will lock everyone who's order puts them *after* our inode. */
ret = ocfs2_lock_buffer_inodes(ctxt, NULL);
if (ret < 0) {
@@ -745,7 +755,6 @@
mlog_errno(ret);
goto bail;
}
-
bail:
mlog_exit(ret);
return ret;
Modified: branches/ocfs2-1.2/fs/ocfs2/mmap.h
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/mmap.h 2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/mmap.h 2006-04-26 22:22:29 UTC (rev 2911)
@@ -69,9 +69,8 @@
struct ocfs2_backing_inode {
struct rb_node ba_node;
struct inode *ba_inode;
- unsigned ba_meta_locked:1, /* meta is locked */
- ba_locked:1, /* both are locked */
- ba_lock_data:1, /* should lock data */
+ unsigned ba_meta_locked:1,
+ ba_data_locked:1,
ba_lock_meta_level:1,
ba_lock_data_level:1;
struct ocfs2_io_marker ba_task;
@@ -115,9 +114,7 @@
wl_do_direct_io:1,
wl_have_i_mutex:1,
wl_unlock_ctxt:1,
- wl_have_before:1,
- wl_have_target_meta:1,
- wl_have_data_lock:1;
+ wl_have_before:1;
struct ocfs2_backing_inode *wl_target_binode;
};
More information about the Ocfs2-commits
mailing list