[Ocfs2-commits] mfasheh commits r2911 - branches/ocfs2-1.2/fs/ocfs2

Wed Apr 26 17:22:30 CDT 2006

Author: mfasheh
Signed-off-by: zab
Date: 2006-04-26 17:22:29 -0500 (Wed, 26 Apr 2006)
New Revision: 2911

Modified:
   branches/ocfs2-1.2/fs/ocfs2/aio.c
   branches/ocfs2-1.2/fs/ocfs2/aops.c
   branches/ocfs2-1.2/fs/ocfs2/dlmglue.c
   branches/ocfs2-1.2/fs/ocfs2/file.c
   branches/ocfs2-1.2/fs/ocfs2/mmap.c
   branches/ocfs2-1.2/fs/ocfs2/mmap.h
Log:
We need to take a data lock around extends to protect the pages that
ocfs2_zero_extend is going to be pulling into the page cache.

Signed-off-by: zab



Modified: branches/ocfs2-1.2/fs/ocfs2/aio.c
===================================================================

--- branches/ocfs2-1.2/fs/ocfs2/aio.c	2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/aio.c	2006-04-26 22:22:29 UTC (rev 2911)
@@ -258,7 +258,7 @@
 
 		okp->kp_ctxt.b_cb = ocfs2_aio_kick;
 		okp->kp_ctxt.b_cb_data = (unsigned long)iocb;
-		target_binode->ba_lock_data = filp->f_flags & O_DIRECT ? 0 : 1;
+		target_binode->ba_lock_data_level = 0;
 	}
 
 	/* this might return EIOCBRETRY and we'll come back again to
@@ -334,6 +334,11 @@
 						    &iocb->ki_pos,
 						    &okp->kp_info,
 						    &okp->kp_ctxt);
+		/*
+		 * XXX this looks totally broken.. what if _maybe_extend
+		 * returns EIOCBRETRY?  it'll never be called again and the
+		 * op will simply proceed without locking?
+		 */
 		okp->kp_have_write_locks = 1;
 		if (okp->kp_info.wl_extended) {
 			/*

Modified: branches/ocfs2-1.2/fs/ocfs2/aops.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/aops.c	2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/aops.c	2006-04-26 22:22:29 UTC (rev 2911)
@@ -237,13 +237,19 @@
 static int ocfs2_prepare_write(struct file *file, struct page *page,
 		unsigned from, unsigned to)
 {
+	struct inode *inode = page->mapping->host;
 	int ret;
 
 	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 
+	/* we can't bring in pages without holding the data lock */
+	mlog_bug_on_msg(OCFS2_I(inode)->ip_data_lockres.l_ex_holders == 0,
+			"inode %llu holders %u\n",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			OCFS2_I(inode)->ip_data_lockres.l_ex_holders);
+
 	ret = cont_prepare_write(page, from, to, ocfs2_get_block,
-		&(OCFS2_I(page->mapping->host)->ip_mmu_private));
-
+				 &(OCFS2_I(inode)->ip_mmu_private));
 	mlog_exit(ret);
 
 	return ret;

Modified: branches/ocfs2-1.2/fs/ocfs2/dlmglue.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/dlmglue.c	2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/dlmglue.c	2006-04-26 22:22:29 UTC (rev 2911)
@@ -467,8 +467,19 @@
 	 * information is already up to data. Convert from NL to
 	 * *anything* however should mark ourselves as needing an
 	 * update */
-	if (lockres->l_level == LKM_NLMODE)
+	if (lockres->l_level == LKM_NLMODE) {
 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+		if (lockres->l_type == OCFS2_LOCK_TYPE_DATA) {
+			/* we're just now getting the data lock so there
+			 * had better not be any pages in the page cache */
+			struct inode *inode = ocfs2_lock_res_inode(lockres);
+			struct address_space *mapping = inode->i_mapping;
+			mlog_bug_on_msg(mapping->nrpages,
+					"inode %llu nrpages %lu\n",
+				(unsigned long long)OCFS2_I(inode)->ip_blkno,
+					mapping->nrpages);
+		}
+	}
 
 	lockres->l_level = lockres->l_requested;
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);

Modified: branches/ocfs2-1.2/fs/ocfs2/file.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/file.c	2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/file.c	2006-04-26 22:22:29 UTC (rev 2911)
@@ -246,8 +246,11 @@
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 
 bail:
-	/* we might have to finish up extentions that were performed before
-	 * an error was returned by, say, data locking */
+	/* 
+	 * if this write created a hole then write zeros into it.. wl_extended
+	 * is only set if we got the data lock so the buffered zero writing
+	 * will have lock coverage.  This must be done before unlocking.
+	 */
 	if (info.wl_extended)
 		ocfs2_file_finish_extension(inode, info.wl_newsize,
 					    info.wl_do_direct_io);
@@ -307,7 +310,7 @@
 		goto bail;
 	}
 
-	target_binode->ba_lock_data = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	target_binode->ba_lock_data_level = 0;
 
 	ret = ocfs2_lock_buffer_inodes(&ctxt, NULL);
 	if (ret < 0) {
@@ -985,10 +988,82 @@
 	return status;
 }
 
+static int ocfs2_setattr_newsize(struct ocfs2_super *osb, struct inode *inode,
+				 u64 newsize)
+{
+	u64 bytes_added = 0;
+	int status;
+
+	if (i_size_read(inode) > newsize) {
+		status = ocfs2_truncate_file(osb, newsize, inode);
+		/* should this be translating errnos to enospc?  it
+		 * used to.. */
+		if (status < 0 && status != -ENOSPC)
+			mlog_errno(status);
+		/* _truncate_file calls _set_file_size which updates
+		 * mmu_private and i_size, we're done.. */
+		goto out;
+	}
+
+	/* 
+	 * protect the pages that ocfs2_zero_extend is going to
+	 * be pulling into the page cache.. we do this before the
+	 * metadata extend so that we don't get into the situation
+	 * where we've extended the metadata but can't get the data
+	 * lock to zero.
+	 */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_extend_file(osb, inode, newsize, &bytes_added);
+	if (status < 0 && (!bytes_added)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		status = -ENOSPC;
+		goto out_unlock;
+	}
+
+	/* partial extend, we continue with what we've got. */
+	if (status < 0
+	    && status != -ENOSPC
+	    && status != -EINTR
+	    && status != -ERESTARTSYS)
+		mlog(ML_ERROR,
+		     "status return of %d extending inode "
+		     "%"MLFu64"\n", status,
+		     OCFS2_I(inode)->ip_blkno);
+	status = 0;
+
+	newsize = bytes_added + i_size_read(inode);
+	if (bytes_added)
+		ocfs2_update_inode_size(inode, newsize);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_OPEN_DIRECT) {
+		/* This is a total broken hack for O_DIRECT crack */
+		OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+#endif
+
+	status = ocfs2_zero_extend(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+out_unlock:
+	ocfs2_data_unlock(inode, 1);
+out:
+	return status;
+}
+
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int status = 0;
-	u64 newsize, bytes_added;
+	u64 newsize;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1033,48 +1108,9 @@
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
 	    newsize != i_size_read(inode)) {
-		bytes_added = 0;
-
-		if (i_size_read(inode) > newsize)
-			status = ocfs2_truncate_file(osb, newsize, inode);
-		else
-			status = ocfs2_extend_file(osb, inode, newsize,
-						   &bytes_added);
-		if (status < 0 && (!bytes_added)) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
-			status = -ENOSPC;
+		status = ocfs2_setattr_newsize(osb, inode, newsize);
+		if (status < 0)
 			goto bail_unlock;
-		}
-
-		/* partial extend, we continue with what we've got. */
-		if (status < 0
-		    && status != -ENOSPC
-		    && status != -EINTR
-		    && status != -ERESTARTSYS)
-			mlog(ML_ERROR,
-			     "status return of %d extending inode "
-			     "%"MLFu64"\n", status,
-			     OCFS2_I(inode)->ip_blkno);
-		status = 0;
-
-		newsize = bytes_added + i_size_read(inode);
-		if (bytes_added)
-			ocfs2_update_inode_size(inode, newsize);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_OPEN_DIRECT) {
-			/* This is a total broken hack for O_DIRECT crack */
-			OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
-		}
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-#endif
-		status = ocfs2_zero_extend(inode);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail_unlock;
-		}
 	}
 
 	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);

Modified: branches/ocfs2-1.2/fs/ocfs2/mmap.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/mmap.c	2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/mmap.c	2006-04-26 22:22:29 UTC (rev 2911)
@@ -337,7 +337,7 @@
 	 * to one we haven't then we mark this node in the ctxt so that
 	 * we'll return to it in a future after, say, hitting last_inode
 	 * or EIOCBRETRY in lock_buffer_inodes */
-	if (pos && pos->ba_locked && binode)
+	if (pos && pos->ba_meta_locked && pos->ba_data_locked && binode)
 		ctxt->b_next_unlocked = binode;
 
 	return binode;
@@ -353,14 +353,14 @@
 int ocfs2_lock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt,
 			     struct inode *last_inode)
 {
-	int status, data_level;
+	int status;
 	struct ocfs2_backing_inode *binode = NULL;
 	struct inode *inode;
 
 	while((binode = ocfs2_next_unlocked(ctxt, last_inode, binode))) {
 		/* the tricksy caller might have locked inodes themselves
 		 * between calls. */
-		if (binode->ba_locked)
+		if (binode->ba_meta_locked && binode->ba_data_locked)
 			continue;
 		inode = binode->ba_inode;
 
@@ -379,10 +379,9 @@
 			binode->ba_meta_locked = 1;
 		}
 
-		/* ba_lock_data isn't set for direct io */
-		if (binode->ba_lock_data) {
-			data_level = binode->ba_lock_data_level;
-			status = ocfs2_data_lock(inode, data_level);
+		if (!binode->ba_data_locked) {
+			status = ocfs2_data_lock(inode,
+						 binode->ba_lock_data_level);
 			if (status < 0) {
 				if (status == -EIOCBRETRY)
 					goto bail;
@@ -397,9 +396,9 @@
 				mlog_errno(status);
 				goto bail;
 			}
+			binode->ba_data_locked = 1;
 		}
 		ocfs2_add_io_marker(inode, &binode->ba_task);
-		binode->ba_locked = 1;
 	}
 
 	status = 0;
@@ -421,11 +420,11 @@
 
 		ocfs2_del_io_marker(binode->ba_inode, &binode->ba_task);
 
-		if (binode->ba_locked && binode->ba_lock_data)
+		if (binode->ba_data_locked)
 			ocfs2_data_unlock(binode->ba_inode,
 					  binode->ba_lock_data_level);
 
-		if (binode->ba_locked || binode->ba_meta_locked)
+		if (binode->ba_meta_locked)
 			ocfs2_meta_unlock(binode->ba_inode,
 					  binode->ba_lock_meta_level);
 
@@ -522,19 +521,22 @@
 				      struct ocfs2_buffer_lock_ctxt *ctxt)
 {
 	int ret = 0;
-	struct ocfs2_super *osb = NULL;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_backing_inode *ba;
 	int status;
-	int level = filp->f_flags & O_APPEND;
 	loff_t saved_ppos;
 	u64 bytes_added = 0;
 
-	osb = OCFS2_SB(inode->i_sb);
+	/* 
+	 * the target inode is different from the other inodes.  in o_direct it
+	 * gets a PR data lock (see below) and when appending it gets an EX
+	 * meta lock.  It's locked manually here though the backing_inode
+	 * fields are maintained while doing so so that unlock does the
+	 * right thing.
+	 */
 
-	/* the target inode is different from the other inodes.  in o_direct it
-	 * doesn't get a data lock and when appending it gets a level 1 meta
-	 * lock.  we use target_binode to set its flags accordingly */
 	if (info->wl_target_binode == NULL) {
 		ret = ocfs2_setup_io_locks(inode->i_sb, inode,
 					   (char __user *) buf,
@@ -547,6 +549,8 @@
 		}
 	}
 
+	ba = info->wl_target_binode;
+
 	/* This will lock everyone in the context who's order puts
 	 * them before us. */
 	if (!info->wl_have_before) {
@@ -558,8 +562,6 @@
 			goto bail;
 		}
 		info->wl_have_before = 1;
-		/* we're writing so get an ex data cluster lock */
-		info->wl_target_binode->ba_lock_data_level = 1;
 	}
 
 	if (!info->wl_have_i_mutex) {
@@ -567,21 +569,21 @@
 		info->wl_have_i_mutex = 1;
 	}
 
-lock:
-	if (!info->wl_have_target_meta) {
-		status = ocfs2_meta_lock(inode, NULL, NULL, level);
+	ba->ba_lock_data_level = 1;
+	if (filp->f_flags & O_APPEND)
+		ba->ba_lock_meta_level = 1;
+
+retry_meta_lock:
+	if (!ba->ba_meta_locked) {
+		status = ocfs2_meta_lock(inode, NULL, NULL,
+					 ba->ba_lock_meta_level);
 		if (status < 0) {
 			mlog_errno(status);
 			ret = status;
 			goto bail;
 		}
-		info->wl_have_target_meta = 1;
+		ba->ba_meta_locked = 1;
 	}
-	/* to handle extending writes, we do a bit of our own locking
-	 * here, but we setup the ctxt do unlock for us (as well as
-	 * handle locking everything else. */
-	if (level)
-		info->wl_target_binode->ba_lock_meta_level = 1;
 
 	/* Clear suid / sgid if necessary. We do this here instead of
 	 * later in the write path because remove_suid() calls
@@ -592,22 +594,20 @@
 	 * can be lost via setattr during extending writes (we set
 	 * inode->i_size at the end of a write. */
 	if (ocfs2_write_should_remove_suid(inode)) {
-		if (!level) {
+		if (ba->ba_lock_meta_level == 0) {
 			mlog(0, "inode %"MLFu64", had a PR, looping back for "
 			     "EX so we can remove SUID\n",
 			     OCFS2_I(inode)->ip_blkno);
-			ocfs2_meta_unlock(inode, level);
-			info->wl_have_target_meta = 0;
-			level = 1;
-			goto lock;
+			ocfs2_meta_unlock(inode, ba->ba_lock_meta_level);
+			ba->ba_meta_locked = 0;
+			ba->ba_lock_meta_level = 1;
+			goto retry_meta_lock;
 		}
 
 		status = ocfs2_write_remove_suid(inode);
 		if (status < 0) {
 			mlog_errno(status);
 			ret = status;
-			ocfs2_meta_unlock(inode, level);
-			info->wl_have_target_meta = 0;
 			goto bail;
 		}
 	}
@@ -650,27 +650,58 @@
 		mlog(0, "O_DIRECT\n");
 	}
 
-	info->wl_target_binode->ba_lock_data = info->wl_do_direct_io ? 0 : 1;
+	/*
+	 * We get PR data locks even for O_DIRECT.  This allows concurrent
+	 * O_DIRECT writes but doesn't let O_DIRECT writes race with 
+	 * extending and buffered zeroing writes race.  If they did race
+	 * then the buffered zeroing could be written back after the O_DIRECT 
+	 * write and overwrite it.  It's one thing to tell people not to 
+	 * mix buffered and O_DIRECT writes, but expecting them to understand
+	 * that file extension is also an implicit buffered write is 
+	 * too much.  By getting the PR we force writeback of the buffered
+	 * zeroing before proceeding.
+	 */
+	if (info->wl_do_direct_io && !(filp->f_flags & O_APPEND))
+		ba->ba_lock_data_level = 0;
 
 	info->wl_newsize = count + saved_ppos;
 	if (filp->f_flags & O_APPEND)
 		info->wl_newsize = count + i_size_read(inode);
 
+	/* get the locking straight for the extending case */
+	if (info->wl_newsize > i_size_read(inode)) {
+		if (ba->ba_lock_meta_level == 0) {
+			mlog(0, "inode %"MLFu64", had a PR meta, looping back "
+			     "for EX\n", OCFS2_I(inode)->ip_blkno);
+			ocfs2_meta_unlock(inode, ba->ba_lock_meta_level);
+			ba->ba_meta_locked = 0;
+			ba->ba_lock_meta_level = 1;
+			goto retry_meta_lock;
+		}
+		ba->ba_lock_data_level = 1;
+	}
+
+	/*
+	 * get the data lock before extending so that we can be sure
+	 * that we'll be able to zero under lock coverage.  This does
+	 * get an EX data lock for O_DIRECT but as long as zeroing is
+	 * buffered we really must hold the lock while manipulating the
+	 * page cache.
+	 */
+	if (!ba->ba_data_locked) {
+		status = ocfs2_data_lock(inode, ba->ba_lock_data_level);
+		if (status < 0) {
+			mlog_errno(status);
+			ret = status;
+			goto bail;
+		}
+		ba->ba_data_locked = 1;
+	}
+
 	mlog(0, "ppos=%lld newsize=%"MLFu64" cursize=%lld\n", saved_ppos,
 	     info->wl_newsize, i_size_read(inode));
 
 	if (info->wl_newsize > i_size_read(inode)) {
-		if (!level) {
-			/* we want an extend, but need a higher
-			 * level cluster lock. */
-			mlog(0, "inode %"MLFu64", had a PR, looping back "
-			     "for EX\n", OCFS2_I(inode)->ip_blkno);
-			ocfs2_meta_unlock(inode, level);
-			info->wl_have_target_meta = 0;
-			level = 1;
-			goto lock;
-		}
-
 		mlog(0, "Writing at EOF, will need more allocation: "
 		     "i_size=%lld, need=%"MLFu64"\n", i_size_read(inode),
 		     info->wl_newsize);
@@ -691,9 +722,6 @@
 				     *ppos, info->wl_newsize);
 			}
 			ret = status;
-
-			info->wl_have_target_meta = 0;
-			ocfs2_meta_unlock(inode, level);
 			goto bail;
 		}
 
@@ -720,24 +748,6 @@
 	 * can stuff *ppos back. */
 	*ppos = saved_ppos;
 
-	if (!info->wl_do_direct_io && !info->wl_have_data_lock) {
-		status = ocfs2_data_lock(inode, 1);
-		if (status < 0) {
-			mlog_errno(status);
-			ret = status;
-
-			info->wl_have_target_meta = 0;
-			ocfs2_meta_unlock(inode, level);
-			goto bail;
-		}
-		info->wl_have_data_lock = 1;
-	}
-
-	/* Alright, fool the io locking stuff into thinking it's
-	 * handled our inode for us. We can now count on it to do the
-	 * unlock for us. */
-	info->wl_target_binode->ba_locked = 1;
-
 	/* This will lock everyone who's order puts them *after* our inode. */
 	ret = ocfs2_lock_buffer_inodes(ctxt, NULL);
 	if (ret < 0) {
@@ -745,7 +755,6 @@
 			mlog_errno(ret);
 		goto bail;
 	}
-
 bail:
 	mlog_exit(ret);
 	return ret;

Modified: branches/ocfs2-1.2/fs/ocfs2/mmap.h
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/mmap.h	2006-04-26 21:19:41 UTC (rev 2910)
+++ branches/ocfs2-1.2/fs/ocfs2/mmap.h	2006-04-26 22:22:29 UTC (rev 2911)
@@ -69,9 +69,8 @@
 struct ocfs2_backing_inode {
 	struct rb_node           ba_node;
 	struct inode            *ba_inode;
-	unsigned		 ba_meta_locked:1, 	/* meta is locked */
-				 ba_locked:1,		/* both are locked */
-				 ba_lock_data:1,	/* should lock data */
+	unsigned		 ba_meta_locked:1,
+				 ba_data_locked:1,
 				 ba_lock_meta_level:1,
 				 ba_lock_data_level:1;
 	struct ocfs2_io_marker   ba_task;
@@ -115,9 +114,7 @@
 					wl_do_direct_io:1,
 					wl_have_i_mutex:1,
 					wl_unlock_ctxt:1,
-					wl_have_before:1,
-					wl_have_target_meta:1,
-					wl_have_data_lock:1;
+					wl_have_before:1;
 	struct ocfs2_backing_inode	*wl_target_binode;
 };