[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V1).

Thu Dec 4 12:05:34 PST 2008

On Thu, Dec 04, 2008 at 06:58:08PM +0800, wengang wang wrote:
> this patch is to fix nfs getting stale inode bug.
> 
>         Right after we validate ih_blkno at the top we will use
> ilookup5() to see if we have the inode in memory.  If we have the inode
> in memory, we know we have the open lock, and the inode must be valid.
> We can check the generation number and look up the alias as we normally
> do.  
>         If the inode is not found via ilookup5() -- that is, it is not
> in memory -- we do something different.  First, we do a dirty read of
> the inode block.  We use that inode block to find it's suballocator 
> (i_suballoc_slot).  Then we take the cluster lock on that suballocator
> system file.  By locking that file, we will have ensured the other nodes
> have written out any changes.  We then check that suballocator to see if
> the inode bit is actually set (i_suballoc_bit).  If the bit is set, we know
> an inode is still valid on disk, and we then call ocfs2_iget().  If the bit
> is not set, we know the inode is not valid, and we return -ESTALE.  At the
> bottom of ocfs2_get_dentry() we can release the lock on the suballocator. 
> We either have the inode (via iget) or don't (-ESTALE).
> 
>         second change is in ocfs2_inode_lock_update().  we don't panic
> kernel when generation mismatches or the on disk inode is deleted. Instead,
> we return -ESTALE.
> 	the last change is that we change the lock order against inode its
> self and the suballcator. at above paragraph, we take locks in order of
> 	suballotor(in ocfs2_get_dentry())
> 	inode(in ocfs2_iget())
> while, the inode deleting procedure locks in reverse order. to avoid deadlock,
> we change the order of inode deleting procedure.
> 
> the patch is against 1.4 git.
> #please ignore the svn info in patch -- I used a local svn for version
> #management.

	This is definitely on the right track.

> +#define mlog_warnno(st) do {						\
> +	int _st = (st);							\
> +	mlog(ML_NOTICE, "status = %lld\n", (long long)_st);		\
> +} while (0)
> +

	I don't think we want mlog_warnno().  Really, it should just be
an mlog(0, ...) if anything.  We don't want to print for every stale
inode.

> @@ -48,23 +50,125 @@ struct ocfs2_inode_handle
>  static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
>  {
>  	struct ocfs2_inode_handle *handle = vobjp;
> -	struct inode *inode;
> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>  	struct dentry *result;
> -
> +	struct inode *inode, *inode_alloc_inode;
> +	struct buffer_head *inode_bh = NULL, *alloc_bh = NULL;
> +	struct buffer_head *group_bh = NULL;
> +	struct ocfs2_dinode *inode_fe, *alloc_fe;
> +	struct ocfs2_group_desc *group;
> +	u64 blkno = handle->ih_blkno, bg_blkno;
> +	unsigned short suballoc_bit, suballoc_slot;
> +	int status;
> +	
>  	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
>  
> -	if (handle->ih_blkno == 0) {
> -		mlog_errno(-ESTALE);
> +	if (blkno == 0) {
> +		mlog_warnno(-ESTALE);
>  		return ERR_PTR(-ESTALE);
>  	}
>  
> +	inode = ocfs2_ilookup(sb, ino_from_blkno(sb, blkno));
> +	/* found in-memory inode, goes to check generation */
> +	if (inode)
> +		goto check_gen;
> +
> +	/* dirty reads(hit disk) the inode to get suballoc_slot and
> +	 * suballoc_bit 
> +	 * */
> +	status = ocfs2_read_block(osb, blkno, &inode_bh, 0, NULL);
> +	if (status < 0) {
> +		mlog_errno(status);
> +		return ERR_PTR(status);
> +	}
> +
> +	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
> +	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
> +		mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
> +		     (unsigned long long)blkno, 7, inode_fe->i_signature);
> +		status = -EINVAL;
> +		goto rls_bh;
> +	}
> +
> +	suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
> +	suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
> +
> +	/* checks suballoc_bit in bitmap is still SET */
> +	inode_alloc_inode =
> +		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
> +					    suballoc_slot);
> +	if (!inode_alloc_inode) {
> +		status = -EEXIST;
> +		goto rls_bh;
> +	}
> +
> +	mutex_lock(&inode_alloc_inode->i_mutex);
> +	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
> +	if (status < 0) 
> +		goto unlock_mutex;
> +
> +	alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
> +	BUG_ON((suballoc_bit + 1) >
> +			ocfs2_bits_per_group(&alloc_fe->id2.i_chain));

	I'd rather leave ocfs2_bits_per_group() static to suballoc.c.

> +
> +	bg_blkno = ocfs2_which_suballoc_group(blkno, suballoc_bit);
> +	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, 
> +				  inode_alloc_inode);
> +	if (status < 0) 
> +		goto inode_unlock;
> +
> +	group = (struct ocfs2_group_desc *) group_bh->b_data;
> +	status = ocfs2_check_group_descriptor(sb, alloc_fe, group);
> +	if (status < 0) 
> +		goto rls_group_bh;
> +
> +	/* if the bit is clear, it's a stale inode */
> +	if (!ocfs2_test_bit(suballoc_bit, (unsigned long *)group->bg_bitmap)) {
> +		status = -ESTALE;
> +		goto rls_group_bh;
> +	}
> +

	In fact, why not make this entire logic a new function in
suballoc.c called ocfs2_test_suballoc_bit().  You pass it a locked
suballocator inode, the blkno, and the suballoc_bit.  It returns the
status of the bit.

>  	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
>  
> -	if (IS_ERR(inode))
> +	/* if suballoc slot changed since last ocfs2_read_block(), we may have
> +	 * the lock on a wrong suballoc inode. but if so, since the inode was 
> +	 * changed, the inode we want must be stale then. 
> +	 * while, we don't check that here(since ocfs2_iget() doesn't bring
> +	 * ocfs2_dinode to us, if we do check, we needs other code). instead,
> +	 * we check on generation later. that is because if the suballoc slot
> +	 * changed, the generation must changed(since they are in the same disk
> +	 * sector). 
> +	 * */ 

	inode's don't change suballoc inodes.  In fact, your call to
ocfs2_check_group_descriptor() will crash the filesystem if it
determines that the group descriptor and the suballoc inode don't match.
But that can't happen in a correctly functioning filesystem.

> +check_gen:
>  	if (handle->ih_generation != inode->i_generation) {
>  		iput(inode);
> +		mlog_warnno(-ESTALE);
>  		return ERR_PTR(-ESTALE);
>  	}

	This is what I mean.  We don't want to log every stale inode.
Stale inodes are a normal part of NFS operation.

> Index: fs/ocfs2/inode.c
> ===================================================================
> --- fs/ocfs2/inode.c	(revision 38)
> +++ fs/ocfs2/inode.c	(working copy)
> @@ -111,6 +111,17 @@ void ocfs2_get_inode_flags(struct ocfs2_
>  		oi->ip_attr |= OCFS2_DIRSYNC_FL;
>  }
>  
> +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
> +{
> +	struct ocfs2_find_inode_args args;
> +
> +	args.fi_blkno = blkno;
> +	args.fi_flags = 0;
> +	args.fi_ino = ino_from_blkno(sb, blkno);
> +	args.fi_sysfile_type = 0;
> +
> +	return ilookup5(sb, blkno, ocfs2_find_actor, &args);
> +}

	This is good.

>  struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
>  			 int sysfile_type)
>  {
> @@ -571,6 +582,9 @@ out:
>  	return status;
>  }
>  
> +/* callers must have cluster lock inode_alloc_inode taken before calling
> + * ocfs2_remove_inode
> + * */
>  static int ocfs2_remove_inode(struct inode *inode,
>  			      struct buffer_head *di_bh,
>  			      struct inode *orphan_dir_inode,
> @@ -592,11 +606,12 @@ static int ocfs2_remove_inode(struct ino
>  		goto bail;
>  	}
>  
> -	mutex_lock(&inode_alloc_inode->i_mutex);
> -	status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
> +	status = ocfs2_read_block(OCFS2_SB(inode_alloc_inode->i_sb),
> +				  OCFS2_I(inode_alloc_inode)->ip_blkno,
> +				  &inode_alloc_bh,
> +				  OCFS2_BH_CACHED,
> +				  inode_alloc_inode);

	If you require callers to lock the alloc inode, have them pass
in the alloc inode and bh to the function.  Then you don't have to call
get_system_file_inode or read_block.

> Index: fs/ocfs2/dlmglue.c
> ===================================================================
> --- fs/ocfs2/dlmglue.c	(revision 38)
> +++ fs/ocfs2/dlmglue.c	(working copy)
> @@ -1940,19 +1940,25 @@ static int ocfs2_inode_lock_update(struc
>  			status = -EIO;
>  			goto bail_refresh;
>  		}
> -		mlog_bug_on_msg(inode->i_generation !=
> -				le32_to_cpu(fe->i_generation),
> -				"Invalid dinode %llu disk generation: %u "
> -				"inode->i_generation: %u\n",
> -				(unsigned long long)oi->ip_blkno,
> -				le32_to_cpu(fe->i_generation),
> -				inode->i_generation);
> -		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
> -				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
> -				"Stale dinode %llu dtime: %llu flags: 0x%x\n",
> -				(unsigned long long)oi->ip_blkno,
> -				(unsigned long long)le64_to_cpu(fe->i_dtime),
> -				le32_to_cpu(fe->i_flags));
> +		if (inode->i_generation != le32_to_cpu(fe->i_generation)) {
> +			mlog(ML_NOTICE, "Invalid dinode %llu "
> +			     "disk generation: %u inode->i_generation: %u\n",
> +			     (unsigned long long)oi->ip_blkno,
> +			     le32_to_cpu(fe->i_generation),
> +			     inode->i_generation);
> +			status = -ESTALE;
> +			goto bail_refresh;
> +		}
> +		if (le64_to_cpu(fe->i_dtime) ||
> +		    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
> +			mlog(ML_NOTICE,
> +			     "Stale dinode %llu dtime: %llu flags: 0x%x\n",
> +			     (unsigned long long)oi->ip_blkno,
> +			     (unsigned long long)le64_to_cpu(fe->i_dtime),
> +			     le32_to_cpu(fe->i_flags));
> +			status = -ESTALE;
> +			goto bail_refresh;

	Once again, we don't want to ML_NOTICE these -ESTALE conditions.
-ESTALE is a normal thing.  Make that mlog(0, ...).

Joel

-- 

 There are morethings in heaven and earth, Horatio,
 Than are dreamt of in your philosophy.

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127