[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V1).
wengang wang
wen.gang.wang at oracle.com
Thu Dec 4 18:35:03 PST 2008
Joel Becker wrote:
> [snip]
>> +#define mlog_warnno(st) do { \
>> + int _st = (st); \
>> + mlog(ML_NOTICE, "status = %lld\n", (long long)_st); \
>> +} while (0)
>> +
>>
>
> I don't think we want mlog_warnno(). Really, it should just be
> an mlog(0, ...) if anything. We don't want to print for every stale
> inode.
>
>
yes, I'm also considering about that. STALE is normal to nfs.
will change it.
>> @@ -48,23 +50,125 @@ struct ocfs2_inode_handle
>> static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
>> {
>> struct ocfs2_inode_handle *handle = vobjp;
>> - struct inode *inode;
>> + struct ocfs2_super *osb = OCFS2_SB(sb);
>> struct dentry *result;
>> -
>> + struct inode *inode, *inode_alloc_inode;
>> + struct buffer_head *inode_bh = NULL, *alloc_bh = NULL;
>> + struct buffer_head *group_bh = NULL;
>> + struct ocfs2_dinode *inode_fe, *alloc_fe;
>> + struct ocfs2_group_desc *group;
>> + u64 blkno = handle->ih_blkno, bg_blkno;
>> + unsigned short suballoc_bit, suballoc_slot;
>> + int status;
>> +
>> mlog_entry("(0x%p, 0x%p)\n", sb, handle);
>>
>> - if (handle->ih_blkno == 0) {
>> - mlog_errno(-ESTALE);
>> + if (blkno == 0) {
>> + mlog_warnno(-ESTALE);
>> return ERR_PTR(-ESTALE);
>> }
>>
>> + inode = ocfs2_ilookup(sb, ino_from_blkno(sb, blkno));
>> + /* found in-memory inode, goes to check generation */
>> + if (inode)
>> + goto check_gen;
>> +
>> + /* dirty reads(hit disk) the inode to get suballoc_slot and
>> + * suballoc_bit
>> + * */
>> + status = ocfs2_read_block(osb, blkno, &inode_bh, 0, NULL);
>> + if (status < 0) {
>> + mlog_errno(status);
>> + return ERR_PTR(status);
>> + }
>> +
>> + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
>> + if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
>> + mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
>> + (unsigned long long)blkno, 7, inode_fe->i_signature);
>> + status = -EINVAL;
>> + goto rls_bh;
>> + }
>> +
>> + suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
>> + suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
>> +
>> + /* checks suballoc_bit in bitmap is still SET */
>> + inode_alloc_inode =
>> + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
>> + suballoc_slot);
>> + if (!inode_alloc_inode) {
>> + status = -EEXIST;
>> + goto rls_bh;
>> + }
>> +
>> + mutex_lock(&inode_alloc_inode->i_mutex);
>> + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
>> + if (status < 0)
>> + goto unlock_mutex;
>> +
>> + alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
>> + BUG_ON((suballoc_bit + 1) >
>> + ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
>>
>
> I'd rather leave ocfs2_bits_per_group() static to suballoc.c.
>
>
I will try to do that better...
>> +
>> + bg_blkno = ocfs2_which_suballoc_group(blkno, suballoc_bit);
>> + status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
>> + inode_alloc_inode);
>> + if (status < 0)
>> + goto inode_unlock;
>> +
>> + group = (struct ocfs2_group_desc *) group_bh->b_data;
>> + status = ocfs2_check_group_descriptor(sb, alloc_fe, group);
>> + if (status < 0)
>> + goto rls_group_bh;
>> +
>> + /* if the bit is clear, it's a stale inode */
>> + if (!ocfs2_test_bit(suballoc_bit, (unsigned long *)group->bg_bitmap)) {
>> + status = -ESTALE;
>> + goto rls_group_bh;
>> + }
>> +
>>
>
> In fact, why not make this entire logic a new function in
> suballoc.c called ocfs2_test_suballoc_bit(). You pass it a locked
> suballocator inode, the blkno, and the suballoc_bit. It returns the
> status of the bit.
>
>
hm, yes, good idea.
>> inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
>>
>> - if (IS_ERR(inode))
>> + /* if suballoc slot changed since last ocfs2_read_block(), we may have
>> + * the lock on a wrong suballoc inode. but if so, since the inode was
>> + * changed, the inode we want must be stale then.
>> + * while, we don't check that here(since ocfs2_iget() doesn't bring
>> + * ocfs2_dinode to us, if we do check, we needs other code). instead,
>> + * we check on generation later. that is because if the suballoc slot
>> + * changed, the generation must changed(since they are in the same disk
>> + * sector).
>> + * */
>>
>
> inode's don't change suballoc inodes. In fact, your call to
> ocfs2_check_group_descriptor() will crash the filesystem if it
> determines that the group descriptor and the suballoc inode don't match.
> But that can't happen in a correctly functioning filesystem.
>
>
Ok, I see. I will remove the comments.
>> +check_gen:
>> if (handle->ih_generation != inode->i_generation) {
>> iput(inode);
>> + mlog_warnno(-ESTALE);
>> return ERR_PTR(-ESTALE);
>> }
>>
>
> This is what I mean. We don't want to log every stale inode.
> Stale inodes are a normal part of NFS operation.
>
yes, will change that.
>
>
>> Index: fs/ocfs2/inode.c
>> ===================================================================
>> --- fs/ocfs2/inode.c (revision 38)
>> +++ fs/ocfs2/inode.c (working copy)
>> @@ -111,6 +111,17 @@ void ocfs2_get_inode_flags(struct ocfs2_
>> oi->ip_attr |= OCFS2_DIRSYNC_FL;
>> }
>>
>> +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
>> +{
>> + struct ocfs2_find_inode_args args;
>> +
>> + args.fi_blkno = blkno;
>> + args.fi_flags = 0;
>> + args.fi_ino = ino_from_blkno(sb, blkno);
>> + args.fi_sysfile_type = 0;
>> +
>> + return ilookup5(sb, blkno, ocfs2_find_actor, &args);
>> +}
>>
>
> This is good.
>
>
>> struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
>> int sysfile_type)
>> {
>> @@ -571,6 +582,9 @@ out:
>> return status;
>> }
>>
>> +/* callers must have cluster lock inode_alloc_inode taken before calling
>> + * ocfs2_remove_inode
>> + * */
>> static int ocfs2_remove_inode(struct inode *inode,
>> struct buffer_head *di_bh,
>> struct inode *orphan_dir_inode,
>> @@ -592,11 +606,12 @@ static int ocfs2_remove_inode(struct ino
>> goto bail;
>> }
>>
>> - mutex_lock(&inode_alloc_inode->i_mutex);
>> - status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
>> + status = ocfs2_read_block(OCFS2_SB(inode_alloc_inode->i_sb),
>> + OCFS2_I(inode_alloc_inode)->ip_blkno,
>> + &inode_alloc_bh,
>> + OCFS2_BH_CACHED,
>> + inode_alloc_inode);
>>
>
> If you require callers to lock the alloc inode, have them pass
> in the alloc inode and bh to the function. Then you don't have to call
> get_system_file_inode or read_block.
>
>
Yes, I had thought so.
while, the grandparent knows the alloc inode and direct parent doesn't use
it at. so gave up.
I will consider it further..
>> Index: fs/ocfs2/dlmglue.c
>> ===================================================================
>> --- fs/ocfs2/dlmglue.c (revision 38)
>> +++ fs/ocfs2/dlmglue.c (working copy)
>> @@ -1940,19 +1940,25 @@ static int ocfs2_inode_lock_update(struc
>> status = -EIO;
>> goto bail_refresh;
>> }
>> - mlog_bug_on_msg(inode->i_generation !=
>> - le32_to_cpu(fe->i_generation),
>> - "Invalid dinode %llu disk generation: %u "
>> - "inode->i_generation: %u\n",
>> - (unsigned long long)oi->ip_blkno,
>> - le32_to_cpu(fe->i_generation),
>> - inode->i_generation);
>> - mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
>> - !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
>> - "Stale dinode %llu dtime: %llu flags: 0x%x\n",
>> - (unsigned long long)oi->ip_blkno,
>> - (unsigned long long)le64_to_cpu(fe->i_dtime),
>> - le32_to_cpu(fe->i_flags));
>> + if (inode->i_generation != le32_to_cpu(fe->i_generation)) {
>> + mlog(ML_NOTICE, "Invalid dinode %llu "
>> + "disk generation: %u inode->i_generation: %u\n",
>> + (unsigned long long)oi->ip_blkno,
>> + le32_to_cpu(fe->i_generation),
>> + inode->i_generation);
>> + status = -ESTALE;
>> + goto bail_refresh;
>> + }
>> + if (le64_to_cpu(fe->i_dtime) ||
>> + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
>> + mlog(ML_NOTICE,
>> + "Stale dinode %llu dtime: %llu flags: 0x%x\n",
>> + (unsigned long long)oi->ip_blkno,
>> + (unsigned long long)le64_to_cpu(fe->i_dtime),
>> + le32_to_cpu(fe->i_flags));
>> + status = -ESTALE;
>> + goto bail_refresh;
>>
>
> Once again, we don't want to ML_NOTICE these -ESTALE conditions.
> -ESTALE is a normal thing. Make that mlog(0, ...).
>
>
ok.
thanks,
wengang.
More information about the Ocfs2-devel
mailing list