[Ocfs2-devel] [PATCH RESEND] jbd2: fix ocfs2 corrupt when updating journal superblock fails

Mon Jun 15 06:08:44 PDT 2015

On Mon 15-06-15 14:27:09, Joseph Qi wrote:
> If updating journal superblock fails after journal data has been flushed,
> the error is omitted and this will mislead the caller as a normal case.
> In ocfs2, the checkpoint will be treated successfully and the other node
> can get the lock to update. Since the sb_start is still pointing to the
> old log block, it will rewrite the journal data during journal recovery
> by the other node. Thus the new updates will be overwritten and ocfs2
> corrupts.
> So in above case we have to return the error, and ocfs2_commit_cache will
> take care of the error and prevent the other node to do update first.
> And only after recovering journal it can do the new updates.
> 
> The issue discussion mail can be found at:
> https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html
> http://comments.gmane.org/gmane.comp.file-systems.ext4/48841
> 
> Reported-by: Yiwen Jiang <jiangyiwen at huawei.com>
> Signed-off-by: Joseph Qi <joseph.qi at huawei.com>
> Tested-by: Yiwen Jiang <jiangyiwen at huawei.com>
> Cc: Junxiao Bi <junxiao.bi at oracle.com>
> Cc: <stable at vger.kernel.org>
  The patch looks good but it seems to be against relatively old kernel
version. Can you please rebase your patch against current kernel? Thanks!

								Honza

> ---
>  fs/jbd2/checkpoint.c |  5 ++---
>  fs/jbd2/journal.c    | 37 ++++++++++++++++++++++++++++++-------
>  include/linux/jbd2.h |  4 ++--
>  3 files changed, 34 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
> index 988b32e..82e5b7d 100644
> --- a/fs/jbd2/checkpoint.c
> +++ b/fs/jbd2/checkpoint.c
> @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
>  	unsigned long	blocknr;
> 
>  	if (is_journal_aborted(journal))
> -		return 1;
> +		return -EIO;
> 
>  	if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
>  		return 1;
> @@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
>  	if (journal->j_flags & JBD2_BARRIER)
>  		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
> 
> -	__jbd2_update_log_tail(journal, first_tid, blocknr);
> -	return 0;
> +	return __jbd2_update_log_tail(journal, first_tid, blocknr);
>  }
> 
> 
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index b96bd80..6b33a42 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -885,9 +885,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
>   *
>   * Requires j_checkpoint_mutex
>   */
> -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
> +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
>  {
>  	unsigned long freed;
> +	int ret;
> 
>  	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
> 
> @@ -897,7 +898,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
>  	 * space and if we lose sb update during power failure we'd replay
>  	 * old transaction with possibly newly overwritten data.
>  	 */
> -	jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
> +	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
> +	if (ret)
> +		goto out;
> +
>  	write_lock(&journal->j_state_lock);
>  	freed = block - journal->j_tail;
>  	if (block < journal->j_tail)
> @@ -913,6 +917,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
>  	journal->j_tail_sequence = tid;
>  	journal->j_tail = block;
>  	write_unlock(&journal->j_state_lock);
> +
> +out:
> +	return ret;
>  }
> 
>  /*
> @@ -1331,7 +1338,7 @@ static int journal_reset(journal_t *journal)
>  	return jbd2_journal_start_thread(journal);
>  }
> 
> -static void jbd2_write_superblock(journal_t *journal, int write_op)
> +static int jbd2_write_superblock(journal_t *journal, int write_op)
>  {
>  	struct buffer_head *bh = journal->j_sb_buffer;
>  	journal_superblock_t *sb = journal->j_superblock;
> @@ -1370,7 +1377,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
>  		printk(KERN_ERR "JBD2: Error %d detected when updating "
>  		       "journal superblock for %s.\n", ret,
>  		       journal->j_devname);
> +		jbd2_journal_abort(journal, ret);
>  	}
> +
> +	return ret;
>  }
> 
>  /**
> @@ -1383,10 +1393,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
>   * Update a journal's superblock information about log tail and write it to
>   * disk, waiting for the IO to complete.
>   */
> -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
> +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
>  				     unsigned long tail_block, int write_op)
>  {
>  	journal_superblock_t *sb = journal->j_superblock;
> +	int ret;
> 
>  	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
>  	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
> @@ -1395,13 +1406,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
>  	sb->s_sequence = cpu_to_be32(tail_tid);
>  	sb->s_start    = cpu_to_be32(tail_block);
> 
> -	jbd2_write_superblock(journal, write_op);
> +	ret = jbd2_write_superblock(journal, write_op);
> +	if (ret)
> +		goto out;
> 
>  	/* Log is no longer empty */
>  	write_lock(&journal->j_state_lock);
>  	WARN_ON(!sb->s_sequence);
>  	journal->j_flags &= ~JBD2_FLUSHED;
>  	write_unlock(&journal->j_state_lock);
> +
> +out:
> +	return ret;
>  }
> 
>  /**
> @@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal)
>  		return -EIO;
> 
>  	mutex_lock(&journal->j_checkpoint_mutex);
> -	jbd2_cleanup_journal_tail(journal);
> +	if (!err) {
> +		err = jbd2_cleanup_journal_tail(journal);
> +		if (err < 0) {
> +			mutex_unlock(&journal->j_checkpoint_mutex);
> +			goto out;
> +		}
> +	}
> 
>  	/* Finally, mark the journal as really needing no recovery.
>  	 * This sets s_start==0 in the underlying superblock, which is
> @@ -1966,7 +1988,8 @@ int jbd2_journal_flush(journal_t *journal)
>  	J_ASSERT(journal->j_head == journal->j_tail);
>  	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
>  	write_unlock(&journal->j_state_lock);
> -	return 0;
> +out:
> +	return err;
>  }
> 
>  /**
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 20e7f78..edb640a 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
>  int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
>  int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
>  			      unsigned long *block);
> -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
> +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
>  void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
> 
>  /* Commit management */
> @@ -1157,7 +1157,7 @@ extern int	   jbd2_journal_recover    (journal_t *journal);
>  extern int	   jbd2_journal_wipe       (journal_t *, int);
>  extern int	   jbd2_journal_skip_recovery	(journal_t *);
>  extern void	   jbd2_journal_update_sb_errno(journal_t *);
> -extern void	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
> +extern int	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
>  				unsigned long, int);
>  extern void	   __jbd2_journal_abort_hard	(journal_t *);
>  extern void	   jbd2_journal_abort      (journal_t *, int);
> -- 
> 1.8.4.3
> 
> 
-- 
Jan Kara <jack at suse.cz>
SUSE Labs, CR