[Ocfs2-devel] [PATCH] ocfs2: prints peer node number when sending tcp msg failed

Wed Mar 10 10:13:34 PST 2010

Yes knowing the node number will be very useful.

Wondering why not just have the mlog in o2net_send_message_vec().
Fewer changes. Do you see any downside?

Wengang Wang wrote:
> This patch adds prints of the number of peer node to which sending tcp message
> failed. It helps debugging.
>
> Signed-off-by: Wengang Wang <wen.gang.wang at oracle.com>
> ---
>  fs/ocfs2/cluster/masklog.h |    9 +++++++++
>  fs/ocfs2/cluster/tcp.c     |    9 +++++++--
>  fs/ocfs2/dlm/dlmast.c      |    2 +-
>  fs/ocfs2/dlm/dlmconvert.c  |    2 +-
>  fs/ocfs2/dlm/dlmunlock.c   |    2 +-
>  5 files changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
> index 3dfddbe..2af7e93 100644
> --- a/fs/ocfs2/cluster/masklog.h
> +++ b/fs/ocfs2/cluster/masklog.h
> @@ -219,6 +219,15 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
>  		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
>  } while (0)
>  
> +/* "node" is number of the node to which sending tcp msg failed */
> +#define mlog_network_errno(st, node) do {				\
> +	int _st = (st);							\
> +	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
> +	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)		\
> +		mlog(ML_ERROR, "failed to send msg to %u. "		\
> +		     "errno: %lld\n", (u32)(node), (long long)_st);	\
> +} while (0)
> +
>  #if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
>  #define mlog_entry(fmt, args...) do {					\
>  	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
> diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
> index d8d0c65..bdc4e9a 100644
> --- a/fs/ocfs2/cluster/tcp.c
> +++ b/fs/ocfs2/cluster/tcp.c
> @@ -1092,12 +1092,17 @@ EXPORT_SYMBOL_GPL(o2net_send_message_vec);
>  int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
>  		       u8 target_node, int *status)
>  {
> +	int ret;
>  	struct kvec vec = {
>  		.iov_base = data,
>  		.iov_len = len,
>  	};
> -	return o2net_send_message_vec(msg_type, key, &vec, 1,
> -				      target_node, status);
> +
> +	ret = o2net_send_message_vec(msg_type, key, &vec, 1,
> +				     target_node, status);
> +	if (ret < 0)
> +		mlog_network_errno(ret, target_node);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(o2net_send_message);
>  
> diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
> index dccc439..bc50076 100644
> --- a/fs/ocfs2/dlm/dlmast.c
> +++ b/fs/ocfs2/dlm/dlmast.c
> @@ -453,7 +453,7 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
>  	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
>  				     lock->ml.node, &status);
>  	if (ret < 0)
> -		mlog_errno(ret);
> +		mlog_network_errno(ret, lock->ml.node);
>  	else {
>  		if (status == DLM_RECOVERING) {
>  			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
> diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
> index f283bce..ba30234 100644
> --- a/fs/ocfs2/dlm/dlmconvert.c
> +++ b/fs/ocfs2/dlm/dlmconvert.c
> @@ -391,7 +391,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
>  		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
>  			dlm_error(ret);
>  	} else {
> -		mlog_errno(tmpret);
> +		mlog_network_errno(tmpret, res->owner);
>  		if (dlm_is_host_down(tmpret)) {
>  			/* instead of logging the same network error over
>  			 * and over, sleep here and wait for the heartbeat
> diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
> index 49e29ec..acdc7fc 100644
> --- a/fs/ocfs2/dlm/dlmunlock.c
> +++ b/fs/ocfs2/dlm/dlmunlock.c
> @@ -355,7 +355,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
>  			mlog(0, "master was in-progress.  retry\n");
>  		ret = status;
>  	} else {
> -		mlog_errno(tmpret);
> +		mlog_network_errno(tmpret, owner);
>  		if (dlm_is_host_down(tmpret)) {
>  			/* NOTE: this seems strange, but it is what we want.
>  			 * when the master goes down during a cancel or
>