[Ocfs2-commits] manish commits r1692 - in branches/dlm-glue: . cluster src

Fri Dec 3 20:54:02 CST 2004

Author: manish
Date: 2004-12-03 20:54:01 -0600 (Fri, 03 Dec 2004)
New Revision: 1692

Added:
   branches/dlm-glue/cluster/
   branches/dlm-glue/cluster/Makefile
   branches/dlm-glue/cluster/compat_libfs.c
   branches/dlm-glue/cluster/compat_libfs.h
   branches/dlm-glue/cluster/dlm_compat.h
   branches/dlm-glue/cluster/dlmcommon.h
   branches/dlm-glue/cluster/dlmmaster.c
   branches/dlm-glue/cluster/dlmmod.c
   branches/dlm-glue/cluster/dlmmod.h
   branches/dlm-glue/cluster/dlmrecovery.c
   branches/dlm-glue/cluster/dlmthread.c
   branches/dlm-glue/cluster/heartbeat.c
   branches/dlm-glue/cluster/heartbeat.h
   branches/dlm-glue/cluster/nodemanager.c
   branches/dlm-glue/cluster/nodemanager.h
   branches/dlm-glue/cluster/tcp.c
   branches/dlm-glue/cluster/tcp.h
   branches/dlm-glue/cluster/test.c
   branches/dlm-glue/cluster/util.c
   branches/dlm-glue/cluster/util.h
   branches/dlm-glue/cluster/warning_hack.h
Modified:
   branches/dlm-glue/Config.make.in
   branches/dlm-glue/Makefile
   branches/dlm-glue/configure.in
   branches/dlm-glue/src/Makefile
   branches/dlm-glue/src/dlmglue.c
   branches/dlm-glue/src/heartbeat.c
   branches/dlm-glue/src/ocfs.h
   branches/dlm-glue/src/super.c
   branches/dlm-glue/src/vote.c
Log:
Landed cluster support


Modified: branches/dlm-glue/Config.make.in
===================================================================

--- branches/dlm-glue/Config.make.in	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/Config.make.in	2004-12-04 02:54:01 UTC (rev 1692)
@@ -54,8 +54,6 @@
 GCCINC = @GCCINC@
 endif
 
-CLUSTERINC = @CLUSTERINC@
-
 HAVE_NPTL = @HAVE_NPTL@
 
 COMPAT_SAFE_WRITE = @COMPAT_SAFE_WRITE@

Modified: branches/dlm-glue/Makefile
===================================================================
--- branches/dlm-glue/Makefile	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/Makefile	2004-12-04 02:54:01 UTC (rev 1692)
@@ -2,7 +2,7 @@
 
 include $(TOPDIR)/Preamble.make
 
-SUBDIRS = src docs patches vendor
+SUBDIRS = cluster src docs patches vendor
 
 DIST_FILES = \
 	COPYING		\

Added: branches/dlm-glue/cluster/Makefile
===================================================================
--- branches/dlm-glue/cluster/Makefile	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/Makefile	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,226 @@
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+TOPDIR = ..
+
+include $(TOPDIR)/Preamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6.x kernel build system
+
+# Global parameter so we know where our stuff is
+CLUSTER_SRC_DIR	:= $(M)
+
+include $(CLUSTER_SRC_DIR)/../Config.make
+endif
+
+#-*******************************************************
+# Now do stuff which is global for 2.4.x and 2.6.x builds
+
+#ifdef OCFS_DEBUG
+OPTS += -g
+#endif
+
+#ifdef OCFS_DEBUG
+GLOBAL_DEFINES += -DDEBUG
+#endif
+
+ifdef OCFS_TRACE
+GLOBAL_DEFINES += -DTRACE
+endif
+
+ifdef HAVE_NPTL
+GLOBAL_DEFINES += -DHAVE_NPTL
+endif
+
+CFILES = \
+	compat_libfs.c	\
+	dlmmaster.c	\
+	dlmmod.c	\
+	dlmrecovery.c	\
+	dlmthread.c	\
+	heartbeat.c	\
+	nodemanager.c	\
+	tcp.c		\
+	util.c		\
+	test.c			
+
+HFILES = \
+	compat_libfs.h	\
+	dlm_compat.h	\
+	dlmcommon.h	\
+	dlmmod.h	\
+	heartbeat.h	\
+	nodemanager.h	\
+	tcp.h		\
+	util.h		\
+	warning_hack.h
+
+CLEAN_RULES = clean-cluster
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+# End of stuff which is global for 2.4.x and 2.6.x kernels
+#-********************************************************
+
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+# Preliminary 2.6.x kernel support.  See if we are building for the 2.6.x
+# kernel
+ifndef KERNEL_26
+# Building for a 2.4.x kernel
+
+WARNINGS = -Wall -Wstrict-prototypes
+
+ifneq ($(OCFS_PROCESSOR),x86_64)
+WARNINGS += -Wmissing-prototypes -Wmissing-declarations
+endif
+
+ifeq ($(KVER),vmware)
+  KERNELINC = /usr/src/linux-2.4/include
+endif
+
+ifeq ($(KVER),suse)
+  GLOBAL_DEFINES += -DSUSE
+endif
+ifeq ($(KVER),hugemem)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=1
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),smp)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=1 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0 
+endif
+ifeq ($(KVER),ent)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=1 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0 
+endif
+ifeq ($(KVER),up)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=1 
+endif
+
+ifeq ($(OCFS_PROCESSOR),ppc64)
+  MACH_CFLAGS += -m64 -fsigned-char -fno-builtin -msoft-float -mminimal-toc
+  LDADD += -m elf64ppc
+endif
+ifeq ($(OCFS_PROCESSOR),x86_64)
+  MACH_CFLAGS += -m64 -mcmodel=kernel
+endif
+
+BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__ 
+DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
+
+INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC)
+
+CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
+	-fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
+LDADD = -nostdlib
+
+OPTIMIZE = -O2
+
+CFLAGS += $(OPTIMIZE)
+
+MODULES = ocfs2_dlm.o ocfs2_heartbeat.o ocfs2_nodemanager.o ocfs2_tcp.o
+TEST_MODULES = ocfs2_cluster_test.o
+
+INSTALL_MODULES = $(MODULES)
+
+# Make dependancies work
+$(CFILES): $(HFILES)
+$(OBJS): $(HFILES)
+
+build-cluster: $(MODULES)
+
+ocfs2_cluster_test.o: test.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_dlm.o: dlmmod.o dlmthread.o dlmrecovery.o util.o compat_libfs.o dlmmaster.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_nodemanager.o: nodemanager.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_heartbeat.o: heartbeat.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_tcp.o: tcp.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+clean-cluster:
+	rm -f *.o *.p *.s
+
+else # ifndef KERNEL_26
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it.  They are:
+#   1.	We are being included by the local Makefile to do a 2.6 kernel build.
+#	In this method we will call the kernel make system to build our module.
+#	This will cause the kernel make system to call back into our makefile
+#	(2nd way).
+
+INSTALL_MODULE = ocfs2.ko
+
+#ALL_RULES = stamp-md5 build-ocfs
+ALL_RULES = build-cluster
+
+build-ocfs:
+	$(MAKE) -C $(KERNELDIR) M=$(CURDIR) modules
+
+clean-ocfs:
+	$(MAKE) -C $(KERNELDIR) M=$(CURDIR) clean
+
+endif # OCFS_KERNEL_2_6
+
+INSTALL_RULES = install-cluster
+
+install-cluster: $(INSTALL_MODULES)
+	$(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+	@for file in $(INSTALL_MODULES); do \
+	  $(INSTALL_DATA) $$file $(DESTDIR)$(MODULEDIR)/ocfs2/$$file \
+        done
+
+include $(TOPDIR)/Postamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6 kernel build system.  So we will include the
+# 2.6.x Makefile and skip everything else.
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it.  They are:
+#   1.	We are being included by the local Makefile to do a 2.6 kernel build.
+#	In this method we will call the kernel make system to build our module.
+#	This will cause the kernel make system to call back into our makefile
+#	(2nd way).
+#
+#   2.	We are being included by the kernel make system.  So in this method we
+#	just setup the variables that the make system wants and then the kernel
+#	make system will take care of the build.
+
+# 2nd method.  The kernel make system is including us.  We need to setup the
+# various parameters for the kernel make system and then it will take care of
+# building us.
+
+STAMP_DIR = $(OCFS_SRC_DIR)
+include $(OCFS_SRC_DIR)/../Versioning.make
+
+EXTRA_CFLAGS += $(GLOBAL_DEFINES)
+
+CFLAGS_$(VERSION_OBJ) += $(VERDEFS)
+
+# Kernel Module file to produce
+obj-m += ocfs2.o
+
+# list of object files that are used to create our module
+ocfs2-objs := $(OBJS)
+
+endif # ifneq ($(KERNELRELEASE),)

Added: branches/dlm-glue/cluster/compat_libfs.c
===================================================================
--- branches/dlm-glue/cluster/compat_libfs.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/compat_libfs.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,705 @@
+/* -----------------------------------------------------------------*/
+
+
+/*
+ *	compat_libfs.c
+ *	Library for filesystems writers.
+ *	PLUS... transaction file stuff stolen from nfsd
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+
+#include "compat_libfs.h"
+
+#define kstatfs statfs
+#define __user
+
+
+int simple_statfs(struct super_block *sb, struct statfs *buf);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+#else
+struct dentry *simple_lookup(struct inode *dir,struct dentry *dentry);
+#endif
+
+int simple_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int dcache_dir_open(struct inode *inode, struct file *file);
+int dcache_dir_close(struct inode *inode, struct file *file);
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin);
+ssize_t generic_read_dir(struct file *filp, char *buf, size_t siz, loff_t *ppos);
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry);
+int simple_empty(struct dentry *dentry);
+int simple_unlink(struct inode *dir, struct dentry *dentry);
+int simple_rmdir(struct inode *dir, struct dentry *dentry);
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry);
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+
+
+
+#if 0
+int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+	return 0;
+}
+#endif
+
+int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+
+/*
+ * Lookup the data. This is trivial - if the dentry didn't already
+ * exist, we know it is negative.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	d_add(dentry, NULL);
+	return NULL;
+}
+#else
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry)
+{
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	d_add(dentry, NULL);
+	return NULL;
+}
+#endif
+
+
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name)
+{
+	struct list_head *iter;
+	struct dentry *child = NULL;
+
+	spin_lock(&dcache_lock);
+	list_for_each(iter, &dentry->d_subdirs) {
+		child = list_entry(iter, struct dentry, d_child);
+		if (child->d_name.len == name->len &&
+		    memcmp(child->d_name.name, name->name, name->len)==0)
+			break;
+		child = NULL;
+	}
+	if (child)
+		dget_locked(child);
+	spin_unlock(&dcache_lock);
+	return child;
+}
+
+
+
+int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+	return 0;
+}
+ 
+int dcache_dir_open(struct inode *inode, struct file *file)
+{
+	static struct qstr cursor_name = {.len = 1, .name = "."};
+
+	file->private_data = d_alloc(file->f_dentry, &cursor_name);
+
+	return file->private_data ? 0 : -ENOMEM;
+}
+
+int dcache_dir_close(struct inode *inode, struct file *file)
+{
+	dput(file->private_data);
+	return 0;
+}
+
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
+{
+	down(&file->f_dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			up(&file->f_dentry->d_inode->i_sem);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct list_head *p;
+			struct dentry *cursor = file->private_data;
+			loff_t n = file->f_pos - 2;
+
+			spin_lock(&dcache_lock);
+			list_del(&cursor->d_child);
+			p = file->f_dentry->d_subdirs.next;
+			while (n && p != &file->f_dentry->d_subdirs) {
+				struct dentry *next;
+				next = list_entry(p, struct dentry, d_child);
+				if (!d_unhashed(next) && next->d_inode)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->d_child, p);
+			spin_unlock(&dcache_lock);
+		}
+	}
+	up(&file->f_dentry->d_inode->i_sem);
+	return offset;
+}
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+
+int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct dentry *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->d_child;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = dentry->d_parent->d_inode->i_ino;
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			spin_lock(&dcache_lock);
+			if (filp->f_pos == 2) {
+				list_del(q);
+				list_add(q, &dentry->d_subdirs);
+			}
+			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
+				struct dentry *next;
+				next = list_entry(p, struct dentry, d_child);
+				if (d_unhashed(next) || !next->d_inode)
+					continue;
+
+				spin_unlock(&dcache_lock);
+				if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0)
+					return 0;
+				spin_lock(&dcache_lock);
+				/* next is still alive */
+				list_del(q);
+				list_add(q, p);
+				p = q;
+				filp->f_pos++;
+			}
+			spin_unlock(&dcache_lock);
+	}
+	return 0;
+}
+
+ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
+{
+	return -EISDIR;
+}
+
+struct file_operations simple_dir_operations = {
+	.open		= dcache_dir_open,
+	.release	= dcache_dir_close,
+	.llseek		= dcache_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= dcache_readdir,
+};
+
+struct inode_operations simple_dir_inode_operations = {
+	.lookup		= simple_lookup,
+};
+
+#if 0
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct super_block *
+get_sb_pseudo(struct file_system_type *fs_type, char *name,
+	struct super_operations *ops, unsigned long magic)
+{
+	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+	static struct super_operations default_ops = {.statfs = simple_statfs};
+	struct dentry *dentry;
+	struct inode *root;
+	struct qstr d_name = {.name = name, .len = strlen(name)};
+
+	if (IS_ERR(s))
+		return s;
+
+	s->s_flags = MS_NOUSER;
+	s->s_maxbytes = ~0ULL;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = magic;
+	s->s_op = ops ? ops : &default_ops;
+	root = new_inode(s);
+	if (!root)
+		goto Enomem;
+	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+	root->i_uid = root->i_gid = 0;
+	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+	dentry = d_alloc(NULL, &d_name);
+	if (!dentry) {
+		iput(root);
+		goto Enomem;
+	}
+	dentry->d_sb = s;
+	dentry->d_parent = dentry;
+	d_instantiate(dentry, root);
+	s->s_root = dentry;
+	s->s_flags |= MS_ACTIVE;
+	return s;
+
+Enomem:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return ERR_PTR(-ENOMEM);
+}
+#endif
+
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inode->i_nlink++;
+	atomic_inc(&inode->i_count);
+	dget(dentry);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static inline int simple_positive(struct dentry *dentry)
+{
+	return dentry->d_inode && !d_unhashed(dentry);
+}
+
+int simple_empty(struct dentry *dentry)
+{
+	struct dentry *child;
+	int ret = 0;
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+		if (simple_positive(child))
+			goto out;
+	ret = 1;
+out:
+	spin_unlock(&dcache_lock);
+	return ret;
+}
+
+int simple_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inode->i_nlink--;
+	dput(dentry);
+	return 0;
+}
+
+int simple_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	if (!simple_empty(dentry))
+		return -ENOTEMPTY;
+
+	dentry->d_inode->i_nlink--;
+	simple_unlink(dir, dentry);
+	dir->i_nlink--;
+	return 0;
+}
+
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
+
+	if (!simple_empty(new_dentry))
+		return -ENOTEMPTY;
+
+	if (new_dentry->d_inode) {
+		simple_unlink(new_dir, new_dentry);
+		if (they_are_dirs)
+			old_dir->i_nlink--;
+	} else if (they_are_dirs) {
+		old_dir->i_nlink--;
+		new_dir->i_nlink++;
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
+		new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
+#if 0
+int simple_readpage(struct file *file, struct page *page)
+{
+	void *kaddr;
+
+	if (PageUptodate(page))
+		goto out;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(kaddr, KM_USER0);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+int simple_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	if (!PageUptodate(page)) {
+		if (to - from != PAGE_CACHE_SIZE) {
+			void *kaddr = kmap_atomic(page, KM_USER0);
+			memset(kaddr, 0, from);
+			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+
+int simple_commit_write(struct file *file, struct page *page,
+			unsigned offset, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_sem.
+	 */
+	if (pos > inode->i_size)
+		i_size_write(inode, pos);
+	set_page_dirty(page);
+	return 0;
+}
+#endif
+
+void d_genocide(struct dentry *root);
+
+void d_genocide(struct dentry *root)
+{
+	struct dentry *this_parent = root;
+	struct list_head *next;
+	spin_lock(&dcache_lock);
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		next = tmp->next;
+		if (d_unhashed(dentry)||!dentry->d_inode)
+			continue;
+		if (!list_empty(&dentry->d_subdirs)) {
+			this_parent = dentry;
+			goto repeat;
+		}
+		atomic_dec(&dentry->d_count);
+	}
+	if (this_parent != root) {
+		next = this_parent->d_child.next;
+		atomic_dec(&this_parent->d_count);
+		this_parent = this_parent->d_parent;
+		goto resume;
+	}
+	spin_unlock(&dcache_lock);
+}
+
+static void simple_read_inode(struct inode * inode)
+{
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+}
+
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+{
+	static struct super_operations s_ops = {
+		.statfs = simple_statfs,
+		.read_inode = simple_read_inode
+	};
+	struct inode *inode;
+	struct dentry *root;
+	struct dentry *dentry;
+	int i;
+
+	s->s_blocksize = PAGE_CACHE_SIZE;
+	s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	s->s_magic = magic;
+	s->s_op = &s_ops;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+	inode->i_mode = S_IFDIR | 0755;
+	inode->i_uid = inode->i_gid = 0;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	for (i = 0; !files->name || files->name[0]; i++, files++) {
+		struct qstr name;
+		if (!files->name)
+			continue;
+		name.name = files->name;
+		name.len = strlen(name.name);
+		printk("adding file %*s\n", name.len, name.name);
+		name.hash = full_name_hash(name.name, name.len);
+		dentry = d_alloc(root, &name);
+		if (!dentry)
+			goto out;
+		inode = new_inode(s);
+		if (!inode)
+			goto out;
+		inode->i_mode = S_IFREG | files->mode;
+		inode->i_uid = inode->i_gid = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_fop = files->ops;
+		inode->i_ino = i;
+		d_add(dentry, inode);
+	}
+	s->s_root = root;
+	return 0;
+out:
+	d_genocide(root);
+	dput(root);
+	return -ENOMEM;
+}
+
+#if 0
+static spinlock_t pin_fs_lock = SPIN_LOCK_UNLOCKED;
+
+int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt = NULL;
+	spin_lock(&pin_fs_lock);
+	if (unlikely(!*mount)) {
+		spin_unlock(&pin_fs_lock);
+		mnt = do_kern_mount(name, 0, name, NULL);
+		if (IS_ERR(mnt))
+			return PTR_ERR(mnt);
+		spin_lock(&pin_fs_lock);
+		if (!*mount)
+			*mount = mnt;
+	}
+	mntget(*mount);
+	++*count;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+	return 0;
+}
+
+void simple_release_fs(struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt;
+	spin_lock(&pin_fs_lock);
+	mnt = *mount;
+	if (!--*count)
+		*mount = NULL;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+}
+
+ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	if (copy_to_user(to, from + pos, count))
+		return -EFAULT;
+	*ppos = pos + count;
+	return count;
+}
+
+EXPORT_SYMBOL(dcache_dir_close);
+EXPORT_SYMBOL(dcache_dir_lseek);
+EXPORT_SYMBOL(dcache_dir_open);
+EXPORT_SYMBOL(dcache_readdir);
+EXPORT_SYMBOL(generic_read_dir);
+EXPORT_SYMBOL(simple_commit_write);
+EXPORT_SYMBOL(simple_empty);
+EXPORT_SYMBOL(simple_fill_super);
+EXPORT_SYMBOL(simple_getattr);
+EXPORT_SYMBOL(simple_link);
+EXPORT_SYMBOL(simple_lookup);
+EXPORT_SYMBOL(simple_pin_fs);
+EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_SYMBOL(simple_readpage);
+EXPORT_SYMBOL(simple_release_fs);
+EXPORT_SYMBOL(simple_rename);
+EXPORT_SYMBOL(simple_rmdir);
+EXPORT_SYMBOL(simple_statfs);
+EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(simple_unlink);
+EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(get_sb_pseudo);
+#endif
+
+/* -----------------------------------------------------------------*/
+
+
+
+/* transaction file support */
+
+/*
+ * transaction based IO methods.
+ * The file expects a single write which triggers the transaction, and then
+ * possibly a read which collects the result - which is stored in a 
+ * file-local buffer.
+ */
+static ssize_t TA_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+{
+	ino_t ino =  file->f_dentry->d_inode->i_ino;
+	struct argresp *ar;
+	ssize_t rv = 0;
+	struct super_block *sb = file->f_dentry->d_inode->i_sb;
+	TA_write_ops *ops = TA_GENERIC_SB_MEMBER(sb);
+	TA_write_op *write_op;
+
+	printk("welcome to TA_write: num_ops=%d, op[%d]=%p, private=%p, size=%u\n", 
+	       ops->num_ops, (int)ino, ops->write_op[ino], file->private_data, size);
+	if (ino >= ops->num_ops || ops->write_op[ino] == NULL)
+		return -EINVAL;
+	write_op = ops->write_op[ino];
+	if (file->private_data) 
+		return -EINVAL; /* only one write allowed per open */
+	if (size > PAGE_SIZE - sizeof(struct argresp))
+		return -EFBIG;
+
+	ar = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!ar)
+		return -ENOMEM;
+	ar->size = 0;
+	down(&file->f_dentry->d_inode->i_sem);
+	if (file->private_data)
+		rv = -EINVAL;
+	else
+		file->private_data = ar;
+	up(&file->f_dentry->d_inode->i_sem);
+	if (rv) {
+		kfree(ar);
+		return rv;
+	}
+	if (copy_from_user(ar->data, buf, size))
+		return -EFAULT;
+
+	printk("now calling write_op...\n");	
+	rv = write_op(file, ar->data, size);
+	printk("write_op returned %d\n", rv);
+	if (rv>0) {
+		ar->size = rv;
+		rv = size;
+	}
+	return rv;
+}
+
+
+static ssize_t TA_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+	struct argresp *ar;
+	ssize_t rv = 0;
+	
+	if (file->private_data == NULL)
+		rv = TA_write(file, buf, 0, pos);
+	if (rv < 0)
+		return rv;
+
+	ar = file->private_data;
+	if (!ar)
+		return 0;
+	if (*pos >= ar->size)
+		return 0;
+	if (*pos + size > ar->size)
+		size = ar->size - *pos;
+	if (copy_to_user(buf, ar->data + *pos, size))
+		return -EFAULT;
+	*pos += size;
+	return size;
+}
+
+static int TA_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+static int TA_release(struct inode *inode, struct file *file)
+{
+	void *p = file->private_data;
+	file->private_data = NULL;
+	kfree(p);
+	return 0;
+}
+
+
+
+
+
+
+
+
+struct file_operations transaction_ops = {
+	.write		= TA_write,
+	.read		= TA_read,
+	.open		= TA_open,
+	.release	= TA_release,
+};

Added: branches/dlm-glue/cluster/compat_libfs.h
===================================================================
--- branches/dlm-glue/cluster/compat_libfs.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/compat_libfs.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,36 @@
+#ifndef CLUSTER_COMPAT_LIBFS_H
+#define CLUSTER_COMPAT_LIBFS_H
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define TA_GENERIC_SB_MEMBER(sb)  ((sb)->s_fs_info)
+#else
+#define TA_GENERIC_SB_MEMBER(sb)  ((sb)->u.generic_sbp)
+#endif
+
+
+/* an argresp is stored in an allocated page and holds the 
+ * size of the argument or response, along with its content
+ */
+struct argresp {
+	ssize_t size;
+	char data[0];
+};
+
+typedef ssize_t (TA_write_op)(struct file *, char *, size_t);
+typedef struct _TA_write_ops
+{
+	int num_ops;
+	TA_write_op *write_op[0];
+} TA_write_ops;
+
+struct tree_descr 
+{ 
+	char *name; 
+	struct file_operations *ops; 
+	int mode; 
+};
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name);
+
+#endif  /* CLUSTER_COMPAT_LIBFS_H */

Added: branches/dlm-glue/cluster/dlm_compat.h
===================================================================
--- branches/dlm-glue/cluster/dlm_compat.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlm_compat.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlm_compat.h
+ *
+ * Compatibility stuff for 2.4
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version
+ * 2 of the License.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLM_COMPAT_H
+#define CLUSTER_DLM_COMPAT_H
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/kdev_t.h>
+#include <linux/sched.h>
+#include <linux/compiler.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+# include <linux/locks.h>
+#else
+# include <linux/buffer_head.h>
+#endif
+
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+
+#ifdef __ia64__
+extern inline void prefetch(const void *x);
+extern inline void prefetchw(const void *x);
+#else
+static inline void prefetch(const void *x);
+static inline void prefetchw(const void *x);
+#endif
+extern inline int generic_fls(int x);
+extern inline int get_bitmask_order(unsigned int count);
+/* XXX Hack to avoid warning */
+struct mem_dqinfo;
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+
+
+
+
+#define flush_scheduled_work	flush_scheduled_tasks
+#define work_struct		tq_struct
+#define INIT_WORK(w, f, d)	INIT_TQUEUE(w, f, d)
+#define schedule_work(w)	schedule_task(w)
+
+#ifdef HAVE_NPTL
+static inline void dequeue_signal_lock(struct task_struct *task,
+				       sigset_t *blocked, siginfo_t *info)
+{
+	spin_lock_irq(&task->sighand->siglock);
+	dequeue_signal(blocked, info);
+	spin_unlock_irq(&task->sighand->siglock);
+}
+#else
+static inline void dequeue_signal_lock(struct task_struct *task,
+				       sigset_t *blocked, siginfo_t *info)
+{
+	spin_lock_irq(&task->sigmask_lock);
+	dequeue_signal(blocked, info);
+	spin_unlock_irq(&task->sigmask_lock);
+}
+#endif
+#define kstatfs statfs
+
+
+
+/*
+ * Copied right out of the 2.6.2 kernel's buffer_head.h:
+ * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
+ * and buffer_foo() functions.
+ */
+#define BUFFER_FNS(bit, name)						\
+static inline void set_buffer_##name(struct buffer_head *bh)		\
+{									\
+	set_bit(BH_##bit, &(bh)->b_state);				\
+}									\
+static inline void clear_buffer_##name(struct buffer_head *bh)		\
+{									\
+	clear_bit(BH_##bit, &(bh)->b_state);				\
+}									\
+static inline int buffer_##name(struct buffer_head *bh)			\
+{									\
+	return test_bit(BH_##bit, &(bh)->b_state);			\
+}
+
+#undef buffer_uptodate
+#undef buffer_dirty
+BUFFER_FNS(Uptodate, uptodate)
+BUFFER_FNS(Dirty, dirty)
+
+#define clear_buffer_dirty  mark_buffer_clean
+
+#endif  /* LINUX_VERSION_CODE < 2.6 */
+
+
+#endif  /* CLUSTER_DLM_COMPAT_H */
+

Added: branches/dlm-glue/cluster/dlmcommon.h
===================================================================
--- branches/dlm-glue/cluster/dlmcommon.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmcommon.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,52 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Common stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMCOMMON_H
+#define CLUSTER_DLMCOMMON_H
+
+#define DLM_ASSERT(x)       ({  if (!(x)) { printk("assert failed! %s:%d\n", __FILE__, __LINE__); BUG(); } })
+
+typedef struct _nm_ctxt nm_ctxt;
+typedef struct _dlm_ctxt dlm_ctxt;
+typedef struct _heartbeat_ctxt heartbeat_ctxt;
+
+#define CLUSTER_DISK_UUID_LEN      32      // 16 byte binary == 32 char hex string
+
+typedef struct _cluster_disk
+{
+	// uuid of disk
+	char uuid[CLUSTER_DISK_UUID_LEN+1];
+	// all the rest are for heartbeat
+	kdev_t dev;
+	u32 blocksize_bits;
+	u32 num_blocks;
+	u64 start_block;
+	util_rarray slots;
+} cluster_disk;
+
+
+#endif /* CLUSTER_DLMCOMMON_H */

Added: branches/dlm-glue/cluster/dlmmaster.c
===================================================================
--- branches/dlm-glue/cluster/dlmmaster.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmmaster.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,967 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+
+
+spinlock_t dlm_master_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(dlm_master_list);
+
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm, 
+			 dlm_lock_resource *res, struct qstr *name, int locked);
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm, 
+			 dlm_lock_resource *res, struct qstr *name, int locked)
+{
+	int ret = 0;
+	
+	mle->dlm = dlm;
+	mle->type = type;
+	INIT_LIST_HEAD(&mle->list);
+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	spin_lock_init(&mle->spinlock);
+	init_waitqueue_head(&mle->wq);
+	atomic_set(&mle->woken, 0);
+	atomic_set(&mle->refcnt, 1);
+	memset(mle->response_map, 0, sizeof(mle->response_map));
+	mle->master = NM_MAX_NODES;
+	mle->error = 0;
+
+	if (mle->type == DLM_MLE_MASTER) 
+		mle->u.res = res;
+	else 
+		strncpy(mle->u.name.name, name->name, name->len);
+		
+	if (!locked)
+		spin_lock(&dlm->spinlock);
+
+	/* copy off the node_map and register hb callbacks on our copy */
+	memcpy(mle->node_map, dlm->node_map, sizeof(mle->node_map));
+	memcpy(mle->vote_map, dlm->node_map, sizeof(mle->vote_map));
+	clear_bit(dlm->group_index, mle->vote_map);
+	clear_bit(dlm->group_index, mle->node_map);
+
+#warning cannot do this here cuz this kmallocs and we are under a spinlock dammit
+	if (hb_register_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle, DLM_HB_NODE_DOWN_PRI+1) ||
+	    hb_register_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle, DLM_HB_NODE_UP_PRI+1)) {
+		ret = -EINVAL;
+	}
+
+	if (!locked)
+		spin_unlock(&dlm->spinlock);
+
+	return ret;
+}
+
+
+
+
+/////////////////////////////////////////////////
+//
+// TODO: change these comments to reflect reality
+// 
+//    master_request(target=me)
+//    wait for all responses
+//    if maybe_map is 0 there are no others in progress
+//        assert_master(me)
+//    else (maybe_map has some nodes in it)
+//        (nodes in maybe_map had better be < my node num)
+//        wait for assert_master
+//    endif     
+//
+//    
+//    receive:
+//        master_request(target):
+//            if i own it, return YES
+//            if i dont know anything about it, return NO
+//            if i have it in progress
+//                if my node number is lower
+//                    return MAYBE
+//                else
+//                    if target < lowest_so_far, lowest_so_far=target
+//                    return NO
+//
+//        assert_master(master):
+//            if i own it, BUG()!!!
+//            if i have it, but owner!=master, BUG()!!!
+//            if i dont know anything about it, ignore
+//            if i have it in progress
+//                if lowest_so_far != master
+//                    BUG()!!!
+//                else
+//                    set the owner, DONE
+//
+/////////////////////////////////////////////////
+
+
+/* remove from list and free */
+void dlm_put_mle(dlm_master_list_entry *mle)
+{
+	if (atomic_dec_and_lock(&mle->refcnt, &dlm_master_lock)) {
+		list_del(&mle->list);
+		spin_unlock(&dlm_master_lock);
+		hb_unregister_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle);
+		hb_unregister_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle);
+		kfree(mle);
+	}
+}
+
+
+
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ * 
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ * 
+ * also, do a lookup in the dlm_master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here.   need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags)
+{
+	dlm_lock_resource *tmpres=NULL, *res=NULL;
+	struct list_head *bucket;
+	dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	struct list_head *iter;
+	int blocked = 0;
+	int map_changed = 0, restart = 0, assert = 0;
+	int ret, start, bit;
+	
+	bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+	/* pre-allocate a dlm_lock_resource and master stuff */
+	mle = kmalloc(sizeof(dlm_master_list_entry), GFP_KERNEL);
+	res = kmalloc(sizeof(dlm_lock_resource), GFP_KERNEL);
+	if (!mle || !res) {
+		printk("could not allocate memory for new lock resource!\n");
+		if (mle)
+			kfree(mle);
+		if (res)
+			kfree(res);
+		return NULL;
+	}
+
+	/* check for pre-existing lock */
+	spin_lock(&dlm->spinlock);
+	tmpres = __dlm_lookup_lock(dlm, lockname);
+	if (tmpres) {
+		spin_unlock(&dlm->spinlock);
+		/* TODO: return error, or return the lockres ?!? */
+		kfree(res);
+		kfree(mle);
+		/* waits for any outstanding work to finish 
+		 * will hold tmpres->spinlock on exit */
+		dlm_wait_on_lockres(tmpres);
+		return tmpres;
+	}
+
+	dlm_init_lockres(res, lockname);
+
+	if (flags & LKM_LOCAL) {
+		/* caller knows it's safe to assume it's not mastered elsewhere
+		 * DONE!  return right away */
+		list_add_tail(&res->list, bucket);
+		res->owner = dlm->group_index;
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	
+		/* return with res->spinlock held */
+
+		/* lock ordering note: this lockres will not be 
+		 * visible until i release dlm->spinlock, so it 
+		 * is ok to release dlm->spinlock out of order here */
+		spin_lock(&res->spinlock);
+		
+		spin_unlock(&dlm->spinlock);
+		return res;
+	}
+		
+	/* look in master list to see if another node has started mastering this */
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		tmpmle = list_entry(iter, dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, tmpmle, lockname))
+			continue;
+
+		if (tmpmle->type == DLM_MLE_MASTER) {
+			printk("impossible!  master entry for nonexistent lock!\n");
+			BUG();
+		}
+		dlm_get_mle(tmpmle);
+		blocked = 1;
+		// found a block!  must wait for lock to be mastered by another node
+		break;
+	}
+
+	if (!blocked) {
+		/* go ahead and try to master lock on this node */
+		if (dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 1)) {
+			printk("bug! failed to register hb callbacks\n");
+			BUG();
+		}
+		list_add(&mle->list, &dlm_master_list);
+	}
+	spin_unlock(&dlm_master_lock);
+
+	/* at this point there is either a DLM_MLE_BLOCK or a DLM_MLE_MASTER 
+	 * on the master list, so it's safe to add the lockres to the hashtable.
+	 * anyone who finds the lock will still have to wait on the IN_PROGRESS. 
+	 * also, any new nodes that try to join at this point will have to wait
+	 * until my dlm_master_lock list is empty, so they cannot possibly 
+	 * do any master requests yet... TODO
+	 * ?? should i have a special type of mle just for joining nodes ?? 
+	 * ?? could allow them to come in and put their mle on the list and sleep ?? */
+
+	/* finally add the lockres to its hash bucket */
+	list_add_tail(&res->list, bucket);
+	spin_unlock(&dlm->spinlock);
+
+	if (blocked) {
+		/* must wait for lock to be mastered elsewhere */
+		kfree(mle);
+		mle = tmpmle;
+		goto wait;
+	}
+
+	ret = -EINVAL;
+	start = 0;
+	while (1) {
+		bit = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+		if (bit >= NM_MAX_NODES) {
+			printk("no more nodes\n");
+			break;
+		}
+		
+		ret = dlm_do_master_request(mle, bit);
+		if (ret < 0) {
+			// TODO
+			//printk("dlm_do_master_request returned %d!\n", ret);
+		}
+		if (mle->master != NM_MAX_NODES) {
+			// found a master!
+			break;
+		}
+		start = bit+1;
+	}
+
+wait:
+	while (1) {
+		spin_lock(&res->spinlock);
+		if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			// another node has become the owner
+			spin_unlock(&res->spinlock);
+			break;
+		}
+		spin_unlock(&res->spinlock);
+
+		spin_lock(&mle->spinlock);
+		if (mle->master != NM_MAX_NODES) {
+			u16 m = mle->master;
+			// printk("node %u is the master!\n", m);
+			spin_unlock(&mle->spinlock);
+
+			spin_lock(&res->spinlock);
+			res->owner = m;
+			spin_unlock(&res->spinlock);
+			break;
+		}
+		restart = 0;
+		map_changed = (memcmp(mle->vote_map, mle->node_map, sizeof(mle->vote_map)) != 0);
+		if (memcmp(mle->vote_map, mle->response_map, sizeof(mle->vote_map)) == 0) {
+			// printk("every node has responded...\n");
+			if (map_changed) {
+				printk("eek! got all original nodes, but nodemap changed while collecting responses\n");
+				restart = 1;
+			}
+
+			if (mle->error) {
+				printk("ugh.  some node hit an error (-ENOMEM).  try the whole thing again\n"); 
+				mle->error = 0;
+				/* TODO: treat this just like the dead node case below,
+				 * cleanup and start over, but keep the error node around */
+				restart = 1;
+			}
+
+			if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+				/* no other nodes are in-progress */
+				/* those nodes should all be locking out this lockid until I assert */
+				/* they should have put a dummy entry on dlm_master_list */
+				/* need to assert myself as the master */
+				
+				// printk("I am the only node in-progress!  asserting myself as master\n");
+				assert = 1;
+			} else {
+				/* other nodes are in-progress */
+				if (map_changed && !test_bit(bit, mle->node_map)) {
+					/* TODO: need to copy the node_map into the vote_map, zero 
+					 * everything out and start over */
+					printk("need to handle this case!  winning node %u just died!\n", bit);
+					restart = 1;
+				}
+
+				if (bit > dlm->group_index) {
+					// printk("next in-progress node (%u) is higher than me (%u)\n",
+					//        bit, dlm->group_index);
+
+					/* nodes not in-progress should be locking out this lockid until I assert */
+					/* in-progress nodes should match me up with their lowest maybe_map bit */
+					/* need to assert myself as the master */
+
+					// printk("I am the lowest node!  asserting myself as master\n");
+					assert = 1;
+				} else {
+					/* need to sit around and wait for assert */
+					/* my lowest maybe_map bit should be the one to assert */
+					/* just fall through and sleep. should be woken by the handler */
+
+					// printk("sleeping while waiting for %u to assert himself as master\n", bit);
+				}
+			}
+		} else {
+			if (map_changed) {
+				/* TODO: need to handle this */
+				printk("eek! nodemap changed while collecting responses\n");
+				restart = 1;
+			}
+			// printk("still waiting for all nodes to respond...\n");
+		}
+
+		if (restart && assert)
+			assert = 0;
+
+		/* make sure to tell any other nodes that i am mastering this */
+		if (assert)
+			mle->master = dlm->group_index;
+
+		spin_unlock(&mle->spinlock);
+		
+		if (assert) {
+			ret = dlm_do_assert_master(mle);
+			// printk("assert returned %d!\n", ret);
+			if (ret == 0) {
+				spin_lock(&res->spinlock);
+				res->owner = dlm->group_index;
+				spin_unlock(&res->spinlock);
+				// printk("wooo!  i am the owner.  phew!\n");
+				break;
+			} else 
+				restart = 1;
+		}
+		if (restart) {
+			printk("something happened such that the master process needs to be restarted!\n");
+			/* TODO: clear it all out and start over */
+		}
+
+		atomic_set(&mle->woken, 0);
+		ret = util_wait_atomic_eq(&mle->wq, &mle->woken, 1, 5000);
+	}
+	dlm_put_mle(mle);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	wake_up(&res->wq);
+
+	/* exits holding res->spinlock */
+	return res;
+}
+	
+
+
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data)
+{
+	u8 response = DLM_MASTER_RESP_MAYBE;
+	dlm_ctxt *dlm = data;
+	dlm_lock_resource *res;
+	dlm_master_request *request = (dlm_master_request *) msg->buf;
+	dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	struct qstr lockname = { .name=request->name, .len=request->namelen };
+	int found;
+	struct list_head *iter;
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+way_up_top:	
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_unlock(&dlm->spinlock);
+
+		/* take care of the easy cases up front */
+		spin_lock(&res->spinlock);
+		if (res->owner == dlm->group_index) {
+			spin_unlock(&res->spinlock);
+			// printk("this node is the master\n");
+			response = DLM_MASTER_RESP_YES;
+			if (mle)
+				kfree(mle);
+			goto send_response;
+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			spin_unlock(&res->spinlock);
+			// printk("node %u is the master\n", res->owner);
+			response = DLM_MASTER_RESP_NO;
+			if (mle)
+				kfree(mle);
+			goto send_response;
+		}
+
+		/* ok, there is no owner.  either this node is 
+		 * being blocked, or it is actively trying to
+		 * master this lock. */
+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+			printk("bug! lock with no owner should be in-progress!\n");
+			BUG();
+		}
+
+		// printk("lockres is in progress...\n");
+		found = 0;
+		spin_lock(&dlm_master_lock);
+		list_for_each(iter, &dlm_master_list) {
+			tmpmle = list_entry(iter, dlm_master_list_entry, list);
+			if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+				continue;
+
+			dlm_get_mle(tmpmle);
+			spin_lock(&tmpmle->spinlock);
+			if (tmpmle->type == DLM_MLE_BLOCK) {
+				// printk("this node is waiting for lockres to be mastered\n");
+				response = DLM_MASTER_RESP_NO;
+			} else {
+				// printk("this node is attempting to master lockres\n");
+				response = DLM_MASTER_RESP_MAYBE;
+			}
+			set_bit(request->node_idx, tmpmle->maybe_map);
+			spin_unlock(&tmpmle->spinlock);
+
+			spin_unlock(&dlm_master_lock);
+			spin_unlock(&res->spinlock);
+
+			dlm_put_mle(tmpmle);
+			if (mle)
+				kfree(mle);
+			goto send_response;
+		}
+		spin_unlock(&dlm_master_lock);
+		spin_unlock(&res->spinlock);
+		printk("bug bug bug!!!  no mle found for this lock!\n");
+		BUG();
+	}
+	
+	/* 
+	 * lockres doesn't exist on this node 
+	 * if there is an MLE_BLOCK, return NO 
+	 * if there is an MLE_MASTER, return MAYBE
+	 * otherwise, add an MLE_BLOCK, return NO 
+	 */
+	found = 0;
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		tmpmle = list_entry(iter, dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+			continue;
+		dlm_get_mle(tmpmle);
+		found = 1;
+		break;
+	}
+
+	if (!found) {
+		/* this lockid has never been seen on this node yet */
+		// printk("no mle found\n");
+		if (!mle) {
+			spin_unlock(&dlm_master_lock);
+			spin_unlock(&dlm->spinlock);
+	
+			mle = kmalloc(sizeof(dlm_master_list_entry) + lockname.len, GFP_KERNEL);
+			if (!mle) {
+				// bad bad bad... this sucks.
+				response = DLM_MASTER_RESP_ERROR;
+				goto send_response;
+			}
+			if (dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, &lockname, 0)) {
+				printk("eeek!\n");
+				response = DLM_MASTER_RESP_ERROR;
+				dlm_put_mle(mle);
+				goto send_response;
+			}
+			goto way_up_top;
+		}
+
+		// printk("this is second time thru, already allocated, add the block.\n");
+		set_bit(request->node_idx, mle->maybe_map);
+		list_add(&mle->list, &dlm_master_list);
+		response = DLM_MASTER_RESP_NO;
+	} else {
+		// printk("mle was found\n");
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK)
+			response = DLM_MASTER_RESP_NO;
+		else
+			response = DLM_MASTER_RESP_MAYBE;
+		set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+		dlm_put_mle(tmpmle);
+	}
+	spin_unlock(&dlm_master_lock);
+	spin_unlock(&dlm->spinlock);
+
+send_response:
+	//ret = dlm_do_master_request_resp(dlm, &lockname, response, request->node_idx);
+	//printk("response returned %d\n", ret);
+	
+	// printk("sending response %d to other node\n", response);
+	return response;
+}
+
+/* NOTE: when doing node recovery, run the dlm_master_list looking for the dead node in 
+ * any maybe_map... clear that bit, and if now empty, clear the whole thing */
+
+/*
+ * locks that can be taken here:
+ * mle->spinlock
+ * dlm_master_list
+ *
+ */
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_master_list_entry *mle = NULL;
+	dlm_master_request_resp *resp = (dlm_master_request_resp *) msg->buf;
+	int found = 0, wake = 0;
+	struct list_head *iter;
+	struct qstr lockname = { .name=resp->name, .len=resp->namelen };
+	
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		mle = list_entry(iter, dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, mle, &lockname)) {
+			mle = NULL;
+			continue;
+		}
+
+		dlm_get_mle(mle);
+		if (mle->type == DLM_MLE_BLOCK) {
+			printk("eek! cannot get a response for a block!\n");
+			break;
+		}
+		found = 1;
+		wake = 0;
+		spin_lock(&mle->spinlock);
+		switch (resp->response) {
+			case DLM_MASTER_RESP_YES:
+				set_bit(resp->node_idx, mle->response_map);
+				// printk("woot!  node %u is the master!\n", resp->node_idx);
+				mle->master = resp->node_idx;
+				wake = 1;
+				break;
+			case DLM_MASTER_RESP_NO:
+				// printk("node %u is not the master, not in-progress\n", resp->node_idx);
+				set_bit(resp->node_idx, mle->response_map);
+				if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+					wake = 1;
+				break;
+			case DLM_MASTER_RESP_MAYBE:
+				// printk("node %u is not the master, but IS in-progress\n", resp->node_idx);
+				set_bit(resp->node_idx, mle->response_map);
+				set_bit(resp->node_idx, mle->maybe_map);
+				if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+					wake = 1;
+				break;
+			case DLM_MASTER_RESP_ERROR:
+				printk("node %u hit an -ENOMEM!  try this whole thing again\n", resp->node_idx);
+				mle->error = 1;
+				wake = 1;
+				break;
+			default:
+				printk("bad response! %u\n", resp->response);
+				break;
+		}
+		if (wake) {		
+			atomic_set(&mle->woken, 1);
+			wake_up(&mle->wq);
+		}
+		spin_unlock(&mle->spinlock);
+		break;
+	}
+	spin_unlock(&dlm_master_lock);
+
+	if (found)
+		dlm_put_mle(mle);
+	else
+		printk("hrrm... got a master resp but found no matching request\n");
+	return 0;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_master_list_entry *mle = NULL;
+	dlm_assert_master *assert = (dlm_assert_master *)msg->buf;
+	dlm_lock_resource *res;
+	int bit;
+	struct list_head *iter;
+	struct qstr lockname = { .name=assert->name, .len=assert->namelen };
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	spin_lock(&dlm->spinlock);
+
+	/* find the MLE */
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		mle = list_entry(iter, dlm_master_list_entry, list);
+		if (dlm_mle_equal(dlm, mle, &lockname)) {
+			dlm_get_mle(mle);
+			break;
+		}
+		mle = NULL;
+	}
+	if (!mle) {
+		printk("EEEEEEK!  just got an assert_master from %u, but no MLE for it!\n",
+		       assert->node_idx);
+		spin_unlock(&dlm_master_lock);
+		goto check_lockres;
+	}
+	if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+		printk("EEK! no bits set in the maybe_map, but %u is asserting!\n",
+		       assert->node_idx);
+		BUG();
+	} else if (bit != assert->node_idx) {
+		/* TODO: is this ok?  */
+		printk("EEK! expected %u to be the master, but %u is asserting!\n", 
+		       bit, assert->node_idx);
+		BUG();
+	}
+	spin_unlock(&dlm_master_lock);
+
+	/* ok everything checks out with the MLE
+	 * now check to see if there is a lockres */
+check_lockres:
+	res = __dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (!mle) {
+			if (res->owner != assert->node_idx) {
+				printk("EEEEeeEEeeEEEK!  assert_master from %u, but current owner is %u!\n",
+				       assert->node_idx, res->owner);
+				BUG();
+			}
+		} else {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				printk("EEEEEEEEEEEEEEEEEK!!! got assert_master from node %u, but %u is the owner!\n",
+			       		assert->node_idx, res->owner);
+				printk("goodnite!\n");
+				BUG();
+			}
+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+				printk("bug! got assert from %u, but lock with no owner should be in-progress!\n",
+			       		assert->node_idx);
+				BUG();
+			}
+		}
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	// printk("woo!  got an assert_master from node %u!\n", assert->node_idx);
+	if (mle) {
+		spin_lock(&mle->spinlock);
+		mle->master = assert->node_idx;
+		atomic_set(&mle->woken, 1);
+		wake_up(&mle->wq);
+		spin_unlock(&mle->spinlock);
+	
+		/* if this is the last put, it will be removed from the list */
+		dlm_put_mle(mle);
+	}
+	return 0;
+}
+
+
+int dlm_do_master_request(dlm_master_list_entry *mle, int to)
+{
+	struct inode *inode = NULL;
+	dlm_ctxt *dlm = mle->dlm;
+	dlm_master_request request;
+	int ret, response=0;
+
+	memset(&request, 0, sizeof(request));
+	request.node_idx = dlm->group_index;
+	if (mle->type == DLM_MLE_BLOCK) {
+		request.namelen = mle->u.name.len;
+		strncpy(request.name, mle->u.name.name, request.namelen);
+	} else {
+		request.namelen = mle->u.res->lockname.len;
+		strncpy(request.name, mle->u.res->lockname.name, request.namelen);
+	}
+
+	ret = -EINVAL;
+	inode = nm_get_group_node_by_index(dlm->group, to);
+	if (inode) {
+		ret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, sizeof(request), inode, &response);
+		iput(inode);
+		if (ret >= 0) {
+			spin_lock(&mle->spinlock);
+			switch (response) {
+				case DLM_MASTER_RESP_YES:
+					set_bit(to, mle->response_map);
+					// printk("woot!  node %u is the master!\n", to);
+					mle->master = to;
+					break;
+				case DLM_MASTER_RESP_NO:
+					// printk("node %u is not the master, not in-progress\n", to);
+					set_bit(to, mle->response_map);
+					break;
+				case DLM_MASTER_RESP_MAYBE:
+					// printk("node %u is not the master, but IS in-progress\n", to);
+					set_bit(to, mle->response_map);
+					set_bit(to, mle->maybe_map);
+					break;
+				case DLM_MASTER_RESP_ERROR:
+					printk("node %u hit an -ENOMEM!  try this whole thing again\n", to);
+					mle->error = 1;
+					break;
+				default:
+					printk("bad response! %u\n", response);
+					ret = -EINVAL;
+					break;
+			}
+			spin_unlock(&mle->spinlock);
+		} else {
+			printk("net_send_message returned %d!\n", ret);
+		}
+	} else {
+		printk("nm_get_group_node_by_index failed to find inode for node %d!\n", to);
+	}	
+	return ret;
+}
+
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to)
+{
+	struct inode *inode = NULL;
+	dlm_master_request_resp resp;
+	int ret;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.node_idx = dlm->group_index;
+	resp.response = response;
+	resp.namelen = name->len;
+	strncpy(resp.name, name->name, name->len);
+
+	inode = nm_get_group_node_by_index(dlm->group, to);
+	if (!inode)
+		return -EINVAL;
+	ret = net_send_message(DLM_MASTER_REQUEST_RESP_MSG, dlm->key, &resp, sizeof(resp), inode, NULL);
+	iput(inode);
+	return ret;
+}
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+int dlm_do_assert_master(dlm_master_list_entry *mle)
+{
+	struct inode *inode = NULL;
+	dlm_ctxt *dlm = mle->dlm;
+	dlm_assert_master assert;
+	int to, start = 0, ret = 0, tmpret;
+
+	while (1) {
+		to = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+		if (to >= NM_MAX_NODES) {
+			// printk("no more nodes\n");
+			break;
+		}
+		// printk("sending assert master to %d\n", to);
+
+		memset(&assert, 0, sizeof(assert));
+		assert.node_idx = dlm->group_index;
+		if (mle->type == DLM_MLE_BLOCK) {
+			assert.namelen = mle->u.name.len;
+			strncpy(assert.name, mle->u.name.name, assert.namelen);
+		} else {
+			assert.namelen = mle->u.res->lockname.len;
+			strncpy(assert.name, mle->u.res->lockname.name, assert.namelen);
+		}
+
+		inode = nm_get_group_node_by_index(dlm->group, to);
+		if (!inode) {
+			tmpret = -EINVAL;
+			printk("could not get nm info for node %d!  need to retry this whole thing\n", to);
+			ret = tmpret;
+			break;
+		}
+		tmpret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &assert, sizeof(assert), inode, NULL);
+		iput(inode);
+
+		if (tmpret < 0) {
+			// TODO
+			// printk("assert_master returned %d!\n", tmpret);
+			ret = tmpret;
+			break;
+		}
+		start = to+1;
+	}
+
+	return ret;
+}
+
+
+
+
+
+
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//int ret;
+	//struct inode *node = ptr2;
+
+	dlm_master_list_entry *mle;
+	dlm_ctxt *dlm;
+
+	mle = data;
+	if (!mle) {
+		printk("eek! NULL mle!\n");
+		return;
+	}
+	if (!mle->dlm) {
+		printk("eek! NULL dlm\n");
+		return;
+	}
+       	dlm = mle->dlm;
+	if (dlm->group != group)
+		return;
+
+	spin_lock(&mle->spinlock);
+
+	if (!test_bit(idx, mle->node_map))
+		printk("node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, mle->node_map);
+
+#if 0	
+	if (test_bit(idx, mle->recovery_map))
+		printk("node %u already added to recovery map!\n", idx);
+	else
+		set_bit(idx, mle->recovery_map);
+#endif
+	spin_unlock(&mle->spinlock);
+}
+
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//struct inode *node = ptr2;
+	dlm_master_list_entry *mle;
+	dlm_ctxt *dlm;
+
+	mle = data;
+	if (!mle) {
+		printk("eek! NULL mle!\n");
+		return;
+	}
+	if (!mle->dlm) {
+		printk("eek! NULL dlm\n");
+		return;
+	}
+       	dlm = mle->dlm;
+	if (dlm->group != group)
+		return;
+
+	spin_lock(&mle->spinlock);
+
+#if 0	
+	if (test_bit(idx, mle->recovery_map))
+		printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+	else 
+#endif
+	{
+		if (test_bit(idx, mle->node_map))
+			printk("node %u already in node map!!!\n", idx);
+		else 
+			set_bit(idx, mle->node_map);
+	}
+
+	spin_unlock(&mle->spinlock);
+}

Added: branches/dlm-glue/cluster/dlmmod.c
===================================================================
--- branches/dlm-glue/cluster/dlmmod.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmmod.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Oracle Corporation");
+//MODULE_DESCRIPTION("Oracle DLM");
+
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, always obey this ordering:
+ *    dlm_domain_lock -> dlm_ctxt -> dlm_lock_resource -> dlm_lock
+ *
+ */
+
+
+static int __init dlm_driver_entry (void);
+static int dlm_read_params(void);
+static void __exit dlm_driver_exit (void);
+
+
+
+LIST_HEAD(dlm_domains);
+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+u16 dlm_global_index = NM_MAX_NODES;
+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_next_cookie = 1;
+
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+static dlm_ctxt * __dlm_lookup_domain(char *domain);
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type);
+
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void __dlm_wait_on_lockres(dlm_lock_resource *res);
+
+
+/* ----------------------------------------------------------------- */
+
+extern spinlock_t dlm_master_lock;
+extern struct list_head dlm_master_list;
+
+typedef struct _dlm_create_lock
+{
+	u16 node_idx;
+	s8 requested_type;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_create_lock;
+
+typedef struct _dlm_convert_lock
+{
+	u16 node_idx;
+	s8 requested_type;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_convert_lock;
+
+typedef struct _dlm_unlock_lock
+{
+	u32 flags;
+	u16 node_idx;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_unlock_lock;
+
+typedef struct _dlm_proxy_ast
+{
+	u16 node_idx;
+	u8 type;
+	u8 blocked_type;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_proxy_ast;
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data);
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data);
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags);
+
+/* ----------------------------------------------------------------- */
+
+
+
+
+/*
+ * dlm_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init dlm_driver_entry (void)
+{
+	int status;
+
+
+	printk("Loaded dlm Driver module\n");
+	status = dlm_read_params();
+	if (status < 0)
+		return -1;
+
+	dlm_global_index = nm_this_node(NULL);
+	if (dlm_global_index == NM_MAX_NODES)
+		return -1;
+
+	return 0;
+}				/* dlm_driver_entry */
+
+/*
+ * dlm_read_params()
+ *
+ * Read insmod params
+ */
+static int dlm_read_params(void)
+{
+	int status = 0;
+	return status;
+}				/* dlm_read_params */
+
+
+/*
+ * dlm_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit dlm_driver_exit (void)
+{
+	printk("Unloaded dlm Driver module\n");
+	return;
+}				/* dlm_driver_exit */
+
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name, 
+		   dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast)
+{
+	dlm_status status;
+	dlm_lock_resource *res;
+	dlm_lock *lock = NULL;
+	char *buf = NULL;
+	int convert = 0, recovery = 0;
+	struct qstr q;
+
+	if (!lksb)
+		return DLM_BADARGS;
+
+	status = DLM_BADPARAM;
+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE)
+		goto error_status;
+
+	if (flags & ~LKM_VALID_FLAGS)
+		goto error_status;
+
+	convert = (flags & LKM_CONVERT);
+	recovery = (flags & LKM_RECOVERY);
+
+	if (recovery && (!dlm_is_recovery_lock(name, strlen(name)) ||
+		 convert) ) {
+		goto error_status;
+	}
+
+
+	if (convert) {
+		/* if converting, must pass in a valid dlm_lock */
+		if (!lksb->lockid || !lksb->lockid->lockres)
+			goto error_status;
+		lock = lksb->lockid;
+	
+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are essentially
+	 	 * static after the original lock call.  convert requests will check
+	 	 * to ensure that everything is the same and pass DLM_BADARGS if not.
+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
+	 	 */
+#warning differs from spec here!
+
+		if (lock->lksb != lksb || lock->ast != ast ||
+		    lock->bast != bast || lock->astdata != data) {
+			status = DLM_BADARGS;
+			printk("ERROR new args:  lksb=%p, ast=%p, bast=%p, astdata=%p\n", 
+			       lksb, ast, bast, data);
+			printk("      orig args: lksb=%p, ast=%p, bast=%p, astdata=%p\n", 
+			       lock->lksb, lock->ast, lock->bast, lock->astdata);
+			goto error_status;
+		}
+		res = lock->lockres;
+
+		down_read(&dlm->recovery_sem);
+		spin_lock(&res->spinlock);
+		if (flags & LKM_LOCAL) {
+			printk("strange LOCAL convert request!\n");
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				spin_unlock(&res->spinlock);
+				status = DLM_BADPARAM;
+				goto up_error;
+			}
+			res->owner = dlm->group_index;
+			printk("set owner to this node.  you SURE thats what you want!?\n");
+		}
+		status = do_dlmconvert(dlm, res, lock, flags, mode);
+	} else {
+		status = DLM_BADARGS;
+		if (!name)
+			goto error;
+		
+		status = DLM_IVBUFLEN;
+		q.len = strlen(name);
+		if (q.len > DLM_LOCKID_NAME_MAX)
+			goto error;
+
+		status = DLM_SYSERR;
+		buf = kmalloc(q.len+1, GFP_KERNEL);
+		if (!buf)
+			goto error;
+
+		memcpy(buf, name, q.len);
+		buf[q.len] = 0;
+		q.name = buf;
+		q.hash = full_name_hash(q.name, q.len);
+
+		if (!recovery)		
+			down_read(&dlm->recovery_sem);
+{
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2;
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+		res = dlm_get_lock_resource(dlm, &q, flags);
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+	printk("dlm_get_lock_resource took %llu cycles\n", u2.q-u1.q);
+}
+		if (!res) {
+			status = DLM_IVLOCKID;
+			goto up_error;
+		}
+		status = do_dlmlock(dlm, res, lksb, flags, mode, ast, bast, data);
+		if (status != DLM_NORMAL)
+			goto up_error;
+	}
+
+	/* TODO: lvb */
+	if (!recovery)
+		up_read(&dlm->recovery_sem);
+	return status;
+
+up_error:
+	if (!recovery)
+		up_read(&dlm->recovery_sem);
+error:
+	if (buf)
+		kfree(buf);
+	lksb->lockid = NULL;
+
+error_status:
+	// this is kind of unnecessary
+	lksb->status = status;
+	return status;
+}
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb, int flags,
+		      int type, dlm_astlockfunc_t *ast, dlm_bastlockfunc_t *bast, void *data)
+{
+	dlm_lock *tmplock;
+       	dlm_status status;
+	u8 *c;
+
+	dlmprintk("type=%d\n", type);
+
+	status = DLM_SYSERR;
+	tmplock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+	if (!tmplock)
+		goto error;
+
+	memset(tmplock, 0, sizeof(dlm_lock));
+	INIT_LIST_HEAD(&tmplock->list);
+	INIT_LIST_HEAD(&tmplock->ast_list);
+	spin_lock_init(&tmplock->spinlock);
+	tmplock->lockres = res;
+	tmplock->type = type;
+	tmplock->convert_type = LKM_IVMODE;
+	tmplock->highest_blocked = LKM_IVMODE;
+	tmplock->node = dlm->group_index;
+	tmplock->ast = ast;
+	tmplock->bast = bast;
+	tmplock->astdata = data;
+	tmplock->lksb = lksb;
+
+	lksb->lockid = tmplock;
+
+	c = (u8 *)(&tmplock->cookie);
+
+	spin_lock(&dlm_cookie_lock);
+	tmplock->cookie = dlm_next_cookie;
+	dlm_next_cookie++;
+	if (dlm_next_cookie & 0xff00000000000000ull) {
+		printk("eek! this node's cookie will now wrap!\n");
+		dlm_next_cookie = 1;
+	}
+	c[7] = (u8)(tmplock->node & 0x00ff);
+	spin_unlock(&dlm_cookie_lock);
+
+	if (res->owner == dlm->group_index)
+		status = dlmlock_local(dlm, res, tmplock, flags);
+	else 
+		status = dlmlock_remote(dlm, res, tmplock, flags);
+error:
+	if (status != DLM_NORMAL) {
+		if (tmplock)
+			kfree(tmplock);
+		lksb->lockid = NULL;
+	}
+	return status;
+}
+
+
+
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+	struct list_head *iter;
+	dlm_lock *tmplock;
+	int got_it = 0;
+
+	BUG_ON(!lock);
+	BUG_ON(!res);
+	BUG_ON(!dlm);
+
+	if (lock->node == dlm->group_index) {
+		BUG_ON(!lock->lksb);
+	}
+
+	dlmprintk("type=%d\n", lock->type);
+
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+			list_add_tail(&lock->list, &res->blocked);
+			goto done;
+		}
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+			list_add_tail(&lock->list, &res->blocked);
+			goto done;
+		}
+	}
+
+	/* got it right away */
+
+	/* if it is a remote request, proxy 
+	 * handler will set the lksb status */
+	if (lock->node == dlm->group_index)
+		lock->lksb->status = DLM_NORMAL;
+
+	list_add_tail(&lock->list, &res->granted);
+
+	if (dlm_do_ast(dlm, res, lock) < 0)
+		printk("eek\n");
+	got_it = 1;
+
+done:
+	spin_unlock(&res->spinlock);
+	dlm_kick_thread(dlm, res);
+	if (!got_it && (flags & LKM_NOQUEUE)) {
+		return DLM_NOTQUEUED;
+	}
+	return DLM_NORMAL;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+	dlm_status status = DLM_DENIED;
+	
+	dlmprintk("type=%d\n", lock->type);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	/* add lock to local (secondary) queue */
+	list_add_tail(&lock->list, &res->blocked);
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock 
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+	
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	if (status != DLM_NORMAL) {
+		/* remove from local queue if it failed */
+		list_del(&lock->list);
+	}
+bail:
+	spin_unlock(&res->spinlock);
+	return status;
+}
+
+
+/* must be already holding lockres->spinlock */
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	dlm_status status;
+
+{
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2;
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+
+	if (res->owner == dlm->group_index)
+		status = dlmconvert_local(dlm, res, lock, flags, type);
+	else 
+		status = dlmconvert_remote(dlm, res, lock, flags, type);
+
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+	printk("dlmconvert took %llu cycles\n", u2.q-u1.q);
+}
+	return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	dlm_status status = DLM_NORMAL;
+	struct list_head *iter;
+	dlm_lock *tmplock=NULL;
+	int remote_in_place = 0;
+
+	dlmprintk("type=%d, convert_type=%d, new convert_type=%d\n", lock->type, lock->convert_type, type);
+
+	spin_lock(&lock->spinlock);
+
+	/* already converting? */
+	if (lock->convert_type != LKM_IVMODE) {
+		printk("attempted to convert a lock with a lock conversion pending\n");
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		return DLM_DENIED;
+	}
+
+	/* must be on grant queue to convert */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		printk("attempted to convert a lock not on grant queue\n");
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		return DLM_DENIED;
+	}
+	
+	
+	/* in-place downconvert? */
+	if (type <= lock->type)
+		goto grant;
+
+	/* upconvert from here on */
+	status = DLM_NORMAL;
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (tmplock == lock)
+			continue;
+		if (!dlm_lock_compatible(tmplock->type, type))
+			goto switch_queues;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->type, type))
+			goto switch_queues;
+		/* existing conversion requests take precedence */
+		if (!dlm_lock_compatible(tmplock->convert_type, type))
+			goto switch_queues;
+	}
+
+	/* fall thru to grant */
+
+grant:
+	if (lock->node != dlm->group_index) {
+		dlmprintk0("no in-place convert for nonlocal locks :(  see if this helps...\n");
+		remote_in_place = 1;
+		goto switch_queues;
+	}
+
+	/* immediately grant the new lock type */
+	//printk("doing in-place %sconvert from %d to %d\n", 
+	//       type > lock->type ? "up" : "down", lock->type, type);
+	lock->type = type;
+	status = DLM_NORMAL;
+
+	/* if it is a remote request, proxy 
+	 * handler will set the lksb status */
+	if (lock->node == dlm->group_index)
+		lock->lksb->status = DLM_NORMAL;
+
+	if (dlm_do_ast(dlm, res, lock) < 0)
+		printk("eek\n");
+
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+
+	/* if successful, kick the queue runner */
+	if (status == DLM_NORMAL) {
+		dlm_kick_thread(dlm, res);
+	}
+
+	return status;
+
+switch_queues:
+	if (flags & LKM_NOQUEUE) {
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		return DLM_NOTQUEUED;
+	}
+
+	lock->convert_type = type;
+	list_del(&lock->list);
+	/* make sure the remote in-place convert gets handled right away */
+	if (remote_in_place)
+		list_add(&lock->list, &res->converting);
+	else
+		list_add_tail(&lock->list, &res->converting);
+	
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+	
+	dlm_kick_thread(dlm, res);
+	return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	dlm_status status = DLM_DENIED;
+	
+	dlmprintk("type=%d, convert_type=%d\n", lock->type, lock->convert_type);
+	
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	/* move lock to local convert queue */
+	list_del(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+	if (lock->convert_type != LKM_IVMODE) {
+		printk("error! converting a remote lock that is already converting!\n");
+		/* TODO: return correct error */
+		BUG();
+	}
+	lock->convert_type = type;
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock 
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+	
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+
+	/* if it failed, move it back to granted queue */
+	if (status != DLM_NORMAL) {
+		list_del(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
+		lock->convert_type = LKM_IVMODE;
+	}
+bail:
+	spin_unlock(&res->spinlock);
+	return status;
+}
+
+
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+	dlm_status status;
+	dlm_lock_resource *res;
+	dlm_lock *lock = NULL;
+	int call_ast = 0;
+
+	if (!lksb)
+		return DLM_BADARGS;
+
+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK))
+		return DLM_BADPARAM;
+
+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+		printk("VALBLK given with CANCEL: ignoring VALBLK\n");
+		flags &= ~LKM_VALBLK;
+	}
+
+	if (!lksb->lockid || !lksb->lockid->lockres)
+		return DLM_BADPARAM;
+
+	lock = lksb->lockid;
+	res = lock->lockres;
+
+	status = dlmunlock_local(dlm, res, lock, lksb, flags, &call_ast);
+	if (call_ast)
+		(*unlockast)(data, lksb->status);
+	return status;
+}
+
+
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast)
+{
+	dlm_status status;
+	int free_lock = 0, remote_ready = 0;
+	int local = 0, remove = 0, regrant = 0;
+
+	/* according to spec and opendlm code
+	 *  flags & LKM_CANCEL != 0: must be converting or blocked
+	 *  flags & LKM_CANCEL == 0: must be granted
+	 * iow, to unlock a converting lock, you must first LKM_CANCEL
+	 * the convert, then call the unlock again with no LKM_CANCEL
+	 */
+	*call_ast = 0;
+
+recheck:
+	spin_lock(&res->spinlock);
+	spin_lock(&lock->spinlock);
+
+	local = (res->owner == dlm->group_index);
+
+	if (flags & LKM_CANCEL) {
+		/* cancel request */
+		if (dlm_lock_on_list(&res->blocked, lock)) {
+			/* cancel this outright */
+			lksb->status = DLM_NORMAL;
+			status = DLM_NORMAL;
+			free_lock = 1;
+			*call_ast = 1;
+			remove = 1;
+			regrant = 0;
+		} else if (dlm_lock_on_list(&res->converting, lock)) {
+			/* cancel the request, put back on granted */
+			lksb->status = DLM_NORMAL;
+			status = DLM_NORMAL;
+			free_lock = 0;
+			*call_ast = 1;
+			remove = 1;
+			regrant = 1;
+		} else if (dlm_lock_on_list(&res->granted, lock)) {
+			/* too late, already granted.  DLM_CANCELGRANT */
+			lksb->status = DLM_CANCELGRANT;
+			status = DLM_NORMAL;
+			free_lock = 0;
+			*call_ast = 1;
+			remove = 0;
+			regrant = 0;
+		} else {
+			/* err. um. eek! */
+			printk("lock to cancel is not on any list!  bug!\n");
+			lksb->status = DLM_IVLOCKID;
+			status = DLM_IVLOCKID;
+			free_lock = 0;
+			*call_ast = 0;
+			remove = 0;
+			regrant = 0;
+		}
+	} else {
+		/* unlock request */
+		if (!dlm_lock_on_list(&res->granted, lock)) {
+			lksb->status = DLM_DENIED;
+			status = DLM_DENIED;
+			free_lock = 0;
+			*call_ast = 0;
+			remove = 0;
+			regrant = 0;
+		} else {
+			/* unlock granted lock */
+			lksb->status = DLM_NORMAL;
+			status = DLM_NORMAL;
+			free_lock = 1;
+			*call_ast = 1;
+			remove = 1;
+			regrant = 0;
+		}
+	}
+
+	if (!local) {
+		/* safe since nothing can change on this 
+		 * seconndary queue without lockres lock */
+		spin_unlock(&lock->spinlock);
+
+		/* if there was an outstanding change on the
+		 * lockres, conditions could have changed */
+		if (!remote_ready &&
+		    res->state & DLM_LOCK_RES_IN_PROGRESS) {
+			__dlm_wait_on_lockres(res);
+			res->state |= DLM_LOCK_RES_IN_PROGRESS;
+			remote_ready = 1;
+			spin_unlock(&res->spinlock);
+			goto recheck;
+		}
+
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			/* !!!!! */
+			spin_unlock(&res->spinlock);
+			return DLM_RECOVERING;
+		} else {
+			spin_unlock(&res->spinlock);
+			status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, flags);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		}
+		spin_lock(&lock->spinlock);
+	}
+
+	if (remove)
+		list_del(&lock->list);
+	if (regrant)
+		list_add_tail(&lock->list, &res->granted);
+
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+
+	if (free_lock) {
+#warning this must change to proper refcounting
+		/* TODO: refcounting... tho for now this will work because 
+		 * the middle layer is keeping track of everything */
+		kfree(lock);
+		lksb->lockid = NULL;
+	}
+	return status;
+}
+	
+
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags)
+{
+	struct inode *inode = NULL;
+	dlm_unlock_lock unlock;
+	int tmpret;
+	dlm_status ret;
+	int status = 0;
+
+	dlmprintk0("\n");
+
+	memset(&unlock, 0, sizeof(unlock));
+	unlock.node_idx = dlm->group_index;
+	unlock.flags = flags;
+	unlock.cookie = lock->cookie;
+	unlock.namelen = res->lockname.len;
+	strncpy(unlock.name, res->lockname.name, unlock.namelen);
+
+	ret = DLM_NOLOCKMGR;
+	lksb->status = DLM_NOLOCKMGR;
+	inode = nm_get_group_node_by_index(dlm->group, res->owner);
+	if (inode) {
+		tmpret = net_send_message(DLM_UNLOCK_LOCK_MSG, dlm->key, &unlock, sizeof(unlock), inode, &status);
+		if (tmpret >= 0) {
+			// successfully sent and received
+			if (status == DLM_CANCELGRANT)
+				ret = DLM_NORMAL;
+			else
+				ret = status;
+			lksb->status = status;
+		} else {
+			printk("error occurred in net_send_message: %d\n", tmpret);
+			ret = dlm_err_to_dlm_status(tmpret);
+			lksb->status = ret;
+		}
+		iput(inode);
+	}
+
+	return ret;
+}
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_unlock_lock *unlock = (dlm_unlock_lock *)msg->buf;
+	dlm_lock_resource *res;
+	struct list_head *iter, *queue;
+	dlm_lock *lock;
+	dlm_status status = DLM_NORMAL;
+	int found = 0;
+	dlm_lockstatus lksb;
+	int ignore;
+	struct qstr lockname = { .name=unlock->name, .len=unlock->namelen };
+
+	dlmprintk0("\n");
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_lock(&res->spinlock);
+		queue = &res->granted;
+again:
+		list_for_each(iter, queue) {
+			lock = list_entry(iter, dlm_lock, list);
+			if (lock->cookie == unlock->cookie &&
+			    lock->node == unlock->node_idx) {
+				found = 1;
+				/* unlockast only called on originating node */
+				status = dlmunlock_local(dlm, res, lock, &lksb, unlock->flags, &ignore);
+				break;
+			}
+		}
+		if (queue == &res->granted) {
+			queue = &res->converting;
+			goto again;
+		} else if (queue == &res->converting) {
+			queue = &res->blocked;
+			goto again;
+		}
+		spin_unlock(&res->spinlock);
+	}
+	if (!found)
+		printk("failed to find lock to unlock!  cookie=%llu\n", unlock->cookie);
+	else
+		status = lksb.status;
+
+	return status;
+}
+
+
+
+
+
+static dlm_ctxt * __dlm_lookup_domain(char *domain)
+{
+	dlm_ctxt *tmp = NULL;
+	struct list_head *iter;
+
+	list_for_each(iter, &dlm_domains) {
+		tmp = list_entry (iter, dlm_ctxt, list);
+		if (strncmp(tmp->name, domain, NM_MAX_NAME_LEN)==0)
+			break;
+		tmp = NULL;
+	}
+
+	return tmp;
+}
+
+dlm_ctxt * dlm_lookup_domain(char *domain)
+{
+	dlm_ctxt *tmp = NULL;
+	spin_lock(&dlm_domain_lock);
+	tmp = __dlm_lookup_domain(domain);
+	spin_unlock(&dlm_domain_lock);
+	return tmp;
+}
+
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+	struct list_head *iter;
+	dlm_lock_resource *tmpres=NULL;
+	struct list_head *bucket;
+	
+	bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+	/* check for pre-existing lock */
+	list_for_each(iter, bucket) {
+		tmpres = list_entry(iter, dlm_lock_resource, list);
+		if (tmpres->lockname.len == lockname->len &&
+		    strncmp(tmpres->lockname.name, lockname->name, lockname->len) == 0)
+			break;
+		tmpres = NULL;
+	}
+	return tmpres;
+}
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+	dlm_lock_resource *res;
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lock(dlm, lockname);
+	spin_unlock(&dlm->spinlock);
+	return res;
+}
+
+
+
+/*
+ * dlm_register_domain: one-time setup per "domain"
+ */
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key)
+{
+	dlm_ctxt *tmp = NULL, *dlm = NULL;
+	struct inode *group = NULL;
+	int tmpret, i;
+	char *netbuf;
+
+	if (strlen(domain) > NM_MAX_NAME_LEN) {
+		printk("domain name length too long\n");
+		goto leave;
+	}
+
+	group = nm_get_group_by_name(group_name);
+	if (!group) {
+		printk("no nm group %s for domain %s!\n", group_name, domain);
+		goto leave;
+	}
+
+	/* 
+	 * TODO: should i do some type of dlm-group-join business here?
+	 * I need to have new nodes communicate with other dlm nodes to 
+	 * wait until their master lists are empty before allowing me to
+	 * join.  does this belong here?  or in hb?
+	 * seems like stuff that heartbeat shouldn't care about, cuz we
+	 * would actually be preventing a node that is "UP" from being 
+	 * part of the dlm group.
+	 */ 
+	dlm = dlm_lookup_domain(domain);
+	if (dlm) {
+		/* found a pre-existing domain */
+		goto leave;
+	}
+
+	dlm = kmalloc(sizeof(dlm_ctxt), GFP_KERNEL);
+	if (dlm == NULL) {
+		printk("could not allocate dlm_ctxt\n");
+		goto leave;
+	}
+	memset(dlm, 0, sizeof(dlm_ctxt));
+	dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+	if (dlm->name == NULL) {
+		kfree(dlm);
+		dlm = NULL;
+		printk("could not allocate dlm domain name\n");
+		goto leave;
+	}
+	dlm->net_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!dlm->net_buf) {
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		printk("could not allocate dlm network temporary buffer\n");
+		goto leave;
+	}
+	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
+	if (!dlm->resources) {
+		kfree(dlm->name);
+		kfree(dlm);
+		free_page((unsigned long)dlm->net_buf);
+		dlm = NULL;
+		printk("could not allocate dlm hash\n");
+		goto leave;
+	}
+	memset(dlm->resources, 0, PAGE_SIZE);
+	
+	for (i=0; i<DLM_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&dlm->resources[i]);
+
+	strcpy(dlm->name, domain);
+	spin_lock_init(&dlm->spinlock);
+	INIT_LIST_HEAD(&dlm->list);
+	INIT_LIST_HEAD(&dlm->dirty_list);
+	INIT_LIST_HEAD(&dlm->reco.resources);
+	INIT_LIST_HEAD(&dlm->reco.received);
+	util_thread_info_init(&dlm->thread);
+	util_thread_info_init(&dlm->reco.thread);
+	init_rwsem(&dlm->recovery_sem);
+	dlm->group = group;
+	dlm->group_index = nm_this_node(group);
+	dlm->key = key;
+	dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+	dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+	dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+	dlm->reco.next_seq = 0;
+
+	spin_lock(&dlm_domain_lock);
+	tmp = __dlm_lookup_domain(domain);
+	if (tmp) {
+		spin_unlock(&dlm_domain_lock);
+		/* found a pre-existing domain */
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	/* add the new domain */
+	list_add_tail(&dlm->list, &dlm_domains);
+	spin_unlock(&dlm_domain_lock);
+
+	tmpret = hb_register_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	if (tmpret)
+		goto error;
+	tmpret = hb_register_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+	if (tmpret)
+		goto error;
+
+	/* TODO: need to use hb_fill_node_map to fill a temporary votemap
+	 * then communicate with each of these nodes that I want to come up
+	 * FOR THIS DLM.  there may be many nodes in this group heartbeating
+	 * but they may not care about this particular dlm instance.  once
+	 * everyone has come back with a response that i have been added or 
+	 * that they are not a member I can put together the REAL node map
+	 * for this dlm in dlm->node_map */
+	/* TODO: I guess we can fill this here as a superset of possible nodes
+	 * so that the hb_callbacks above have something to work on in the meantime
+	 * then trim out the nodes that are not part of this dlm once we know */
+	/* TODO: I may need to register a special net handler on insmod of dlm.o
+	 * with a key of 0 so that I can respond to requests even if I am not
+	 * part of a dlm group.  this would still leave a gap in time between the
+	 * start of heartbeating and the insmod dlm.o, unless I change the module 
+	 * loading stuff in clusterbo to include dlm.o (which would work fine) */
+#warning WRONG WRONG WRONG
+	tmpret = hb_fill_node_map(group, dlm->node_map, NM_MAX_NODES);
+	if (tmpret)
+		goto error;
+
+
+#if 0
+	tmpret = net_register_handler("reco-request", 
+		      DLM_NET_RECOVERY_REQUEST_MSG_TYPE, 
+		      key, sizeof(dlm_reco_request),
+		      dlm_recovery_request_handler, dlm);
+	if (tmpret)
+		goto error;
+	tmpret = net_register_handler("reco-lock-arr-req", 
+		      DLM_NET_RECOVERY_LOCK_ARR_REQ_MSG_TYPE, 
+		      key, sizeof(dlm_reco_lock_arr_req),
+		      dlm_recovery_lock_arr_req_handler, dlm);
+	if (tmpret)
+		goto error;
+	tmpret = net_register_handler("reco-response", 
+		      DLM_NET_RECOVERY_RESPONSE_MSG_TYPE, 
+		      key, sizeof(dlm_reco_response),
+		      dlm_recovery_response_handler, dlm);
+	if (tmpret)
+		goto error;
+#endif
+
+	netbuf = dlm->net_buf;
+	tmpret = net_register_handler(DLM_MASTER_REQUEST_RESP_MSG, key, 0, 
+				      sizeof(dlm_master_request_resp), 
+				      dlm_master_request_resp_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request_resp));
+
+	tmpret = net_register_handler(DLM_MASTER_REQUEST_MSG, key, 0, 
+				      sizeof(dlm_master_request), 
+				      dlm_master_request_handler,
+				      dlm, netbuf);
+
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request));
+
+	tmpret = net_register_handler(DLM_ASSERT_MASTER_MSG, key, 0, 
+				      sizeof(dlm_assert_master), 
+				      dlm_assert_master_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_assert_master));
+	tmpret = net_register_handler(DLM_CREATE_LOCK_MSG, key, 0, 
+				      sizeof(dlm_create_lock), 
+				      dlm_create_lock_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_create_lock));
+	tmpret = net_register_handler(DLM_CONVERT_LOCK_MSG, key, 0, 
+				      sizeof(dlm_convert_lock), 
+				      dlm_convert_lock_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_convert_lock));
+
+	tmpret = net_register_handler(DLM_UNLOCK_LOCK_MSG, key, 0,
+				      sizeof(dlm_unlock_lock),
+				      dlm_unlock_lock_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_unlock_lock));
+				
+	tmpret = net_register_handler(DLM_PROXY_AST_MSG, key, 0, 
+				      sizeof(dlm_proxy_ast), 
+				      dlm_proxy_ast_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_proxy_ast));
+// printk("netbuf=%p net_buf=%p diff=%d\n", netbuf, dlm->net_buf, ((char *)netbuf - (char *)dlm->net_buf));   // currently 768
+	
+	tmpret = dlm_launch_thread(dlm);
+	if (tmpret == 0)
+		goto leave;
+
+error:	
+	hb_unregister_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm);
+	hb_unregister_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm);
+	spin_lock(&dlm_domain_lock);
+	list_del(&dlm->list);
+	spin_unlock(&dlm_domain_lock);
+	free_page((unsigned long)dlm->net_buf);
+	free_page((unsigned long)dlm->resources);
+	kfree(dlm->name);
+	kfree(dlm);
+	dlm = NULL;
+
+leave:
+	if (!dlm && group)
+	       	iput(group);
+	return dlm;
+}
+
+void dlm_unregister_domain(dlm_ctxt *dlm)
+{
+	// fill me in please
+}
+
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname)
+{
+	memset(res, 0, sizeof(dlm_lock_resource));
+	res->lockname.name = lockname->name;
+	res->lockname.len = lockname->len;
+	res->lockname.hash = lockname->hash;
+	init_waitqueue_head(&res->wq);
+	spin_lock_init(&res->spinlock);
+	INIT_LIST_HEAD(&res->list);
+	INIT_LIST_HEAD(&res->granted);
+	INIT_LIST_HEAD(&res->converting);
+	INIT_LIST_HEAD(&res->blocked);
+	INIT_LIST_HEAD(&res->dirty);
+	INIT_LIST_HEAD(&res->recovering);
+
+	res->owner = DLM_LOCK_RES_OWNER_UNKNOWN;
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+}
+
+
+
+	
+/* will exit holding res->spinlock, but may drop in function */
+void dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	current->state = TASK_RUNNING;
+}
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		spin_lock(&res->spinlock);
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	current->state = TASK_RUNNING;
+}
+
+  
+
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock)
+{
+	dlm_astlockfunc_t *fn = lock->ast;
+
+	dlmprintk0("\n");
+
+	if (lock->node != dlm->group_index) {
+		return dlm_send_proxy_ast(dlm, res, lock, DLM_AST, 0);
+	}
+	if (!fn) {
+		printk("eek! lock has no ast %*s!  cookie=%llu\n", 
+		       res->lockname.len, res->lockname.name, lock->cookie);
+		return -EINVAL;
+	}
+	(*fn)(lock->astdata);
+	return 0;
+}
+
+
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type)
+{
+	dlm_bastlockfunc_t *fn = lock->bast;
+	
+	dlmprintk0("\n");
+
+	if (lock->node != dlm->group_index) {
+		return dlm_send_proxy_ast(dlm, res, lock, DLM_BAST, blocked_type);
+	}
+
+	if (!fn) {
+		printk("eek! lock has no bast %*s!  cookie=%llu\n", 
+		       res->lockname.len, res->lockname.name, lock->cookie);
+		return -EINVAL;
+	}
+	(*fn)(lock->astdata, blocked_type);
+	return 0;
+}
+
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type)
+{
+	int ret = 0;
+	dlm_proxy_ast past;
+	struct inode *inode = NULL;
+	
+	dlmprintk("to=%u, type=%d, blocked_type=%d\n", lock->node, type, blocked_type);
+
+	past.node_idx = dlm->group_index;
+	past.type = type;
+	past.blocked_type = blocked_type;
+	past.namelen = res->lockname.len;
+	strncpy(past.name, res->lockname.name, past.namelen);
+	past.cookie = lock->cookie;
+
+	ret = -EINVAL;
+	inode = nm_get_group_node_by_index(dlm->group, lock->node);
+	if (inode) {
+		ret = net_send_message(DLM_PROXY_AST_MSG, dlm->key, &past, sizeof(past), inode, NULL);
+		iput(inode);
+	}
+	if (ret < 0) {
+		printk("(%d) dlm_send_proxy_ast: returning %d\n", current->pid, ret);
+	}
+	return ret;
+}
+
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data)
+{
+	int status;
+	dlm_ctxt *dlm = data;
+	dlm_lock_resource *res;
+	dlm_lock *lock = NULL;
+	dlm_proxy_ast *past = (dlm_proxy_ast *) msg->buf;
+	struct qstr lockname = { .name=past->name, .len=past->namelen };
+	struct list_head *iter, *head=NULL;
+	u64 cookie = past->cookie;
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+	
+	dlmprintk("type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+	if (past->type != DLM_AST && 
+	    past->type != DLM_BAST) {
+		printk("Eeeek unknown ast type! %d, cookie=%llu, name=%*s\n", 
+		       past->type, cookie, lockname.len, lockname.name);
+		return 0;
+	}
+
+	res = dlm_lookup_lock(dlm, &lockname);
+	if (!res) {
+		printk("eek! got %sast for unknown lockres!  cookie=%llu, name=%*s, namelen=%d\n", 
+		       past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+		return 0;
+	}
+
+	if (!dlm_is_recovery_lock(past->name, past->namelen))
+		down_read(&dlm->recovery_sem);
+	spin_lock(&res->spinlock);
+
+	/* try convert queue for both ast/bast */
+	head = &res->converting;
+	lock = NULL;
+	list_for_each(iter, head) {
+		lock = list_entry (iter, dlm_lock, list);
+		if (lock->cookie == cookie)
+			goto do_ast;
+	}
+
+	/* if not on convert, try blocked for ast, granted for bast */
+	if (past->type == DLM_AST)
+		head = &res->blocked;
+	else 
+		head = &res->granted;
+
+	list_for_each(iter, head) {
+		lock = list_entry (iter, dlm_lock, list);
+		if (lock->cookie == cookie)
+			goto do_ast;
+	}
+
+	printk("eek! got %sast for unknown lock!  cookie=%llu, name=%*s, namelen=%d\n", 
+	       past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+	spin_unlock(&res->spinlock);
+	if (!dlm_is_recovery_lock(past->name, past->namelen))
+		up_read(&dlm->recovery_sem);
+	return 0;
+		
+do_ast:
+	if (past->type == DLM_AST) {
+		list_del(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
+		dlmprintk("ast: adding to granted list... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+		if (lock->convert_type != LKM_IVMODE) {
+			lock->type = lock->convert_type;
+			lock->convert_type = LKM_IVMODE;
+		} else {
+			// should already be there....
+		}
+		
+		lock->lksb->status = DLM_NORMAL;
+
+		status = dlm_do_ast(dlm, res, lock);
+		dlmprintk("ast done: now... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+	} else {
+		dlmprintk("bast: before... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+		status = dlm_do_bast(dlm, res, lock, past->blocked_type);
+		dlmprintk("bast: after... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+	}
+
+	if (status < 0)
+		printk("eeek: ast/bast returned %d\n", status);
+
+	spin_unlock(&res->spinlock);
+	if (!dlm_is_recovery_lock(past->name, past->namelen))
+		up_read(&dlm->recovery_sem);
+	return 0;
+}
+
+
+
+
+
+
+
+/*
+ * message handlers should just return status.
+ * this will get send back to the calling node if it
+ * requested a status return.
+ */
+
+
+/* remote lock creation */
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+	struct inode *inode = NULL;
+	dlm_create_lock create;
+	int tmpret, status = 0;
+	dlm_status ret;
+
+	dlmprintk0("\n");
+
+	memset(&create, 0, sizeof(create));
+	create.node_idx = dlm->group_index;
+	create.requested_type = lock->type;
+	create.cookie = lock->cookie;
+	create.namelen = res->lockname.len;
+	strncpy(create.name, res->lockname.name, create.namelen);
+
+	ret = DLM_NOLOCKMGR;
+	inode = nm_get_group_node_by_index(dlm->group, res->owner);
+	if (inode) {
+		tmpret = net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, sizeof(create), inode, &status);
+		if (tmpret >= 0) {
+			// successfully sent and received
+			ret = status;  // this is already a dlm_status
+		} else {
+			printk("error occurred in net_send_message: %d\n", tmpret);
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+		iput(inode);
+	}
+
+	return ret;
+}
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_create_lock *create = (dlm_create_lock *)msg->buf;
+	dlm_lock_resource *res;
+	dlm_lock *newlock;
+	dlm_status status = DLM_NORMAL;
+	struct qstr lockname = { .name=create->name, .len=create->namelen };
+	
+	dlmprintk0("\n");
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	newlock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+	if (!newlock)
+		return DLM_SYSERR;
+	
+	memset(newlock, 0, sizeof(dlm_lock));
+	INIT_LIST_HEAD(&newlock->list);
+	INIT_LIST_HEAD(&newlock->ast_list);
+	spin_lock_init(&newlock->spinlock);
+	newlock->type = create->requested_type;
+	newlock->convert_type = LKM_IVMODE;
+	newlock->highest_blocked = LKM_IVMODE;
+	newlock->node = create->node_idx;
+	newlock->ast = NULL;
+	newlock->bast = NULL;
+	newlock->astdata = NULL;
+	newlock->cookie = create->cookie;
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_lock(&res->spinlock);
+		newlock->lockres = res;
+		status = dlmlock_local(dlm, res, newlock, 0);
+		spin_unlock(&res->spinlock);
+	}
+
+	return status;
+}
+
+/* remote lock conversion */
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	struct inode *inode = NULL;
+	dlm_convert_lock convert;
+	int tmpret;
+	dlm_status ret;
+	int status = 0;
+
+	dlmprintk0("\n");
+
+	memset(&convert, 0, sizeof(convert));
+	convert.node_idx = dlm->group_index;
+	convert.requested_type = type;
+	convert.cookie = lock->cookie;
+	convert.namelen = res->lockname.len;
+	strncpy(convert.name, res->lockname.name, convert.namelen);
+
+	ret = DLM_NOLOCKMGR;
+	inode = nm_get_group_node_by_index(dlm->group, res->owner);
+	if (inode) {
+		tmpret = net_send_message(DLM_CONVERT_LOCK_MSG, dlm->key, &convert, sizeof(convert), inode, &status);
+		if (tmpret >= 0) {
+			// successfully sent and received
+			ret = status;  // this is already a dlm_status
+		} else {
+			printk("error occurred in net_send_message: %d\n", tmpret);
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+		iput(inode);
+	}
+
+	return ret;
+}
+
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_convert_lock *convert = (dlm_convert_lock *)msg->buf;
+	dlm_lock_resource *res;
+	struct list_head *iter;
+	dlm_lock *lock;
+	dlm_status status = DLM_NORMAL;
+	int found = 0;
+	struct qstr lockname = { .name=convert->name, .len=convert->namelen };
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2, u3, u4, u5, u6, u7;
+
+
+	dlmprintk0("\n");
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lock(dlm, &lockname);
+	rdtsc(u3.hilo[0], u3.hilo[1]);
+	if (res) {
+		spin_lock(&res->spinlock);
+	rdtsc(u4.hilo[0], u4.hilo[1]);
+		list_for_each(iter, &res->granted) {
+			lock = list_entry(iter, dlm_lock, list);
+			if (lock->cookie == convert->cookie &&
+			    lock->node == convert->node_idx) {
+				found = 1;
+	rdtsc(u5.hilo[0], u5.hilo[1]);
+				status = dlmconvert_local(dlm, res, lock, 0, convert->requested_type);
+	rdtsc(u6.hilo[0], u6.hilo[1]);
+				break;
+			}
+		}
+		spin_unlock(&res->spinlock);
+	}
+	if (!found)
+		printk("failed to find lock to convert on grant queue!  cookie=%llu\n", convert->cookie);
+
+	rdtsc(u7.hilo[0], u7.hilo[1]);
+	dlmprintk("1-2:%llu 2-3:%llu 3-4:%llu 4-5:%llu 5-6:%llu 6-7:%llu\n",
+		  u2.q-u1.q, u3.q-u2.q, u4.q-u3.q, u5.q-u4.q, u6.q-u5.q, u7.q-u6.q);
+	return status;
+}
+
+void dlm_dump_everything(void)
+{
+	dlm_ctxt *dlm;
+	struct list_head *iter;
+
+	printk("dumping ALL dlm state for node %s\n", system_utsname.nodename);
+	spin_lock(&dlm_domain_lock);
+	list_for_each(iter, &dlm_domains) {
+		dlm = list_entry (iter, dlm_ctxt, list);
+		dlm_dump_dlm(dlm);
+	}
+	spin_unlock(&dlm_domain_lock);
+}
+
+void dlm_dump_dlm(dlm_ctxt *dlm)
+{
+	dlm_lock_resource *res;
+	dlm_lock *lock;
+	struct list_head *iter, *iter2;
+	struct list_head *bucket;
+	int i;
+
+	printk("dlm_ctxt: %s, group=%u, key=%u\n", dlm->name, dlm->group_index, dlm->key);
+	printk("some bug here... should not have to check for this...\n");
+	if (!dlm || !dlm->name) {
+		printk("wtf... dlm=%p\n", dlm);
+		return;
+	}
+		
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry(iter, dlm_lock_resource, list);
+			printk("lockres: %*s, owner=%u, state=%u\n", res->lockname.len, res->lockname.name,
+			       res->owner, res->state);
+			spin_lock(&res->spinlock);
+			printk("  granted queue: \n");
+			list_for_each(iter2, &res->granted) {
+				lock = list_entry(iter2, dlm_lock, list);
+				spin_lock(&lock->spinlock);
+				printk("    type=%d, conv=%d, node=%u, cookie=%llu\n", 
+				       lock->type, lock->convert_type, lock->node, lock->cookie);
+				spin_unlock(&lock->spinlock);
+			}
+			printk("  converting queue: \n");
+			list_for_each(iter2, &res->converting) {
+				lock = list_entry(iter2, dlm_lock, list);
+				spin_lock(&lock->spinlock);
+				printk("    type=%d, conv=%d, node=%u, cookie=%llu\n", 
+				       lock->type, lock->convert_type, lock->node, lock->cookie);
+				spin_unlock(&lock->spinlock);
+			}
+			printk("  blocked queue: \n");
+			list_for_each(iter2, &res->blocked) {
+				lock = list_entry(iter2, dlm_lock, list);
+				spin_lock(&lock->spinlock);
+				printk("    type=%d, conv=%d, node=%u, cookie=%llu\n", 
+				       lock->type, lock->convert_type, lock->node, lock->cookie);
+				spin_unlock(&lock->spinlock);
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+module_init (dlm_driver_entry);
+module_exit (dlm_driver_exit);

Added: branches/dlm-glue/cluster/dlmmod.h
===================================================================
--- branches/dlm-glue/cluster/dlmmod.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmmod.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,467 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMMOD_H
+#define CLUSTER_DLMMOD_H
+
+
+
+#if 0
+#define dlmprintk(x, arg...)
+#define dlmprintk0(x)
+#else
+#define dlmprintk(x, arg...)    printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__, ##arg)
+#define dlmprintk0(x)           printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__)
+#endif
+
+
+
+
+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
+#define DLM_HB_NODE_UP_PRI       (0x8000000)  
+
+#define DLM_LVB_LEN  64
+#define DLM_LOCKID_NAME_MAX    32
+
+#define DLM_DOMAIN_NAME_MAX_LEN    255
+#define DLM_LOCK_RES_OWNER_UNKNOWN     NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
+
+#define DLM_HASH_BITS     7
+#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
+#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
+
+typedef enum _dlm_ast_type {
+	DLM_AST = 0,
+	DLM_BAST,
+	DLM_ASTUNLOCK
+} dlm_ast_type;
+
+
+#define LKM_IVMODE      (-1)            /* invalid mode */
+#define LKM_NLMODE      0               /* null lock */
+#define LKM_CRMODE      1               /* concurrent read */    /* unsupported */
+#define LKM_CWMODE      2               /* concurrent write */    /* unsupported */
+#define LKM_PRMODE      3               /* protected read */
+#define LKM_PWMODE      4               /* protected write */    /* unsupported */
+#define LKM_EXMODE      5               /* exclusive */
+#define LKM_MAXMODE     5
+#define LKM_MODEMASK    0xff
+
+
+/* TODO: Flags which OCFS2 will require: 
+ *       - LKM_LOCAL
+ *       - LKM_VALBLK
+ *       - LKM_NOQUEUE
+ *       - LKM_CONVERT
+ *       - LKM_CANCEL   */
+#define LKM_ORPHAN      0x10            /* this lock is orphanable */    /* unsupported */
+#define LKM_PARENTABLE  0x20            /* this lock was orphaned */    /* unsupported */
+#define LKM_BLOCK       0x40            /* blocking lock request */    /* unsupported */
+#define LKM_LOCAL       0x80            /* local lock request */    
+#define LKM_VALBLK      0x100           /* lock value block request */
+#define LKM_NOQUEUE     0x200           /* non blocking request */
+#define LKM_CONVERT     0x400           /* conversion request */
+#define LKM_NODLCKWT    0x800           /* this lock wont deadlock */    /* unsupported */
+#define LKM_UNLOCK      0x1000          /* deallocate this lock */
+#define LKM_CANCEL      0x2000          /* cancel conversion request */
+#define LKM_DEQALL      0x4000          /* remove all locks held by proc */    /* unsupported */
+#define LKM_INVVALBLK   0x8000          /* invalidate lock value block */
+#define LKM_SYNCSTS     0x10000         /* return synchronous status if poss */    /* unsupported */
+#define LKM_TIMEOUT     0x20000         /* lock request contains timeout */    /* unsupported */
+#define LKM_SNGLDLCK    0x40000         /* request can self-deadlock */    /* unsupported */
+#define LKM_FINDLOCAL   0x80000         /* find local lock request */    /* unsupported */
+#define LKM_PROC_OWNED  0x100000        /* owned by process, not group */    /* unsupported */
+#define LKM_XID         0x200000        /* use transaction id for deadlock */    /* unsupported */
+#define LKM_XID_CONFLICT 0x400000       /* do not allow lock inheritance */    /* unsupported */
+#define LKM_FORCE       0x800000        /* force unlock flag */
+#define LKM_REVVALBLK   0x1000000       /* temporary solution: re-validate lock value block */    /* unsupported */
+
+#define LKM_RECOVERY    0x80000000      /* extension: flag for recovery lock, used to avoid recovery rwsem */
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN   9
+
+static inline int dlm_is_recovery_lock(char *lock_name, int name_len)
+{
+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+	    strncmp(lock_name, DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN)==0)
+		return 1;
+	return 0;
+}
+
+typedef enum _dlm_status {
+	DLM_NORMAL,               /* request in progress */
+	DLM_GRANTED,              /* request granted */
+	DLM_DENIED,               /* request denied */
+	DLM_DENIED_NOLOCKS,       /* request denied, out of system resources */
+	DLM_WORKING,              /* async request in progress */
+	DLM_BLOCKED,              /* lock request blocked */
+	DLM_BLOCKED_ORPHAN,       /* lock request blocked by a orphan lock*/
+	DLM_DENIED_GRACE_PERIOD,  /* topological change in progress */
+	DLM_SYSERR,               /* system error */
+	DLM_NOSUPPORT,            /* unsupported */
+	DLM_CANCELGRANT,          /* can't cancel convert: already granted */
+	DLM_IVLOCKID,             /* bad lockid */
+	DLM_SYNC,                 /* synchronous request granted */
+	DLM_BADTYPE,              /* bad resource type */
+	DLM_BADRESOURCE,          /* bad resource handle */
+	DLM_MAXHANDLES,           /* no more resource handles */
+	DLM_NOCLINFO,             /* can't contact cluster manager */
+	DLM_NOLOCKMGR,            /* can't contact lock manager */
+	DLM_NOPURGED,             /* can't contact purge daemon */
+	DLM_BADARGS,              /* bad api args */
+	DLM_VOID,                 /* no status */
+	DLM_NOTQUEUED,            /* NOQUEUE was specified and request failed */
+	DLM_IVBUFLEN,             /* invalid resource name length */
+	DLM_CVTUNGRANT,           /* attempted to convert ungranted lock */
+	DLM_BADPARAM,             /* invalid lock mode specified */
+	DLM_VALNOTVALID,          /* value block has been invalidated */
+	DLM_REJECTED,             /* request rejected, unrecognized client */
+	DLM_ABORT,                /* blocked lock request cancelled */
+	DLM_CANCEL,               /* conversion request cancelled */
+	DLM_IVRESHANDLE,          /* invalid resource handle */
+	DLM_DEADLOCK,             /* deadlock recovery refused this request */
+	DLM_DENIED_NOASTS,        /* failed to allocate AST */
+	DLM_FORWARD,              /* request must wait for primary's response */
+	DLM_TIMEOUT,              /* timeout value for lock has expired */
+	DLM_IVGROUPID,            /* invalid group specification */
+	DLM_VERS_CONFLICT,        /* version conflicts prevent request handling */
+	DLM_BAD_DEVICE_PATH,      /* Locks device does not exist or path wrong */
+	DLM_NO_DEVICE_PERMISSION, /* Client has insufficient pers for device */
+	DLM_NO_CONTROL_DEVICE,    /* Cannot set options on opened device */
+	DLM_MAXSTATS,             /* upper limit for return code validation */
+	
+	DLM_RECOVERING            /* our lame addition to allow caller to fail a lock 
+				     request if it is being recovered */
+} dlm_status;
+
+
+
+typedef struct _dlm_recovery_ctxt
+{
+	struct list_head resources;
+	struct list_head received;   // list of dlm_reco_lock_infos received from other nodes during recovery
+	u16 new_master;
+	u16 dead_node;
+	u16 sending_node;
+	u32 next_seq;
+	util_thread_info thread;
+} dlm_recovery_ctxt;
+
+
+struct _dlm_ctxt
+{
+	struct list_head list;
+	struct list_head *resources;
+	struct list_head dirty_list;
+	spinlock_t spinlock;
+	struct rw_semaphore recovery_sem;
+	char *name;
+	char *net_buf;
+	util_thread_info thread;
+	struct inode *group;
+	u32 key;
+	u16 group_index;
+	u32 node_map[8];
+	u32 recovery_map[8];
+	dlm_recovery_ctxt reco;
+};
+
+#define DLM_LOCK_RES_UNINITED             0x00000001
+#define DLM_LOCK_RES_RECOVERING           0x00000002
+#define DLM_LOCK_RES_READY                0x00000004
+#define DLM_LOCK_RES_DIRTY                0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010 
+
+typedef struct _dlm_lock_resource
+{
+	struct list_head list;
+	struct list_head granted;
+	struct list_head converting; 
+	struct list_head blocked;
+	struct list_head dirty;
+	struct list_head recovering; // dlm_recovery_ctxt.resources list
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	u16 owner;              // node which owns the lock resource, or unknown
+	u16 state;
+	struct qstr lockname;
+	char lvb[DLM_LVB_LEN];
+} dlm_lock_resource;
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, dlm_status);
+
+typedef struct _dlm_lockstatus dlm_lockstatus;
+
+typedef struct _dlm_lock
+{
+	struct list_head list;
+	struct list_head ast_list;
+	dlm_lock_resource *lockres;
+	spinlock_t spinlock;
+
+	s8 type;
+	s8 convert_type;
+	s8 highest_blocked;
+	s8 reserved1;
+	u16 node;
+	u16 reserved2;
+
+	dlm_astlockfunc_t *ast;     // ast and bast must be callable while holding a spinlock!
+	dlm_bastlockfunc_t *bast;
+	void *astdata;
+	u64 cookie;
+	dlm_lockstatus *lksb;
+} dlm_lock;
+
+
+struct _dlm_lockstatus {
+	dlm_status status;
+	dlm_lock *lockid;
+	char lvb[DLM_LVB_LEN];
+};
+
+enum {
+	DLM_MLE_BLOCK,
+	DLM_MLE_MASTER
+};
+
+typedef struct _dlm_lock_name
+{
+	u8 len;
+	u8 name[0];   // [DLM_LOCKID_NAME_MAX]
+} dlm_lock_name;
+
+/* good god this needs to be trimmed down */
+typedef struct _dlm_master_list_entry
+{
+	struct list_head list;
+	dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	atomic_t refcnt;
+	u32 maybe_map[8];
+	u32 vote_map[8];
+	u32 response_map[8];
+	u32 node_map[8];
+	u16 master;
+	u8 error;
+	u8 type;    // BLOCK or MASTER
+	union {
+		dlm_lock_resource *res;
+		dlm_lock_name name;
+	} u;
+} dlm_master_list_entry;
+
+void dlm_put_mle(dlm_master_list_entry *mle);
+static inline void dlm_get_mle(dlm_master_list_entry *mle)
+{
+	atomic_inc(&mle->refcnt);
+}
+
+
+#define DLM_MASTER_REQUEST_MSG  	500
+#define DLM_MASTER_REQUEST_RESP_MSG	501
+#define DLM_ASSERT_MASTER_MSG		502
+#define DLM_CREATE_LOCK_MSG		503
+#define DLM_CONVERT_LOCK_MSG		504
+#define DLM_PROXY_AST_MSG		505
+#define DLM_UNLOCK_LOCK_MSG		506
+
+
+enum {
+	DLM_MASTER_RESP_NO,
+	DLM_MASTER_RESP_YES,
+	DLM_MASTER_RESP_MAYBE,
+	DLM_MASTER_RESP_ERROR
+};
+
+typedef struct _dlm_master_request
+{
+	u16 node_idx;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request;
+
+typedef struct _dlm_master_request_resp
+{
+	u16 node_idx;
+	u8 response;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request_resp;
+
+typedef struct _dlm_assert_master
+{
+	u16 node_idx;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} dlm_assert_master;
+
+
+
+
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res);
+void dlm_thread_run_lock_resources(dlm_ctxt *dlm);
+int dlm_thread(void *data);
+int dlm_launch_thread(dlm_ctxt *dlm);
+void dlm_complete_thread(dlm_ctxt *dlm);
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name, 
+		   dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast);
+		   
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb,
+		      int flags, int type, dlm_astlockfunc_t *ast, 
+		      dlm_bastlockfunc_t *bast, void *data);
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data);
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast);
+
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key);
+void dlm_unregister_domain(dlm_ctxt *dlm);
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_refresh_lock_resource(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock);
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type);
+u16 dlm_nm_this_node(dlm_ctxt *dlm);
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res);
+
+int dlm_nm_init(dlm_ctxt *dlm);
+int dlm_heartbeat_init(dlm_ctxt *dlm);
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+dlm_ctxt * dlm_lookup_domain(char *domain);
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data);
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int dlm_hb_node_up(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node);
+
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data);
+int dlm_do_master_request(dlm_master_list_entry *mle, int to);
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to);
+int dlm_do_assert_master(dlm_master_list_entry *mle);
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data);
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname);
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void dlm_dump_everything(void);
+void dlm_dump_dlm(dlm_ctxt *dlm);
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+	/* NO_LOCK compatible with all */
+	if (request == LKM_NLMODE ||
+	    existing == LKM_NLMODE)
+		return 1;
+
+	/* EX incompatible with all non-NO_LOCK */
+	if (request == LKM_EXMODE)
+		return 0;
+	
+	/* request must be PR, which is compatible with PR */
+	if (existing == LKM_PRMODE)
+		return 1;
+
+	return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head, dlm_lock *lock)
+{
+	struct list_head *iter;
+	dlm_lock *tmplock;
+
+	list_for_each(iter, head) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (tmplock == lock)
+			return 1;
+	}
+	return 0;
+}
+
+static inline int dlm_mle_equal(dlm_ctxt *dlm, dlm_master_list_entry *mle, struct qstr *lockname)
+{
+	dlm_lock_resource *res;
+
+	if (dlm != mle->dlm)
+		return 0;
+
+	if (mle->type == DLM_MLE_BLOCK) {
+		if (lockname->len != mle->u.name.len ||
+    	    	    strncmp(lockname->name, mle->u.name.name, lockname->len)!=0)
+			return 0;
+	} else {
+		res = mle->u.res;
+		if (res->lockname.hash != lockname->hash ||
+       	    	    res->lockname.len != lockname->len ||
+       	    	    strncmp(res->lockname.name, lockname->name, lockname->len)!=0)
+			return 0;
+	}
+	return 1;
+}
+
+static inline dlm_status dlm_err_to_dlm_status(int err)
+{
+	dlm_status ret;
+	if (err == -ENOMEM)
+		ret = DLM_SYSERR;
+	else if (err == -ETIMEDOUT || net_link_down(err, NULL)) 
+		ret = DLM_NOLOCKMGR;
+	else if (err == -EINVAL)
+		ret = DLM_BADPARAM;
+	else if (err == -ENAMETOOLONG)
+		ret = DLM_IVBUFLEN;
+	else
+		ret = DLM_BADARGS;
+	return ret;
+}
+
+#endif /* CLUSTER_DLMMOD_H */

Added: branches/dlm-glue/cluster/dlmrecovery.c
===================================================================
--- branches/dlm-glue/cluster/dlmrecovery.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmrecovery.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,705 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked);
+
+int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(dlm_ctxt *dlm);
+			
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node);
+static int dlm_remaster_locks_local(dlm_ctxt *dlm);
+int dlm_init_recovery_area(dlm_ctxt *dlm, u16 dead_node, u16 num_nodes);
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node);
+void dlm_destroy_recovery_area(dlm_ctxt *dlm, u16 dead_node);
+
+#define DLM_RECOVERY_THREAD_MS  2000
+
+#if 0
+/*
+ * RECOVERY THREAD
+ */
+
+void dlm_kick_recovery_thread(dlm_ctxt *dlm)
+{
+	/* wake the recovery thread */
+	atomic_set(&dlm->reco.thread.woken, 1);
+	wake_up(&dlm->reco.thread.thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(dlm_ctxt *dlm)
+{
+	printk("starting recovery thread...\n");
+	dlm->reco.thread.pid = kernel_thread (dlm_recovery_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (dlm->reco.thread.pid < 0) {
+		printk("unable to launch recovery thread, error=%d", dlm->reco.thread.pid);
+		return -EINVAL;
+	}
+	printk("recovery thread running...\n");
+	return 0;
+}
+
+void dlm_complete_recovery_thread(dlm_ctxt *dlm)
+{
+	printk ("waiting for recovery thread to exit....");
+	send_sig (SIGINT, dlm->reco.thread.task, 0);
+	wait_for_completion (&dlm->reco.thread.complete);
+	printk ("recovery thread exited\n");
+	dlm->reco.thread.task = NULL;
+}
+
+	/* 
+	 * this is lame, but here's how recovery works...
+	 * 1) all recovery threads cluster wide will work on recovering
+	 *    ONE node at a time
+	 * 2) negotiate who will take over all the locks for the dead node.
+	 *    thats right... ALL the locks.
+	 * 3) once a new master is chosen, everyone scans all locks
+	 *    and moves aside those mastered by the dead guy
+	 * 4) each of these locks should be locked until recovery is done
+	 * 5) the new master collects up all of secondary lock queue info
+	 *    one lock at a time, forcing each node to communicate back
+	 *    before continuing
+	 * 6) each secondary lock queue responds with the full known lock info
+	 * 7) once the new master has run all its locks, it sends a ALLDONE! 
+	 *    message to everyone
+	 * 8) upon receiving this message, the secondary queue node unlocks
+	 *    and responds to the ALLDONE
+	 * 9) once the new master gets responses from everyone, he unlocks 
+	 *    everything and recovery for this dead node is done
+	 *10) go back to 2) while there are still dead nodes
+	 *
+	 */
+
+
+
+int dlm_recovery_thread(void *data)
+{
+	int status, i;
+	int cnt = 0, dlm_num;
+	struct list_head *iter, *iter2, *tmpiter;
+	dlm_lock_resource *res;
+	char name[12];
+	dlm_ctxt *dlm = data;
+	u16 tmp;
+
+
+	dlm_num = nm_get_group_global_index(dlm->group);
+	sprintf(name, "dlmreco-%03u", dlm_num);
+	util_daemonize (name, strlen(name), 1);
+	dlm->reco.thread.task = current;
+
+	while (1) {
+		spin_lock(&dlm->spinlock);
+
+		/* check to see if the new master has died */
+		if (dlm->reco.new_master != NM_INVALID_SLOT_NUM &&
+		    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+			printk("new master %u died while recovering %u!\n",
+			       dlm->reco.new_master, dlm->reco.dead_node);
+			// unset the new_master, leave dead_node
+			dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+		}
+
+		/* select a target to recover */
+		if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+			dlm->reco.dead_node = find_next_bit (dlm->recovery_map, NM_MAX_NODES, 0);
+			if (dlm->reco.dead_node >= NM_MAX_NODES)
+				dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+		} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+			// BUG?
+			printk("dead_node %u no longer in recovery map!\n",
+			       dlm->reco.dead_node);
+			dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+		}
+
+		spin_unlock(&dlm->spinlock);
+
+		if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+			printk("nothing to recover!  sleeping now!\n");
+			goto sleep;
+		}
+
+		/* take write barrier */
+		/* (stops the list reshuffling thread, proxy ast handling) */
+		down_write(&dlm->recovery_sem);
+
+		/* choose a new master */
+		if (dlm->reco.new_master == NM_INVALID_SLOT_NUM) {
+			u16 new_dead_node = dlm->reco.dead_node;
+			dlm->reco.new_master = dlm_pick_recovery_master(dlm, &new_dead_node);
+			if (new_dead_node != dlm->reco.dead_node) {
+				// master wants to recover a different node
+				dlm->reco.dead_node = new_dead_node;
+				
+				// do local cleanup if heartbeat has not added the
+				// node to the recovery map yet
+				spin_lock(&dlm->spinlock);
+				if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+					dlm_do_local_recovery_cleanup(dlm, dlm->reco.dead_node, 1);
+					set_bit(dlm->reco.dead_node, dlm->recovery_map);
+					clear_bit(dlm->reco.dead_node, dlm->node_map);
+				}
+				spin_unlock(&dlm->spinlock);
+			}
+		}
+		
+
+		if (dlm->reco.new_master == dlm->group_index) {
+			status = dlm_remaster_locks_local(dlm);
+			if (status < 0) {
+				printk("error remastering locks for node %u!!!!  retrying!\n",
+				       dlm->reco.dead_node);
+			} else {
+				// success!  see if any other nodes need recovery
+				spin_lock(&dlm->spinlock);
+				clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+				spin_unlock(&dlm->spinlock);
+				dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+				dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+				dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+				dlm->reco.next_seq = 0;
+			}
+			up_write(&dlm->recovery_sem);
+			// pick another dead node
+			continue;
+		} else {
+			// sit around until new_master is dead or done
+			// we will get signalled by the waitqueue either way
+			printk("new_master %u is recovering dead_node %u... waiting...\n",
+			       dlm->reco.new_master, dlm->reco.dead_node);
+		}
+
+		up_write(&dlm->recovery_sem);
+
+sleep:
+		atomic_set(&dlm->reco.thread.woken, 0);
+		status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq, 
+					     &dlm->reco.thread.woken, 
+					     1, DLM_RECOVERY_THREAD_MS);
+		if (status == 0 || status == -ETIMEDOUT) {
+			if (atomic_read(&dlm->reco.thread.woken))
+				printk("aha!!! recovery thread woken!\n");
+			else 
+				printk("timed out waiting, running again\n");
+			continue;
+		}
+		printk("recovery thread got %d while waiting\n", status);
+		break;
+	}
+
+	flush_scheduled_work();
+	complete (&dlm->reco.thread.complete);
+	printk("quitting recovery thread!!!!!!\n");
+	return 0;
+}
+
+/* +- if this node is NOT the new master... */
+/* +--- if master's dead_node is not the one we chose, do local cleanup again with proper dead_node */
+/* +---	wait for poll messages from new master: register net message handler, it will do the work */
+/* +--- check for death of new master */
+/* +--- if dead, unregister the handler, unset new_master, keep dead_node and goto "select a target" */
+/* |- on request, send header with number of packets, get response, then start blasting packets */
+/* |- retransmit any missed packets on request */
+/* |- once ALL DONE is received, run all locks again */
+/* +--- unset the RECOVERING flag */
+/* +--- set the new owner as new_master */
+/* +--- remove dead_node from recovery map */
+/* +--- unset new_master and dead_node and start all over */
+
+
+static int dlm_remaster_locks_local(dlm_ctxt *dlm)
+{
+	int num_nodes = 255, i, status = 0;
+	u32 node_map[8];
+
+
+/* +- if this node is the new master, init the temp recovery area */
+/* |- poll each live node for lock state */
+/* |- collect the data from each node until node says it's done, or dead */
+/* +--- if node died, throw away temp recovery area, keep new_master and dead_node, goto "select a target" */
+/* |- apply all temp area changes to real lock */
+/* +- send ALL DONE message to each node */
+
+
+	status = dlm_init_recovery_area(dlm, dlm->reco.dead_node, num_nodes);
+	if (status < 0)
+		return status;
+
+	spin_lock(&dlm->spinlock);
+	num_nodes = nm_get_group_max_slots(dlm->group);
+	memcpy(node_map, dlm->node_map, sizeof(node_map));
+	spin_unlock(&dlm->spinlock);
+
+	for (i=0; i<num_nodes; i++) {
+		if (test_bit(i, node_map)) {
+			spin_lock(&dlm->spinlock);
+			dlm->reco.sending_node = i;
+			dlm->reco.next_seq = 0;
+			spin_unlock(&dlm->spinlock);
+			status = dlm_request_all_locks(dlm, i, dlm->reco.dead_node);
+			if (status < 0) {
+				spin_lock(&dlm->spinlock);
+				dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+				dlm->reco.next_seq = 0;
+				spin_unlock(&dlm->spinlock);
+				dlm_destroy_recovery_area(dlm, dlm->reco.dead_node);
+				return status;
+			}
+		}
+	}
+	return status;
+}
+
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node)
+{
+	printk("dlm_request_all_locks: dead node is %u, sending request to %u\n",
+	       dead_node, request_from);
+	// send message
+	// sleep until all received or error
+	return 0;
+}
+
+#endif
+
+#if 0
+
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data);
+
+typedef struct _dlm_reco_lock_info
+{
+	u16 node;
+	u16 unused1;
+	u64 cookie;
+	s8 type;
+	s8 convert_type;
+	u8 list;
+	u8 lockname_len;
+	u8 lockname[DLM_LOCKID_NAME_MAX];
+} dlm_reco_lock_info;
+
+enum {
+	DLM_RECO_MASTER_REQUEST, 
+	DLM_RECO_XMIT_LOCKS_REQUEST,
+	DLM_RECO_XMIT_LOCK_HDR_REQUEST,
+	DLM_RECO_XMIT_LOCK_ARR_REQUEST,
+	DLM_RECO_XMIT_COMPLETE_REQUEST,
+	DLM_RECO_ALL_DONE_REQUEST
+};
+
+enum {
+	DLM_RECO_NO_RESPONSE,
+	DLM_RECO_YES_RESPONSE
+};
+
+#define DLM_LOCKS_PER_PACKET   40
+
+typedef struct _dlm_reco_lock_arr_req
+{
+	u8 request_type;
+	u8 num_locks;
+	u16 dead_node;
+	u32 seqnum;
+	dlm_reco_lock_info lock[DLM_LOCKS_PER_PACKET];
+} dlm_reco_lock_arr_req;
+
+typedef struct _dlm_reco_request
+{
+	u8 request_type;
+	u8 unused1;
+	u16 dead_node;
+	u32 num;
+} dlm_reco_request;
+
+typedef struct _dlm_reco_response
+{
+	u8 response_type;
+	u8 unused1[7];
+} dlm_reco_response;
+
+static inline int dlm_reco_lock_info_valid(dlm_reco_lock_info *info)
+{
+	if (info->type != LKM_NLMODE &&
+	    info->type != LKM_PRMODE &&
+	    info->type != LKM_EXMODE)
+		return 0;
+	if (info->convert_type != LKM_NLMODE &&
+	    info->convert_type != LKM_PRMODE &&
+	    info->convert_type != LKM_EXMODE)
+		return 0;
+	if (info->list > 2)
+		return 0;
+	return 1;
+}
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order);
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order)
+{
+	int ret = -EINVAL;
+	dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+	
+	/* check a bunch of ugly conditions */
+	*out_of_order = 0;
+	if (req->num_locks > DLM_LOCKS_PER_PACKET) {
+		printk("num_locks too large! %u\n", req->num_locks);
+	} else if (req->seqnum != dlm->reco.next_seq) {
+		printk("expected seq %lu from node %u, got %lu\n",
+		       dlm->reco.next_seq, msg->src_node,
+		       req->seqnum);
+		*out_of_order = 1;
+	} else if (dlm->reco.dead_node != req->dead_node) {
+		printk("bad lock array: dead node=%u, sent=%u\n",
+		       dlm->reco.dead_node != req->dead_node);
+	} else if (dlm->reco.new_master != dlm->group_index) {
+		printk("this node is not the recovery master!\n");
+	} else if (dlm->reco.sending_node != msg->src_node ||
+		 dlm->group_index == msg->dest_node) {
+		printk("eek. sending_node=%u, actual=%u, dest=%u, me=%u\n",
+		       dlm->reco.sending_node, msg->src_node, 
+		       msg->dest_node, dlm->group_index);
+	} else
+		ret = 0;
+	return ret;
+}
+
+
+/* 
+ * gawd i hate udp
+ */
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+	dlm_lock_resource *res = NULL;
+	dlm_reco_lock_info *info;
+	dlm_lock **newlocks = NULL;
+	dlm_lock *lock = NULL;
+	int ret, i, out_of_order = 0;
+	
+	// TODO: ntoh(req)
+
+	ret = 0;
+	if (req->num_locks == 0)
+		goto send_response;
+
+	/* check to see if it's worth kmallocing */
+	spin_lock(&dlm->spinlock);
+	ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+	spin_unlock(&dlm->spinlock);
+	if (ret < 0)
+		goto send_response;
+
+	newlocks = kmalloc(req->num_locks * sizeof(dlm_lock *), GFP_KERNEL);
+	if (!newlocks) {
+		printk("failed to alloc temp lock array!\n");
+		ret = -ENOMEM;
+		goto send_response;
+	}
+	memset(newlocks, 0, req->num_locks * sizeof(dlm_lock *));
+	for (i=0; i<req->num_locks; i++) {
+		info = &(req->lock[i]);
+		if (!dlm_reco_lock_info_valid(info)) {
+			ret = -EINVAL;
+			goto send_response;
+		}
+		lock = newlocks[i] = kmem_cache_alloc(dlm_lock_cache, GFP_KERNEL);
+		if (!newlocks[i]) {
+			ret = -ENOMEM;
+			goto send_response;
+		}
+		memset(lock, 0, sizeof(dlm_lock));
+		LIST_HEAD_INIT(&lock->list);
+		LIST_HEAD_INIT(&lock->ast_list);
+		spin_lock_init(&lock->spinlock);
+		lock->type = info->type;
+		lock->convert_type = info->convert_type;
+		lock->node = dlm->group_index;
+		//atomic_set(&lock->ast_lock, 0);
+		//atomic_set(&lock->bast_lock, 0);
+		lock->ast = NULL;
+		lock->bast = NULL;
+		lock->astdata = (void *)info->list;   // cheating here...
+		lock->cookie = info->cookie;	
+	}
+
+	spin_lock(&dlm->spinlock);
+	/* ok now that everything is allocated and the lock has
+	 * been taken again, recheck all those stupid conditions */
+	ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+	if (ret < 0) {
+		spin_unlock(&dlm->spinlock);
+		goto send_response;
+	}
+	for (i=0; i<req->num_locks; i++) {
+		info = &(req->lock[i]);
+		lock = newlocks[i];
+		list_add_tail(&lock->list, &dlm->reco.received);
+	}
+	spin_unlock(&dlm->spinlock);
+
+send_response:
+	if (newlocks) {
+		if (ret < 0) {
+			for (i=0; i<req->num_locks; i++)
+				if (newlocks[i])
+					kmem_cache_free(dlm_reco_lock_info_cache, newlocks[i]);
+		}
+		kfree(newlocks);
+	}
+
+	return ret;
+}
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+}
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+}
+
+
+
+
+
+static int dlm_send_reco_request(dlm_ctxt *dlm, dlm_reco_request *buf, u16 to, struct inode *node)
+{
+	int ret;
+	net_msg *msg = net_package_message(DLM_NET_RECOVERY_REQUEST_MSG_TYPE, 
+				  dlm->key, buf, sizeof(*buf), 
+				  dlm->group_index, to);
+	if (!msg)
+		return -ENOMEM;
+	ret = net_send_udp_msg (node, msg, sizeof(*buf));
+	kfree(msg);
+	return ret;
+}
+
+static int dlm_recover_domain(dlm_ctxt *dlm)
+{
+
+	
+	return 0;
+}
+
+
+#endif
+
+#warning may need to change kfree to put_lock and refcounting here
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked)
+{
+	struct list_head *iter, *iter2, *tmpiter;
+	dlm_lock_resource *res;
+	dlm_lock *lock;
+	int i;
+	struct list_head *bucket;
+	
+	if (!locked)	
+		spin_lock(&dlm->spinlock);
+
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, dlm_lock_resource, list);
+			spin_lock(&res->spinlock);
+			if (res->owner == dead_node) {
+				res->state |= DLM_LOCK_RES_RECOVERING;
+				list_del(&res->recovering);
+				list_add_tail(&res->recovering, &dlm->reco.resources);
+			} else if (res->owner == dlm->group_index) {
+				list_for_each_safe(iter2, tmpiter, &res->granted) {
+					lock = list_entry (iter2, dlm_lock, list);
+					if (lock->node == dead_node) {
+						list_del(&lock->list);
+						kfree(lock);
+					}
+				}
+				list_for_each_safe(iter2, tmpiter, &res->converting) {
+					lock = list_entry (iter2, dlm_lock, list);
+					if (lock->node == dead_node) {
+						list_del(&lock->list);
+						kfree(lock);
+					}
+				}
+				list_for_each_safe(iter2, tmpiter, &res->blocked) {
+					lock = list_entry (iter2, dlm_lock, list);
+					if (lock->node == dead_node) {
+						list_del(&lock->list);
+						kfree(lock);
+					}
+				}
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+	if (!locked)
+		spin_unlock(&dlm->spinlock);
+}
+
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//int ret;
+	//struct inode *group = ptr1;
+	//struct inode *node = ptr2;
+	dlm_ctxt *dlm = data;
+	
+	spin_lock(&dlm->spinlock);
+
+	if (!test_bit(idx, dlm->node_map))
+		printk("node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, dlm->node_map);
+	
+	if (test_bit(idx, dlm->recovery_map))
+		printk("node %u already added to recovery map!\n", idx);
+	else {
+		set_bit(idx, dlm->recovery_map);
+		dlm_do_local_recovery_cleanup(dlm, idx, 1);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//struct inode *group = ptr1;
+	//struct inode *node = ptr2;
+	dlm_ctxt *dlm = data;
+
+	spin_lock(&dlm->spinlock);
+
+	if (test_bit(idx, dlm->recovery_map)) {
+		printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+	} else {
+		if (test_bit(idx, dlm->node_map))
+			printk("node %u already in node map!!!\n", idx);
+		else 
+			set_bit(idx, dlm->node_map);
+	}
+
+	spin_unlock(&dlm->spinlock);
+}
+
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+	if (test_bit(node, dlm->recovery_map))
+		return 1;
+	return 0;
+}
+
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+	if (test_bit(node, dlm->node_map))
+		return 1;
+	return 0;
+}
+
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+	int ret;
+	spin_lock(&dlm->spinlock);
+	ret = __dlm_hb_node_dead(dlm, node);
+	spin_unlock(&dlm->spinlock);
+	return ret;
+}
+
+int dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+	int ret;
+	spin_lock(&dlm->spinlock);
+	ret = __dlm_hb_node_up(dlm, node);
+	spin_unlock(&dlm->spinlock);
+	return ret;
+}
+
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node)
+{
+	u16 master = 0;
+#if 0
+	dlm_status ret;
+	dlm_lockstatus lksb;
+
+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 
+		      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+
+	if (ret == DLM_NORMAL) {
+		// I am master
+		// send message to all nodes saying that I am beginning a recovery session for node XX,
+		//   then call dlmunlock???
+
+	} else if (ret == DLM_NOTQUEUED) {
+		// another node is master
+		// wait on reco.new_master != NM_INVALID_SLOT_NUM
+	} 
+
+	// at this point, every node in this domain should have reco.new_master and .dead_node set, even
+	//   if they have not discovered the dead node on their own
+	//
+	//
+	// atomic_set(&dlm->reco.thread.woken, 0);
+	//     232                 status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq,
+	//         233                                              &dlm->reco.thread.woken,
+	//             234                                              1, DLM_RECOVERY_THREAD_MS);
+	//
+#endif
+	return master;
+}

Added: branches/dlm-glue/cluster/dlmthread.c
===================================================================
--- branches/dlm-glue/cluster/dlmthread.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmthread.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,329 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+extern u16 dlm_global_index;
+
+#define dlm_lock_is_remote(dlm, lock)     ((lock)->node != (dlm)->group_index)
+
+/*
+ * DLM THREAD
+ */
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+	dlm_lock *lock, *target;
+	struct list_head *iter, *tmpiter;
+	LIST_HEAD(bast_list);
+	struct list_head *head;
+	s8 hi;
+
+	spin_lock(&res->spinlock);
+
+#if 0
+	{
+		int g=0, c=0, b=0;
+		list_for_each(iter, &res->granted) {
+			g++;
+		}
+		list_for_each(iter, &res->converting) {
+			c++;
+		}
+		list_for_each(iter, &res->blocked) {
+			b++;
+		}
+		printk("(%d) granted: %d, converting: %d, blocked: %d\n", current->pid, g, c, b);
+	}
+#endif
+
+converting:
+	if (list_empty(&res->converting))
+		goto blocked;
+	target = list_entry(res->converting.next, dlm_lock, list);
+	if (target->convert_type == LKM_IVMODE) {
+		printk("eeek!!! converting a lock with no convert_type!!!!\n");
+		BUG();
+	}
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+	
+	/* we can convert the lock */
+	if (list_empty(&bast_list)) {
+		spin_lock(&target->spinlock);
+		DLM_ASSERT(target->highest_blocked == LKM_IVMODE);	
+		
+		dlmprintk("calling ast for converting lock: %*s, have: %d, granting: %d, node: %u\n", 
+			  res->lockname.len, res->lockname.name, target->type, target->convert_type, target->node);
+
+		target->type = target->convert_type;
+		target->convert_type = LKM_IVMODE;
+		list_del(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		if (target->node == dlm->group_index) {
+			DLM_ASSERT(target->lksb);
+			DLM_ASSERT(target->lksb->status);
+
+			target->lksb->status = DLM_NORMAL;
+		} else {
+			dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+		}
+
+		spin_unlock(&target->spinlock);
+
+		if (dlm_do_ast(dlm, res, target) < 0)
+			printk("eek\n");
+		/* go back and check for more */
+		goto converting;
+	}
+
+blocked:
+	if (list_empty(&res->blocked)) {
+		goto basts;
+	}
+	target = list_entry(res->blocked.next, dlm_lock, list);
+
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+	
+	/* we can grant the blocked lock (only 
+	 * possible if converting list empty) */
+	if (list_empty(&bast_list)) {
+		spin_lock(&target->spinlock);
+		DLM_ASSERT(target->highest_blocked == LKM_IVMODE);
+		
+		dlmprintk("calling ast for blocked lock: %*s, granting: %d, node: %u\n", 
+			  res->lockname.len, res->lockname.name, target->type, target->node);
+
+		// target->type is already correct
+		list_del(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		if (target->node == dlm->group_index) {
+			DLM_ASSERT(target->lksb);
+			DLM_ASSERT(target->lksb->status);
+		
+			target->lksb->status = DLM_NORMAL;
+		} else {
+			dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+		}
+		
+		spin_unlock(&target->spinlock);
+
+		if (dlm_do_ast(dlm, res, target) < 0)
+			printk("eek\n");
+		/* go back and check for more */
+		goto converting;
+	}
+
+basts:
+	list_for_each_safe(iter, tmpiter, &bast_list) {
+		lock = list_entry(iter, dlm_lock, ast_list);
+		spin_lock(&lock->spinlock);
+		DLM_ASSERT(lock->highest_blocked > LKM_IVMODE);
+		hi = lock->highest_blocked;
+		lock->highest_blocked = LKM_IVMODE;
+		list_del(&lock->ast_list);
+		spin_unlock(&lock->spinlock);
+
+		if (dlm_do_bast(dlm, res, lock, hi) < 0)
+			printk("eeek\n");
+	}
+	spin_unlock(&res->spinlock);
+}
+
+
+/* must have NO locks when calling this */
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+	if (res) {
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+		if (!(res->state & DLM_LOCK_RES_DIRTY)) {
+			list_add_tail(&res->dirty, &dlm->dirty_list);
+			res->state |= DLM_LOCK_RES_DIRTY;
+		}
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+	}
+
+	/* wake the dlm thread */
+	atomic_set(&dlm->thread.woken, 1);
+	wake_up(&dlm->thread.thread_wq);
+}
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(dlm_ctxt *dlm)
+{
+	printk("starting dlm thread...\n");
+	dlm->thread.pid = kernel_thread (dlm_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (dlm->thread.pid < 0) {
+		printk("unable to launch dlm thread, error=%d", dlm->thread.pid);
+		return -EINVAL;
+	}
+	printk("dlm thread running for %s...\n", dlm->name);
+	return 0;
+}
+
+void dlm_complete_thread(dlm_ctxt *dlm)
+{
+	printk ("waiting for dlm thread to exit....");
+	send_sig (SIGINT, dlm->thread.task, 0);
+	wait_for_completion (&dlm->thread.complete);
+	printk ("dlm thread exited\n");
+	dlm->thread.task = NULL;
+}
+
+
+
+
+int dlm_thread(void *data)
+{
+	int status;
+	struct list_head *iter, *tmpiter;
+	dlm_lock_resource *res;
+	dlm_ctxt *dlm = data;
+
+	util_daemonize ("dlm_thread", strlen("dlm_thread"), 1);
+	dlm->thread.task = current;
+
+	while (1) {
+		down_read(&dlm->recovery_sem);
+		spin_lock(&dlm->spinlock);
+		list_for_each_safe(iter, tmpiter, &dlm->dirty_list) {
+			res = list_entry(iter, dlm_lock_resource, dirty);
+			/* don't shuffle secondary queues */
+			if (res->owner != dlm->group_index)
+				continue;
+			dlm_shuffle_lists(dlm, res);
+			spin_lock(&res->spinlock);
+			list_del(&res->dirty);
+			res->state &= ~DLM_LOCK_RES_DIRTY;
+			spin_unlock(&res->spinlock);
+		}
+		spin_unlock(&dlm->spinlock);
+		up_read(&dlm->recovery_sem);
+			
+		atomic_set(&dlm->thread.woken, 0);
+		status = util_wait_atomic_eq(&dlm->thread.thread_wq, 
+					     &dlm->thread.woken, 
+					     1, DLM_THREAD_MS);
+
+		if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+			if (atomic_read(&dlm->thread.woken))
+				printk("aha!!! dlm thread woken!\n");
+			else 
+				printk("timed out waiting, running again\n");
+#endif
+			continue;
+		}
+	
+		printk("DLM thread got %d while waiting\n", status);
+		break;
+	}
+
+	flush_scheduled_work();
+	complete (&dlm->thread.complete);
+	printk("quitting DLM thread!!!!!!\n");
+	return 0;
+}

Added: branches/dlm-glue/cluster/heartbeat.c
===================================================================
--- branches/dlm-glue/cluster/heartbeat.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/heartbeat.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,869 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Keeps track of alive nodes in the cluster.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+static void hb_teardown(void);
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx);
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start);
+static ssize_t write_disk(struct file *file, char *buf, size_t size);
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx);
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx);
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx);
+static int hb_do_disk_heartbeat(void *page);
+static int hb_thread(void *data);
+static void hb_complete_thread(void);
+static void hb_kick_thread(void);
+static int hb_launch_thread(void);
+static inline int hb_wait_on_callback_state(int type);
+
+
+
+/* globals */
+extern char *nm_nodename;
+static spinlock_t hb_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(hb_net_groups);
+static LIST_HEAD(hb_disk_groups);
+static int hb_callback_state[HB_NUM_CB];
+struct list_head hb_callbacks[HB_NUM_CB];
+static spinlock_t hb_cb_lock = SPIN_LOCK_UNLOCKED;
+static struct task_struct *hb_task = NULL;
+static atomic_t hb_thread_woken = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(hb_thread_wait_queue);
+static struct completion hb_complete;
+static int hb_pid = -1;
+
+static wait_queue_head_t hb_cb_wq;
+static atomic_t hb_cb_ready = ATOMIC_INIT(0);
+
+
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else {
+		printk("eek!  EIO!\n");
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+}
+
+
+
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx)
+{
+	int ret;
+	printk("hb_do_node_down: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+	printk("NOT removing node from group\n");
+	//ret = nm_remove_node_from_group(group, node);
+	hb_do_callbacks(HB_NODE_DOWN_CB, group, node, 0);
+	return 0;
+}
+
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx)
+{
+	printk("hb_do_node_up: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+	hb_do_callbacks(HB_NODE_UP_CB, group, node, 0);
+	return 0;
+}
+
+static inline void hb_submit_bh(int rw, struct buffer_head *bh)
+{
+	printk("submit_bh: rw=%s, blocknr=%lu, mapped=%s\n",
+	       rw==WRITE?"write":"read", bh->b_blocknr, 
+	       buffer_mapped(bh) ? "yes" : "no");
+	submit_bh(rw, bh);
+}
+
+
+static int hb_do_disk_heartbeat(void *page)
+{
+	nm_group_inode_private *priv;
+	struct inode *group, *node;
+	struct list_head *iter;
+	struct buffer_head *bh;
+	hb_disk_slot *slot;
+	hb_disk_heartbeat_block *hb_block;
+	int ino, idx, ret, i;
+	struct inode **dead_nodes, **live_nodes;
+	LIST_HEAD(tmplist);
+	u64 blkno;
+	cluster_disk *disk;
+
+	// NM_MAX_NODES is 255
+	dead_nodes = page;
+	live_nodes = page + (sizeof(struct inode *) * 256);
+	
+	spin_lock(&hb_lock);
+	list_splice_init(&hb_disk_groups, &tmplist);
+	spin_unlock(&hb_lock);
+
+	list_for_each(iter, &tmplist) {
+		priv = list_entry(iter, nm_group_inode_private, disk_list);
+		group = priv->inode;
+		disk = &priv->disk;
+
+		memset(page, 0, PAGE_SIZE);
+		down(&group->i_sem);
+
+		idx = 0;
+		while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+retry_submit:
+			bh = slot->bh;
+			node = slot->inode;
+
+			ino = nm_get_node_global_index(node);
+
+			if (ino == nm_this_node(group)) {
+				lock_buffer(bh);
+				if (!buffer_mapped(bh)) {
+					blkno = (unsigned long long) bh->b_blocknr;
+					unlock_buffer(bh);
+					brelse(bh);
+					slot->bh = getblk(disk->dev,
+							  blkno,
+							  (1 << disk->blocksize_bits));
+					goto retry_submit;
+				}
+				memset(bh->b_data, 0, bh->b_size);
+				hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+				hb_block->time = CURRENT_TIME;
+				if (!hb_block->time)
+					hb_block->time = 1;
+				set_buffer_uptodate(bh);
+				clear_buffer_dirty(bh);
+				bh->b_end_io = hb_end_buffer_io_sync;
+				hb_submit_bh(WRITE, bh);
+			} else {
+				lock_buffer(bh);
+				if (!buffer_mapped(bh)) {
+					blkno = (unsigned long long) bh->b_blocknr;
+					unlock_buffer(bh);
+					brelse(bh);
+					slot->bh = getblk(disk->dev,
+							  blkno,
+							  (1 << disk->blocksize_bits));
+					goto retry_submit;
+				}
+				clear_buffer_uptodate(bh);
+				bh->b_end_io = hb_end_buffer_io_sync;
+				hb_submit_bh(READ, bh);
+			}
+			idx++;
+		}
+	
+		idx = 0;
+		while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+			bh = slot->bh;
+			node = slot->inode;
+
+			ino = nm_get_node_global_index(node);
+
+			wait_on_buffer(bh);
+			hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+			if (hb_block->time != slot->last_time) {
+				if (slot->state == HB_NODE_STATE_INIT) {
+					printk("first time for this node!\n");
+					live_nodes[ino] = node;
+					slot->state = HB_NODE_STATE_UP;
+				}
+				node->i_atime = hb_block->time;
+				slot->last_time = hb_block->time;
+				slot->margin = HB_DISK_MARGIN;
+				hb_do_callbacks(HB_NODE_RESPONDED_CB, group, node, HB_TYPE_DISK);
+			} else {
+				slot->margin--;
+				printk("node %d missed.  margin=%d\n", ino, slot->margin);
+			}
+
+			if (ino != nm_this_node(group) && slot->margin <= 0) {
+				printk("node %d JUST DIED!!!!\n", ino);
+				dead_nodes[ino] = node;
+				slot->state = HB_NODE_STATE_DOWN;
+			}
+			idx++;
+		}
+
+		up(&group->i_sem);
+
+		/* Do holding group i_sem while doing node-up/down.
+		 * Changes may need to be made to the group, so 
+		 * i_sem will be needed... */
+		for (i=0; i<NM_MAX_NODES; i++) {
+			if (live_nodes[i])
+				ret = hb_do_node_up(group, live_nodes[i], i);
+			else if (dead_nodes[i])
+				ret = hb_do_node_down(group, dead_nodes[i], i);
+		}
+	}
+	
+	spin_lock(&hb_lock);
+	list_splice(&tmplist, &hb_disk_groups);
+	spin_unlock(&hb_lock);
+	return 0;
+}
+
+
+static int hb_thread(void *data)
+{
+	int status;
+	void *page;
+	
+	page = (void *) __get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	util_daemonize ("hb_thread", strlen("hb_thread"), 1);
+	hb_task = current;
+
+	while (1) {
+		status = hb_do_disk_heartbeat(page);
+
+		atomic_set(&hb_thread_woken, 0);
+		status = util_wait_atomic_eq(&hb_thread_wait_queue, 
+					     &hb_thread_woken, 
+					     1, HB_THREAD_MS);
+
+		if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+			if (atomic_read(&hb_thread_woken))
+				printk("aha!!! hb thread woken!\n");
+			else 
+				printk("hb thread timed out waiting, running again\n");
+#endif
+			continue;
+		}
+		printk("hb thread got %d while waiting\n", status);
+		break;
+	}
+
+	flush_scheduled_work();
+	complete (&hb_complete);
+	printk("quitting hb thread!!!!!!\n");
+	return 0;
+}
+
+
+static void hb_kick_thread(void)
+{
+	atomic_set(&hb_thread_woken, 1);
+	wake_up(&hb_thread_wait_queue);
+}
+
+/* Launch the hb thread for the mounted volume */
+static int hb_launch_thread(void)
+{
+	hb_pid = -1;
+	hb_task = NULL;
+	init_completion (&hb_complete);
+
+	printk("starting hb thread...\n");
+	hb_pid = kernel_thread (hb_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (hb_pid < 0) {
+		printk("unable to launch hb thread, error=%d", hb_pid);
+		return -EINVAL;
+	}
+	printk("hb thread running...\n");
+	return 0;
+}
+
+static void hb_complete_thread(void)
+{
+	printk ("waiting for hb thread to exit....");
+	send_sig (SIGINT, hb_task, 0);
+	wait_for_completion (&hb_complete);
+	printk ("hb thread exited\n");
+	hb_task = NULL;
+}
+
+
+
+
+
+
+
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start)
+{
+	int ret = -EINVAL;
+	cluster_disk *disk;
+	nm_group_inode_private *priv;
+
+	priv = group->u.generic_ip;
+	if (!priv)
+		goto leave;
+
+	if (priv->state == NM_GROUP_READY)
+		return 0;
+
+	/* hold an extra ref as long as hb keeps track of the group */
+	igrab(group);
+
+	disk = &priv->disk;
+	if (blocks > NM_MAX_NODES)
+	       blocks = NM_MAX_NODES;
+	disk->dev = dev;
+	disk->blocksize_bits = bits;
+	disk->num_blocks = blocks;
+	disk->start_block = start;
+	util_init_rarray(&disk->slots, sizeof(hb_disk_slot));
+
+	/* start allowing group additions */
+	ret = nm_make_group_ready(group);
+
+leave:
+	if (ret < 0)
+		iput(group);
+
+	return ret;
+}
+	
+
+static ssize_t write_disk(struct file *file, char *buf, size_t size)
+{
+	hb_op *data;
+	struct inode *group = NULL;
+	struct file *filp = NULL;
+	kdev_t dev;
+	int ret, tmpret;
+	nm_group_inode_private *priv;
+	u32 tmpmap[8];
+	
+	printk("write_disk\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+	data = (hb_op *) buf; if (data->magic != HB_OP_MAGIC)
+		return -EINVAL;
+
+	switch (data->opcode)
+	{
+		case HB_OP_START_DISK_HEARTBEAT:
+			if (data->bits < 9 || data->bits > 12) {
+				ret = sprintf(buf, "%d: bad blocksize bits! %u", -EINVAL, data->bits);
+				break;
+			}
+			group = nm_get_group_by_num(data->group_num);
+			if (!group || !group->u.generic_ip) {
+				ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+				break;
+			}
+			priv = group->u.generic_ip;
+			if (strncmp(priv->disk.uuid, data->disk_uuid, CLUSTER_DISK_UUID_LEN) != 0) {
+				ret = sprintf(buf, "%d: bad disk uuid!", -EINVAL);
+				break;
+			}
+			filp = fget(data->fd);
+			if (!filp) {
+				ret = sprintf(buf, "%d: bad fd!", -EINVAL);
+				break;
+			}
+			dev = filp->f_dentry->d_inode->i_rdev;
+			tmpret = hb_init_disk_hb_group(group, dev, data->bits, data->blocks, data->start);
+			if (tmpret < 0) {
+				fput(filp);
+				ret = sprintf(buf, "%d: failed to init disk heartbeat for group %u!", 
+					      -EINVAL, data->group_num);
+			} else {
+				ret = sprintf(buf, "0: disk heartbeat started for group %u!", 
+					      data->group_num);
+			}
+			break;
+
+		case HB_OP_GET_NODE_MAP:
+			group = nm_get_group_by_num(data->group_num);
+			if (!group || !group->u.generic_ip) {
+				ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+				break;
+			}
+			
+			if ((ret = hb_fill_node_map(group, tmpmap, sizeof(tmpmap))) == 0) {
+				ret = sprintf(buf, "0: ");
+				buf += ret;
+				memcpy(buf, tmpmap, sizeof(tmpmap));
+				ret += sizeof(tmpmap);
+			} else {
+				ret = sprintf(buf, "%d: error occurred in hb_fill_node_map", ret);
+			}
+			break;
+
+		default:
+			ret = sprintf(buf, "%d: bad opcode! %u", -EINVAL, data->opcode);
+			break;
+	}
+
+	if (group)
+		iput(group);
+	
+	return ret;
+}
+
+
+extern struct file_operations transaction_ops;
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+static int hb_fill_super(struct super_block * sb, void * data, int silent)
+{
+	int ret;
+	TA_write_ops *ops;
+	static struct tree_descr hb_files[] = {
+		[HB_Disk] = {".disk", &transaction_ops, S_IWUSR},
+		/* last one */ {""}
+	};
+	
+	ops = kmalloc(sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	memset(ops, 0, sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)));
+	ops->num_ops = HB_WriteOpArraySize;
+	ops->write_op[HB_Disk] = write_disk;
+
+	printk("calling simple_fill_super...\n");
+	ret = simple_fill_super(sb, 0x5551212f, hb_files);
+	if (ret >= 0)
+		TA_GENERIC_SB_MEMBER(sb) = ops;
+	else 
+		kfree(ops);
+	return ret;
+}
+
+static struct super_block *hb_read_super (struct super_block *sb, void *data, int silent)
+{
+	printk("welcome to hb_read_super!!!\n");
+	return (hb_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (hb_fs_type, "hb", hb_read_super, FS_SINGLE|FS_LITTER);
+
+
+/* TODO: make callbacks all return int */
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	hb_disk_slot *slot;
+	struct inode *group = ptr1;
+	struct inode *node = ptr2;
+	cluster_disk *disk;
+	nm_group_inode_private *priv;
+	int ino, ret = 0;
+	u64 block;
+
+	printk("hb_nm_group_node_add_cb: group=%lu, node=%lu, idx=%u\n",
+	       group->i_ino, node->i_ino, idx);
+
+	down(&group->i_sem);	
+	priv = group->u.generic_ip;
+	if (!priv) {
+		printk("eek! bad group inode!\n");
+		goto leave;
+	}
+	disk = &priv->disk;
+	if (disk->uuid[0]) {
+		ret = util_resize_rarray(&disk->slots, idx+1);
+		if (ret < 0) {
+			printk("eeeeeeek!!!! failed to resize disk state data\n");
+			goto leave;
+		}
+	
+		ino = nm_get_node_global_index(node);
+		if (ino > disk->num_blocks) {
+			printk("disk heartbeat area does not have enough blocks!\n");
+			goto leave;
+		}
+		block = ino + disk->start_block;
+	
+		slot = util_rarray_idx_to_slot(&disk->slots, idx);
+		if (!slot) {
+			printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+			goto leave;
+		}
+		slot->inode = igrab(node);
+		slot->last_time = 0;
+		slot->margin = HB_INITIAL_DISK_MARGIN;
+#warning needs to change for 2.6
+		slot->bh = getblk(disk->dev, (int)block, (1 << disk->blocksize_bits));
+		slot->state = HB_NODE_STATE_INIT;
+	} else {
+		printk("doing nothing for group add for non-disk heartbeat group\n");
+	}
+	
+leave:
+	up(&group->i_sem);
+	return;	
+}
+
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	hb_disk_slot *slot;
+	struct inode *group = ptr1;
+	struct inode *node = ptr2;
+	cluster_disk *disk;
+	nm_group_inode_private *priv;
+	int ret = -EINVAL;
+
+	printk("hb_nm_group_node_del_cb: group=%lu, node=%lu, idx=%u\n",
+	       group->i_ino, node->i_ino, idx);
+
+	down(&group->i_sem);
+	priv = group->u.generic_ip;
+	if (!priv) {
+		printk("eek! bad group inode!\n");
+		goto leave;
+	}
+	disk = &priv->disk;
+	slot = util_rarray_idx_to_slot(&disk->slots, idx);
+	if (!slot) {
+		printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+		goto leave;
+	}
+	if (slot->inode!=node) {
+		printk("eeeeeeek!!!! node inode changed!\n");
+		goto leave;
+	}
+	iput(node);
+	if (slot->bh) {
+		wait_on_buffer(slot->bh);
+		brelse(slot->bh);
+	}
+	memset(slot, 0, sizeof(hb_disk_slot));
+	ret = 0;
+leave:
+
+	up(&group->i_sem);
+	printk("hb_nm_group_node_del_cb done: %d\n", ret);
+	return;
+}
+
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	//struct inode *node = ptr1;
+}
+
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	struct inode *group = ptr1;
+	nm_group_inode_private *priv;
+
+	printk("hb_nm_group_add_cb: group=%lu, idx=%u\n",
+	       group->i_ino, idx);
+	
+	priv = group->u.generic_ip;
+	if (!priv) {
+		printk("eek! bad group inode!\n");
+		return;
+	}
+
+	spin_lock(&hb_lock);
+	list_add_tail(&priv->net_list, &hb_net_groups);
+	if (priv->disk.uuid[0]) {
+		printk("adding priv=%p inode=%p to disk group list\n", priv, group);
+		list_add_tail(&priv->disk_list, &hb_disk_groups);
+	}
+	spin_unlock(&hb_lock);
+}
+
+enum {
+	HB_CB_STATE_FROZEN = 0,
+	HB_CB_STATE_READY
+};
+
+static int __init init_hb(void)
+{
+	int retval=-1, i;
+	printk("loading heartbeat module: nodename is %s\n", nm_nodename);
+
+	if (proc_mkdir("cluster/heartbeat", 0)) {
+		// ???
+	}
+
+	//hb_net_timestamps = __get_free_page(GFP_KERNEL);
+	//if (!hb_net_timestamps)
+	//	goto done;
+
+	for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+		INIT_LIST_HEAD(&hb_callbacks[i]);
+	init_waitqueue_head(&hb_cb_wq);
+	for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+		hb_callback_state[i] = HB_CB_STATE_READY;
+
+	if (nm_register_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb))
+		goto done;
+	if (nm_register_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb))
+		goto done;
+	if (nm_register_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb))
+		goto done;
+	if (nm_register_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb))
+		goto done;
+ 
+	if (hb_launch_thread() < 0)
+		goto done;
+ 
+	retval = register_filesystem(&hb_fs_type);
+done:
+	if (retval)
+		hb_teardown();
+	return retval;
+}
+
+static void __exit exit_hb(void)
+{
+	int i;
+	spin_lock(&hb_cb_lock);
+	for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++) {
+		hb_wait_on_callback_state(i);
+		hb_callback_state[i] = HB_CB_STATE_FROZEN;
+	}
+	spin_unlock(&hb_cb_lock);
+
+	hb_complete_thread();
+	hb_teardown();
+	unregister_filesystem(&hb_fs_type);
+	printk("unloading heartbeat module\n");
+}
+
+static void hb_teardown(void)
+{
+	nm_unregister_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb);
+	nm_unregister_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb);
+	nm_unregister_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb);
+	nm_unregister_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb);
+	remove_proc_entry("cluster/heartbeat", NULL);
+	//if (hb_net_timestamps)
+	//	kfree(hb_net_timestamps);
+}
+
+module_init(init_hb)
+module_exit(exit_hb)
+
+
+int hb_fill_node_map(struct inode *group, void *map, int size)
+{
+	hb_disk_slot *slot;
+	int idx = 0;
+	nm_group_inode_private *priv;
+	
+	priv = group->u.generic_ip;
+
+	memset(map, 0, size);
+	down(&group->i_sem);
+
+	if (priv->disk.uuid[0]) {
+		while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+			if (idx >= size-1) {
+				printk("map size (%d) too small for idx (%d)\n",
+			       	size, idx);
+				up(&group->i_sem);
+				return -EINVAL;
+			}
+			if (slot->state == HB_NODE_STATE_UP)
+				set_bit(idx, map);
+			idx++;
+		}
+	} else {
+		printk("filling straight from slot bitmap for non-disk heartbeat group\n");
+		memcpy(map, priv->slot_bitmap, size);
+	}
+
+	up(&group->i_sem);
+
+	return 0;
+}
+		
+
+static inline int hb_wait_on_callback_state(int type)
+{
+	while (hb_callback_state[type] == HB_CB_STATE_FROZEN) {
+		spin_unlock(&hb_cb_lock);
+		atomic_set(&hb_cb_ready, 0);
+		if (util_wait_atomic_eq(&hb_cb_wq, &hb_cb_ready, 1, 0) == -EINTR) {
+			return -EINTR;
+		}
+		spin_lock(&hb_cb_lock);
+	}
+	return 0;
+}
+
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority)
+{
+	hb_callback_func *f, *tmp;
+	struct list_head *iter;
+	int ret;
+
+	if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+		return -EINVAL;
+	f = kmalloc(sizeof(hb_callback_func), GFP_KERNEL);
+	if (f == NULL)
+		return -ENOMEM;
+	memset(f, 0, sizeof(hb_callback_func));
+	f->func = func;
+	f->data = data;
+	f->priority = priority;
+
+	spin_lock(&hb_cb_lock);
+	ret = hb_wait_on_callback_state(type);
+	if (ret < 0) {
+		spin_unlock(&hb_cb_lock);
+		kfree(f);
+		return ret;
+	}
+	
+	list_for_each(iter, &hb_callbacks[type]) {
+		tmp = list_entry (iter, hb_callback_func, list);
+		if (priority < tmp->priority) {
+			list_add_tail(&f->list, iter);
+			spin_unlock(&hb_cb_lock);
+			return 0;
+		}
+	}
+	list_add_tail(&f->list, &hb_callbacks[type]);
+	spin_unlock(&hb_cb_lock);
+	return 0;
+}
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data)
+{
+	struct list_head *iter, *tmpiter;
+	int ret = -EINVAL;
+	hb_callback_func *f;
+
+	if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+		return -EINVAL;
+
+	spin_lock(&hb_cb_lock);
+	ret = hb_wait_on_callback_state(type);
+	if (ret < 0) {
+		spin_unlock(&hb_cb_lock);
+		return ret;
+	}
+	hb_callback_state[type] = HB_CB_STATE_FROZEN;
+	spin_unlock(&hb_cb_lock);
+
+	list_for_each_safe(iter, tmpiter, &hb_callbacks[type]) {
+		f = list_entry (iter, hb_callback_func, list);
+		if (f->func == func && f->data == data) {
+			list_del(&f->list);
+			kfree(f);
+			ret = 0;
+			break;
+		}
+	}
+
+	spin_lock(&hb_cb_lock);
+	hb_callback_state[type] = HB_CB_STATE_READY;
+	atomic_set(&hb_cb_ready, 1);
+	wake_up(&hb_cb_wq);
+	spin_unlock(&hb_cb_lock);
+	return ret;
+}
+
+
+
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx)
+{
+	struct list_head *iter;
+	hb_callback_func *f;
+	int ret;
+	
+	spin_lock(&hb_cb_lock);
+	ret = hb_wait_on_callback_state(type);
+	if (ret < 0) {
+		spin_unlock(&hb_cb_lock);
+		printk("missed hb callback(%d) due to EINTR!\n", type);
+		return;
+	}
+	hb_callback_state[type] = HB_CB_STATE_FROZEN;
+	spin_unlock(&hb_cb_lock);
+
+	list_for_each(iter, &hb_callbacks[type]) {
+		f = list_entry (iter, hb_callback_func, list);
+		(f->func) (ptr1, ptr2, idx, f->data);
+	}
+
+	spin_lock(&hb_cb_lock);
+	hb_callback_state[type] = HB_CB_STATE_READY;
+	atomic_set(&hb_cb_ready, 1);
+	wake_up(&hb_cb_wq);
+	spin_unlock(&hb_cb_lock);
+}

Added: branches/dlm-glue/cluster/heartbeat.h
===================================================================
--- branches/dlm-glue/cluster/heartbeat.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/heartbeat.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,129 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_HEARTBEAT_H
+#define CLUSTER_HEARTBEAT_H
+
+
+enum {
+	HB_NODE_STATE_INIT = 0,
+	HB_NODE_STATE_DOWN,
+	HB_NODE_STATE_UP
+};
+
+struct _heartbeat_ctxt
+{
+	int dummy;
+};
+
+typedef struct _hb_disk_slot
+{
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct list_head list;
+	unsigned long last_time;
+	u16 margin;
+	u16 state;
+} hb_disk_slot;
+
+
+
+#define HB_THREAD_MS                  2000   // every 2 seconds
+
+
+#define HB_OP_MAGIC      0xf00d
+enum {
+	HB_OP_START_DISK_HEARTBEAT=371,
+	HB_OP_GET_NODE_MAP
+};
+
+typedef struct _hb_op
+{
+	u16 magic;
+	u16 opcode;
+	unsigned int fd;
+	char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+	u16 group_num;
+	u32 bits;
+	u32 blocks;
+	u64 start;
+} hb_op;
+
+enum {
+	HB_TYPE_DISK = 0,
+	HB_TYPE_NET
+};
+
+
+/* callback stuff */
+
+enum {
+	HB_NODE_DOWN_CB = 0,
+	HB_NODE_UP_CB,
+	HB_NODE_RESPONDED_CB,    // this one is very chatty
+	HB_NUM_CB
+};
+
+typedef void (hb_cb_func)(struct inode *, struct inode *, int, void *);
+
+typedef struct _hb_callback_func
+{
+	struct list_head list;
+	hb_cb_func *func;
+	void *data;
+	int priority;
+} hb_callback_func;
+
+
+enum {
+	HB_Root = 1,
+	HB_Disk,
+	HB_WriteOpArraySize
+};
+
+typedef struct _hb_disk_heartbeat_block
+{
+	u64 time;
+} hb_disk_heartbeat_block;
+
+
+// number of initial allowed misses 
+#define HB_INITIAL_DISK_MARGIN     60
+#define HB_INITIAL_NET_MARGIN      60
+
+// number of allowed misses in steady state
+#define HB_DISK_MARGIN             30
+#define HB_NET_MARGIN              30
+
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data);
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority);
+int hb_fill_node_map(struct inode *group, void *map, int size);
+
+
+
+#endif /* CLUSTER_HEARTBEAT_H */

Added: branches/dlm-glue/cluster/nodemanager.c
===================================================================
--- branches/dlm-glue/cluster/nodemanager.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/nodemanager.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,1330 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.c
+ *
+ * totally lame static node management placeholder
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/hash.h>
+
+#include <asm/uaccess.h>
+
+#include "tcp.h"
+#include "dlmmod.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+/*
+ * This nm module is similar to nfsd/nfsctl.c in that it uses
+ * transaction files (in /proc/cluster/nm) to communicate with
+ * the kernel module instead of ioctls or other means.
+ *
+ * Files involved:
+ *  /proc/cluster/nm/cluster - used to create/destroy cluster, adds 
+ *                             nodes/groups to the cluster, queries info
+ *                             about the cluster
+ *  /proc/cluster/nm/group   - adds/removes nodes from a group, queries
+ *                             info about a group
+ *  /proc/cluster/nm/node    - changes info for a node, queries info about
+ *                             a node
+ *
+ * This nm implementation basically allows this node to live in exactly one 
+ * cluster.  All "clustered" nodes that are known to this node should be
+ * added to the cluster, and all nodes should see the same list of nodes in
+ * the same order at all times.  The "slot" number given to a node in this 
+ * global cluster list is fixed and never changes.  Groups can be dynamically
+ * created within a cluster (TODO: currently static only) and be made up of 
+ * one or more nodes (listed at most once) in the global list.  A node may exist
+ * in many groups.  Also, a group may have an optional disk UUID which is simply
+ * stored for later use by the heartbeat service.  (The heartbeat service will
+ * do disk heartbeating only for those groups with valid UUIDs.)  
+ *
+ * USAGE:
+ * For our purposes, the nm service can be autoloaded by an fstab entry or manually
+ * through mount (mount -t nm none /proc/cluster/nm).  Once that is done, an init
+ * script (or single executable on an initrd) should be run to create the static
+ * cluster info, possibly from a file like /etc/nm.conf or similar.  We should 
+ * probably create a "dlm" or "everyone" group (with NO disk heartbeating) so that 
+ * the dlm service can be used with the network only.  This group should contain 
+ * all known nodes.  After this is done, the net, hb and dlm modules can come up.
+ * The nm service is now ready for use, since groups don't need to be created till 
+ * later.
+ * 
+ * A group services daemon can be written (by someone!? ;-) to run at this point.
+ * Since the "dlm" group has everything it needs for full dlmming (since it uses 
+ * only network), the dlm itself can be used to arbitrate for group creation, 
+ * and additions/deletions from groups.  Callbacks should be registered with nm by
+ * other services that care on each of these events.  For instance, heartbeat should
+ * register a callback with nm for group creation, and addition and deletion from 
+ * a group so that it can make any necessary changes to its heartbeating (primarily
+ * so that it can begin/end disk heartbeat for any group/node that needs it).
+ *   
+ * NOTE NOTE NOTE !!!!:
+ * This is intended to be a quickie implementation.  (translation: lame)  I do not
+ * want to step on anyone's toes who may have implemented something wayyy better.
+ * If something out there "wins", we will plug into that instead.  If nothing really
+ * takes off, we at least have a (lame) reference to work off of.  However, since this 
+ * implementation exists solely to make ocfs2 work, and one of the major advantages
+ * of ocfs version 1 was ease of setup, we don't want to move to something 
+ * substantially more complicated than this (one conf file).
+ *
+ */ 
+
+
+
+/* globals */
+nm_cluster cluster;
+struct super_block *single_sb;
+char *nm_nodename;
+static spinlock_t nm_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t nm_cb_lock = SPIN_LOCK_UNLOCKED;
+struct list_head nm_callbacks[NM_NUM_CB];
+
+
+static void nm_teardown(void);
+static int nm_create_cluster(char *buf);
+static void nm_init_cluster(nm_cluster *cluster);
+int nm_create_node(char *buf, nm_op *data);
+int nm_name_cluster(char *buf, nm_op *data);
+int nm_destroy_cluster(char *buf);
+int nm_get_cluster_num_nodes(char *buf);
+int nm_get_cluster_num_groups(char *buf);
+int nm_get_node_info(char *buf, nm_op *data);
+int nm_get_group_info(char *buf, nm_op *data);
+nm_cluster *nm_get_cluster(void);
+struct inode *nm_get_group_by_name(char *node_name);
+struct inode *nm_get_node_by_name(char *node_name);
+int nm_init(dlm_ctxt *dlm);
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx);
+
+/* support for adding files, dirs, hardlinks in /proc/cluster/nm/... */
+extern struct file_operations simple_dir_operations;
+extern struct inode_operations simple_dir_inode_operations;
+extern struct file_operations transaction_ops;
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request);
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+
+static ssize_t write_node(struct file *file, char *buf, size_t size);
+static ssize_t write_group(struct file *file, char *buf, size_t size);
+static ssize_t write_cluster(struct file *file, char *buf, size_t size);
+
+static struct inode * __nm_get_group_by_num(u16 group_num);
+static struct inode * __nm_get_node_by_num(u16 node_num);
+
+
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child);
+
+#define NM_HASH_BITS     7
+#define NM_HASH_SIZE     (1 << NM_HASH_BITS)
+#define NM_HASH_MASK     (NM_HASH_SIZE - 1)
+
+static struct list_head *nm_ip_hash = NULL;
+static spinlock_t nm_ip_hash_lock;
+
+static int nm_init_ip_hash(void);
+static void nm_destroy_ip_hash(void);
+
+
+static void nm_destroy_ip_hash(void)
+{
+	int i;
+	if (!nm_ip_hash)
+		return;
+	for (i=0; i<NM_HASH_SIZE; i++) {
+		/* TODO: cleanup */
+	}
+	free_page((unsigned long)nm_ip_hash);
+}
+
+static int nm_init_ip_hash(void)
+{
+	int i;
+	
+	if ((PAGE_SIZE / sizeof(struct list_head)) < NM_HASH_SIZE) {
+		printk("eek!  hash size too big for this arch!\n");
+		BUG();
+	}
+
+	nm_ip_hash = (struct list_head *) __get_free_page(GFP_KERNEL);
+	if (!nm_ip_hash)
+		return -ENOMEM;
+	for (i=0; i<NM_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nm_ip_hash[i]);
+	spin_lock_init(&nm_ip_hash_lock);
+	return 0;
+}
+
+
+
+
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request)
+{
+	int start = 0, slot_num;
+	if (request != NM_INVALID_SLOT_NUM)
+		start = request;
+	slot_num = find_next_zero_bit (bitmap, max, start);
+	if (slot_num >= max)
+		return -1;
+	if (request != NM_INVALID_SLOT_NUM && slot_num != request)
+		return -1;
+	set_bit(slot_num, bitmap);
+	return slot_num;
+}
+
+
+
+
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+	struct qstr name;
+	struct dentry *dentry = ERR_PTR(-EINVAL);
+	struct inode *inode;
+
+	if (!file->name)
+		goto out;
+	name.name = file->name;
+	name.len = strlen(name.name);
+	printk("adding file %*s\n", name.len, name.name);
+	name.hash = full_name_hash(name.name, name.len);
+	dentry = d_alloc(parent, &name);
+	if (!dentry) {
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode = new_inode(s);
+	if (!inode) {
+		dput(dentry);
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode->i_mode = file->mode;
+	inode->i_uid = inode->i_gid = 0;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (file->mode & S_IFDIR) {
+		inode->i_op = &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+	} else {
+		inode->i_fop = file->ops;
+	}		    
+	inode->i_ino = ino;
+	insert_inode_hash(inode);
+	d_add(dentry, inode);
+
+out:
+	return dentry;
+}
+
+
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+	struct qstr name;
+	struct dentry *dentry = ERR_PTR(-EINVAL);
+	struct inode *inode;
+
+	if (!file->name)
+		goto out;
+	name.name = file->name;
+	name.len = strlen(name.name);
+	printk("adding link %*s\n", name.len, name.name);
+	name.hash = full_name_hash(name.name, name.len);
+	dentry = d_alloc(parent, &name);
+	if (!dentry) {
+		printk("failed to d_alloc\n");
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode = iget(s, ino);
+	if (!inode) {
+		printk("failed to iget\n");
+		dput(dentry);
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	if (!inode->u.generic_ip) {
+		printk("bad inode: %d\n", ino);
+		iput(inode);
+		dput(dentry);
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode->i_nlink++;
+	d_add(dentry, inode);
+
+out:
+	return dentry;
+}
+
+
+
+
+
+/* cluster, node and group transaction files. 
+ * here's where the actual work of nm takes place. */
+
+static int nm_create_cluster(char *buf)
+{
+	int ret = -EINVAL;
+	
+	printk("create cluster...\n");
+	
+	spin_lock(&nm_lock);
+	if (cluster.state == NM_CLUSTER_UP) {
+		ret = sprintf(buf, "%d: cluster already up\n", -EINVAL);
+	} else {
+		cluster.state = NM_CLUSTER_UP;
+		ret = sprintf(buf, "0: cluster state: UP");
+	}
+	spin_unlock(&nm_lock);
+	return ret;
+}
+
+
+
+int nm_create_group(char *buf, nm_op *data)
+{
+	struct tree_descr desc;
+	struct dentry *dentry = NULL;
+	struct inode *inode = NULL;
+	int ino, group_num;
+	int ret = -EINVAL;
+	nm_group_inode_private *g = NULL;
+
+	printk("create group...\n");
+
+	data->arg_u.gc.name[NM_MAX_NAME_LEN] = '\0';
+	inode = nm_get_group_by_name(data->arg_u.gc.name);
+	if (inode) {
+		ret = sprintf(buf, "%d: group %u (%s) already exists", -EEXIST, 
+			      nm_get_group_global_index(inode), data->arg_u.gc.name);
+		iput(inode);
+		return ret;
+	}
+
+	group_num = data->arg_u.gc.group_num;
+	if (group_num > NM_INVALID_SLOT_NUM)
+		goto leave;
+
+	spin_lock(&cluster.bitmap_lock);
+	group_num = nm_find_next_slot(&(cluster.group_bitmap[0]), 255, group_num);
+	spin_unlock(&cluster.bitmap_lock);
+
+	if (group_num < 0) {
+		printk("out of group slots!\n");
+		goto leave;
+	}
+
+	ino = group_num + NM_GROUP_INODE_START;
+
+	desc.name = data->arg_u.gc.name;
+	desc.ops = NULL;
+	desc.mode = S_IFDIR | 0755;
+	dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+	if (IS_ERR(dentry))
+		goto leave;
+	inode = igrab(dentry->d_inode);
+	if (!inode) {
+		printk("igrab failed!\n");
+		goto leave;
+	}
+		
+	g = kmalloc(sizeof(nm_group_inode_private), GFP_KERNEL);
+	if (!g) 
+		goto leave;
+
+	memset(g, 0, sizeof(nm_group_inode_private));
+	memcpy(g->disk.uuid, data->arg_u.gc.disk_uuid, CLUSTER_DISK_UUID_LEN);
+	spin_lock_init(&g->bitmap_lock);
+	if (g->disk.uuid[0])
+		g->state = NM_GROUP_NOT_READY;
+	else
+		g->state = NM_GROUP_READY;
+	g->inode = inode;
+	inode->u.generic_ip = g;
+
+	ret = sprintf(buf, "0: group %u (%s) added, uuid: %s", group_num,
+		      data->arg_u.gc.name, g->disk.uuid);
+	nm_do_callbacks(NM_GROUP_ADD_CB, inode, NULL, group_num);
+
+leave:
+	if (ret < 0) {
+		if (inode) {
+			if (inode->u.generic_ip)
+				kfree(inode->u.generic_ip);
+			iput(inode);
+		}
+		if (dentry)
+			dput(dentry);
+	}
+	return ret;
+}
+
+
+int nm_create_node(char *buf, nm_op *data)
+{
+	struct tree_descr desc;
+	struct dentry *dentry = NULL;
+	struct inode *inode = NULL;
+	int ino, node_num, bucket;
+	int ret = -EINVAL;
+	nm_node_inode_private *n = NULL;
+
+	printk("add cluster node ...\n");
+
+	data->arg_u.node.node_name[NM_MAX_NAME_LEN] = '\0';
+	inode = nm_get_node_by_name(data->arg_u.node.node_name);
+	if (inode) {
+		ret = sprintf(buf, "%d: node %u (%s) already exists", -EEXIST, 
+			      nm_get_node_global_index(inode), 
+			      data->arg_u.node.node_name);
+		iput(inode);
+		return ret;
+	}
+
+	node_num = data->arg_u.node.node_num;
+	if (node_num > NM_INVALID_SLOT_NUM) {
+		printk("bad node_num: %d\n", node_num);
+		goto leave;
+	}
+
+	spin_lock(&cluster.bitmap_lock);
+	node_num = nm_find_next_slot(&(cluster.node_bitmap[0]), 255, node_num);
+	spin_unlock(&cluster.bitmap_lock);
+
+	if (node_num < 0) {
+		printk("out of node slots!\n");
+		goto leave;
+	}
+
+	ino = node_num + NM_NODE_INODE_START;
+
+	desc.name = data->arg_u.node.node_name;
+	desc.ops = NULL;
+	desc.mode = S_IFREG | S_IWUSR;
+	dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+	if (IS_ERR(dentry)) {
+		printk("bad dentry\n");
+		goto leave;
+	}
+	inode = igrab(dentry->d_inode);
+	if (!inode) {
+		printk("igrab failed!\n");
+		goto leave;
+	}
+		
+	n = kmalloc(sizeof(nm_node_inode_private), GFP_KERNEL);
+	if (!n) {
+		printk("could not kmalloc\n");
+		goto leave;
+	}
+	memcpy(&n->node, &data->arg_u.node, sizeof(nm_node_info));
+	INIT_LIST_HEAD(&n->ip_hash);
+	n->net.sock = NULL;
+	INIT_LIST_HEAD(&n->net.list);
+	spin_lock_init(&n->net.sock_lock);
+	n->net.flags = 0;
+
+	/* hash on first ip address */
+	spin_lock(&nm_ip_hash_lock);
+	bucket = hash_long(n->node.ifaces[0].addr_u.ip_addr4, NM_HASH_BITS);
+	list_add_tail(&n->ip_hash, &nm_ip_hash[bucket]);
+	spin_unlock(&nm_ip_hash_lock);
+	printk("hashed ip %d.%d.%d.%d to bucket %d\n", NIPQUAD(n->node.ifaces[0].addr_u.ip_addr4), bucket);
+	n->inode = inode;
+	inode->u.generic_ip = n;
+
+	ret = sprintf(buf, "0: node %u (%s) added", node_num, n->node.node_name);
+	nm_do_callbacks(NM_NODE_ADD_CB, inode, NULL, node_num);
+
+leave:
+	if (ret < 0) {
+		if (inode) {
+			if (inode->u.generic_ip)
+				kfree(inode->u.generic_ip);
+			iput(inode);
+		}
+		if (dentry)
+			dput(dentry);
+	}
+	return ret;
+}
+
+int nm_make_group_ready(struct inode *group)
+{
+	nm_group_inode_private *g = group->u.generic_ip;
+	if (!g)
+		return -EINVAL;
+	g->state = NM_GROUP_READY;
+	return 0;
+}
+
+int nm_add_node_to_group(char *buf, nm_op *data)
+{
+	struct tree_descr desc;
+	struct inode *inode = NULL;
+	struct dentry *dentry = NULL, *child = NULL;
+	nm_group_inode_private *g = NULL;
+	int group_num, slot_num;
+	int ret = -EINVAL;
+	u16 ino;
+	char tmpname[6];
+
+	printk("add node to group...\n");
+
+	group_num = data->arg_u.gc.group_num;
+	ino = data->arg_u.gc.node_num;
+	slot_num = data->arg_u.gc.slot_num;
+
+	/* request a certain slot, or NM_INVALID_SLOT_NUM for any slot */
+	if (slot_num > NM_INVALID_SLOT_NUM)
+		goto leave;
+	
+	if (ino >= NM_INVALID_SLOT_NUM || group_num >= NM_INVALID_SLOT_NUM)
+		goto leave;
+
+       	inode = __nm_get_group_by_num(group_num);
+	if (!inode)
+		goto leave;
+	if (list_empty(&inode->i_dentry))
+		goto leave;
+	dentry = dget(list_entry(inode->i_dentry.next, struct dentry, d_alias));
+	if (!dentry)
+		goto leave;
+	g = inode->u.generic_ip;
+	if (!g)
+		goto leave;
+
+	if (g->state == NM_GROUP_NOT_READY) {
+		ret = sprintf(buf, "%d: group disk has not been discovered.  cannot add nodes.", -EROFS);
+		goto leave;
+	}
+
+	spin_lock(&g->bitmap_lock);
+	slot_num = nm_find_next_slot(&(g->slot_bitmap[0]), 255, slot_num);
+	spin_unlock(&g->bitmap_lock);
+	if (slot_num < 0)
+		goto leave;
+
+	/* create hardlink to ino with name "slot_num" */
+	sprintf(tmpname, "%03u", slot_num);
+	desc.name = &(tmpname[0]);
+	desc.ops = NULL;
+	desc.mode = 0;
+	child = nm_add_link(single_sb, dentry, &desc, 
+			    NM_NODE_INODE_START+ino);
+	if (IS_ERR(child)) {
+		printk("error adding link for %s\n", tmpname);
+		child = NULL;
+		goto leave;
+	}
+
+	ret = sprintf(buf, "0: node %u added to group: %*s", 
+		      ino, dentry->d_name.len, dentry->d_name.name);
+
+	if (!igrab(child->d_inode))
+		goto leave;
+	nm_do_callbacks(NM_GROUP_NODE_ADD_CB, inode, child->d_inode, slot_num);
+	iput(child->d_inode);
+
+leave:
+	if (dentry)
+		dput(dentry);
+	if (child)
+		dput(child);
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+
+int nm_remove_node_from_group(struct inode *group, struct inode *node)
+{
+	struct dentry *child = NULL;
+	nm_group_inode_private *g = NULL;
+	int slot_num;
+	int ret = -EINVAL;
+
+	printk("remove node from group...\n");
+
+	slot_num = nm_get_group_index(group, node, &child);
+
+	if (slot_num == NM_MAX_NODES || !child)
+		goto leave;
+
+	g = group->u.generic_ip;
+	if (!g)
+		goto leave;
+		
+	printk("killing the dentry now!!\n");
+	down(&group->i_zombie);
+	node->i_nlink--;
+	d_delete(child);
+	up(&group->i_zombie);
+	printk("done killing the dentry!!\n");
+
+
+	if (!igrab(node))
+		goto leave;
+	nm_do_callbacks(NM_GROUP_NODE_DEL_CB, group, node, slot_num);
+	iput(node);
+	
+	spin_lock(&g->bitmap_lock);
+	clear_bit(slot_num, (void *)(&g->slot_bitmap[0]));
+	spin_unlock(&g->bitmap_lock);
+
+	ret = 0;
+
+leave:
+	if (child)
+		dput(child);
+	return ret;
+}
+
+
+
+int nm_name_cluster(char *buf, nm_op *data)
+{
+	int ret = -EINVAL;
+
+	printk("name cluster...\n");
+	spin_lock(&nm_lock);
+	if (cluster.state == NM_CLUSTER_UP) {
+		ret = sprintf(buf, "%d: cluster name could not be set.  cluster already up.", -EINVAL);
+		goto leave;
+	}
+	memset(cluster.name, 0, NM_MAX_NAME_LEN+1);
+	memcpy(cluster.name, data->arg_u.name, NM_MAX_NAME_LEN);
+	ret = sprintf(buf, "0: cluster name set: %s", cluster.name);
+leave:
+	spin_unlock(&nm_lock);
+	return ret;
+}
+
+int nm_destroy_cluster(char *buf)
+{
+	int ret;
+	printk("destroy cluster...\n");
+
+	/* TODO */
+	spin_lock(&nm_lock);
+	nm_init_cluster(&cluster);
+	ret = sprintf(buf, "0: rudely destroyed cluster!!!");
+	spin_unlock(&nm_lock);
+	return ret;
+}
+
+int nm_get_cluster_num_nodes(char *buf)
+{
+	int num_nodes=0, i;
+	
+	printk("get cluster num nodes...\n");
+
+	spin_lock(&cluster.bitmap_lock);
+	for (i=0; i<8; i++)
+		num_nodes += hweight32(cluster.node_bitmap[i]);
+	spin_unlock(&cluster.bitmap_lock);
+
+	return sprintf(buf, "0: %d", num_nodes);
+}
+
+int nm_get_cluster_num_groups(char *buf)
+{
+	int num_groups=0, i;
+	
+	printk("get cluster num groups...\n");
+
+	spin_lock(&cluster.bitmap_lock);
+	for (i=0; i<8; i++)
+		num_groups += hweight32(cluster.group_bitmap[i]);
+	spin_unlock(&cluster.bitmap_lock);
+
+	return sprintf(buf, "0: %d", num_groups);
+}
+
+int nm_get_group_num_nodes(struct inode *group)
+{
+	int num_nodes=0, i;
+	nm_group_inode_private *g;
+	
+	printk("get group num nodes...\n");
+	
+	g = group->u.generic_ip;
+	if (!g)
+		return -EINVAL;
+
+	spin_lock(&g->bitmap_lock);
+	for (i=0; i<8; i++)
+		num_nodes += hweight32(g->slot_bitmap[i]);
+	spin_unlock(&g->bitmap_lock);
+
+	return num_nodes;
+}
+
+int nm_get_group_max_slots(struct inode *group)
+{
+	int last=0, i;
+	nm_group_inode_private *g;
+	
+	printk("get group num nodes...\n");
+	
+	g = group->u.generic_ip;
+	if (!g)
+		return -EINVAL;
+
+#warning need to change this for 64 bit 
+	spin_lock(&g->bitmap_lock);
+	for (i=7; i>=0; i--) {
+		if (g->slot_bitmap[i]) {
+			last = fls(g->slot_bitmap[i]);
+			last += (i * sizeof(g->slot_bitmap[i]));
+			break;
+		}
+	}
+	spin_unlock(&g->bitmap_lock);
+
+	return last;
+}
+
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx)
+{
+	nm_group_inode_private *priv;
+	int next;
+
+	if (*idx >= 255)
+		return NULL;
+	priv = group->u.generic_ip;
+	if (!priv)
+		return NULL;
+	next = find_next_bit(priv->slot_bitmap, 255, *idx);
+	if (next >= 255)
+		return NULL;
+	*idx = next;
+	return util_rarray_idx_to_slot(&priv->disk.slots, next);
+}
+
+int nm_get_node_info(char *buf, nm_op *data)
+{
+	int ret, tmpret, i;
+	nm_node_inode_private *priv;
+	nm_network_iface *n;
+	struct inode *inode = NULL;
+	struct dentry *dentry;
+	u16 node_num;
+	u16 vers;
+
+	ret = -EINVAL;
+	node_num = data->arg_u.index;
+	inode = __nm_get_node_by_num(node_num);
+	if (inode) {
+		dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+		priv = inode->u.generic_ip;
+		ret = sprintf(buf, "0: global_index=%u\n"
+			           "name=%*s\n",
+				priv->node.node_num, dentry->d_name.len, 
+				dentry->d_name.name);
+		buf += ret;
+		for (i=0; i<NM_MAX_IFACES; i++) {
+			n = &priv->node.ifaces[i];
+			vers = ntohs(n->ip_version);
+			printk("ip_version=%u, vers=%u\n", n->ip_version, vers);
+			if (vers!=4 && vers!=6)
+				continue;
+			/* TODO: how to print ipv6? */
+			tmpret = sprintf(buf, "iface%d.port=%u\n"
+				            "iface%d.version=%d\n"
+					    "iface%d.addr=%d.%d.%d.%d\n",
+				      i, ntohs(n->ip_port), i, vers, i,
+				      NIPQUAD(n->addr_u.ip_addr4));
+			buf += tmpret;
+			ret += tmpret;
+		}
+		iput(inode);
+	}
+	return ret;
+}
+
+int nm_get_group_info(char *buf, nm_op *data)
+{
+	int ret, tmpret;
+	nm_group_inode_private *g = NULL;
+	struct inode *inode = NULL;
+	u16 group_num;
+	struct dentry *dentry, *child;
+
+	ret = -EINVAL;
+	group_num = data->arg_u.index;
+	inode = __nm_get_group_by_num(group_num);
+	if (inode) {
+		g = inode->u.generic_ip;
+		dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+		ret = sprintf(buf, "0: group_num=%u\n"
+		        	   "name=%*s\n"
+				   "disk_uuid=%s\n",
+			      group_num, dentry->d_name.len, 
+			      dentry->d_name.name, g->disk.uuid);
+		buf += ret;
+
+		spin_lock(&dcache_lock);
+		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+			tmpret = sprintf(buf, "%*s\n", child->d_name.len, 
+					 child->d_name.name);
+			buf += tmpret;
+			ret += tmpret;
+		}
+		spin_unlock(&dcache_lock);
+		iput(inode);
+	}
+	return ret;
+}
+
+	
+
+static ssize_t write_cluster(struct file *file, char *buf, size_t size)
+{
+	nm_op *data;
+	int ret;
+	u16 me;
+	
+	printk("write_cluster\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+        data = (nm_op *) buf;
+	if (data->magic != NM_OP_MAGIC)
+		return -EINVAL;
+
+	switch (data->opcode) {
+		case NM_OP_CREATE_CLUSTER:
+			ret = nm_create_cluster(buf);
+			break;
+		case NM_OP_CREATE_GROUP:
+			ret = nm_create_group(buf, data);
+			break;
+		case NM_OP_NAME_CLUSTER:
+			ret = nm_name_cluster(buf, data);
+			break;
+		case NM_OP_DESTROY_CLUSTER:
+			ret = nm_destroy_cluster(buf);
+			break;
+		case NM_OP_ADD_CLUSTER_NODE:
+			ret = nm_create_node(buf, data);
+			break;
+		case NM_OP_GET_CLUSTER_NUM_NODES:
+			ret = nm_get_cluster_num_nodes(buf);
+			break;
+		case NM_OP_GET_GLOBAL_NODE_NUM:
+			ret = 0;
+			me = nm_this_node(NULL);
+			if (me >= NM_MAX_NODES)
+				ret = -EINVAL;
+			ret = sprintf(buf, "%d: %u", ret, me);
+			break;
+		default:
+			ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+			break;
+	}
+	printk("leaving!\n");
+	return ret;
+}
+
+static ssize_t write_node(struct file *file, char *buf, size_t size)
+{
+	nm_op *data;
+	int ret;
+	
+	printk("write_node\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+        data = (nm_op *) buf;
+	if (data->magic != NM_OP_MAGIC)
+		return -EINVAL;
+
+	switch (data->opcode) {
+		case NM_OP_GET_NODE_INFO:
+			ret = nm_get_node_info(buf, data);
+			break;
+		default:
+			ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+			break;
+	}
+	printk("leaving!\n");
+	return ret;
+}
+
+static ssize_t write_group(struct file *file, char *buf, size_t size)
+{
+	nm_op *data;
+	int ret;
+	
+	printk("write_group\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+        data = (nm_op *) buf;
+	if (data->magic != NM_OP_MAGIC)
+		return -EINVAL;
+
+	printk("opcode is %u, add_group is %u\n", data->opcode, NM_OP_ADD_GROUP_NODE);
+	switch (data->opcode) {
+		case NM_OP_GET_GROUP_INFO:
+			ret = nm_get_group_info(buf, data);
+			break;
+
+		case NM_OP_ADD_GROUP_NODE:
+			ret = nm_add_node_to_group(buf, data);
+			break;
+
+		default:
+			ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+			break;
+	}
+	printk("leaving!\n");
+	return ret;
+}
+
+
+
+static struct inode * __nm_get_group_by_num(u16 group_num)
+{
+	struct inode *inode = iget(single_sb, group_num + NM_GROUP_INODE_START);
+	if (!inode)
+		return NULL;
+	if (!inode->u.generic_ip) {
+		iput(inode);
+		return NULL;
+	}
+	return inode;
+}	
+
+static struct inode * __nm_get_node_by_num(u16 node_num)
+{
+	struct inode *inode = iget(single_sb, node_num + NM_NODE_INODE_START);
+	if (!inode)
+		return NULL;
+	if (!inode->u.generic_ip) {
+		iput(inode);
+		return NULL;
+	}
+	return inode;
+}
+
+/* ipv4 only for now... */
+struct inode * nm_get_node_by_ip(u32 addr)
+{
+	int bucket;
+	struct list_head *iter;
+	nm_node_inode_private *priv;
+	struct inode *ret = NULL;
+	
+	bucket = hash_long(addr, NM_HASH_BITS);
+
+	spin_lock(&nm_ip_hash_lock);
+	list_for_each(iter, &nm_ip_hash[bucket]) {
+		priv = list_entry(iter, nm_node_inode_private, ip_hash);
+		if (priv->node.ifaces[0].addr_u.ip_addr4 == addr) {
+			ret = igrab(priv->inode);
+			break;
+		}
+		    
+	}
+	spin_unlock(&nm_ip_hash_lock);
+	return ret;
+}
+
+
+struct inode * nm_get_group_by_num(u16 group_num)
+{
+	struct inode *inode;
+	spin_lock(&nm_lock);
+	inode = __nm_get_group_by_num(group_num);
+	spin_unlock(&nm_lock);
+	return inode;
+}
+
+nm_cluster * nm_get_cluster(void)
+{
+	return &cluster;
+}
+
+struct inode * nm_get_node_by_num(u16 node_num)
+{
+	struct inode *inode;
+	spin_lock(&nm_lock);
+	inode = __nm_get_node_by_num(node_num);
+	spin_unlock(&nm_lock);
+	return inode;
+}
+
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index)
+{
+	struct dentry *dentry = NULL, *parent;
+	struct inode *inode = NULL;
+	char tmpname[6];
+
+	if (list_empty(&group->i_dentry))
+		return NULL;
+	parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+	if (!parent)
+		return NULL;
+	
+	sprintf(tmpname, "%03u", index);
+	dentry = lookup_one_len(tmpname, parent, strlen(tmpname));
+	if (!IS_ERR(dentry)) {
+		inode = dentry->d_inode;
+		if (inode) {
+			inode = igrab(inode);
+			if (!inode->u.generic_ip || !S_ISREG (inode->i_mode)) {
+				printk("bad inode!\n");
+				iput(inode);
+				inode = NULL;
+			}
+		}
+		if (!inode)
+			dput(dentry);
+	}
+	dput(parent);
+	return inode;
+}
+
+
+struct inode * __nm_get_node_by_name(char *node_name, int dir)
+{
+	struct dentry *dentry = NULL;
+	struct inode *inode = NULL;
+	
+	dentry = lookup_one_len(node_name, single_sb->s_root, strlen(node_name));
+	if (!IS_ERR(dentry)) {
+		inode = dentry->d_inode;
+		if (inode) {
+			inode = igrab(inode);
+			if (!inode->u.generic_ip ||
+		    	(dir && !S_ISDIR (inode->i_mode)) ||
+		    	(!dir && !S_ISREG (inode->i_mode))) {
+				printk("bad inode!\n");
+				iput(inode);
+				inode = NULL;
+			}
+		}
+	}
+	return inode;
+}
+
+
+/* 
+ * if group is NULL: return the global index for this node
+ * if group is non NULL: return the index within the group of this node
+ *
+ * NOTE: currently getting the group index is slow
+ *       will need to change this somehow
+ */
+u16 nm_this_node(struct inode *group)
+{
+	struct inode *inode = NULL;
+	struct dentry *child = NULL;
+	u16 node_num = NM_MAX_NODES;
+
+       	inode = nm_get_node_by_name(nm_nodename);
+	if (inode && inode->u.generic_ip) {
+		if (group)
+			node_num = nm_get_group_index(group, inode, &child);
+		else 
+			node_num = nm_get_node_global_index(inode);
+
+	}
+	iput(inode);
+	dput(child);
+	//printk("for group=%p, this node is %u\n", group, node_num);
+	return node_num;
+}
+
+/* slow */
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child)
+{
+	struct dentry *tmp = NULL, *parent = NULL;
+	u16 slot_num = NM_MAX_NODES;
+	struct list_head *iter;
+	char tmpname[6];
+	char *err;
+
+	*child = NULL;
+	parent = NULL;
+	if (list_empty(&group->i_dentry))
+		goto leave;
+	parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+	if (!parent)
+		goto leave;
+		
+	spin_lock(&dcache_lock);
+	list_for_each(iter, &parent->d_subdirs) {
+		tmp = list_entry(iter, struct dentry, d_child);
+		if (tmp->d_inode == inode)
+			break;
+		tmp = NULL;
+	}
+	if (tmp)
+		dget_locked(tmp);
+	spin_unlock(&dcache_lock);
+
+	if (!tmp || tmp->d_name.len > 3)
+		goto leave;
+	strncpy(tmpname, tmp->d_name.name, tmp->d_name.len);
+	tmpname[tmp->d_name.len] = '\0';
+	err=NULL;
+	slot_num = simple_strtoul(tmpname, &err, 10);
+	
+	if (*err != '\0')
+		slot_num = NM_MAX_NODES;  // error
+	else
+		*child = dget(tmp);  // done, get extra ref for child
+		
+leave:
+	dput(parent);
+	dput(tmp);
+
+	return slot_num;
+}
+
+int nm_init(dlm_ctxt *dlm)
+{
+	return 0;
+}
+
+int nm_register_callback(int type, void (*func)(void *, void *, u16))
+{
+	nm_callback_func *f;
+
+	if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+		return -EINVAL;
+	f = kmalloc(sizeof(nm_callback_func), GFP_KERNEL);
+	if (f == NULL)
+		return -ENOMEM;
+	memset(f, 0, sizeof(nm_callback_func));
+	f->func = func;
+	spin_lock(&nm_cb_lock);
+	list_add_tail(&f->list, &nm_callbacks[type]);
+	spin_unlock(&nm_cb_lock);
+	return 0;
+}
+
+#warning need to change nm callbacks to be like hb callbacks... no locks when calling.
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16))
+{
+	struct list_head *iter, *tmpiter;
+	int ret = -EINVAL;
+	nm_callback_func *f;
+
+	if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+		return ret;
+
+	spin_lock(&nm_cb_lock);
+	list_for_each_safe(iter, tmpiter, &nm_callbacks[type]) {
+		f = list_entry (iter, nm_callback_func, list);
+		if (f->func == func) {
+			list_del(&f->list);
+			kfree(f);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock(&nm_cb_lock);
+	return ret;
+}
+
+
+
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx)
+{
+	struct list_head *iter;
+	nm_callback_func *f;
+	
+	spin_lock(&nm_cb_lock);
+	list_for_each(iter, &nm_callbacks[type]) {
+		f = list_entry (iter, nm_callback_func, list);
+		(f->func) (ptr1, ptr2, idx);
+	}
+	spin_unlock(&nm_cb_lock);
+}
+
+
+static void nm_teardown(void)
+{
+	remove_proc_entry("cluster/nm", NULL);
+	remove_proc_entry("cluster", NULL);
+}
+
+static void nm_init_cluster(nm_cluster *cluster)
+{
+	int i;
+	memset(cluster, 0, sizeof(nm_cluster));
+	cluster->state = NM_CLUSTER_DOWN;
+	spin_lock_init(&cluster->bitmap_lock);
+	
+	for (i=NM_NODE_ADD_CB; i<=NM_GROUP_NODE_DEL_CB; i++)
+		INIT_LIST_HEAD(&nm_callbacks[i]);
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+static int nm_fill_super(struct super_block * sb, void * data, int silent)
+{
+	int ret, sz;
+	TA_write_ops *ops;
+	static struct tree_descr nm_files[] = {
+		[NM_Cluster] = {".cluster", &transaction_ops, S_IWUSR},
+		[NM_Node] = {".node", &transaction_ops, S_IWUSR},
+		[NM_Group] = {".group", &transaction_ops, S_IWUSR},
+		/* last one */ {""}
+	};
+	
+	sz = sizeof(nm_files) / sizeof(struct tree_descr);
+	ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+	ops->num_ops = sz;
+	ops->write_op[NM_Cluster] = write_cluster;
+	ops->write_op[NM_Node] = write_node;
+	ops->write_op[NM_Group] = write_group;
+
+	single_sb = NULL;
+	printk("calling simple_fill_super...\n");
+	ret = simple_fill_super(sb, 0x98675309, nm_files);
+	if (ret >= 0) {
+		TA_GENERIC_SB_MEMBER(sb) = ops;
+		single_sb = sb;
+	} else {
+		kfree(ops);
+	}
+	return ret;
+}
+
+static struct super_block *nm_read_super (struct super_block *sb, void *data, int silent)
+{
+	printk("welcome to nm_read_super!!!\n");
+	return (nm_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (nm_fs_type, "nm", nm_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_nm(void)
+{
+	int retval;
+	nm_nodename = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
+	if (nm_nodename==NULL) {
+		printk("could not allocate a few bytes for nodename!\n");
+		return -ENOMEM;
+	}
+	strcpy(nm_nodename, system_utsname.nodename);
+	printk("loading nm module: nodename is %s\n", nm_nodename);
+
+	if (nm_init_ip_hash() < 0) {
+		printk("failed to allocate node IP hash\n");
+		return -ENOMEM;
+	}
+
+	nm_init_cluster(&cluster);
+
+	if (proc_mkdir("cluster", 0)) {
+		if (proc_mkdir("cluster/nm", 0)) {
+		}
+	}
+	printk("calling register_filesystem\n");
+	retval = register_filesystem(&nm_fs_type);
+	printk("done calling register_filesystem: ret=%d\n", retval);
+	if (retval)
+		nm_teardown();
+	return retval;
+}
+
+static void __exit exit_nm(void)
+{
+	nm_teardown();
+	unregister_filesystem(&nm_fs_type);
+	nm_destroy_ip_hash();
+	kfree(nm_nodename);
+	printk("unloading nm module\n");
+}
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_nm)
+module_exit(exit_nm)

Added: branches/dlm-glue/cluster/nodemanager.h
===================================================================
--- branches/dlm-glue/cluster/nodemanager.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/nodemanager.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,252 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_NODEMANAGER_H
+#define CLUSTER_NODEMANAGER_H
+
+
+
+struct _nm_ctxt
+{
+	int dummy;
+};
+
+#define NM_MAX_IFACES            2
+#define NM_MAX_NODES             255
+#define NM_INVALID_SLOT_NUM      255
+
+/* host name, group name, cluster name all 64 bytes */
+#define NM_MAX_NAME_LEN          64    // __NEW_UTS_LEN
+
+
+#define NM_GROUP_INODE_START    200000
+#define NM_NODE_INODE_START     100000
+
+enum {
+	NM_CLUSTER_DOWN=0,
+	NM_CLUSTER_UP
+};
+
+enum {
+	NM_GROUP_NOT_READY=0,
+	NM_GROUP_READY
+};
+
+enum {
+	NM_Root = 1,
+	NM_Cluster,
+	NM_Node,
+	NM_Group,
+};
+
+
+
+
+typedef struct _nm_network_iface
+{
+	u16 ip_port;			/* for simplicity, just define exactly one port for this if */
+	u16 ip_version;
+	union {
+		u32 ip_addr4;		/* IPv4 address in NBO */
+		u32 ip_addr6[4];	/* IPv6 address in NBO */
+	} addr_u;
+} nm_network_iface;
+
+typedef struct _nm_node_info 
+{
+	u16 node_num;
+	char node_name[NM_MAX_NAME_LEN+1];
+	nm_network_iface ifaces[NM_MAX_IFACES];
+} nm_node_info;
+
+
+typedef struct _nm_cluster
+{
+	char name[NM_MAX_NAME_LEN+1];
+	int state;
+	spinlock_t bitmap_lock;
+	u32 group_bitmap[8];
+	u32 node_bitmap[8];
+} nm_cluster;
+
+
+typedef struct _nm_group_inode_private
+{
+	struct inode *inode;
+	struct list_head net_list;
+	struct list_head disk_list;
+	cluster_disk disk;
+	int state;
+	spinlock_t bitmap_lock;
+	u32 slot_bitmap[8];
+} nm_group_inode_private;
+
+#ifdef __KERNEL__
+/* TODO: move this */
+#define NET_FLAG_CREATING_SOCKET   0x00000001
+typedef struct _net_inode_private
+{
+	struct socket *sock;
+	wait_queue_t sleep;
+	spinlock_t sock_lock;
+	struct list_head handlers;
+	struct list_head list;
+	int flags;
+} net_inode_private;
+
+typedef struct _nm_node_inode_private
+{
+	struct inode *inode;
+	nm_node_info node;
+	struct list_head ip_hash;
+	net_inode_private net;
+} nm_node_inode_private;
+#endif
+
+/* transaction file nm_op stuff */
+
+#define NM_OP_MAGIC      0xbeaf
+enum {
+	NM_OP_CREATE_CLUSTER=123,
+	NM_OP_DESTROY_CLUSTER,
+	NM_OP_NAME_CLUSTER,
+	NM_OP_ADD_CLUSTER_NODE,
+	NM_OP_GET_CLUSTER_NUM_NODES,
+	NM_OP_GET_NODE_INFO,
+	NM_OP_CREATE_GROUP,
+	NM_OP_GET_GROUP_INFO,
+	NM_OP_ADD_GROUP_NODE,
+	NM_OP_GET_GLOBAL_NODE_NUM
+};
+
+typedef struct _nm_group_change
+{
+	u16 group_num;
+	u16 node_num;
+	u16 slot_num;
+	char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+	char name[NM_MAX_NAME_LEN+1];
+} nm_group_change;
+
+typedef struct _nm_op
+{
+	u16 magic;
+	u16 opcode;
+	union {
+		u16 index;
+		char name[NM_MAX_NAME_LEN+1];
+		nm_node_info node;
+		nm_group_change gc;
+	} arg_u;
+} nm_op;
+
+
+/* callback stuff */
+
+enum {
+	NM_NODE_ADD_CB = 0,
+	NM_NODE_DEL_CB,
+	NM_GROUP_ADD_CB,
+	NM_GROUP_DEL_CB,
+	NM_GROUP_NODE_ADD_CB,
+	NM_GROUP_NODE_DEL_CB,
+	NM_NUM_CB
+};
+
+typedef void (nm_cb_func)(void *, void *, u16);
+
+typedef struct _nm_callback_func
+{
+	struct list_head list;
+	nm_cb_func *func;
+	//void (*func)(void *, void *, u16);
+} nm_callback_func;
+
+
+
+
+u16 nm_this_node(struct inode *group);
+int nm_init(struct _dlm_ctxt *dlm);
+nm_cluster * nm_get_cluster(void);
+int nm_register_callback(int type, void (*func)(void *, void *, u16));
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16));
+int nm_get_group_num_nodes(struct inode *group);
+int nm_get_group_max_slots(struct inode *group);
+int nm_make_group_ready(struct inode *group);
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx);
+int nm_remove_node_from_group(struct inode *group, struct inode *node);
+int nm_create_group(char *buf, nm_op *data);
+int nm_add_node_to_group(char *buf, nm_op *data);
+
+#ifdef __KERNEL__
+
+
+struct inode * nm_get_group_by_num(u16 group_num);
+struct inode * nm_get_node_by_num(u16 node_num);
+struct inode * __nm_get_node_by_name(char *node_name, int dir);
+struct inode * nm_get_node_by_ip(u32 addr);
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index);
+
+static inline struct inode * nm_get_node_by_name(char *node_name)
+{
+	return __nm_get_node_by_name(node_name, 0);
+}
+static inline struct inode * nm_get_group_by_name(char *group_name)
+{
+	return __nm_get_node_by_name(group_name, 1);
+}
+
+
+static inline int nm_get_node_global_index(struct inode *node)
+{
+	return (node->i_ino - NM_NODE_INODE_START);
+}
+static inline int nm_get_group_global_index(struct inode *group)
+{
+	return (group->i_ino - NM_GROUP_INODE_START);
+}
+#endif
+
+static inline int nm_valid_ino(int ino)
+{
+#if 0
+	// these should never be referred to in kernel
+	if (ino >= NM_Cluster && ino <= NM_Group)
+		return 1;
+#endif
+	if (ino >= NM_NODE_INODE_START &&
+	    ino < NM_NODE_INODE_START + NM_MAX_NODES)
+		return 1;
+	if (ino >= NM_GROUP_INODE_START &&
+	    ino < NM_GROUP_INODE_START + NM_MAX_NODES)
+		return 1;
+	return 0;
+}
+
+
+	
+#endif /* CLUSTER_NODEMANAGER_H */

Added: branches/dlm-glue/cluster/tcp.c
===================================================================
--- branches/dlm-glue/cluster/tcp.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/tcp.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,1614 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.c
+ *
+ * tcp network stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+//#if 0
+#define netprintk(x, arg...)    printk("(%d) " x, current->pid, ##arg)
+#define netprintk0(x)           printk("(%d) " x, current->pid)
+//#else
+#if 0
+#define netprintk(x, arg...)    
+#define netprintk0(x)           
+#endif
+
+struct socket *recv_sock = NULL;
+static u16 ip_version, ip_port;
+static void *net_junk_buf = NULL;
+static struct inode *net_inode = NULL;
+static u16 net_node_num;
+
+char *gsd_buf = NULL;
+char *gsd_handler_buf = NULL;
+
+
+static spinlock_t net_handler_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_list_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_status_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(net_handlers);
+static LIST_HEAD(net_recv_list);
+static LIST_HEAD(net_dispatch_list);
+static LIST_HEAD(net_status_list);
+
+static DECLARE_WAIT_QUEUE_HEAD(net_disp_thread_wait_queue);
+static DECLARE_WAIT_QUEUE_HEAD(net_recv_thread_wait_queue);
+static int net_recv_pid = -1;
+static struct task_struct *net_recv_task = NULL;
+static struct completion net_recv_complete;
+
+
+
+/////////////////////
+static void net_shutdown(void);
+static int net_startup(void);
+static int __init net_driver_entry (void);
+static int net_init_driver(void);
+static void __exit net_driver_exit (void);
+static void net_remove_handlers(void);
+static int net_check_message_valid(net_msg *msg, u32 len);
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode);
+static void net_dump_msg(struct socket *sock, struct inode *inode);
+static int net_recv_message_header(net_msg *hdr, struct socket *sock);
+static int net_init_tcp_recv_sock(void);
+static int net_receive_thread(void *data);
+static int net_receive(void);
+static int net_accept_tcp_connections(void);
+static void net_release_tcp_sock(void);
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd);
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+
+int gsd_message_action(gsd_message *g);
+int gsd_message_handler(net_msg *msg, u32 len, void *data);
+void gsd_teardown(void);
+int gsd_setup(void);
+
+
+//////////////////////
+
+
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_get_handler(net_msg_handler *nmh)
+{
+	atomic_inc(&nmh->refcnt);
+}
+
+static inline void net_get_handler(net_msg_handler *nmh)
+{
+	spin_lock(&net_handler_lock);
+	__net_get_handler(nmh);
+	spin_unlock(&net_handler_lock);
+}
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_put_handler(net_msg_handler *nmh)
+{
+	atomic_dec(&nmh->refcnt);
+	if (!atomic_read(&nmh->refcnt)) {
+		if (net_handler_in_use(nmh))
+			netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+		kfree(nmh);
+	}
+}
+
+static inline void net_put_handler(net_msg_handler *nmh)
+{
+	if (atomic_dec_and_lock(&nmh->refcnt, &net_handler_lock)) {
+		if (net_handler_in_use(nmh))
+			netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+		kfree(nmh);
+		spin_unlock(&net_handler_lock);
+	}
+}
+
+
+
+DECLARE_MUTEX(net_state_lock);
+u32 net_driver_state = NET_DRIVER_UNINITED;
+u32 net_num_dispatched = 0;
+
+
+/*
+ * net_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init net_driver_entry (void)
+{
+	struct proc_dir_entry *de;
+	de = proc_mkdir("cluster/net", 0);
+	if (!de)
+		return -1;
+	de->proc_fops->ioctl = net_ioctl;
+	
+	netprintk0("Loaded net Driver module\n");
+	return 0;
+}				/* net_driver_entry */
+
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	net_ioc data;
+	gsd_ioc gsd_data;
+	int ret = 0;
+	gsd_message g;
+	int response = 0;
+	struct inode *to = NULL;
+	struct file *file = NULL;
+
+	if (_IOC_TYPE (cmd) != NET_IOC_MAGIC) {
+		ret = -ENOTTY;
+		goto exit_ioctl;
+	}
+
+	switch (cmd) {
+	    case NET_IOC_ACTIVATE:
+		    memset(&data, 0, sizeof(net_ioc));
+		    down(&net_state_lock);
+		    data.status = net_driver_state;
+		    if (net_driver_state == NET_DRIVER_UNINITED) {
+			    ret = net_init_driver();
+			    if (ret < 0) {
+				    netprintk("error trying to activate net driver: %d\n", ret);
+				    data.status = NET_DRIVER_UNINITED;
+			    } else {
+				    netprintk0("activated net driver!\n");
+				    net_driver_state = data.status = NET_DRIVER_READY;
+			    }
+		    }
+		    up(&net_state_lock);
+
+		    ret = copy_to_user ((net_ioc *) arg, &data,
+					sizeof (net_ioc));
+		    break;
+	    case NET_IOC_GETSTATE:
+		    memset(&data, 0, sizeof(net_ioc));
+		    down(&net_state_lock);
+		    data.status = net_driver_state;
+		    up(&net_state_lock);
+		    ret = copy_to_user ((net_ioc *) arg, &data,
+					sizeof (net_ioc));
+		    break;
+		    
+	    case GSD_IOC_CREATE_GROUP:
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+		   
+		    file = fget(gsd_data.fd); 
+		    if (!file || !file->f_dentry || !file->f_dentry->d_inode) { 
+			    ret = -EINVAL;
+			    break; 
+		    } 
+		    to = file->f_dentry->d_inode;
+
+		    g.action = GSD_ACTION_ADD_GROUP;
+		    g.from = net_node_num;
+		    g.namelen = gsd_data.namelen;
+		    memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+		    if (to == net_inode) { 
+			    /* create the group locally */
+			    ret = gsd_message_action(&g);
+		    } else { 
+			    /* create the group on remote node */
+			    ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response); 
+			    if (ret == 0) 
+				    ret = response;
+		    }
+
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    gsd_data.status = ret;
+		    ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+		    break;
+
+	    case GSD_IOC_ADD_GROUP_NODE:
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+		   
+		    file = fget(gsd_data.fd); 
+		    if (!file || !file->f_dentry || !file->f_dentry->d_inode) { 
+			    ret = -EINVAL;
+			    break; 
+		    } 
+		    to = file->f_dentry->d_inode;
+
+		    g.action = GSD_ACTION_ADD_GROUP_NODE;
+		    g.from = net_node_num;
+		    g.namelen = gsd_data.namelen;
+		    memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+		    if (to == net_inode) {
+			    /* create the group locally */
+			    ret = gsd_message_action(&g);
+		    } else { 
+			    /* create the group on remote node */
+			    ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response); 
+			    if (ret == 0) 
+				    ret = response;
+		    }
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    gsd_data.status = ret;
+		    ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+		    break;
+	    default:
+		    ret = -ENOTTY;
+		    break;
+	}
+
+exit_ioctl:
+
+	if (file)
+		fput(file);
+
+	return ret;
+}				/* net_ioctl */
+
+static int net_init_driver(void)
+{
+	nm_node_info *info;
+	nm_node_inode_private *priv;
+
+	/* get the global node number for this node */
+	net_node_num = nm_this_node(NULL);
+	if (net_node_num >= NM_MAX_NODES) {
+		netprintk0("local nm node number not initialized!\n");
+		return -1;
+	}
+	net_inode = nm_get_node_by_num(net_node_num);
+	if (!net_inode) {
+		netprintk0("local nm node inode not initialized!\n");
+		return -1;
+	}
+	priv = (nm_node_inode_private *)net_inode->u.generic_ip;
+	if (!priv) {
+		iput(net_inode);
+		netprintk0("local nm node info not initialized!\n");
+		return -1;
+	}
+	info = &priv->node;
+	ip_version = info->ifaces[0].ip_version;
+	ip_port = info->ifaces[0].ip_port;
+
+	if (net_startup() < 0)
+		return -1;
+
+	if (gsd_setup() < 0)
+		return -1;
+
+	return 0;
+}				/* net_init_driver*/
+
+
+/*
+ * net_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit net_driver_exit (void)
+{
+	down(&net_state_lock);
+	if (net_driver_state == NET_DRIVER_READY) {
+		netprintk0("shutting down network\n");
+		net_shutdown();
+		netprintk0("removing all net driver handlers\n");
+		net_remove_handlers();
+		gsd_teardown();
+		if (net_inode)
+			iput(net_inode);
+		net_driver_state = NET_DRIVER_UNINITED;
+	}
+	up(&net_state_lock);
+	remove_proc_entry("cluster/net", NULL);
+	netprintk0("Unloading net driver module\n");
+	return;
+}				/* net_driver_exit */
+
+
+static int net_startup(void)
+{
+	net_recv_pid = -1;
+	net_recv_task = NULL;
+	init_completion (&net_recv_complete);
+
+	net_junk_buf = (void *) __get_free_page(GFP_KERNEL);
+	if (!net_junk_buf)
+		return -ENOMEM;
+
+	netprintk0("starting net receive thread...\n");
+	net_recv_pid = kernel_thread (net_receive_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (net_recv_pid < 0) {
+		netprintk("unable to launch net receive thread, error=%d", net_recv_pid);
+		net_shutdown();
+		return -EINVAL;
+	}
+
+	netprintk0("net thread running...\n");
+	return 0;
+}
+
+static void net_shutdown(void)
+{
+	netprintk ("waiting for net thread to exit....");
+	send_sig (SIGINT, net_recv_task, 0);
+	wait_for_completion (&net_recv_complete);
+	free_page((unsigned long)net_junk_buf);
+	netprintk ("net thread exited\n");
+}
+
+
+static int net_receive_thread(void *data)
+{
+	int status = 0;
+	DECLARE_WAITQUEUE(main_wait, current);
+
+	util_daemonize ("netrecv", strlen("netrecv"), 1);
+	net_recv_task = current;
+
+	status = net_init_tcp_recv_sock();
+       	if (status >= 0 && recv_sock) {
+		add_wait_queue_exclusive(recv_sock->sk->sleep, &main_wait);
+		while (1) {
+			status = 0;
+			if (recv_sock->sk->tp_pinfo.af_tcp.accept_queue)
+				status = net_accept_tcp_connections();
+			if (!list_empty(&net_recv_list))
+				status = net_receive();
+
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(20*HZ);
+			current->state = TASK_RUNNING;
+
+			if (signal_pending(current)) {
+				netprintk0("net recv thread got signal!\n");
+				break;
+			}
+		}
+		remove_wait_queue(recv_sock->sk->sleep, &main_wait);
+	} else {
+		netprintk0("failed to initialize net_thread!\n");
+	}
+
+	/* Flush all scheduled tasks */
+	flush_scheduled_work();
+	net_release_tcp_sock();
+	net_recv_task = NULL;
+	complete (&net_recv_complete);
+	return 0;
+}
+
+typedef union _my_timing_t
+{
+	__u64 q;
+	__u32 lohi[2];
+} my_timing_t;
+
+
+static int net_check_message_valid(net_msg *msg, u32 len)
+{
+	return 1;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/* for lack of a better place to do this */
+
+int gsd_setup()
+{
+	int ret;
+	gsd_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!gsd_buf)
+		return -ENOMEM;
+	/* need this stupidity until I can divorce the actual nm actions
+	 * from the output they send to their user buffer */
+	gsd_handler_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!gsd_handler_buf)
+		return -ENOMEM;
+
+	ret = net_register_handler(GSD_MESSAGE, 0, 0, sizeof(gsd_message),
+				   gsd_message_handler, NULL, gsd_buf);
+
+	return ret;
+}
+
+void gsd_teardown()
+{
+	free_page((unsigned long)gsd_buf);
+	free_page((unsigned long)gsd_handler_buf);
+}
+
+int gsd_message_handler(net_msg *msg, u32 len, void *data)
+{
+	return gsd_message_action((gsd_message *)msg->buf);
+}
+
+int gsd_message_action(gsd_message *g)
+{
+	int ret;
+	nm_op op;
+	int namelen = g->namelen;
+	struct inode *node=NULL, *group=NULL;
+	char name[NM_MAX_NAME_LEN+1];
+	
+	if (namelen > NM_MAX_NAME_LEN)
+		return -EINVAL;
+	strncpy(name, g->name, namelen);
+	name[namelen] = '\0';
+	
+	memset(&op, 0, sizeof(op));
+	switch (g->action) {
+		case GSD_ACTION_ADD_GROUP:
+			group = nm_get_group_by_name(name);
+			if (group) {
+				ret = 0;
+				break;
+			}
+			op.arg_u.gc.group_num = NM_INVALID_SLOT_NUM;
+			memcpy(op.arg_u.gc.name, name, namelen);
+			memcpy(op.arg_u.gc.disk_uuid, name, namelen);
+
+			ret = nm_create_group(gsd_handler_buf, &op);
+			if (ret >= 0)
+				ret = 0;
+			break;
+
+		case GSD_ACTION_ADD_GROUP_NODE:
+			group = nm_get_group_by_name(name);
+			if (!group) {
+				ret = -EINVAL;
+				break;
+			}
+			node = nm_get_group_node_by_index(group, g->from);
+			if (node) {
+				ret = 0;
+				if (nm_get_node_global_index(node) != g->from)
+					ret = -EINVAL;
+				break;
+			}
+			op.arg_u.gc.group_num = nm_get_group_global_index(group);
+			op.arg_u.gc.node_num = g->from;
+			op.arg_u.gc.slot_num = g->from;
+			ret = nm_add_node_to_group(gsd_handler_buf, &op);
+			if (ret >= 0)
+				ret = 0;
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+	}
+
+	if (node)
+		iput(node);
+	if (group)
+		iput(group);
+	return ret;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+int net_register_handler(u32 msg_type, u32 key, int flags, u32 max_len, 
+			 net_msg_handler_func *func, void *data, void *buf)
+{
+	net_msg_handler *nmh, *found=NULL;
+	u32 packet_len = sizeof(net_msg) + max_len;
+
+	if (packet_len < NET_MIN_MSG_LEN || packet_len > NET_MAX_MSG_LEN) {
+		netprintk("max_len for message handler out of range: %u\n", 
+			max_len);
+		return -EINVAL;
+	}
+
+	/* if expecting any message payload, must pass a prealloced buffer */
+	if (!buf && max_len) {
+		netprintk("max_len > 0 (%u), but no buffer supplied!\n",
+		       max_len);
+		return -EINVAL;
+	}
+
+	if (!msg_type) {
+		netprintk("no message type provided: %u, %p\n", msg_type, func);
+		return -EINVAL;
+
+	}
+	if (!func) {
+		netprintk("no message handler provided: %u, %p\n",
+		       msg_type, func);
+		return -EINVAL;
+	}
+
+       	nmh = kmalloc(sizeof(net_msg_handler), GFP_KERNEL);
+	if (!nmh) {
+		return -ENOMEM;
+	}
+	memset(nmh, 0, sizeof(net_msg_handler));
+	nmh->func = func;
+	nmh->data = data;
+	nmh->msg_type = msg_type;
+	nmh->max_len = max_len;
+	nmh->key = key;
+	spin_lock_init(&nmh->lock);
+	atomic_set(&nmh->refcnt, 0);
+	if (max_len == 0) {
+		nmh->buf = &nmh->hdr;
+	} else {
+		nmh->buf = buf;
+	}
+	nmh->flags = flags;
+	INIT_LIST_HEAD(&nmh->list);
+	net_get_handler(nmh);
+
+	
+	/* add the new handler, checking for pre-existing */
+	spin_lock(&net_handler_lock);
+	found = net_lookup_handler(msg_type, key);
+	if (!found) {
+		list_add_tail(&nmh->list, &net_handlers);
+	} else {
+		spin_unlock(&net_handler_lock);
+		net_put_handler(found);
+		netprintk("message handler for type %u, key %u already exists!!!\n",
+		       msg_type, key);
+		/* this should destroy it */
+		net_put_handler(nmh);
+		return -EEXIST;
+	}
+	spin_unlock(&net_handler_lock);
+	return 0;
+}
+
+
+
+/* net_handler_lock should be held */
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key)
+{
+	net_msg_handler *ret;
+	struct list_head *iter;
+
+	list_for_each(iter, &net_handlers) {
+		ret = list_entry(iter, net_msg_handler, list);
+		if (ret->msg_type == msg_type && ret->key == key) {
+			__net_get_handler(ret);
+			return ret;
+		}
+	}
+	return NULL;
+}
+
+
+
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len)
+{
+	net_msg *ret = NULL;
+	net_msg_handler *handler = NULL;
+	u32 packet_len;
+
+	spin_lock(&net_handler_lock);
+	handler = net_lookup_handler(msg_type, key);
+	spin_unlock(&net_handler_lock);
+	
+	if (!handler) {
+		netprintk("no such message type: %u/%u\n", msg_type, key);
+		return NULL;
+	}
+	if (net_handler_msg_len_ok(handler, len)) {
+		netprintk("len for message type %u incorrect: %u, should be %u\n", 
+		       msg_type, len, handler->max_len);
+		goto done;
+	}
+	packet_len = len + sizeof(net_msg);
+	ret = kmalloc(packet_len, GFP_KERNEL);
+	if (!ret) {
+		netprintk("failed to allocate %u bytes for message!\n", packet_len);
+		goto done;
+	}
+	memset(ret, 0, packet_len);
+	ret->magic = NET_MSG_MAGIC;
+	ret->data_len = len;
+	ret->msg_type = msg_type;
+	ret->key = key;
+	if (len > 0)
+		memcpy(&(ret->buf[0]), data, len);
+
+done:
+	if (handler)
+		net_put_handler(handler);
+	return ret;
+}
+
+/* TODO Fix */
+static void net_remove_handlers(void)
+{
+	/* TODO: make an iterator in nm for running over each global inode
+	 * do I have this already?  then call destroy on each.  last put
+	 * will do the work.  doesnt matter if it's slow.  this is only
+	 * on shutdown... */
+}
+
+
+
+
+/*
+ * net_recv_tcp_msg()
+ *
+ */
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len)
+{
+	nm_node_inode_private *priv;
+	nm_node_info *node;
+	int status = -EINVAL, error;
+	mm_segment_t oldfs;
+	struct sockaddr_in sin;
+	struct iovec iov = { 
+		.iov_len = *packet_len, 
+		.iov_base = data 
+	};
+	struct msghdr msg = { 
+		.msg_control = NULL, 
+		.msg_controllen = 0, 
+		.msg_iovlen = 1, 
+		.msg_iov = &iov, 
+		.msg_name = (struct sockaddr *) &sin, 
+		.msg_namelen = sizeof (sin),
+       		.msg_flags = 0 
+	};
+
+
+	priv = (nm_node_inode_private *)inode->u.generic_ip;
+	node = &priv->node;
+	if (!sock) {
+		spin_lock(&priv->net.sock_lock); 
+		/* TODO: sock refcounting... i think we can get/put the sk */
+		sock = priv->net.sock;
+		if (!sock)
+			return -EINVAL;
+		spin_unlock(&priv->net.sock_lock); 
+	}
+	
+	memset (&sin, 0, sizeof (sin));
+	oldfs = get_fs ();
+	set_fs (get_ds ());
+	error = sock_recvmsg (sock, &msg, *packet_len, msg.msg_flags);
+	set_fs (oldfs);
+
+	status = 0;
+	if (error < 0) {
+		if (error == -ERESTARTSYS) {
+			status = -EBADF;
+			netprintk ("Shutting down\n");
+		} else {
+			status = -EINVAL;
+			netprintk ("unable to recvmsg, error=%d\n", error);
+		}
+		goto bail;
+	} else {
+		*packet_len = iov.iov_len;
+		status = 0;
+		netprintk("woot.  recevied len=%d\n", *packet_len);
+		if (!net_check_message_valid(data, *packet_len)) {
+			netprintk0("eeeek bad net message!\n");
+			status = -EINVAL;
+		}
+	}
+
+	//netprintk ("Received packet from: %d.%d.%d.%d\n",
+	//		NIPQUAD (sin.sin_addr.s_addr));
+
+bail:
+	return status;
+}				/* net_recv_tcp_msg */
+
+
+/*
+ * net_send_tcp_msg()
+ *
+ */
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len)
+{
+	int status = 0, error;
+	struct sockaddr_in sin;
+	mm_segment_t oldfs;
+	nm_node_inode_private *priv;
+	nm_node_info *node;
+
+	priv = (nm_node_inode_private *)inode->u.generic_ip;
+	node = &priv->node;
+	if (!sock) {
+		spin_lock(&priv->net.sock_lock);
+		/* TODO: sock refcounting... i think we can get/put the sk */
+		sock = priv->net.sock;
+		spin_unlock(&priv->net.sock_lock);
+	}
+
+	oldfs = get_fs ();
+	netprintk("Sending msg to node=%u, name=%s\n", node->node_num, node->node_name);
+	memset (&sin, 0, sizeof (sin));
+	sin.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+	sin.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+	sin.sin_port = node->ifaces[0].ip_port;
+	
+
+	status = -EINVAL;
+	if (sock) {
+		struct iovec iov = {
+			.iov_base = data,
+			.iov_len = packet_len
+		};
+		struct msghdr msg = {
+			.msg_iov = &iov,
+			.msg_iovlen = 1,
+			.msg_control = NULL,
+			.msg_controllen = 0,
+			.msg_name = (struct sockaddr *) &sin,
+			.msg_namelen = sizeof (sin),
+			.msg_flags = 0
+		};
+		
+		status = 0;	
+		set_fs (get_ds ());
+		error = sock_sendmsg (sock, &msg, packet_len);
+		set_fs (oldfs);
+	
+		if (error < 0) {
+			netprintk ("unable to sendmsg, error=%d\n", error);
+			status = -EINVAL;
+		} 
+	}
+	if (status < 0)
+		netprintk ("bad status: %d\n", status);
+
+	status = 0;
+	return status;
+}				/* net_send_tcp_msg */
+
+static spinlock_t net_msg_num_lock = SPIN_LOCK_UNLOCKED;
+static u64 net_msg_num = 1;
+
+/*
+ * net_send_message
+ *
+ *   - this is probably the function you are looking for
+ *   - it will package up the message for you, verifying that
+ *       the message handler is there and the length is ok,
+ *       connect to the other node if there is not already a
+ *       socket for it, and optionally wait on a status return
+ *       from the other node 
+ *   - all you need prior to this call is to have inited the
+ *       net stuff, to have a valid inode for the node to contact 
+ *       in nm, and to have registered the message handler
+ */
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status)
+{
+	int ret = 0, tmpret;
+	net_msg *msg = NULL;
+	net_msg_handler *handler = NULL;
+	u32 packet_len;
+	net_status_ctxt nsc;
+	wait_queue_t sleep;
+	nm_node_inode_private *priv = NULL;
+	net_inode_private *net = NULL;
+
+	if (!inode || !inode->u.generic_ip) {
+		netprintk0("bad inode, cannot send message\n");
+		return -EINVAL;
+	}
+	priv = (nm_node_inode_private *)inode->u.generic_ip;
+	net = &priv->net;
+	spin_lock(&net->sock_lock); 
+	if (!net->sock) {
+		spin_unlock(&net->sock_lock);
+		ret = net_init_tcp_sock(inode);
+		if (!(ret == 0 || ret == -EEXIST)) {
+			netprintk0("failed to create socket!");
+			return -EINVAL;
+		}
+	}
+	spin_unlock(&net->sock_lock); 
+	
+
+	spin_lock(&net_handler_lock);
+	handler = net_lookup_handler(msg_type, key);
+	spin_unlock(&net_handler_lock);
+	
+	if (!handler) {
+		netprintk("no such message type: %u/%u\n", msg_type, key);
+		return -EINVAL;
+	}
+
+	if (net_handler_msg_len_ok(handler, len)) {
+		netprintk("len for message type %u incorrect: %u, should be %u\n", 
+		       msg_type, len, handler->max_len);
+		ret = -EINVAL;
+		goto done;
+	}
+	packet_len = len + sizeof(net_msg);
+	msg = kmalloc(packet_len, GFP_KERNEL);
+	if (!msg) {
+		netprintk("failed to allocate %u bytes for message!\n", packet_len);
+		ret = -ENOMEM;
+		goto done;
+	}
+	memset(msg, 0, packet_len);
+	msg->magic = NET_MSG_MAGIC;
+	msg->data_len = len;
+	msg->msg_type = msg_type;
+	msg->key = key;
+	spin_lock(&net_msg_num_lock);
+	msg->msg_num = net_msg_num;
+	if (net_msg_num == NET_MSG_NUM_MAX) {
+		printk("eek!  net_msg_num wrapping to 1 now...\n");
+		net_msg_num = 1;
+	}
+	spin_unlock(&net_msg_num_lock);
+	if (len > 0)
+		memcpy(&(msg->buf[0]), data, len);
+
+	/* does the caller want to wait for a simple status? */
+	if (status) {
+		msg->status = 1;
+
+		INIT_LIST_HEAD(&nsc.list);
+		init_waitqueue_head(&nsc.wq);
+		atomic_set(&nsc.woken, 0);
+		nsc.msg_num = msg->msg_num;
+		nsc.status = 0;
+		spin_lock(&net_status_lock);
+		list_add(&nsc.list, &net_status_list);
+		spin_unlock(&net_status_lock);
+
+		init_waitqueue_entry(&sleep, current);
+		spin_lock(&net->sock_lock);
+		if (!net->sock) {
+			spin_unlock(&net->sock_lock);
+			netprintk0("caller wanted status return but socket went away!\n");
+			kfree(msg);
+			return -EINVAL;
+		}
+		add_wait_queue(net->sock->sk->sleep, &sleep);
+		spin_unlock(&net->sock_lock); 
+	}
+{
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2;
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+
+
+	ret = net_send_tcp_msg(inode, NULL, msg, packet_len);
+
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+	netprintk("net_send_tcp_msg took %llu cycles\n", u2.q-u1.q);
+	if (status) {
+		if (ret >= 0) {
+			/* wait on other node's handler */
+			rdtsc(u1.hilo[0], u1.hilo[1]);
+			tmpret = util_wait_atomic_eq(&nsc.wq, &nsc.woken, 1, 0);
+			rdtsc(u2.hilo[0], u2.hilo[1]);
+			netprintk("waiting on status took %llu cycles\n", u2.q-u1.q);
+			*status = nsc.status;
+			netprintk("status return requested, status is %d\n", *status);
+			remove_wait_queue(recv_sock->sk->sleep, &sleep);
+		} else {
+			netprintk("status return requested, and error returned from net_send_tcp_msg=%d\n", ret);
+			/* return bad status right away */
+			*status = ret;
+		}
+	} else if (ret < 0) {
+		netprintk("no status return requested, but error returned from net_send_tcp_msg=%d\n", ret);
+	}
+}
+	
+done:
+	if (handler)
+		net_put_handler(handler);
+	if (msg)
+		kfree(msg);
+	return ret;
+}
+
+
+
+
+
+/*
+ * net_receive: receive from and dispatch all sockets with data pending
+ */
+static int net_receive(void)
+{
+	struct inode *inode;
+	struct list_head *iter, *tmpiter;
+	nm_node_inode_private *priv;
+	net_inode_private *net;
+	struct socket *sock;
+	struct sock *sk;
+	net_msg hdr;
+	net_msg_handler *hnd = NULL;
+	int err = 0;
+	int tmperr;
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2, u3, u4, u5, u6;
+
+
+start_over:	
+	spin_lock(&net_list_lock);
+	list_for_each_safe(iter, tmpiter, &net_recv_list) {
+		net = list_entry(iter, net_inode_private, list);
+		priv = container_of(net, nm_node_inode_private, net);
+	       	inode = priv->inode;
+		sock = net->sock;
+		
+		if (!sock) {
+			//netprintk0("no socket yet....\n");
+			continue;
+		}
+
+		if (sock->sk->state != TCP_ESTABLISHED &&
+		    sock->sk->state != TCP_CLOSE_WAIT) {
+			netprintk0("kill it and continue\n");
+			net_dump_and_close_sock(sock, inode);
+			continue;
+		}
+	
+		sk = sock->sk;
+		if (skb_queue_empty(&sk->receive_queue)) {
+			//netprintk("queue empty for %lu\n", inode->i_ino);
+			continue;
+		}
+	
+			
+
+		list_del(&net->list);
+		spin_unlock(&net_list_lock);
+	
+		memset(&hdr, 0, sizeof(net_msg));
+		err = net_recv_message_header(&hdr, sock);
+		if (err < 0) {
+			netprintk0("failed to receive message!\n");
+			goto error;
+		}
+		netprintk("received message header... magic=%u type=%u key=%u\n", 
+			  hdr.magic, hdr.msg_type, hdr.key);
+
+		if (hdr.magic == NET_MSG_STATUS_MAGIC) {
+rdtsc(u1.hilo[0], u1.hilo[1]);
+			net_dump_msg(sock, inode);
+			/* special type for returning message status */
+rdtsc(u2.hilo[0], u2.hilo[1]);
+			net_do_status_return(hdr.msg_num, hdr.status);
+rdtsc(u3.hilo[0], u3.hilo[1]);
+printk("status return: net_dump_msg took %llu, net_do_status_return took %llu\n", u2.q-u1.q, u3.q-u2.q);
+			err = 0;
+			goto error;
+		} else if (hdr.magic != NET_MSG_MAGIC) {
+			netprintk("bad magic: %u\n", hdr.magic);
+			goto error;
+		}
+		
+		if (net_is_valid_error_type(hdr.msg_type)) {
+			/* do error handling */
+			netprintk("this is a standard error message: type=%d\n", hdr.msg_type);
+			if (hdr.msg_type == NET_ALREADY_CONNECTED) {
+				netprintk0("error: there is already a socket for this connection\n");
+			} else if (hdr.msg_type == NET_UNKNOWN_HOST) {
+				netprintk0("error: unknown host\n");
+			}
+			net_dump_msg(sock, inode);
+			err = 0;
+			goto error;
+		}
+
+		/* find a handler for it */
+		spin_lock(&net_handler_lock);
+		hnd = net_lookup_handler(hdr.msg_type, hdr.key);
+		spin_unlock(&net_handler_lock);
+		
+		if (!hnd) {
+			err = -EINVAL;
+			netprintk0("no handler for message.\n");
+			goto error;
+		}
+rdtsc(u1.hilo[0], u1.hilo[1]);
+		err = net_dispatch_message(inode, sock, &hdr, hnd);
+rdtsc(u2.hilo[0], u2.hilo[1]);
+printk("net_dispatch_message took %llu\n", u2.q-u1.q);
+
+		/* if node has requested status return, do it now */
+		if (hdr.status) {
+#ifdef BIG_NET_MSG
+			u16 n = hdr.src_node;
+			hdr.src_node = hdr.dst_node;
+			hdr.dst_node = n;
+#endif
+			hdr.status = err;
+			hdr.magic = NET_MSG_STATUS_MAGIC;  // twiddle the magic
+rdtsc(u3.hilo[0], u3.hilo[1]);
+			tmperr = net_send_tcp_msg(inode, sock, &hdr, sizeof(net_msg));
+rdtsc(u4.hilo[0], u4.hilo[1]);
+printk("status return (net_send_tcp_msg) took %llu\n", u4.q-u3.q);
+		} else if (err < 0) {
+			netprintk("dispatch (%u/%u) returned %d\n",
+				  hdr.msg_type, hdr.key, err);
+		}
+
+
+		net_put_handler(hnd);
+
+		// re-add this socket
+		spin_lock(&net_list_lock);
+		list_add_tail(&net->list, &net_recv_list);
+		spin_unlock(&net_list_lock);
+		goto start_over;
+
+error:
+		if (err < 0) {
+			if (net_link_down(err, sock)) {
+				// do NOT re-add this socket
+				netprintk("link down! err=%d\n", err);
+				net_dump_and_close_sock(sock, inode);
+			} else {
+				netprintk("bad message... node=%lu.\n", inode->i_ino);
+				net_dump_msg(sock, inode);
+				// re-add this socket
+				spin_lock(&net_list_lock);
+				list_add_tail(&net->list, &net_recv_list);
+				spin_unlock(&net_list_lock);
+			}
+		} else {
+			// re-add this socket
+			spin_lock(&net_list_lock);
+			list_add_tail(&net->list, &net_recv_list);
+			spin_unlock(&net_list_lock);
+		}
+		goto start_over;
+	}
+	spin_unlock(&net_list_lock);
+
+	return 0;
+}
+
+
+void net_do_status_return(u64 msg_num, s32 status)
+{
+	net_status_ctxt *nsc;
+	struct list_head *iter;
+
+	spin_lock(&net_status_lock);
+	list_for_each(iter, &net_status_list) {
+		nsc = list_entry(iter, net_status_ctxt, list);
+		if (nsc->msg_num == msg_num) {
+			nsc->status = status;
+			atomic_set(&nsc->woken, 1);
+			list_del(&nsc->list);
+			spin_unlock(&net_status_lock);
+			wake_up(&nsc->wq);
+			return;
+		}
+	}
+	spin_unlock(&net_status_lock);
+}
+
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd)
+{
+	int ret = -EINVAL;
+	int len, packet_len;
+
+	len = hdr->data_len;
+	packet_len = len + sizeof(net_msg);
+
+	spin_lock(&hnd->lock);
+	if (net_handler_in_use(hnd)) {
+		netprintk0("EEEEEK!  handler in use! bugbug\n");
+		spin_unlock(&hnd->lock);
+		return -EINVAL;
+	}
+	if (len > hnd->max_len) {
+		netprintk("eek! advertised message data len is too large %u (max: %u)\n",
+		       len, hnd->max_len);
+		spin_unlock(&hnd->lock);
+		return -EINVAL;
+	}
+	hnd->flags |= (1 << NET_HND_IN_USE);
+	spin_unlock(&hnd->lock);
+
+	memset(hnd->buf, 0, packet_len);
+	ret = net_recv_tcp_msg(inode, sock, hnd->buf, &packet_len);
+	if (ret < 0) {
+		netprintk("net_recv_tcp_msg returned: %d\n", ret);
+	} else {
+		net_num_dispatched++;
+		ret = (hnd->func)((net_msg *)hnd->buf, packet_len, hnd->data);
+	}
+	
+	spin_lock(&hnd->lock);
+	hnd->flags &= ~(1 << NET_HND_IN_USE);
+	spin_unlock(&hnd->lock);
+
+	return ret;
+}
+
+
+
+/*
+ * net_accept_tcp_connections()
+ *
+ */
+static int net_accept_tcp_connections(void)
+{
+	int error, slen;
+	struct sockaddr_in sin;
+	struct socket *sock;
+	struct inode *inode;
+
+	if (!recv_sock) {
+		netprintk0("no socket!\n");
+		return 0;
+	}
+	
+	if (!recv_sock->sk->tp_pinfo.af_tcp.accept_queue) {
+		//netprintk0("no connections on the queue\n");
+		return 0;
+	}
+	error = 0;
+	while (error >= 0) {
+		sock = sock_alloc();
+		if (!sock)
+			break;
+
+		sock->type = recv_sock->type;
+		sock->ops = recv_sock->ops;
+		error = recv_sock->ops->accept(recv_sock, sock, O_NONBLOCK);
+		if (error < 0) {
+			sock_release(sock);
+			break;
+		}
+		if (sock->sk->state == TCP_CLOSE) {
+			sock_release(sock);
+			continue;
+		}
+			
+		slen = sizeof(sin);
+		error = sock->ops->getname(sock, (struct sockaddr *) &sin, &slen, 1);
+		if (error < 0)
+			break;
+		
+		netprintk("attempt to connect from %u.%u.%u.%u:%04x\n", 
+			NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+
+		inode = nm_get_node_by_ip(sin.sin_addr.s_addr);
+		if (inode) {
+			int exists = 1;
+			nm_node_inode_private *priv = inode->u.generic_ip;
+			net_inode_private *net = NULL;
+
+			if (priv) {
+				net = &priv->net;
+				netprintk("connect from known host: %s\n",
+				      priv->node.node_name);
+				if (ntohs(sin.sin_port) >= 1024)
+					netprintk("warning: connect from unprivileged port: %u.%u.%u.%u:%d\n",
+						NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+	 			spin_lock(&priv->net.sock_lock); 
+				if (!priv->net.sock) {
+					netprintk("new sock, doesnt exist\n");
+					exists = 0;
+					priv->net.sock = sock;
+					if (current != net_recv_task) {
+						netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+						if (net_recv_task == NULL) 
+							BUG();
+						init_waitqueue_entry(&priv->net.sleep, net_recv_task);
+					} else {
+						netprintk("process %p added to waitqueue\n", current);
+						init_waitqueue_entry(&priv->net.sleep, current);
+					}
+					add_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+				}
+	 			spin_unlock(&priv->net.sock_lock); 
+
+				if (exists) {
+					netprintk0("already a socket for this connection!\n");
+					net_send_error(sock, NET_ALREADY_CONNECTED);
+					net_dump_and_close_sock(sock, inode);
+				} else {
+					spin_lock(&net_list_lock);
+					netprintk("added inode %lu to net_recv_list\n", inode->i_ino);
+					if (list_empty(&net->list))
+						list_add_tail(&net->list, &net_recv_list);
+					spin_unlock(&net_list_lock);
+				}
+			}
+
+			iput(inode);
+		} else {
+			netprintk0("connect from unknown host...\n");
+			net_send_error(sock, NET_UNKNOWN_HOST);
+			net_dump_and_close_sock(sock, inode);
+		}
+	}
+	return error;
+}
+
+
+int net_send_error(struct socket *sock, u32 err_type)
+{
+        struct msghdr   msg;
+        mm_segment_t    oldfs;
+        struct iovec    iov;
+        int             len;
+	static net_msg err;
+
+	if (!net_is_valid_error_type(err_type)) {
+		netprintk("bug! bad error type! %u\n", err_type);
+		return -EINVAL;
+	}
+	memset(&err, 0, sizeof(net_msg));	
+	err.magic        = NET_MSG_MAGIC;
+	err.msg_type     = err_type;
+
+        msg.msg_name     = 0;
+        msg.msg_namelen  = 0;
+        msg.msg_iov      = &iov;
+        msg.msg_iovlen   = 1;
+        msg.msg_control  = NULL;
+        msg.msg_controllen = 0;
+        msg.msg_flags    = MSG_NOSIGNAL;
+        msg.msg_iov->iov_len = (__kernel_size_t)sizeof(net_msg);
+        msg.msg_iov->iov_base = (char*) &err;
+
+        oldfs = get_fs(); set_fs(KERNEL_DS);
+        len = sock_sendmsg(sock, &msg, (size_t)(sizeof(net_msg)));
+        set_fs(oldfs);
+
+        return len;
+}
+
+
+static int net_recv_message_header(net_msg *hdr, struct socket *sock)
+{
+	int status;
+	mm_segment_t oldfs;
+	struct iovec iov = {
+		.iov_base = hdr,
+		.iov_len = sizeof(net_msg)
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = NULL,
+		.msg_controllen = 0,
+		.msg_name = 0,    // (struct sockaddr *) &sin,
+		.msg_namelen = 0, // sizeof (sin),
+		.msg_flags = 0
+	};
+
+	status = 0;
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	status = sock_recvmsg(sock, &msg, sizeof(net_msg), MSG_PEEK);
+	set_fs(oldfs);
+
+	if (status < 0) {
+		if (status == -ERESTARTSYS) {
+			status = -EBADF;
+			netprintk ("Shutting down\n");
+		} else {
+			status = -EINVAL;
+			netprintk ("unable to recvmsg, error=%d\n", status);
+		}
+	}
+	// error or bytes received
+	return status;
+}
+
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode)
+{
+	nm_node_inode_private *priv = NULL;
+
+	net_dump_msg(sock, inode);
+
+	if (sock->sk) {
+		if (inode) {
+	       		priv = inode->u.generic_ip;
+			if (priv) {
+	 			spin_lock(&priv->net.sock_lock); 
+				remove_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+				priv->net.sock = NULL;
+	 			spin_unlock(&priv->net.sock_lock); 
+			}
+		}
+	}
+	sock_release(sock);
+}
+
+static void net_dump_msg(struct socket *sock, struct inode *inode)
+{
+	struct msghdr           msg;
+	struct iovec            iov;
+	int                     len;
+	mm_segment_t            oldfs;
+
+	if (sock->sk) {
+		len = 1;
+		while (len>0)
+		{
+			msg.msg_name     = 0;
+			msg.msg_namelen  = 0;
+			msg.msg_iov      = &iov;
+			msg.msg_iovlen   = 1;
+			msg.msg_control  = NULL;
+			msg.msg_controllen = 0;
+			msg.msg_flags    = MSG_DONTWAIT;
+			msg.msg_iov->iov_base = net_junk_buf;
+			msg.msg_iov->iov_len  = (__kernel_size_t)PAGE_SIZE;
+			len = 0;
+			oldfs = get_fs(); set_fs(KERNEL_DS);
+			len = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+			set_fs(oldfs);
+		}
+	}
+}
+
+
+int net_init_tcp_sock(struct inode *inode)
+{
+	nm_node_inode_private *priv;
+	nm_node_info *node;
+	net_inode_private *net = NULL;
+	struct sockaddr_in myaddr, remoteaddr;
+	int err = -EINVAL;
+	int i;
+	struct sock *sk;
+	struct socket *sock = NULL;
+
+	priv = inode->u.generic_ip;
+	if (!priv) {
+		netprintk0("bad inode\n");
+		return -EINVAL;
+	}
+	net = &priv->net;
+	node = &priv->node;
+	
+	if ((err = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+		netprintk("can't create socket: err=%d\n", err);
+		return err;
+	}
+
+	spin_lock(&net->sock_lock); 
+	if (net->sock || net->flags & NET_FLAG_CREATING_SOCKET) {
+		netprintk("socket already created or creating for inode %lu\n", inode->i_ino);
+		spin_unlock(&net->sock_lock);
+		sock_release(sock);
+		return -EEXIST;
+	}
+	net->flags |= NET_FLAG_CREATING_SOCKET;
+	spin_unlock(&net->sock_lock);
+
+	memset(&myaddr, 0, sizeof(myaddr));
+	myaddr.sin_family = AF_INET;
+	myaddr.sin_port = htons(0);  // any port
+	err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, sizeof(myaddr));
+	
+	memset (&remoteaddr, 0, sizeof (remoteaddr));
+	remoteaddr.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+	remoteaddr.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+	remoteaddr.sin_port = node->ifaces[0].ip_port;
+
+	//netprintk("connecting new socket: ip %d.%d.%d.%d, port %d\n", NIPQUAD(remoteaddr.sin_addr.s_addr), remoteaddr.sin_port);
+	err = sock->ops->connect(sock, (struct sockaddr *) &remoteaddr, 
+					sizeof(remoteaddr), 0); /* TODO put this back!  O_NONBLOCK); */
+	//netprintk("connect status %d\n", err);
+	
+	if (err >= 0) {
+		spin_lock(&net->sock_lock);
+		net->sock = sock;
+		net->flags &= ~NET_FLAG_CREATING_SOCKET;
+
+		netprintk0("1) ok this node is actively trying to connect, add to waitqueue\n");
+		if (current != net_recv_task) {
+			netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+			if (net_recv_task == NULL) 
+				BUG();
+			init_waitqueue_entry(&net->sleep, net_recv_task);
+		} else {
+			netprintk("process %p added to waitqueue\n", current);
+			init_waitqueue_entry(&net->sleep, current);
+		}
+		add_wait_queue(sock->sk->sleep, &net->sleep);
+
+		spin_unlock(&net->sock_lock);
+		goto out;
+	}
+
+	sk = sock->sk;
+	switch (err) {
+		case -EALREADY:
+		case -EINPROGRESS:
+					
+			/* TODO: awful awful awful */
+			for (i=0; i<100; i++) {
+				/* Protect against TCP socket state changes */
+				lock_sock(sk);
+				if (sk->state == TCP_ESTABLISHED) {
+					release_sock(sk);
+					netprintk0("woo!  connected...\n");
+					err = 0;
+					spin_lock(&net->sock_lock);
+					net->flags &= ~NET_FLAG_CREATING_SOCKET;
+					net->sock = sock;
+
+					netprintk0("2) ok this node is actively trying to connect, add to waitqueue\n");
+					if (current != net_recv_task) {
+						netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+						if (net_recv_task == NULL) 
+							BUG();
+						init_waitqueue_entry(&net->sleep, net_recv_task);
+					} else {
+						netprintk("process %p added to waitqueue\n", current);
+						init_waitqueue_entry(&net->sleep, current);
+					}
+					add_wait_queue(sock->sk->sleep, &net->sleep);
+
+					spin_unlock(&net->sock_lock);
+					break;
+				} else {
+					netprintk("waiting for connection: pass %d, state %d\n", i, sk->state);
+					/* TODO */
+#if 0
+					task->tk_timeout = RPC_CONNECT_TIMEOUT;
+					/* if the socket is already closing, delay briefly */
+					if ((1<<sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+						task->tk_timeout = RPC_REESTABLISH_TIMEOUT;
+					rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
+#endif
+					/* TODO: this is awful... change it later */
+				}
+				release_sock(sk);
+				util_sleep(100);
+			}
+			break;
+		case -ECONNREFUSED:
+		case -ECONNRESET:
+		case -ENOTCONN:
+			netprintk("conn refused, reset or not connected\n");
+			break;
+		default:
+			/* Report myriad other possible returns.  If this file
+			* system is soft mounted, just error out, like Solaris.  */
+			netprintk("error %d connecting to server\n", err);
+			/* TODO */
+#if 0
+			/* This will prevent anybody else from connecting */
+			rpc_delay(task, RPC_REESTABLISH_TIMEOUT);
+			task->tk_status = status;
+#endif
+			break;
+	}
+
+out:
+	if (err < 0) {
+		if (net) {
+			spin_lock(&net->sock_lock);
+			if (net->sock)
+				netprintk0("wha?! there's a socket there already!!!!\n");
+			net->flags &= ~NET_FLAG_CREATING_SOCKET;
+			spin_unlock(&net->sock_lock);
+		}
+	       	if (sock) 
+			sock_release(sock);
+	} else {
+		/* add this inode to the receive list, if not already */
+		spin_lock(&net_list_lock);
+		if (list_empty(&net->list))
+			list_add_tail(&net->list, &net_recv_list);
+		spin_unlock(&net_list_lock);
+	}
+
+	return err;
+}
+
+
+
+/*
+ * net_init_tcp_recv_sock()
+ *
+ */
+static int net_init_tcp_recv_sock(void)
+{
+	struct sockaddr_in sin;
+	int status = -EINVAL;
+
+	/* Create Receive Socket */
+	status = sock_create(net_ip_version_to_family(ip_version),
+			     SOCK_STREAM, IPPROTO_TCP,
+			     &recv_sock);
+	if (status < 0) {
+		netprintk ("unable to create socket, error=%d", status);
+		goto bail;
+	}
+
+
+	/* Bind Receive Socket */
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = net_ip_version_to_family(ip_version);
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = ip_port;
+
+	status = recv_sock->ops->bind(recv_sock,
+					 (struct sockaddr *)&sin,
+					 sizeof(sin));
+	if (status < 0) {
+		netprintk ("unable to bind socket to port %d, error=%d", 
+			ntohs(ip_port), status);
+	}
+
+	/* !!! dunno about these... */
+	recv_sock->sk->reuse = 1;
+	status = recv_sock->ops->listen(recv_sock, 64);
+
+bail:
+	return status;
+}				/* net_init_tcp_recv_sock */
+
+
+static void net_release_tcp_sock(void)
+{
+	if (recv_sock) {
+		sock_release (recv_sock);
+		recv_sock = NULL;
+	}
+}
+
+
+module_init (net_driver_entry);
+module_exit (net_driver_exit);

Added: branches/dlm-glue/cluster/tcp.h
===================================================================
--- branches/dlm-glue/cluster/tcp.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/tcp.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_TCP_H
+#define CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+#include "nodemanager.h"
+
+
+#ifdef __KERNEL__
+
+#define NET_DISP_THREAD_MS   5000   /* TODO */
+#define NET_RECV_THREAD_MS   5000   /* TODO */
+
+#ifdef BIG_NET_MSG
+#define NET_MSG_MAGIC           ((u32)0xbc0ffa55)
+#define NET_MSG_STATUS_MAGIC    ((u32)0xbc0ffa56)
+#define NET_MSG_NUM_MAX         ((u64)0xffffffffffffffffULL)
+typedef struct _net_msg
+{
+	__u32 magic;
+	__u32 data_len;
+	__u16 src_node;
+	__u16 dst_node;
+	__u32 msg_type;
+	__u32 key;
+	__s32 status;
+	__u64 msg_num;
+	__u8  buf[0];
+} net_msg;
+#else
+
+#define NET_MSG_MAGIC           ((u16)0xfa55)
+#define NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define NET_MSG_NUM_MAX         ((u32)0xffffffffUL)
+typedef struct _net_msg
+{
+	__u16 magic;
+	__u16 data_len;
+	__u16 msg_type;
+	__s16 status;
+	__u32 key;
+	__u32 msg_num;
+	__u8  buf[0];
+} net_msg;
+
+#endif
+
+typedef int (net_msg_handler_func)(net_msg *msg, u32 len, void *data);
+
+typedef struct _net_msg_handler
+{
+	struct list_head list;
+	u32 msg_type;
+	u32 key;
+	net_msg_handler_func *func;
+	void *data;
+	net_msg hdr;
+	u32 max_len;
+	void *buf;
+	spinlock_t lock;
+	atomic_t refcnt;
+	int flags;
+} net_msg_handler;
+
+typedef struct _net_status_ctxt
+{
+	struct list_head list;
+	s32 status;
+	u64 msg_num;
+	wait_queue_head_t wq;
+	atomic_t woken;
+} net_status_ctxt;
+
+void net_do_status_return(u64 msg_num, s32 status);
+
+/* no clue for these yet... */
+#define NET_MIN_MSG_LEN  (0)
+#define NET_MAX_MSG_LEN  (8192)
+	
+
+#define NET_ALREADY_CONNECTED   2
+#define NET_UNKNOWN_HOST        3
+	
+
+static inline int net_is_valid_error_type(u32 err_type)
+{
+	if (err_type == NET_ALREADY_CONNECTED ||
+	    err_type == NET_UNKNOWN_HOST)
+		return 1;
+	return 0;
+}
+		       
+enum {
+	NET_HND_VAR_LEN = 0,
+	NET_HND_IN_USE,
+};
+
+#define net_handler_variable_len(h)   ((h)->flags & (1 << NET_HND_VAR_LEN))
+#define net_handler_in_use(h)         ((h)->flags & (1 << NET_HND_IN_USE))
+
+static inline int net_handler_msg_len_ok(net_msg_handler *handler, u32 len)
+{
+	return (net_handler_variable_len(handler) ? 
+		len > handler->max_len : len != handler->max_len);
+}
+
+
+static inline int net_ip_version_to_family(u16 ip_version)
+{
+	printk("ip_version passed: %u, host byteorder: %u\n", ip_version, ntohs(ip_version));
+	return PF_INET;
+	switch (ntohs(ip_version)) {
+		case 4:
+			return PF_INET;
+		case 6:
+			return PF_INET6;
+		default:
+			BUG();
+	}
+
+	return 4;
+}
+
+
+
+/* TODO: figure this out.... */
+static inline int net_link_down(int err, struct socket *sock)
+{
+	if (sock) {
+		if (sock->sk->state != TCP_ESTABLISHED &&
+	    	    sock->sk->state != TCP_CLOSE_WAIT)
+			return 1;
+	}
+
+	if (err >= 0)
+		return 0;
+	switch (err) {
+		/* ????????????????????????? */
+		case -ERESTARTSYS:
+		case -EBADF:
+		/* When the server has died, an ICMP port unreachable 
+		 * message prompts ECONNREFUSED. */
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+			return 1;
+	}
+	return 0;
+}
+
+enum {
+	NET_DRIVER_UNINITED,
+	NET_DRIVER_READY,
+};
+
+
+int net_register_handler(u32 msg_type, u32 key, int flags, 
+			 u32 max_len, net_msg_handler_func *func, void *data, void *buf);
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len);
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len);
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len);
+int net_send_error(struct socket *sock, u32 err_type);
+int net_init_tcp_sock(struct inode *inode);
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status);
+int net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *group);
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key);
+
+#endif /* __KERNEL__ */
+
+typedef struct _net_ioc
+{
+	u32 status;
+} net_ioc;
+
+typedef struct _gsd_ioc
+{
+	int fd;
+	int namelen;
+	char name[NM_MAX_NAME_LEN+1];
+	int status;
+} gsd_ioc;
+
+#define  NET_IOC_MAGIC          'O'
+#define  NET_IOC_ACTIVATE       _IOR(NET_IOC_MAGIC, 1, net_ioc)
+#define  NET_IOC_GETSTATE       _IOR(NET_IOC_MAGIC, 2, net_ioc)
+#define  GSD_IOC_CREATE_GROUP   _IOR(NET_IOC_MAGIC, 3, gsd_ioc)
+#define  GSD_IOC_ADD_GROUP_NODE _IOR(NET_IOC_MAGIC, 4, gsd_ioc)
+
+#define GSD_MESSAGE   130
+#define GSD_ACTION_ADD_GROUP        (0x01)
+#define GSD_ACTION_ADD_GROUP_NODE   (0x02)
+
+typedef struct _gsd_message
+{
+	u16 from;
+	u8 action;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} gsd_message;
+
+#endif /* CLUSTER_TCP_H */

Added: branches/dlm-glue/cluster/test.c
===================================================================
--- branches/dlm-glue/cluster/test.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/test.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,811 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * test.c
+ *
+ * test module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/proc_fs.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "dlmmod.h"
+
+#include "compat_libfs.h"
+
+#define testprintk(x, arg...)    printk("TEST: (%d) " x, current->pid, ##arg)
+#define testprintk0(x)           printk("TEST: (%d) " x, current->pid)
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size);
+static ssize_t write_net_send(struct file *file, char *buf, size_t size);
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size);
+
+enum {
+	TEST_Root = 1,
+	TEST_NetRegister,
+	TEST_NetSend,
+	TEST_NetGetNum,
+	TEST_DLMPoop,
+	TEST_DLMPoop2,
+	TEST_DLMPoop3,
+	TEST_DLMRegister
+};
+
+extern spinlock_t net_state_lock;
+extern u32 net_driver_state;
+extern struct file_operations transaction_ops;
+extern char *nm_nodename;
+extern u32 net_num_dispatched;
+
+
+static void test_teardown(void);
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data);
+
+static int test_net_send(int arg);
+static int test_net_register(int arg);
+static int test_net_get_num(int arg);
+static int test_dlm_poop(int arg);
+static int test_dlm_poop2(int arg);
+static int test_dlm_poop3(int arg);
+static int test_dlm_register(int arg);
+
+
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data)
+{
+	testprintk("got a message!  type=%u, len=%u, data=%d\n", msg->msg_type, len, *(int *)data);
+	return 0;
+}
+
+#define TEST_MSG_TYPE1    87654321
+#define TEST_KEY1         12378534
+	
+int test_data1 = 723123123;
+
+static int test_net_register(int arg)
+{
+	int ret;
+	struct inode *dest_inode;
+	u16 dest_node_num = (u16)arg;
+
+	testprintk("running test_net_register: will contact node %u\n", dest_node_num);
+
+	dest_inode = nm_get_node_by_num(dest_node_num);
+	if (!dest_inode) {
+		testprintk("eeek! failed to find node %u\n", dest_node_num);
+		return 0;
+	}
+	{       
+		struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+		testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+	}
+
+	ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+				   test_small_msg_func, &test_data1, NULL);
+	if (ret < 0) {
+		testprintk0("eek!  register failed!\n");
+		return -1;
+	}
+	ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+				   test_small_msg_func, &test_data1, NULL);
+	if (ret >= 0) {
+		testprintk0("eek!  re-register was supposed to fail but didnt!!!\n");
+		return -1;
+	}
+	testprintk0("sweet.  re-register failed like it should have.\n");
+
+	testprintk0("creating socket now...\n");
+	ret = net_init_tcp_sock(dest_inode);
+	if (ret < 0) {
+		testprintk0("failed to make socket\n");
+		return -1;
+	}
+	testprintk("net_init_tcp_sock returned %d\n", ret);
+
+	testprintk0("leaving test_net_register!\n");
+	return 0;
+}
+
+
+static int test_net_send(int arg)
+{
+	int ret;
+	struct inode *dest_inode;
+	u16 dest_node_num = (u16)arg;
+
+	testprintk("running test_net_send: will contact node %u\n", dest_node_num);
+
+	dest_inode = nm_get_node_by_num(dest_node_num);
+	if (!dest_inode) {
+		testprintk("eeek! failed to find node %u\n", dest_node_num);
+		return 0;
+	}
+	{
+		struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+		testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+	}
+
+	testprintk0("packaging message now\n");
+
+	{
+		testprintk0("woo!  made a message packet... lets try sending it to ourself...\n");
+		testprintk0("waiting for socket to be created\n");
+		while (1) {
+			printk(".");
+			spin_lock(&net_state_lock);
+			if (net_driver_state == NET_DRIVER_READY) {
+				spin_unlock(&net_state_lock);
+				break;
+			}
+			spin_unlock(&net_state_lock);
+			util_sleep (100);
+		}
+		printk(".  done... let's go!\n");
+		ret = net_send_message(TEST_MSG_TYPE1, TEST_KEY1, NULL, 0, dest_inode, NULL);
+		testprintk("sent!!!! ret=%d\n", ret);
+	}
+	testprintk0("leaving test_net_send!\n");
+	return 0;
+	
+}
+
+static int test_net_get_num(int arg)
+{
+	testprintk("number of messages dispatched: %u\n", net_num_dispatched);
+	return 0;
+}
+
+void my_ast(void *data);
+void my_bast(void *data, int blocked_type);
+	
+dlm_lockstatus lksb1, lksb2;
+wait_queue_head_t convert_wq;
+atomic_t convert_flag;
+
+dlm_ctxt *the_dlm = NULL;
+
+static int test_dlm_poop(int arg)
+{
+	testprintk("calling dlm_dump_dlm(%p)\n", the_dlm);
+	if (the_dlm)
+		dlm_dump_dlm(the_dlm);
+
+#if 0
+	dlm_ctxt *dlm;
+	dlm_status status;
+	void *data1 = &lksb1;
+	void *data2 = &lksb2;
+	int ret;
+
+	memset(&lksb1, 0, sizeof(dlm_lockstatus));
+	memset(&lksb1, 0, sizeof(dlm_lockstatus));
+
+	testprintk0("calling dlm_register_domain...\n");
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+	testprintk("dlm_register_domain returned %p\n", dlm);
+
+	testprintk0("calling dlmlock...\n");
+	status = dlmlock(dlm, LKM_EXMODE, &lksb1, 0, "lock1", my_ast, data1, my_bast);
+	testprintk("dlmlock returned %d.  lksb.status=%d, lock=%p\n", status, lksb1.status, lksb1.lockid);
+
+	testprintk0("calling dlmlock to do a convert...\n");
+	status = dlmlock(dlm, LKM_PRMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data1, my_bast);
+	testprintk("dlmlock returned %d\n", status);
+
+	init_waitqueue_head (&convert_wq);
+	atomic_set(&convert_flag, 0);
+
+	testprintk0("calling second dlmlock...\n");
+	status = dlmlock(dlm, LKM_EXMODE, &lksb2, 0, "lock1", my_ast, data2, my_bast);
+	testprintk("dlmlock returned %d.  lksb.status=%d, lock=%p\n", status, lksb2.status, lksb2.lockid);
+
+	testprintk0("sleeping now!\n");
+	ret = util_wait_atomic_eq(&convert_wq, &convert_flag, 1, 20000);
+	testprintk("wait returned %d\n", ret);
+
+	testprintk0("calling dlmlock to do a convert the blocking lock to NL...\n");
+	status = dlmlock(dlm, LKM_NLMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data2, my_bast);
+	testprintk("dlmlock returned %d\n", status);
+
+	testprintk0("sleeping\n");
+	util_sleep(10000);
+	testprintk0("DONE!\n");
+#endif
+	return 0;
+}
+
+
+void my_ast(void *data)
+{
+	dlm_lockstatus *l = data;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	testprintk("AST!!!:   lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+}
+
+void my_bast(void *data, int blocked_type)
+{
+	dlm_lockstatus *l = data;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	testprintk("BAST!!!:   blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&convert_flag, 1);
+	wake_up(&convert_wq);
+}
+
+atomic_t finish;
+
+typedef struct _poo
+{
+	struct task_struct *task;
+	dlm_ctxt *dlm;
+	dlm_lockstatus *lksb;
+	wait_queue_head_t wq;
+	atomic_t ast_flag;
+	atomic_t bast_flag;
+	struct completion complete;
+} poo;
+void my_ast2(void *data);
+void my_bast2(void *data, int blocked_type);
+int test_dlm_thread(void *data);
+atomic_t asts_fired, basts_fired;
+
+typedef union _my_timing_t
+{
+	__u64 q;
+	__u32 lohi[2];
+} my_timing_t;
+
+
+static int test_dlm_poop2(int arg)
+{
+	dlm_ctxt *dlm;
+	dlm_status status;
+	void *data1 = &lksb1;
+	void *data2 = &lksb2;
+	int ret;
+	int pid1, pid2;
+	poo *poo1, *poo2;
+	my_timing_t t1, t2, t3;
+
+	poo1 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo1=%p\n", poo1);	
+	poo2 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo2=%p\n", poo2);	
+
+	atomic_set(&finish, 0);
+	atomic_set(&asts_fired, 0);
+	atomic_set(&basts_fired, 0);
+
+	testprintk0("calling dlm_register_domain...\n");
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+	testprintk("dlm_register_domain returned %p\n", dlm);
+	
+	poo1->dlm = dlm;
+	poo2->dlm = dlm;
+	init_completion(&poo1->complete);
+	init_completion(&poo2->complete);
+
+	rdtsc(t1.lohi[0], t1.lohi[1]);
+	pid1 = kernel_thread (test_dlm_thread, poo1, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (pid1 < 0) {
+		printk("unable to launch thread, error=%d", pid1);
+		return -EINVAL;
+	}
+	pid2 = kernel_thread (test_dlm_thread, poo2, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (pid2 < 0) {
+		printk("unable to launch thread, error=%d", pid2);
+		return -EINVAL;
+	}
+	testprintk("dlm threads running for %s...\n", dlm->name);
+	testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+	testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+	//testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+	//testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+	//testprintk("sending sigint now...\n");
+	//send_sig (SIGINT, poo1->task, 0);
+	//send_sig (SIGINT, poo2->task, 0);
+	//atomic_set(&finish, 1);
+	while (1) {
+		util_sleep(30000);
+		rdtsc(t3.lohi[0], t3.lohi[1]);
+		testprintk("another 30 sec: asts=%d, basts=%d, diff=%llu\n", 
+			   atomic_read(&asts_fired), atomic_read(&basts_fired), 
+			    t3.q - t1.q);
+		if (atomic_read(&finish)==1) {
+			printk("finish set!\n");
+			break;
+		}
+	}
+	wait_for_completion (&poo1->complete);
+	wait_for_completion (&poo2->complete);
+	rdtsc(t2.lohi[0], t2.lohi[1]);
+	kfree(poo1);
+	kfree(poo2);
+	testprintk("leaving!   asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired), 
+		    t2.q - t1.q);
+	return 0;
+}
+
+
+int test_dlm_thread(void *data)
+{
+	dlm_status status;
+	int ret;
+	dlm_lockstatus *lksb;
+	poo *mypoo = data;
+	dlm_ctxt *dlm = mypoo->dlm;
+
+	testprintk("mypoo=%p, dlm=%p\n", mypoo, dlm);
+	mypoo->task = current;
+	lksb = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+	memset(lksb, 0, sizeof(dlm_lockstatus));
+
+	mypoo->lksb = lksb;
+	init_waitqueue_head(&mypoo->wq);
+
+	atomic_set(&mypoo->ast_flag, 0);
+	atomic_set(&mypoo->bast_flag, 0);
+	
+	testprintk("mypoo=%p, dlm=%p, task=%p\n", mypoo, dlm, mypoo->task);
+
+	testprintk("calling dlmlock(%p, %d, %p, 0, \"lock1\", %p, %p, %p) to create the lock...\n",
+		    dlm, LKM_EXMODE, lksb, my_ast2, data, my_bast2);
+	status = dlmlock(dlm, LKM_EXMODE, lksb, 0, "lock1", my_ast2, data, my_bast2);
+	testprintk("dlmlock returned %d.  lksb.status=%d, lock=%p\n", status, lksb->status, lksb->lockid);
+
+again:
+	ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+	if (ret < 0) {
+		testprintk("1: waiting on ast converting to EX, ret=%d, type=%d, convtype=%d\n", 
+		       ret, lksb->lockid->type, lksb->lockid->convert_type);
+		if (ret == -EINTR)
+			goto leave;
+		goto again;
+	}
+	atomic_set(&mypoo->ast_flag, 0);
+
+
+
+wait_bast:	
+	ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->bast_flag, 1, 0);
+	if (ret < 0) {
+		testprintk("2: waiting on bast after converting to EX, ret=%d, type=%d, convtype=%d\n", 
+		       ret, lksb->lockid->type, lksb->lockid->convert_type);
+		if (ret == -EINTR)
+			goto leave;
+		goto wait_bast;
+	}
+	atomic_set(&mypoo->bast_flag, 0);
+
+
+
+
+	atomic_set(&mypoo->ast_flag, 0);
+
+	status = dlmlock(dlm, LKM_NLMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+wait_ast:
+	ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+	if (ret < 0) {
+		testprintk("3: waiting on ast converting to NL, ret=%d, type=%d, convtype=%d\n", 
+		       ret, lksb->lockid->type, lksb->lockid->convert_type);
+		if (ret == -EINTR)
+			goto leave;
+		goto wait_ast;
+	}
+
+	atomic_set(&mypoo->ast_flag, 0);
+	atomic_set(&mypoo->bast_flag, 0);
+
+	status = dlmlock(dlm, LKM_EXMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+
+	if (atomic_read(&finish) == 0)
+		goto again;
+leave:
+
+	atomic_set(&finish, 1);
+	kfree(mypoo->lksb);
+	complete (&mypoo->complete);
+	testprintk0("exiting thread\n");
+	return 0;
+}
+
+
+void my_ast2(void *data)
+{
+	poo *mypoo = data;
+	dlm_lockstatus *l = mypoo->lksb;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&asts_fired);
+	//testprintk("AST!!!:   lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	//       l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&mypoo->ast_flag, 1);
+	wake_up(&mypoo->wq);
+}
+
+void my_bast2(void *data, int blocked_type)
+{
+	poo *mypoo = data;
+	dlm_lockstatus *l = mypoo->lksb;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&basts_fired);
+	//testprintk("BAST!!!:   blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	//       blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&mypoo->bast_flag, 1);
+	wake_up(&mypoo->wq);
+}
+
+wait_queue_head_t wq3;
+atomic_t ast_flag3, bast_flag3;
+dlm_lockstatus *lksb3;
+
+void my_bast3(void *data, int blocked_type);
+void my_ast3(void *data);
+
+void my_ast3(void *data)
+{
+	dlm_lock *lock = lksb3->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&asts_fired);
+	testprintk("AST!!!:   lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&ast_flag3, 1);
+	wake_up(&wq3);
+}
+
+void my_bast3(void *data, int blocked_type)
+{
+	dlm_lock *lock = lksb3->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&basts_fired);
+	testprintk("BAST!!!:   blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       blocked_type, lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&bast_flag3, 1);
+	wake_up(&wq3);
+}
+
+static int test_dlm_poop3(int arg)
+{
+	dlm_ctxt *dlm;
+	dlm_status status;
+	int ret, i;
+	my_timing_t t1, t2, t3, t4;
+
+	atomic_set(&finish, 0);
+	atomic_set(&asts_fired, 0);
+	atomic_set(&basts_fired, 0);
+
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+
+	lksb3 = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+	memset(lksb3, 0, sizeof(dlm_lockstatus));
+
+	init_waitqueue_head(&wq3);
+
+	atomic_set(&ast_flag3, 0);
+	atomic_set(&bast_flag3, 0);
+
+	i = 0;
+	rdtsc(t1.lohi[0], t1.lohi[1]);
+
+	/* CREATE -> NL */	
+	testprintk0("creating lock\n");
+rdtsc(t3.lohi[0], t3.lohi[1]);
+	status = dlmlock(dlm, LKM_NLMODE, lksb3, 0, "lock1", my_ast3, NULL, my_bast3);
+
+	while (1) {
+		testprintk("%d: waiting on ast\n", i);
+		ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+		if (ret == -EINTR)
+			break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->NL took: %llu\n", i, t4.q - t3.q);
+		testprintk("%d: no bast for NL\n", i);
+	
+		atomic_set(&ast_flag3, 0);
+		atomic_set(&bast_flag3, 0);
+
+		if (i == 10) {
+			testprintk("%d: reached 10, goodbye\n", i);
+			break;
+		}
+		dlm_dump_dlm(dlm);
+	
+		/* CONVERT -> EX */	
+		testprintk("%d: converting dlmlock->EX\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+		status = dlmlock(dlm, LKM_EXMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+		testprintk("%d: waiting on ast\n", i);
+		ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+		if (ret == -EINTR)
+			break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->EX took: %llu\n", i, t4.q - t3.q);
+		atomic_set(&ast_flag3, 0);	
+	
+		testprintk("%d: waiting on bast\n", i);
+		ret = util_wait_atomic_eq(&wq3, &bast_flag3, 1, 0);
+		if (ret == -EINTR)
+			break;
+		atomic_set(&ast_flag3, 0);
+		atomic_set(&bast_flag3, 0);
+	
+		/* CONVERT -> NL */	
+		testprintk("%d: converting dlmlock->NL\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+		status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+	
+		/* WAIT ON AST AGAIN */
+		i++;
+	}
+	
+	/* DOWNCONVERT LAST TIME */
+	/* TODO: replace with dlmunlock once implemented */
+	status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+	kfree(lksb3);
+
+	rdtsc(t2.lohi[0], t2.lohi[1]);
+	testprintk("leaving!   asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired), 
+		    t2.q - t1.q);
+	return 0;
+}
+
+
+static int test_dlm_register(int arg)
+{
+	dlm_ctxt *dlm;
+
+	testprintk0("calling dlm_register_domain...\n");
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+	testprintk("dlm_register_domain returned %p\n", dlm);
+
+	the_dlm = dlm;	
+	testprintk0("leaving!\n");
+	return 0;
+}
+
+
+
+
+/*
+ * module stuff
+ */
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_net_register(%d)\n", arg);
+	tmpret = test_net_register(arg);
+	ret = sprintf(buf, "test_net_register(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_net_send(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_net_send(%d)\n", arg);
+	tmpret = test_net_send(arg);
+	ret = sprintf(buf, "test_net_send(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_net_get_num(%d)\n", arg);
+	tmpret = test_net_get_num(arg);
+	ret = sprintf(buf, "test_net_get_num(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_poop(%d)\n", arg);
+	tmpret = test_dlm_poop(arg);
+	ret = sprintf(buf, "test_dlm_poop(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_poop2(%d)\n", arg);
+	tmpret = test_dlm_poop2(arg);
+	ret = sprintf(buf, "test_dlm_poop2(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_poop3(%d)\n", arg);
+	tmpret = test_dlm_poop3(arg);
+	ret = sprintf(buf, "test_dlm_poop3(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_register(%d)\n", arg);
+	tmpret = test_dlm_register(arg);
+	ret = sprintf(buf, "test_dlm_register(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+static int test_fill_super(struct super_block * sb, void * data, int silent)
+{	
+	int ret, sz;
+	TA_write_ops *ops;
+	static struct tree_descr test_files[] = {
+		[TEST_NetRegister] = {"net-register", &transaction_ops, S_IWUSR},
+		[TEST_NetSend] = {"net-send", &transaction_ops, S_IWUSR},
+		[TEST_NetGetNum] = {"net-get-num", &transaction_ops, S_IWUSR},
+		[TEST_DLMPoop] = {"dlm-poop", &transaction_ops, S_IWUSR},
+		[TEST_DLMPoop2] = {"dlm-poop2", &transaction_ops, S_IWUSR},
+		[TEST_DLMPoop3] = {"dlm-poop3", &transaction_ops, S_IWUSR},
+		[TEST_DLMRegister] = {"dlm-register", &transaction_ops, S_IWUSR},
+		/* last one */ {""}
+	};
+	
+	sz = sizeof(test_files) / sizeof(struct tree_descr);
+	ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+	ops->num_ops = sz;
+	ops->write_op[TEST_NetRegister] = write_net_register;
+	ops->write_op[TEST_NetSend] = write_net_send;
+	ops->write_op[TEST_NetGetNum] = write_net_get_num;
+	ops->write_op[TEST_DLMPoop] = write_dlm_poop;
+	ops->write_op[TEST_DLMPoop2] = write_dlm_poop2;
+	ops->write_op[TEST_DLMPoop3] = write_dlm_poop3;
+	ops->write_op[TEST_DLMRegister] = write_dlm_register;
+
+	printk("calling simple_fill_super...\n");
+	ret = simple_fill_super(sb, 0x12beAf00L, test_files);
+	if (ret >= 0) {
+		TA_GENERIC_SB_MEMBER(sb) = ops;
+	} else {
+		kfree(ops);
+	}
+	return ret;
+}
+
+static struct super_block *test_read_super (struct super_block *sb, void *data, int silent)
+{
+	printk("welcome to test_read_super!!!\n");
+	return (test_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (test_fs_type, "test", test_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_test(void)
+{
+	int retval;
+	void *ret;
+
+	printk("loading test module: nodename is %s\n", nm_nodename);
+
+	ret = proc_mkdir("cluster/test", 0);
+	printk("proc_mkdir of cluster/test returned %p\n", ret);
+
+	printk("calling register_filesystem\n");
+	retval = register_filesystem(&test_fs_type);
+	printk("done calling register_filesystem: ret=%d\n", retval);
+	if (retval) {
+		printk("oopsy that did not work\n");
+		test_teardown();
+	} else
+		printk("woot.  good to go.\n");
+	return retval;
+}
+
+static void __exit exit_test(void)
+{
+	test_teardown();
+	unregister_filesystem(&test_fs_type);
+	printk("unloading test module\n");
+}
+
+static void test_teardown(void)
+{
+	printk("removing cluster/test\n");
+	remove_proc_entry("cluster/test", NULL);
+}
+
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_test)
+module_exit(exit_test)

Added: branches/dlm-glue/cluster/util.c
===================================================================
--- branches/dlm-glue/cluster/util.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/util.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,349 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.c
+ *
+ * General purpose code
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "util.h"
+
+static void util_timeout_func(unsigned long data);
+
+/* block all but 'mask' sigs, optionally saving off our previous
+ * signal state. */
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+	sigset_t tmpsig;
+
+	siginitsetinv(&tmpsig, mask);
+	sigprocmask(SIG_BLOCK, &tmpsig, oldsigs);
+#else
+#ifdef HAVE_NPTL
+	spin_lock_irq (&current->sighand->siglock);
+	if (oldsigs)
+		*oldsigs = current->blocked;
+	siginitsetinv (&current->blocked, mask);
+	recalc_sigpending ();
+	spin_unlock_irq (&current->sighand->siglock);
+#else
+	spin_lock_irq (&current->sigmask_lock);
+	if (oldsigs)
+		*oldsigs = current->blocked;
+	siginitsetinv (&current->blocked, mask);
+	recalc_sigpending (current);
+	spin_unlock_irq (&current->sigmask_lock);
+#endif
+#endif
+}
+
+void util_unblock_sigs(sigset_t newsig)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+	sigprocmask(SIG_SETMASK, &newsig, NULL);
+#else
+#ifdef HAVE_NPTL
+       	spin_lock_irq (&current->sighand->siglock);
+	current->blocked = newsig;
+	recalc_sigpending ();
+	spin_unlock_irq (&current->sighand->siglock);
+#else
+	spin_lock_irq (&current->sigmask_lock);
+	current->blocked = newsig;
+	recalc_sigpending (current);
+	spin_unlock_irq (&current->sigmask_lock);
+#endif
+#endif
+}
+
+/*
+ * util_daemonize() 
+ *
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/* yes, len is unused but kept here for backwards compatibility. */
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+	sigset_t tmpsig;
+
+	daemonize (name);
+
+	if (shutdown_sigs) {
+		/* Unblock SIGKILL, SIGSTOP, SIGHUP and SIGINT */
+		sigemptyset(&tmpsig);
+		sigaddsetmask(&tmpsig, SHUTDOWN_SIGS);
+		sigprocmask(SIG_UNBLOCK, &tmpsig, NULL);
+	}
+
+	return;
+}				/* util_daemonize */
+#else
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+	daemonize ();
+	reparent_to_init ();
+
+	if (len > 0) {
+		if (len > 15)
+			BUG();
+		strncpy (current->comm, name, len);
+		current->comm[len] = '\0';
+	}
+
+	if (shutdown_sigs)
+		util_block_sigs(NULL, SHUTDOWN_SIGS);
+	else
+		util_block_sigs(NULL, 0);
+	return;
+}				/* util_daemonize */
+#endif
+
+/*
+ * util_sleep()
+ *
+ * The interval time is in milliseconds
+ *
+ * This function needs to be removed.
+ * Instead call schedule_timeout() directly and handle signals.
+ */
+int util_sleep (__u32 ms)
+{
+	__u32 numJiffies;
+
+	/* 10ms = 1 jiffy, minimum resolution is one jiffy */
+	numJiffies = ms * HZ / 1000;
+	numJiffies = (numJiffies < 1) ? 1 : numJiffies;
+
+	set_current_state (TASK_INTERRUPTIBLE);
+	numJiffies = schedule_timeout (numJiffies);
+
+	return 0;
+}				/* util_sleep */
+
+/* prefetch has been declared to allow to build in debug mode */
+#ifdef DEBUG
+#ifndef ARCH_HAS_PREFETCH
+inline void prefetch (const void *x)
+{;
+}
+#endif
+#endif
+
+
+static void util_timeout_func(unsigned long data)
+{
+	util_timeout *to = (util_timeout *)data; 
+
+	to->timed_out = 1;
+	wake_up(&to->wait);
+}
+
+void util_init_timeout(util_timeout *to)
+{
+	init_timer(&to->timer);
+	to->timer.data = (unsigned long)to;
+	to->timer.function = util_timeout_func;
+	to->timed_out = 0;
+	init_waitqueue_head(&to->wait);
+}
+
+void util_set_timeout(util_timeout *to, __u32 timeout)
+{
+	__u32 how_long;
+
+	if (!timeout) {
+		to->timed_out = 1;
+		return ;
+	}
+
+	how_long = (timeout * HZ / 1000);
+	if (how_long < 1)
+		how_long = 1;
+
+	to->timer.expires = jiffies + how_long;
+	add_timer(&to->timer);
+}
+
+void util_clear_timeout(util_timeout *to)
+{
+	del_timer_sync(&to->timer);
+}
+
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms)
+{
+	int ret;
+	util_timeout timeout;
+	DECLARE_WAITQUEUE(wait, current);
+	DECLARE_WAITQUEUE(to_wait, current);
+
+	util_init_timeout(&timeout);
+
+	if (ms) {
+		util_set_timeout(&timeout, ms);
+		if (timeout.timed_out) {
+			util_clear_timeout(&timeout);
+		}
+	}
+	add_wait_queue(wq, &wait);
+	add_wait_queue(&timeout.wait, &to_wait);
+	do { 
+		ret = 0;
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (atomic_read(var)==val)
+			break;
+		ret = -ETIMEDOUT;
+		if (timeout.timed_out)
+			break;
+		schedule();
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+	} while (1);
+	
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(wq, &wait);
+	remove_wait_queue(&timeout.wait, &to_wait);
+
+	if (ms)
+		util_clear_timeout(&timeout);
+
+	return ret;
+}
+
+/* resizable (using chained pages) array stuff */
+void util_init_rarray(util_rarray *arr, u16 elem_size)
+{
+	arr->elements = 0;
+	arr->max_elem = 0;
+	arr->elem_size = elem_size;
+	arr->page = NULL;
+}	
+
+
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx)
+{
+	int pgnum, pgoff;
+	util_rarray_page *pg;
+	
+	if (idx >= arr->max_elem) {
+		printk("eek! asked for %d, but only %d elements\n", 
+		       idx, arr->max_elem);
+		return NULL;
+	}
+	
+	pgnum = idx / UTIL_RARRAY_ELEM_PER_BUF(arr);
+	pgoff = idx % UTIL_RARRAY_ELEM_PER_BUF(arr);
+	pg = (util_rarray_page *)arr->page;
+	while (pgnum--) {
+		if (!pg->next) {
+			printk("eeek! no next page!\n");
+			return NULL;
+		}
+		pg = pg->next;
+	}
+	return (((char *)pg->buf) + (pgoff * arr->elem_size));
+}
+
+
+void * util_get_new_rarray_slot(util_rarray *arr, int *index)
+{
+	char *tmp;
+	util_rarray_page *newpg, *pg;
+	
+	if (arr->max_elem == arr->elements) {
+		newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+		if (!newpg) {
+			printk("could not grow array!!!\n");
+			return NULL;
+		}
+		memset(newpg, 0, PAGE_SIZE);
+		if (arr->page) {
+			pg = (util_rarray_page *)arr->page;
+			while (pg->next)
+				pg = pg->next;
+			pg->next = newpg;
+		} else
+			arr->page = newpg;
+		arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+	}
+
+	tmp = util_rarray_idx_to_slot(arr, arr->elements);
+	if (tmp) {
+		if (index)
+			*index = arr->elements;
+		arr->elements++;
+	}
+	return tmp;
+}
+
+
+int util_add_to_rarray(util_rarray *arr, void *new)
+{
+	void *slot;
+	int idx;
+
+	slot = util_get_new_rarray_slot(arr, &idx);
+	if (slot == NULL) 
+		return -EINVAL;
+	memcpy(slot, new, arr->elem_size);
+	return idx;
+}
+
+/* resizes rarray to at least newelem elements */
+int util_resize_rarray(util_rarray *arr, int newelem)
+{
+	util_rarray_page *newpg, *pg;
+
+	printk("util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+	while (arr->max_elem < newelem) {
+		newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+		if (!newpg) {
+			printk("could not grow array!!!\n");
+			return -ENOMEM;
+		}
+		memset(newpg, 0, PAGE_SIZE);
+		if (arr->page) {
+			pg = (util_rarray_page *)arr->page;
+			while (pg->next)
+				pg = pg->next;
+			pg->next = newpg;
+		} else
+			arr->page = newpg;
+		arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+	}
+	printk("leaving util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+
+	return 0;
+}
+
+

Added: branches/dlm-glue/cluster/util.h
===================================================================
--- branches/dlm-glue/cluster/util.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/util.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_UTIL_H
+#define CLUSTER_UTIL_H
+
+#ifdef __KERNEL__
+#define SHUTDOWN_SIGS   (sigmask(SIGKILL) | sigmask(SIGHUP) | \
+			 sigmask(SIGINT) | sigmask(SIGQUIT))
+
+/* timeout structure taken from Ben's aio.c */
+typedef struct _util_timeout {
+	struct timer_list	timer;
+	int			timed_out;
+	wait_queue_head_t	wait;
+} util_timeout;
+
+void util_clear_timeout(util_timeout *to);
+void util_daemonize(char *name, int len, int shutdown_sigs);
+void util_init_timeout(util_timeout *to);
+void util_set_timeout(util_timeout *to, __u32 timeout);
+void util_show_stack(unsigned long *esp);
+void util_show_trace(unsigned long *stack);
+int util_sleep(__u32 ms);
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms);
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask);
+void util_unblock_sigs(sigset_t newsig);
+
+/* exits when var == val, or on timeout */
+static inline int util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int timeout)
+{
+	int ret = 0;
+	if (atomic_read(var) != val)
+		ret = __util_wait_atomic_eq(wq, var, val, timeout);
+	return ret;
+}
+
+#endif  /* __KERNEL__ */
+
+/* resizable array */
+typedef struct _util_rarray
+{
+	void *page;
+	u16 elements;
+	u16 max_elem;
+	u16 elem_size;
+	u16 reserved1;
+} util_rarray;
+
+#define UTIL_RARRAY_PAGE_BUF_SIZE    (PAGE_SIZE - offsetof(util_rarray_page, buf))
+#define UTIL_RARRAY_ELEM_PER_BUF(r)  ((UTIL_RARRAY_PAGE_BUF_SIZE) / (r)->elem_size)
+typedef struct _util_rarray_page
+{
+	void *next;
+	char buf[0];
+} util_rarray_page;
+
+void util_init_rarray(util_rarray *arr, u16 elem_size);
+void * util_get_new_rarray_slot(util_rarray *arr, int *index);
+int util_add_to_rarray(util_rarray *arr, void *new);
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx);
+int util_resize_rarray(util_rarray *arr, int newelem);
+
+#ifdef __KERNEL__
+typedef struct _util_thread_info
+{
+	wait_queue_head_t thread_wq;
+	atomic_t woken;
+	struct task_struct *task;
+	struct completion complete;
+	int pid;
+} util_thread_info;
+
+
+static inline void util_thread_info_init(util_thread_info *info)
+{
+	init_waitqueue_head(&info->thread_wq);
+	atomic_set(&info->woken, 0);
+	info->task = NULL;
+	info->pid = -1;
+	init_completion(&info->complete);
+}
+#endif /* __KERNEL__ */
+
+#endif /* CLUSTER_UTIL_H */

Added: branches/dlm-glue/cluster/warning_hack.h
===================================================================
--- branches/dlm-glue/cluster/warning_hack.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/warning_hack.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * warning_hack.h
+ *
+ * just to get rid of stupid warnings
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef WARNING_HACK_H
+#define WARNING_HACK_H
+
+struct mem_dqinfo;
+struct request;
+
+extern __inline__ int generic_fls(int x);
+extern __inline__ int get_bitmask_order(unsigned int count);
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+extern inline int rq_data_dir(struct request *rq);
+	
+
+#endif /* WARNING_HACK_H */

Modified: branches/dlm-glue/configure.in
===================================================================
--- branches/dlm-glue/configure.in	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/configure.in	2004-12-04 02:54:01 UTC (rev 1692)
@@ -82,19 +82,6 @@
   AC_MSG_ERROR(GCC is required)
 fi
 
-AC_MSG_CHECKING(for cluster support headers)
-AC_ARG_WITH(cluster-support, [  --with-cluster-support=dir Path to the cluster support headers [[none]]], clusterinc="$withval", clusterinc="not found")
-AC_MSG_RESULT($clusterinc)
-
-CLUSTERINC=
-if test -f "$clusterinc/dlmcommon.h"; then
-  CLUSTERINC=$clusterinc
-else
-  AC_MSG_ERROR([Cluster support headers not found, please use --with-cluster-support=/path/to/headers])
-fi
-
-AC_SUBST(CLUSTERINC)
-
 AC_MSG_CHECKING(for debugging)
 AC_ARG_ENABLE(debug, [  --enable-debug=[yes/no]         Turn on debugging [default=yes]],,enable_debug=yes)
 OCFS_DEBUG=

Modified: branches/dlm-glue/src/Makefile
===================================================================
--- branches/dlm-glue/src/Makefile	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/Makefile	2004-12-04 02:54:01 UTC (rev 1692)
@@ -188,7 +188,7 @@
 BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__ 
 DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
 
-INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC) -I$(CLUSTERINC)
+INCLUDES = -I. -I$(TOPDIR) -I$(KERNELINC) -I$(GCCINC)
 
 CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
 	-fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
@@ -237,8 +237,8 @@
 INSTALL_RULES = install-ocfs
 
 install-ocfs: $(INSTALL_MODULE)
-	$(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)
-	$(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/$<
+	$(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+	$(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/ocfs2/$<
 
 include $(TOPDIR)/Postamble.make
 

Modified: branches/dlm-glue/src/dlmglue.c
===================================================================
--- branches/dlm-glue/src/dlmglue.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/dlmglue.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -30,12 +30,12 @@
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmhb.h>
-#include <dlmnm.h>
-#include <dlmtcp.h>
-#include <dlmmod.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
 
 #include "ocfs_log.h"
 #include "ocfs.h"

Modified: branches/dlm-glue/src/heartbeat.c
===================================================================
--- branches/dlm-glue/src/heartbeat.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/heartbeat.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -31,9 +31,9 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmhb.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
 
 #include "ocfs_log.h"
 #include "ocfs.h"

Modified: branches/dlm-glue/src/ocfs.h
===================================================================
--- branches/dlm-glue/src/ocfs.h	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/ocfs.h	2004-12-04 02:54:01 UTC (rev 1692)
@@ -42,11 +42,11 @@
 # include <linux/tqueue.h>
 #endif
 
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmnm.h>
-#include <dlmtcp.h>
-#include <dlmmod.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
 
 /* convenience macro */
 

Modified: branches/dlm-glue/src/super.c
===================================================================
--- branches/dlm-glue/src/super.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/super.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -42,9 +42,9 @@
 #include <linux/socket.h>
 #include <linux/inet.h>
 
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmnm.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
 
 #include "ocfs_log.h"
 #include "ocfs.h"

Modified: branches/dlm-glue/src/vote.c
===================================================================
--- branches/dlm-glue/src/vote.c	2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/vote.c	2004-12-04 02:54:01 UTC (rev 1692)
@@ -30,12 +30,12 @@
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmhb.h>
-#include <dlmnm.h>
-#include <dlmtcp.h>
-#include <dlmmod.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
 
 #include "ocfs_log.h"
 #include "ocfs.h"