[Ocfs2-commits] mfasheh commits r1693 - in trunk: . cluster src

Mon Dec 6 15:45:34 CST 2004

Author: mfasheh
Date: 2004-12-06 15:45:32 -0600 (Mon, 06 Dec 2004)
New Revision: 1693

Added:
   trunk/cluster/
   trunk/cluster/Makefile
   trunk/cluster/compat_libfs.c
   trunk/cluster/compat_libfs.h
   trunk/cluster/dlm_compat.h
   trunk/cluster/dlmcommon.h
   trunk/cluster/dlmmaster.c
   trunk/cluster/dlmmod.c
   trunk/cluster/dlmmod.h
   trunk/cluster/dlmrecovery.c
   trunk/cluster/dlmthread.c
   trunk/cluster/heartbeat.c
   trunk/cluster/heartbeat.h
   trunk/cluster/nodemanager.c
   trunk/cluster/nodemanager.h
   trunk/cluster/tcp.c
   trunk/cluster/tcp.h
   trunk/cluster/test.c
   trunk/cluster/util.c
   trunk/cluster/util.h
   trunk/cluster/warning_hack.h
   trunk/src/dlmglue.c
   trunk/src/dlmglue.h
   trunk/src/slot_map.c
   trunk/src/slot_map.h
Removed:
   trunk/src/dlm.c
   trunk/src/dlm.h
   trunk/src/lockres.c
   trunk/src/lockres.h
   trunk/src/nm.c
   trunk/src/nm.h
   trunk/src/ocfs2_disk_dlm.h
   trunk/src/volcfg.c
   trunk/src/volcfg.h
Modified:
   trunk/Makefile
   trunk/src/Makefile
   trunk/src/alloc.c
   trunk/src/aops.c
   trunk/src/dcache.c
   trunk/src/dir.c
   trunk/src/file.c
   trunk/src/heartbeat.c
   trunk/src/heartbeat.h
   trunk/src/inode.c
   trunk/src/inode.h
   trunk/src/journal.c
   trunk/src/localalloc.c
   trunk/src/namei.c
   trunk/src/ocfs.h
   trunk/src/ocfs1_fs_compat.h
   trunk/src/ocfs2.h
   trunk/src/ocfs2_fs.h
   trunk/src/ocfs_journal.h
   trunk/src/ocfs_log.h
   trunk/src/proc.c
   trunk/src/suballoc.c
   trunk/src/super.c
   trunk/src/sysfile.c
   trunk/src/sysfile.h
   trunk/src/util.c
   trunk/src/util.h
   trunk/src/vote.c
   trunk/src/vote.h
Log:
* merge the dlm-glue branch back to trunk.



Modified: trunk/Makefile
===================================================================

--- trunk/Makefile	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/Makefile	2004-12-06 21:45:32 UTC (rev 1693)
@@ -2,7 +2,7 @@
 
 include $(TOPDIR)/Preamble.make
 
-SUBDIRS = src docs patches vendor
+SUBDIRS = cluster src docs patches vendor
 
 DIST_FILES = \
 	COPYING		\

Added: trunk/cluster/Makefile
===================================================================
--- trunk/cluster/Makefile	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/Makefile	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,226 @@
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+TOPDIR = ..
+
+include $(TOPDIR)/Preamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6.x kernel build system
+
+# Global parameter so we know where our stuff is
+CLUSTER_SRC_DIR	:= $(M)
+
+include $(CLUSTER_SRC_DIR)/../Config.make
+endif
+
+#-*******************************************************
+# Now do stuff which is global for 2.4.x and 2.6.x builds
+
+#ifdef OCFS_DEBUG
+OPTS += -g
+#endif
+
+#ifdef OCFS_DEBUG
+GLOBAL_DEFINES += -DDEBUG
+#endif
+
+ifdef OCFS_TRACE
+GLOBAL_DEFINES += -DTRACE
+endif
+
+ifdef HAVE_NPTL
+GLOBAL_DEFINES += -DHAVE_NPTL
+endif
+
+CFILES = \
+	compat_libfs.c	\
+	dlmmaster.c	\
+	dlmmod.c	\
+	dlmrecovery.c	\
+	dlmthread.c	\
+	heartbeat.c	\
+	nodemanager.c	\
+	tcp.c		\
+	util.c		\
+	test.c			
+
+HFILES = \
+	compat_libfs.h	\
+	dlm_compat.h	\
+	dlmcommon.h	\
+	dlmmod.h	\
+	heartbeat.h	\
+	nodemanager.h	\
+	tcp.h		\
+	util.h		\
+	warning_hack.h
+
+CLEAN_RULES = clean-cluster
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+# End of stuff which is global for 2.4.x and 2.6.x kernels
+#-********************************************************
+
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+# Preliminary 2.6.x kernel support.  See if we are building for the 2.6.x
+# kernel
+ifndef KERNEL_26
+# Building for a 2.4.x kernel
+
+WARNINGS = -Wall -Wstrict-prototypes
+
+ifneq ($(OCFS_PROCESSOR),x86_64)
+WARNINGS += -Wmissing-prototypes -Wmissing-declarations
+endif
+
+ifeq ($(KVER),vmware)
+  KERNELINC = /usr/src/linux-2.4/include
+endif
+
+ifeq ($(KVER),suse)
+  GLOBAL_DEFINES += -DSUSE
+endif
+ifeq ($(KVER),hugemem)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=1
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),smp)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=1 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0 
+endif
+ifeq ($(KVER),ent)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=1 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0 
+endif
+ifeq ($(KVER),up)
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0 
+  GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=1 
+endif
+
+ifeq ($(OCFS_PROCESSOR),ppc64)
+  MACH_CFLAGS += -m64 -fsigned-char -fno-builtin -msoft-float -mminimal-toc
+  LDADD += -m elf64ppc
+endif
+ifeq ($(OCFS_PROCESSOR),x86_64)
+  MACH_CFLAGS += -m64 -mcmodel=kernel
+endif
+
+BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__ 
+DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
+
+INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC)
+
+CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
+	-fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
+LDADD = -nostdlib
+
+OPTIMIZE = -O2
+
+CFLAGS += $(OPTIMIZE)
+
+MODULES = ocfs2_dlm.o ocfs2_heartbeat.o ocfs2_nodemanager.o ocfs2_tcp.o
+TEST_MODULES = ocfs2_cluster_test.o
+
+INSTALL_MODULES = $(MODULES)
+
+# Make dependancies work
+$(CFILES): $(HFILES)
+$(OBJS): $(HFILES)
+
+build-cluster: $(MODULES)
+
+ocfs2_cluster_test.o: test.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_dlm.o: dlmmod.o dlmthread.o dlmrecovery.o util.o compat_libfs.o dlmmaster.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_nodemanager.o: nodemanager.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_heartbeat.o: heartbeat.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_tcp.o: tcp.o util.o compat_libfs.o
+	$(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+clean-cluster:
+	rm -f *.o *.p *.s
+
+else # ifndef KERNEL_26
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it.  They are:
+#   1.	We are being included by the local Makefile to do a 2.6 kernel build.
+#	In this method we will call the kernel make system to build our module.
+#	This will cause the kernel make system to call back into our makefile
+#	(2nd way).
+
+INSTALL_MODULE = ocfs2.ko
+
+#ALL_RULES = stamp-md5 build-ocfs
+ALL_RULES = build-cluster
+
+build-ocfs:
+	$(MAKE) -C $(KERNELDIR) M=$(CURDIR) modules
+
+clean-ocfs:
+	$(MAKE) -C $(KERNELDIR) M=$(CURDIR) clean
+
+endif # OCFS_KERNEL_2_6
+
+INSTALL_RULES = install-cluster
+
+install-cluster: $(INSTALL_MODULES)
+	$(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+	@for file in $(INSTALL_MODULES); do \
+	  $(INSTALL_DATA) $$file $(DESTDIR)$(MODULEDIR)/ocfs2/$$file \
+        done
+
+include $(TOPDIR)/Postamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6 kernel build system.  So we will include the
+# 2.6.x Makefile and skip everything else.
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it.  They are:
+#   1.	We are being included by the local Makefile to do a 2.6 kernel build.
+#	In this method we will call the kernel make system to build our module.
+#	This will cause the kernel make system to call back into our makefile
+#	(2nd way).
+#
+#   2.	We are being included by the kernel make system.  So in this method we
+#	just setup the variables that the make system wants and then the kernel
+#	make system will take care of the build.
+
+# 2nd method.  The kernel make system is including us.  We need to setup the
+# various parameters for the kernel make system and then it will take care of
+# building us.
+
+STAMP_DIR = $(OCFS_SRC_DIR)
+include $(OCFS_SRC_DIR)/../Versioning.make
+
+EXTRA_CFLAGS += $(GLOBAL_DEFINES)
+
+CFLAGS_$(VERSION_OBJ) += $(VERDEFS)
+
+# Kernel Module file to produce
+obj-m += ocfs2.o
+
+# list of object files that are used to create our module
+ocfs2-objs := $(OBJS)
+
+endif # ifneq ($(KERNELRELEASE),)

Added: trunk/cluster/compat_libfs.c
===================================================================
--- trunk/cluster/compat_libfs.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/compat_libfs.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,705 @@
+/* -----------------------------------------------------------------*/
+
+
+/*
+ *	compat_libfs.c
+ *	Library for filesystems writers.
+ *	PLUS... transaction file stuff stolen from nfsd
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+
+#include "compat_libfs.h"
+
+#define kstatfs statfs
+#define __user
+
+
+int simple_statfs(struct super_block *sb, struct statfs *buf);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+#else
+struct dentry *simple_lookup(struct inode *dir,struct dentry *dentry);
+#endif
+
+int simple_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int dcache_dir_open(struct inode *inode, struct file *file);
+int dcache_dir_close(struct inode *inode, struct file *file);
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin);
+ssize_t generic_read_dir(struct file *filp, char *buf, size_t siz, loff_t *ppos);
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry);
+int simple_empty(struct dentry *dentry);
+int simple_unlink(struct inode *dir, struct dentry *dentry);
+int simple_rmdir(struct inode *dir, struct dentry *dentry);
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry);
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+
+
+
+#if 0
+int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+	return 0;
+}
+#endif
+
+int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+
+/*
+ * Lookup the data. This is trivial - if the dentry didn't already
+ * exist, we know it is negative.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	d_add(dentry, NULL);
+	return NULL;
+}
+#else
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry)
+{
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	d_add(dentry, NULL);
+	return NULL;
+}
+#endif
+
+
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name)
+{
+	struct list_head *iter;
+	struct dentry *child = NULL;
+
+	spin_lock(&dcache_lock);
+	list_for_each(iter, &dentry->d_subdirs) {
+		child = list_entry(iter, struct dentry, d_child);
+		if (child->d_name.len == name->len &&
+		    memcmp(child->d_name.name, name->name, name->len)==0)
+			break;
+		child = NULL;
+	}
+	if (child)
+		dget_locked(child);
+	spin_unlock(&dcache_lock);
+	return child;
+}
+
+
+
+int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+	return 0;
+}
+ 
+int dcache_dir_open(struct inode *inode, struct file *file)
+{
+	static struct qstr cursor_name = {.len = 1, .name = "."};
+
+	file->private_data = d_alloc(file->f_dentry, &cursor_name);
+
+	return file->private_data ? 0 : -ENOMEM;
+}
+
+int dcache_dir_close(struct inode *inode, struct file *file)
+{
+	dput(file->private_data);
+	return 0;
+}
+
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
+{
+	down(&file->f_dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			up(&file->f_dentry->d_inode->i_sem);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct list_head *p;
+			struct dentry *cursor = file->private_data;
+			loff_t n = file->f_pos - 2;
+
+			spin_lock(&dcache_lock);
+			list_del(&cursor->d_child);
+			p = file->f_dentry->d_subdirs.next;
+			while (n && p != &file->f_dentry->d_subdirs) {
+				struct dentry *next;
+				next = list_entry(p, struct dentry, d_child);
+				if (!d_unhashed(next) && next->d_inode)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->d_child, p);
+			spin_unlock(&dcache_lock);
+		}
+	}
+	up(&file->f_dentry->d_inode->i_sem);
+	return offset;
+}
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+
+int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct dentry *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->d_child;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = dentry->d_parent->d_inode->i_ino;
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			spin_lock(&dcache_lock);
+			if (filp->f_pos == 2) {
+				list_del(q);
+				list_add(q, &dentry->d_subdirs);
+			}
+			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
+				struct dentry *next;
+				next = list_entry(p, struct dentry, d_child);
+				if (d_unhashed(next) || !next->d_inode)
+					continue;
+
+				spin_unlock(&dcache_lock);
+				if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0)
+					return 0;
+				spin_lock(&dcache_lock);
+				/* next is still alive */
+				list_del(q);
+				list_add(q, p);
+				p = q;
+				filp->f_pos++;
+			}
+			spin_unlock(&dcache_lock);
+	}
+	return 0;
+}
+
+ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
+{
+	return -EISDIR;
+}
+
+struct file_operations simple_dir_operations = {
+	.open		= dcache_dir_open,
+	.release	= dcache_dir_close,
+	.llseek		= dcache_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= dcache_readdir,
+};
+
+struct inode_operations simple_dir_inode_operations = {
+	.lookup		= simple_lookup,
+};
+
+#if 0
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct super_block *
+get_sb_pseudo(struct file_system_type *fs_type, char *name,
+	struct super_operations *ops, unsigned long magic)
+{
+	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+	static struct super_operations default_ops = {.statfs = simple_statfs};
+	struct dentry *dentry;
+	struct inode *root;
+	struct qstr d_name = {.name = name, .len = strlen(name)};
+
+	if (IS_ERR(s))
+		return s;
+
+	s->s_flags = MS_NOUSER;
+	s->s_maxbytes = ~0ULL;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = magic;
+	s->s_op = ops ? ops : &default_ops;
+	root = new_inode(s);
+	if (!root)
+		goto Enomem;
+	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+	root->i_uid = root->i_gid = 0;
+	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+	dentry = d_alloc(NULL, &d_name);
+	if (!dentry) {
+		iput(root);
+		goto Enomem;
+	}
+	dentry->d_sb = s;
+	dentry->d_parent = dentry;
+	d_instantiate(dentry, root);
+	s->s_root = dentry;
+	s->s_flags |= MS_ACTIVE;
+	return s;
+
+Enomem:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return ERR_PTR(-ENOMEM);
+}
+#endif
+
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inode->i_nlink++;
+	atomic_inc(&inode->i_count);
+	dget(dentry);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static inline int simple_positive(struct dentry *dentry)
+{
+	return dentry->d_inode && !d_unhashed(dentry);
+}
+
+int simple_empty(struct dentry *dentry)
+{
+	struct dentry *child;
+	int ret = 0;
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+		if (simple_positive(child))
+			goto out;
+	ret = 1;
+out:
+	spin_unlock(&dcache_lock);
+	return ret;
+}
+
+int simple_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inode->i_nlink--;
+	dput(dentry);
+	return 0;
+}
+
+int simple_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	if (!simple_empty(dentry))
+		return -ENOTEMPTY;
+
+	dentry->d_inode->i_nlink--;
+	simple_unlink(dir, dentry);
+	dir->i_nlink--;
+	return 0;
+}
+
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
+
+	if (!simple_empty(new_dentry))
+		return -ENOTEMPTY;
+
+	if (new_dentry->d_inode) {
+		simple_unlink(new_dir, new_dentry);
+		if (they_are_dirs)
+			old_dir->i_nlink--;
+	} else if (they_are_dirs) {
+		old_dir->i_nlink--;
+		new_dir->i_nlink++;
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
+		new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
+#if 0
+int simple_readpage(struct file *file, struct page *page)
+{
+	void *kaddr;
+
+	if (PageUptodate(page))
+		goto out;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(kaddr, KM_USER0);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+int simple_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	if (!PageUptodate(page)) {
+		if (to - from != PAGE_CACHE_SIZE) {
+			void *kaddr = kmap_atomic(page, KM_USER0);
+			memset(kaddr, 0, from);
+			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+
+int simple_commit_write(struct file *file, struct page *page,
+			unsigned offset, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_sem.
+	 */
+	if (pos > inode->i_size)
+		i_size_write(inode, pos);
+	set_page_dirty(page);
+	return 0;
+}
+#endif
+
+void d_genocide(struct dentry *root);
+
+void d_genocide(struct dentry *root)
+{
+	struct dentry *this_parent = root;
+	struct list_head *next;
+	spin_lock(&dcache_lock);
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		next = tmp->next;
+		if (d_unhashed(dentry)||!dentry->d_inode)
+			continue;
+		if (!list_empty(&dentry->d_subdirs)) {
+			this_parent = dentry;
+			goto repeat;
+		}
+		atomic_dec(&dentry->d_count);
+	}
+	if (this_parent != root) {
+		next = this_parent->d_child.next;
+		atomic_dec(&this_parent->d_count);
+		this_parent = this_parent->d_parent;
+		goto resume;
+	}
+	spin_unlock(&dcache_lock);
+}
+
+static void simple_read_inode(struct inode * inode)
+{
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+}
+
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+{
+	static struct super_operations s_ops = {
+		.statfs = simple_statfs,
+		.read_inode = simple_read_inode
+	};
+	struct inode *inode;
+	struct dentry *root;
+	struct dentry *dentry;
+	int i;
+
+	s->s_blocksize = PAGE_CACHE_SIZE;
+	s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	s->s_magic = magic;
+	s->s_op = &s_ops;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+	inode->i_mode = S_IFDIR | 0755;
+	inode->i_uid = inode->i_gid = 0;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	for (i = 0; !files->name || files->name[0]; i++, files++) {
+		struct qstr name;
+		if (!files->name)
+			continue;
+		name.name = files->name;
+		name.len = strlen(name.name);
+		printk("adding file %*s\n", name.len, name.name);
+		name.hash = full_name_hash(name.name, name.len);
+		dentry = d_alloc(root, &name);
+		if (!dentry)
+			goto out;
+		inode = new_inode(s);
+		if (!inode)
+			goto out;
+		inode->i_mode = S_IFREG | files->mode;
+		inode->i_uid = inode->i_gid = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_fop = files->ops;
+		inode->i_ino = i;
+		d_add(dentry, inode);
+	}
+	s->s_root = root;
+	return 0;
+out:
+	d_genocide(root);
+	dput(root);
+	return -ENOMEM;
+}
+
+#if 0
+static spinlock_t pin_fs_lock = SPIN_LOCK_UNLOCKED;
+
+int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt = NULL;
+	spin_lock(&pin_fs_lock);
+	if (unlikely(!*mount)) {
+		spin_unlock(&pin_fs_lock);
+		mnt = do_kern_mount(name, 0, name, NULL);
+		if (IS_ERR(mnt))
+			return PTR_ERR(mnt);
+		spin_lock(&pin_fs_lock);
+		if (!*mount)
+			*mount = mnt;
+	}
+	mntget(*mount);
+	++*count;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+	return 0;
+}
+
+void simple_release_fs(struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt;
+	spin_lock(&pin_fs_lock);
+	mnt = *mount;
+	if (!--*count)
+		*mount = NULL;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+}
+
+ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	if (copy_to_user(to, from + pos, count))
+		return -EFAULT;
+	*ppos = pos + count;
+	return count;
+}
+
+EXPORT_SYMBOL(dcache_dir_close);
+EXPORT_SYMBOL(dcache_dir_lseek);
+EXPORT_SYMBOL(dcache_dir_open);
+EXPORT_SYMBOL(dcache_readdir);
+EXPORT_SYMBOL(generic_read_dir);
+EXPORT_SYMBOL(simple_commit_write);
+EXPORT_SYMBOL(simple_empty);
+EXPORT_SYMBOL(simple_fill_super);
+EXPORT_SYMBOL(simple_getattr);
+EXPORT_SYMBOL(simple_link);
+EXPORT_SYMBOL(simple_lookup);
+EXPORT_SYMBOL(simple_pin_fs);
+EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_SYMBOL(simple_readpage);
+EXPORT_SYMBOL(simple_release_fs);
+EXPORT_SYMBOL(simple_rename);
+EXPORT_SYMBOL(simple_rmdir);
+EXPORT_SYMBOL(simple_statfs);
+EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(simple_unlink);
+EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(get_sb_pseudo);
+#endif
+
+/* -----------------------------------------------------------------*/
+
+
+
+/* transaction file support */
+
+/*
+ * transaction based IO methods.
+ * The file expects a single write which triggers the transaction, and then
+ * possibly a read which collects the result - which is stored in a 
+ * file-local buffer.
+ */
+static ssize_t TA_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+{
+	ino_t ino =  file->f_dentry->d_inode->i_ino;
+	struct argresp *ar;
+	ssize_t rv = 0;
+	struct super_block *sb = file->f_dentry->d_inode->i_sb;
+	TA_write_ops *ops = TA_GENERIC_SB_MEMBER(sb);
+	TA_write_op *write_op;
+
+	printk("welcome to TA_write: num_ops=%d, op[%d]=%p, private=%p, size=%u\n", 
+	       ops->num_ops, (int)ino, ops->write_op[ino], file->private_data, size);
+	if (ino >= ops->num_ops || ops->write_op[ino] == NULL)
+		return -EINVAL;
+	write_op = ops->write_op[ino];
+	if (file->private_data) 
+		return -EINVAL; /* only one write allowed per open */
+	if (size > PAGE_SIZE - sizeof(struct argresp))
+		return -EFBIG;
+
+	ar = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!ar)
+		return -ENOMEM;
+	ar->size = 0;
+	down(&file->f_dentry->d_inode->i_sem);
+	if (file->private_data)
+		rv = -EINVAL;
+	else
+		file->private_data = ar;
+	up(&file->f_dentry->d_inode->i_sem);
+	if (rv) {
+		kfree(ar);
+		return rv;
+	}
+	if (copy_from_user(ar->data, buf, size))
+		return -EFAULT;
+
+	printk("now calling write_op...\n");	
+	rv = write_op(file, ar->data, size);
+	printk("write_op returned %d\n", rv);
+	if (rv>0) {
+		ar->size = rv;
+		rv = size;
+	}
+	return rv;
+}
+
+
+static ssize_t TA_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+	struct argresp *ar;
+	ssize_t rv = 0;
+	
+	if (file->private_data == NULL)
+		rv = TA_write(file, buf, 0, pos);
+	if (rv < 0)
+		return rv;
+
+	ar = file->private_data;
+	if (!ar)
+		return 0;
+	if (*pos >= ar->size)
+		return 0;
+	if (*pos + size > ar->size)
+		size = ar->size - *pos;
+	if (copy_to_user(buf, ar->data + *pos, size))
+		return -EFAULT;
+	*pos += size;
+	return size;
+}
+
+static int TA_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+static int TA_release(struct inode *inode, struct file *file)
+{
+	void *p = file->private_data;
+	file->private_data = NULL;
+	kfree(p);
+	return 0;
+}
+
+
+
+
+
+
+
+
+struct file_operations transaction_ops = {
+	.write		= TA_write,
+	.read		= TA_read,
+	.open		= TA_open,
+	.release	= TA_release,
+};

Added: trunk/cluster/compat_libfs.h
===================================================================
--- trunk/cluster/compat_libfs.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/compat_libfs.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,36 @@
+#ifndef CLUSTER_COMPAT_LIBFS_H
+#define CLUSTER_COMPAT_LIBFS_H
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define TA_GENERIC_SB_MEMBER(sb)  ((sb)->s_fs_info)
+#else
+#define TA_GENERIC_SB_MEMBER(sb)  ((sb)->u.generic_sbp)
+#endif
+
+
+/* an argresp is stored in an allocated page and holds the 
+ * size of the argument or response, along with its content
+ */
+struct argresp {
+	ssize_t size;
+	char data[0];
+};
+
+typedef ssize_t (TA_write_op)(struct file *, char *, size_t);
+typedef struct _TA_write_ops
+{
+	int num_ops;
+	TA_write_op *write_op[0];
+} TA_write_ops;
+
+struct tree_descr 
+{ 
+	char *name; 
+	struct file_operations *ops; 
+	int mode; 
+};
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name);
+
+#endif  /* CLUSTER_COMPAT_LIBFS_H */

Added: trunk/cluster/dlm_compat.h
===================================================================
--- trunk/cluster/dlm_compat.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlm_compat.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlm_compat.h
+ *
+ * Compatibility stuff for 2.4
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version
+ * 2 of the License.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLM_COMPAT_H
+#define CLUSTER_DLM_COMPAT_H
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/kdev_t.h>
+#include <linux/sched.h>
+#include <linux/compiler.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+# include <linux/locks.h>
+#else
+# include <linux/buffer_head.h>
+#endif
+
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+
+#ifdef __ia64__
+extern inline void prefetch(const void *x);
+extern inline void prefetchw(const void *x);
+#else
+static inline void prefetch(const void *x);
+static inline void prefetchw(const void *x);
+#endif
+extern inline int generic_fls(int x);
+extern inline int get_bitmask_order(unsigned int count);
+/* XXX Hack to avoid warning */
+struct mem_dqinfo;
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+
+
+
+
+#define flush_scheduled_work	flush_scheduled_tasks
+#define work_struct		tq_struct
+#define INIT_WORK(w, f, d)	INIT_TQUEUE(w, f, d)
+#define schedule_work(w)	schedule_task(w)
+
+#ifdef HAVE_NPTL
+static inline void dequeue_signal_lock(struct task_struct *task,
+				       sigset_t *blocked, siginfo_t *info)
+{
+	spin_lock_irq(&task->sighand->siglock);
+	dequeue_signal(blocked, info);
+	spin_unlock_irq(&task->sighand->siglock);
+}
+#else
+static inline void dequeue_signal_lock(struct task_struct *task,
+				       sigset_t *blocked, siginfo_t *info)
+{
+	spin_lock_irq(&task->sigmask_lock);
+	dequeue_signal(blocked, info);
+	spin_unlock_irq(&task->sigmask_lock);
+}
+#endif
+#define kstatfs statfs
+
+
+
+/*
+ * Copied right out of the 2.6.2 kernel's buffer_head.h:
+ * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
+ * and buffer_foo() functions.
+ */
+#define BUFFER_FNS(bit, name)						\
+static inline void set_buffer_##name(struct buffer_head *bh)		\
+{									\
+	set_bit(BH_##bit, &(bh)->b_state);				\
+}									\
+static inline void clear_buffer_##name(struct buffer_head *bh)		\
+{									\
+	clear_bit(BH_##bit, &(bh)->b_state);				\
+}									\
+static inline int buffer_##name(struct buffer_head *bh)			\
+{									\
+	return test_bit(BH_##bit, &(bh)->b_state);			\
+}
+
+#undef buffer_uptodate
+#undef buffer_dirty
+BUFFER_FNS(Uptodate, uptodate)
+BUFFER_FNS(Dirty, dirty)
+
+#define clear_buffer_dirty  mark_buffer_clean
+
+#endif  /* LINUX_VERSION_CODE < 2.6 */
+
+
+#endif  /* CLUSTER_DLM_COMPAT_H */
+

Added: trunk/cluster/dlmcommon.h
===================================================================
--- trunk/cluster/dlmcommon.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmcommon.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,52 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Common stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMCOMMON_H
+#define CLUSTER_DLMCOMMON_H
+
+#define DLM_ASSERT(x)       ({  if (!(x)) { printk("assert failed! %s:%d\n", __FILE__, __LINE__); BUG(); } })
+
+typedef struct _nm_ctxt nm_ctxt;
+typedef struct _dlm_ctxt dlm_ctxt;
+typedef struct _heartbeat_ctxt heartbeat_ctxt;
+
+#define CLUSTER_DISK_UUID_LEN      32      // 16 byte binary == 32 char hex string
+
+typedef struct _cluster_disk
+{
+	// uuid of disk
+	char uuid[CLUSTER_DISK_UUID_LEN+1];
+	// all the rest are for heartbeat
+	kdev_t dev;
+	u32 blocksize_bits;
+	u32 num_blocks;
+	u64 start_block;
+	util_rarray slots;
+} cluster_disk;
+
+
+#endif /* CLUSTER_DLMCOMMON_H */

Added: trunk/cluster/dlmmaster.c
===================================================================
--- trunk/cluster/dlmmaster.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmmaster.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,967 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+
+
+spinlock_t dlm_master_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(dlm_master_list);
+
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm, 
+			 dlm_lock_resource *res, struct qstr *name, int locked);
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm, 
+			 dlm_lock_resource *res, struct qstr *name, int locked)
+{
+	int ret = 0;
+	
+	mle->dlm = dlm;
+	mle->type = type;
+	INIT_LIST_HEAD(&mle->list);
+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	spin_lock_init(&mle->spinlock);
+	init_waitqueue_head(&mle->wq);
+	atomic_set(&mle->woken, 0);
+	atomic_set(&mle->refcnt, 1);
+	memset(mle->response_map, 0, sizeof(mle->response_map));
+	mle->master = NM_MAX_NODES;
+	mle->error = 0;
+
+	if (mle->type == DLM_MLE_MASTER) 
+		mle->u.res = res;
+	else 
+		strncpy(mle->u.name.name, name->name, name->len);
+		
+	if (!locked)
+		spin_lock(&dlm->spinlock);
+
+	/* copy off the node_map and register hb callbacks on our copy */
+	memcpy(mle->node_map, dlm->node_map, sizeof(mle->node_map));
+	memcpy(mle->vote_map, dlm->node_map, sizeof(mle->vote_map));
+	clear_bit(dlm->group_index, mle->vote_map);
+	clear_bit(dlm->group_index, mle->node_map);
+
+#warning cannot do this here cuz this kmallocs and we are under a spinlock dammit
+	if (hb_register_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle, DLM_HB_NODE_DOWN_PRI+1) ||
+	    hb_register_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle, DLM_HB_NODE_UP_PRI+1)) {
+		ret = -EINVAL;
+	}
+
+	if (!locked)
+		spin_unlock(&dlm->spinlock);
+
+	return ret;
+}
+
+
+
+
+/////////////////////////////////////////////////
+//
+// TODO: change these comments to reflect reality
+// 
+//    master_request(target=me)
+//    wait for all responses
+//    if maybe_map is 0 there are no others in progress
+//        assert_master(me)
+//    else (maybe_map has some nodes in it)
+//        (nodes in maybe_map had better be < my node num)
+//        wait for assert_master
+//    endif     
+//
+//    
+//    receive:
+//        master_request(target):
+//            if i own it, return YES
+//            if i dont know anything about it, return NO
+//            if i have it in progress
+//                if my node number is lower
+//                    return MAYBE
+//                else
+//                    if target < lowest_so_far, lowest_so_far=target
+//                    return NO
+//
+//        assert_master(master):
+//            if i own it, BUG()!!!
+//            if i have it, but owner!=master, BUG()!!!
+//            if i dont know anything about it, ignore
+//            if i have it in progress
+//                if lowest_so_far != master
+//                    BUG()!!!
+//                else
+//                    set the owner, DONE
+//
+/////////////////////////////////////////////////
+
+
+/* remove from list and free */
+void dlm_put_mle(dlm_master_list_entry *mle)
+{
+	if (atomic_dec_and_lock(&mle->refcnt, &dlm_master_lock)) {
+		list_del(&mle->list);
+		spin_unlock(&dlm_master_lock);
+		hb_unregister_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle);
+		hb_unregister_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle);
+		kfree(mle);
+	}
+}
+
+
+
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ * 
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ * 
+ * also, do a lookup in the dlm_master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here.   need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags)
+{
+	dlm_lock_resource *tmpres=NULL, *res=NULL;
+	struct list_head *bucket;
+	dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	struct list_head *iter;
+	int blocked = 0;
+	int map_changed = 0, restart = 0, assert = 0;
+	int ret, start, bit;
+	
+	bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+	/* pre-allocate a dlm_lock_resource and master stuff */
+	mle = kmalloc(sizeof(dlm_master_list_entry), GFP_KERNEL);
+	res = kmalloc(sizeof(dlm_lock_resource), GFP_KERNEL);
+	if (!mle || !res) {
+		printk("could not allocate memory for new lock resource!\n");
+		if (mle)
+			kfree(mle);
+		if (res)
+			kfree(res);
+		return NULL;
+	}
+
+	/* check for pre-existing lock */
+	spin_lock(&dlm->spinlock);
+	tmpres = __dlm_lookup_lock(dlm, lockname);
+	if (tmpres) {
+		spin_unlock(&dlm->spinlock);
+		/* TODO: return error, or return the lockres ?!? */
+		kfree(res);
+		kfree(mle);
+		/* waits for any outstanding work to finish 
+		 * will hold tmpres->spinlock on exit */
+		dlm_wait_on_lockres(tmpres);
+		return tmpres;
+	}
+
+	dlm_init_lockres(res, lockname);
+
+	if (flags & LKM_LOCAL) {
+		/* caller knows it's safe to assume it's not mastered elsewhere
+		 * DONE!  return right away */
+		list_add_tail(&res->list, bucket);
+		res->owner = dlm->group_index;
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	
+		/* return with res->spinlock held */
+
+		/* lock ordering note: this lockres will not be 
+		 * visible until i release dlm->spinlock, so it 
+		 * is ok to release dlm->spinlock out of order here */
+		spin_lock(&res->spinlock);
+		
+		spin_unlock(&dlm->spinlock);
+		return res;
+	}
+		
+	/* look in master list to see if another node has started mastering this */
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		tmpmle = list_entry(iter, dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, tmpmle, lockname))
+			continue;
+
+		if (tmpmle->type == DLM_MLE_MASTER) {
+			printk("impossible!  master entry for nonexistent lock!\n");
+			BUG();
+		}
+		dlm_get_mle(tmpmle);
+		blocked = 1;
+		// found a block!  must wait for lock to be mastered by another node
+		break;
+	}
+
+	if (!blocked) {
+		/* go ahead and try to master lock on this node */
+		if (dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 1)) {
+			printk("bug! failed to register hb callbacks\n");
+			BUG();
+		}
+		list_add(&mle->list, &dlm_master_list);
+	}
+	spin_unlock(&dlm_master_lock);
+
+	/* at this point there is either a DLM_MLE_BLOCK or a DLM_MLE_MASTER 
+	 * on the master list, so it's safe to add the lockres to the hashtable.
+	 * anyone who finds the lock will still have to wait on the IN_PROGRESS. 
+	 * also, any new nodes that try to join at this point will have to wait
+	 * until my dlm_master_lock list is empty, so they cannot possibly 
+	 * do any master requests yet... TODO
+	 * ?? should i have a special type of mle just for joining nodes ?? 
+	 * ?? could allow them to come in and put their mle on the list and sleep ?? */
+
+	/* finally add the lockres to its hash bucket */
+	list_add_tail(&res->list, bucket);
+	spin_unlock(&dlm->spinlock);
+
+	if (blocked) {
+		/* must wait for lock to be mastered elsewhere */
+		kfree(mle);
+		mle = tmpmle;
+		goto wait;
+	}
+
+	ret = -EINVAL;
+	start = 0;
+	while (1) {
+		bit = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+		if (bit >= NM_MAX_NODES) {
+			printk("no more nodes\n");
+			break;
+		}
+		
+		ret = dlm_do_master_request(mle, bit);
+		if (ret < 0) {
+			// TODO
+			//printk("dlm_do_master_request returned %d!\n", ret);
+		}
+		if (mle->master != NM_MAX_NODES) {
+			// found a master!
+			break;
+		}
+		start = bit+1;
+	}
+
+wait:
+	while (1) {
+		spin_lock(&res->spinlock);
+		if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			// another node has become the owner
+			spin_unlock(&res->spinlock);
+			break;
+		}
+		spin_unlock(&res->spinlock);
+
+		spin_lock(&mle->spinlock);
+		if (mle->master != NM_MAX_NODES) {
+			u16 m = mle->master;
+			// printk("node %u is the master!\n", m);
+			spin_unlock(&mle->spinlock);
+
+			spin_lock(&res->spinlock);
+			res->owner = m;
+			spin_unlock(&res->spinlock);
+			break;
+		}
+		restart = 0;
+		map_changed = (memcmp(mle->vote_map, mle->node_map, sizeof(mle->vote_map)) != 0);
+		if (memcmp(mle->vote_map, mle->response_map, sizeof(mle->vote_map)) == 0) {
+			// printk("every node has responded...\n");
+			if (map_changed) {
+				printk("eek! got all original nodes, but nodemap changed while collecting responses\n");
+				restart = 1;
+			}
+
+			if (mle->error) {
+				printk("ugh.  some node hit an error (-ENOMEM).  try the whole thing again\n"); 
+				mle->error = 0;
+				/* TODO: treat this just like the dead node case below,
+				 * cleanup and start over, but keep the error node around */
+				restart = 1;
+			}
+
+			if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+				/* no other nodes are in-progress */
+				/* those nodes should all be locking out this lockid until I assert */
+				/* they should have put a dummy entry on dlm_master_list */
+				/* need to assert myself as the master */
+				
+				// printk("I am the only node in-progress!  asserting myself as master\n");
+				assert = 1;
+			} else {
+				/* other nodes are in-progress */
+				if (map_changed && !test_bit(bit, mle->node_map)) {
+					/* TODO: need to copy the node_map into the vote_map, zero 
+					 * everything out and start over */
+					printk("need to handle this case!  winning node %u just died!\n", bit);
+					restart = 1;
+				}
+
+				if (bit > dlm->group_index) {
+					// printk("next in-progress node (%u) is higher than me (%u)\n",
+					//        bit, dlm->group_index);
+
+					/* nodes not in-progress should be locking out this lockid until I assert */
+					/* in-progress nodes should match me up with their lowest maybe_map bit */
+					/* need to assert myself as the master */
+
+					// printk("I am the lowest node!  asserting myself as master\n");
+					assert = 1;
+				} else {
+					/* need to sit around and wait for assert */
+					/* my lowest maybe_map bit should be the one to assert */
+					/* just fall through and sleep. should be woken by the handler */
+
+					// printk("sleeping while waiting for %u to assert himself as master\n", bit);
+				}
+			}
+		} else {
+			if (map_changed) {
+				/* TODO: need to handle this */
+				printk("eek! nodemap changed while collecting responses\n");
+				restart = 1;
+			}
+			// printk("still waiting for all nodes to respond...\n");
+		}
+
+		if (restart && assert)
+			assert = 0;
+
+		/* make sure to tell any other nodes that i am mastering this */
+		if (assert)
+			mle->master = dlm->group_index;
+
+		spin_unlock(&mle->spinlock);
+		
+		if (assert) {
+			ret = dlm_do_assert_master(mle);
+			// printk("assert returned %d!\n", ret);
+			if (ret == 0) {
+				spin_lock(&res->spinlock);
+				res->owner = dlm->group_index;
+				spin_unlock(&res->spinlock);
+				// printk("wooo!  i am the owner.  phew!\n");
+				break;
+			} else 
+				restart = 1;
+		}
+		if (restart) {
+			printk("something happened such that the master process needs to be restarted!\n");
+			/* TODO: clear it all out and start over */
+		}
+
+		atomic_set(&mle->woken, 0);
+		ret = util_wait_atomic_eq(&mle->wq, &mle->woken, 1, 5000);
+	}
+	dlm_put_mle(mle);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	wake_up(&res->wq);
+
+	/* exits holding res->spinlock */
+	return res;
+}
+	
+
+
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data)
+{
+	u8 response = DLM_MASTER_RESP_MAYBE;
+	dlm_ctxt *dlm = data;
+	dlm_lock_resource *res;
+	dlm_master_request *request = (dlm_master_request *) msg->buf;
+	dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	struct qstr lockname = { .name=request->name, .len=request->namelen };
+	int found;
+	struct list_head *iter;
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+way_up_top:	
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_unlock(&dlm->spinlock);
+
+		/* take care of the easy cases up front */
+		spin_lock(&res->spinlock);
+		if (res->owner == dlm->group_index) {
+			spin_unlock(&res->spinlock);
+			// printk("this node is the master\n");
+			response = DLM_MASTER_RESP_YES;
+			if (mle)
+				kfree(mle);
+			goto send_response;
+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			spin_unlock(&res->spinlock);
+			// printk("node %u is the master\n", res->owner);
+			response = DLM_MASTER_RESP_NO;
+			if (mle)
+				kfree(mle);
+			goto send_response;
+		}
+
+		/* ok, there is no owner.  either this node is 
+		 * being blocked, or it is actively trying to
+		 * master this lock. */
+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+			printk("bug! lock with no owner should be in-progress!\n");
+			BUG();
+		}
+
+		// printk("lockres is in progress...\n");
+		found = 0;
+		spin_lock(&dlm_master_lock);
+		list_for_each(iter, &dlm_master_list) {
+			tmpmle = list_entry(iter, dlm_master_list_entry, list);
+			if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+				continue;
+
+			dlm_get_mle(tmpmle);
+			spin_lock(&tmpmle->spinlock);
+			if (tmpmle->type == DLM_MLE_BLOCK) {
+				// printk("this node is waiting for lockres to be mastered\n");
+				response = DLM_MASTER_RESP_NO;
+			} else {
+				// printk("this node is attempting to master lockres\n");
+				response = DLM_MASTER_RESP_MAYBE;
+			}
+			set_bit(request->node_idx, tmpmle->maybe_map);
+			spin_unlock(&tmpmle->spinlock);
+
+			spin_unlock(&dlm_master_lock);
+			spin_unlock(&res->spinlock);
+
+			dlm_put_mle(tmpmle);
+			if (mle)
+				kfree(mle);
+			goto send_response;
+		}
+		spin_unlock(&dlm_master_lock);
+		spin_unlock(&res->spinlock);
+		printk("bug bug bug!!!  no mle found for this lock!\n");
+		BUG();
+	}
+	
+	/* 
+	 * lockres doesn't exist on this node 
+	 * if there is an MLE_BLOCK, return NO 
+	 * if there is an MLE_MASTER, return MAYBE
+	 * otherwise, add an MLE_BLOCK, return NO 
+	 */
+	found = 0;
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		tmpmle = list_entry(iter, dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+			continue;
+		dlm_get_mle(tmpmle);
+		found = 1;
+		break;
+	}
+
+	if (!found) {
+		/* this lockid has never been seen on this node yet */
+		// printk("no mle found\n");
+		if (!mle) {
+			spin_unlock(&dlm_master_lock);
+			spin_unlock(&dlm->spinlock);
+	
+			mle = kmalloc(sizeof(dlm_master_list_entry) + lockname.len, GFP_KERNEL);
+			if (!mle) {
+				// bad bad bad... this sucks.
+				response = DLM_MASTER_RESP_ERROR;
+				goto send_response;
+			}
+			if (dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, &lockname, 0)) {
+				printk("eeek!\n");
+				response = DLM_MASTER_RESP_ERROR;
+				dlm_put_mle(mle);
+				goto send_response;
+			}
+			goto way_up_top;
+		}
+
+		// printk("this is second time thru, already allocated, add the block.\n");
+		set_bit(request->node_idx, mle->maybe_map);
+		list_add(&mle->list, &dlm_master_list);
+		response = DLM_MASTER_RESP_NO;
+	} else {
+		// printk("mle was found\n");
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK)
+			response = DLM_MASTER_RESP_NO;
+		else
+			response = DLM_MASTER_RESP_MAYBE;
+		set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+		dlm_put_mle(tmpmle);
+	}
+	spin_unlock(&dlm_master_lock);
+	spin_unlock(&dlm->spinlock);
+
+send_response:
+	//ret = dlm_do_master_request_resp(dlm, &lockname, response, request->node_idx);
+	//printk("response returned %d\n", ret);
+	
+	// printk("sending response %d to other node\n", response);
+	return response;
+}
+
+/* NOTE: when doing node recovery, run the dlm_master_list looking for the dead node in 
+ * any maybe_map... clear that bit, and if now empty, clear the whole thing */
+
+/*
+ * locks that can be taken here:
+ * mle->spinlock
+ * dlm_master_list
+ *
+ */
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_master_list_entry *mle = NULL;
+	dlm_master_request_resp *resp = (dlm_master_request_resp *) msg->buf;
+	int found = 0, wake = 0;
+	struct list_head *iter;
+	struct qstr lockname = { .name=resp->name, .len=resp->namelen };
+	
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		mle = list_entry(iter, dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, mle, &lockname)) {
+			mle = NULL;
+			continue;
+		}
+
+		dlm_get_mle(mle);
+		if (mle->type == DLM_MLE_BLOCK) {
+			printk("eek! cannot get a response for a block!\n");
+			break;
+		}
+		found = 1;
+		wake = 0;
+		spin_lock(&mle->spinlock);
+		switch (resp->response) {
+			case DLM_MASTER_RESP_YES:
+				set_bit(resp->node_idx, mle->response_map);
+				// printk("woot!  node %u is the master!\n", resp->node_idx);
+				mle->master = resp->node_idx;
+				wake = 1;
+				break;
+			case DLM_MASTER_RESP_NO:
+				// printk("node %u is not the master, not in-progress\n", resp->node_idx);
+				set_bit(resp->node_idx, mle->response_map);
+				if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+					wake = 1;
+				break;
+			case DLM_MASTER_RESP_MAYBE:
+				// printk("node %u is not the master, but IS in-progress\n", resp->node_idx);
+				set_bit(resp->node_idx, mle->response_map);
+				set_bit(resp->node_idx, mle->maybe_map);
+				if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+					wake = 1;
+				break;
+			case DLM_MASTER_RESP_ERROR:
+				printk("node %u hit an -ENOMEM!  try this whole thing again\n", resp->node_idx);
+				mle->error = 1;
+				wake = 1;
+				break;
+			default:
+				printk("bad response! %u\n", resp->response);
+				break;
+		}
+		if (wake) {		
+			atomic_set(&mle->woken, 1);
+			wake_up(&mle->wq);
+		}
+		spin_unlock(&mle->spinlock);
+		break;
+	}
+	spin_unlock(&dlm_master_lock);
+
+	if (found)
+		dlm_put_mle(mle);
+	else
+		printk("hrrm... got a master resp but found no matching request\n");
+	return 0;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_master_list_entry *mle = NULL;
+	dlm_assert_master *assert = (dlm_assert_master *)msg->buf;
+	dlm_lock_resource *res;
+	int bit;
+	struct list_head *iter;
+	struct qstr lockname = { .name=assert->name, .len=assert->namelen };
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	spin_lock(&dlm->spinlock);
+
+	/* find the MLE */
+	spin_lock(&dlm_master_lock);
+	list_for_each(iter, &dlm_master_list) {
+		mle = list_entry(iter, dlm_master_list_entry, list);
+		if (dlm_mle_equal(dlm, mle, &lockname)) {
+			dlm_get_mle(mle);
+			break;
+		}
+		mle = NULL;
+	}
+	if (!mle) {
+		printk("EEEEEEK!  just got an assert_master from %u, but no MLE for it!\n",
+		       assert->node_idx);
+		spin_unlock(&dlm_master_lock);
+		goto check_lockres;
+	}
+	if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+		printk("EEK! no bits set in the maybe_map, but %u is asserting!\n",
+		       assert->node_idx);
+		BUG();
+	} else if (bit != assert->node_idx) {
+		/* TODO: is this ok?  */
+		printk("EEK! expected %u to be the master, but %u is asserting!\n", 
+		       bit, assert->node_idx);
+		BUG();
+	}
+	spin_unlock(&dlm_master_lock);
+
+	/* ok everything checks out with the MLE
+	 * now check to see if there is a lockres */
+check_lockres:
+	res = __dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (!mle) {
+			if (res->owner != assert->node_idx) {
+				printk("EEEEeeEEeeEEEK!  assert_master from %u, but current owner is %u!\n",
+				       assert->node_idx, res->owner);
+				BUG();
+			}
+		} else {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				printk("EEEEEEEEEEEEEEEEEK!!! got assert_master from node %u, but %u is the owner!\n",
+			       		assert->node_idx, res->owner);
+				printk("goodnite!\n");
+				BUG();
+			}
+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+				printk("bug! got assert from %u, but lock with no owner should be in-progress!\n",
+			       		assert->node_idx);
+				BUG();
+			}
+		}
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	// printk("woo!  got an assert_master from node %u!\n", assert->node_idx);
+	if (mle) {
+		spin_lock(&mle->spinlock);
+		mle->master = assert->node_idx;
+		atomic_set(&mle->woken, 1);
+		wake_up(&mle->wq);
+		spin_unlock(&mle->spinlock);
+	
+		/* if this is the last put, it will be removed from the list */
+		dlm_put_mle(mle);
+	}
+	return 0;
+}
+
+
+int dlm_do_master_request(dlm_master_list_entry *mle, int to)
+{
+	struct inode *inode = NULL;
+	dlm_ctxt *dlm = mle->dlm;
+	dlm_master_request request;
+	int ret, response=0;
+
+	memset(&request, 0, sizeof(request));
+	request.node_idx = dlm->group_index;
+	if (mle->type == DLM_MLE_BLOCK) {
+		request.namelen = mle->u.name.len;
+		strncpy(request.name, mle->u.name.name, request.namelen);
+	} else {
+		request.namelen = mle->u.res->lockname.len;
+		strncpy(request.name, mle->u.res->lockname.name, request.namelen);
+	}
+
+	ret = -EINVAL;
+	inode = nm_get_group_node_by_index(dlm->group, to);
+	if (inode) {
+		ret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, sizeof(request), inode, &response);
+		iput(inode);
+		if (ret >= 0) {
+			spin_lock(&mle->spinlock);
+			switch (response) {
+				case DLM_MASTER_RESP_YES:
+					set_bit(to, mle->response_map);
+					// printk("woot!  node %u is the master!\n", to);
+					mle->master = to;
+					break;
+				case DLM_MASTER_RESP_NO:
+					// printk("node %u is not the master, not in-progress\n", to);
+					set_bit(to, mle->response_map);
+					break;
+				case DLM_MASTER_RESP_MAYBE:
+					// printk("node %u is not the master, but IS in-progress\n", to);
+					set_bit(to, mle->response_map);
+					set_bit(to, mle->maybe_map);
+					break;
+				case DLM_MASTER_RESP_ERROR:
+					printk("node %u hit an -ENOMEM!  try this whole thing again\n", to);
+					mle->error = 1;
+					break;
+				default:
+					printk("bad response! %u\n", response);
+					ret = -EINVAL;
+					break;
+			}
+			spin_unlock(&mle->spinlock);
+		} else {
+			printk("net_send_message returned %d!\n", ret);
+		}
+	} else {
+		printk("nm_get_group_node_by_index failed to find inode for node %d!\n", to);
+	}	
+	return ret;
+}
+
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to)
+{
+	struct inode *inode = NULL;
+	dlm_master_request_resp resp;
+	int ret;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.node_idx = dlm->group_index;
+	resp.response = response;
+	resp.namelen = name->len;
+	strncpy(resp.name, name->name, name->len);
+
+	inode = nm_get_group_node_by_index(dlm->group, to);
+	if (!inode)
+		return -EINVAL;
+	ret = net_send_message(DLM_MASTER_REQUEST_RESP_MSG, dlm->key, &resp, sizeof(resp), inode, NULL);
+	iput(inode);
+	return ret;
+}
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+int dlm_do_assert_master(dlm_master_list_entry *mle)
+{
+	struct inode *inode = NULL;
+	dlm_ctxt *dlm = mle->dlm;
+	dlm_assert_master assert;
+	int to, start = 0, ret = 0, tmpret;
+
+	while (1) {
+		to = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+		if (to >= NM_MAX_NODES) {
+			// printk("no more nodes\n");
+			break;
+		}
+		// printk("sending assert master to %d\n", to);
+
+		memset(&assert, 0, sizeof(assert));
+		assert.node_idx = dlm->group_index;
+		if (mle->type == DLM_MLE_BLOCK) {
+			assert.namelen = mle->u.name.len;
+			strncpy(assert.name, mle->u.name.name, assert.namelen);
+		} else {
+			assert.namelen = mle->u.res->lockname.len;
+			strncpy(assert.name, mle->u.res->lockname.name, assert.namelen);
+		}
+
+		inode = nm_get_group_node_by_index(dlm->group, to);
+		if (!inode) {
+			tmpret = -EINVAL;
+			printk("could not get nm info for node %d!  need to retry this whole thing\n", to);
+			ret = tmpret;
+			break;
+		}
+		tmpret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &assert, sizeof(assert), inode, NULL);
+		iput(inode);
+
+		if (tmpret < 0) {
+			// TODO
+			// printk("assert_master returned %d!\n", tmpret);
+			ret = tmpret;
+			break;
+		}
+		start = to+1;
+	}
+
+	return ret;
+}
+
+
+
+
+
+
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//int ret;
+	//struct inode *node = ptr2;
+
+	dlm_master_list_entry *mle;
+	dlm_ctxt *dlm;
+
+	mle = data;
+	if (!mle) {
+		printk("eek! NULL mle!\n");
+		return;
+	}
+	if (!mle->dlm) {
+		printk("eek! NULL dlm\n");
+		return;
+	}
+       	dlm = mle->dlm;
+	if (dlm->group != group)
+		return;
+
+	spin_lock(&mle->spinlock);
+
+	if (!test_bit(idx, mle->node_map))
+		printk("node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, mle->node_map);
+
+#if 0	
+	if (test_bit(idx, mle->recovery_map))
+		printk("node %u already added to recovery map!\n", idx);
+	else
+		set_bit(idx, mle->recovery_map);
+#endif
+	spin_unlock(&mle->spinlock);
+}
+
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//struct inode *node = ptr2;
+	dlm_master_list_entry *mle;
+	dlm_ctxt *dlm;
+
+	mle = data;
+	if (!mle) {
+		printk("eek! NULL mle!\n");
+		return;
+	}
+	if (!mle->dlm) {
+		printk("eek! NULL dlm\n");
+		return;
+	}
+       	dlm = mle->dlm;
+	if (dlm->group != group)
+		return;
+
+	spin_lock(&mle->spinlock);
+
+#if 0	
+	if (test_bit(idx, mle->recovery_map))
+		printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+	else 
+#endif
+	{
+		if (test_bit(idx, mle->node_map))
+			printk("node %u already in node map!!!\n", idx);
+		else 
+			set_bit(idx, mle->node_map);
+	}
+
+	spin_unlock(&mle->spinlock);
+}

Added: trunk/cluster/dlmmod.c
===================================================================
--- trunk/cluster/dlmmod.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmmod.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Oracle Corporation");
+//MODULE_DESCRIPTION("Oracle DLM");
+
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, always obey this ordering:
+ *    dlm_domain_lock -> dlm_ctxt -> dlm_lock_resource -> dlm_lock
+ *
+ */
+
+
+static int __init dlm_driver_entry (void);
+static int dlm_read_params(void);
+static void __exit dlm_driver_exit (void);
+
+
+
+LIST_HEAD(dlm_domains);
+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+u16 dlm_global_index = NM_MAX_NODES;
+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_next_cookie = 1;
+
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+static dlm_ctxt * __dlm_lookup_domain(char *domain);
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type);
+
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void __dlm_wait_on_lockres(dlm_lock_resource *res);
+
+
+/* ----------------------------------------------------------------- */
+
+extern spinlock_t dlm_master_lock;
+extern struct list_head dlm_master_list;
+
+typedef struct _dlm_create_lock
+{
+	u16 node_idx;
+	s8 requested_type;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_create_lock;
+
+typedef struct _dlm_convert_lock
+{
+	u16 node_idx;
+	s8 requested_type;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_convert_lock;
+
+typedef struct _dlm_unlock_lock
+{
+	u32 flags;
+	u16 node_idx;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_unlock_lock;
+
+typedef struct _dlm_proxy_ast
+{
+	u16 node_idx;
+	u8 type;
+	u8 blocked_type;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+	u64 cookie;
+} dlm_proxy_ast;
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data);
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data);
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags);
+
+/* ----------------------------------------------------------------- */
+
+
+
+
+/*
+ * dlm_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init dlm_driver_entry (void)
+{
+	int status;
+
+
+	printk("Loaded dlm Driver module\n");
+	status = dlm_read_params();
+	if (status < 0)
+		return -1;
+
+	dlm_global_index = nm_this_node(NULL);
+	if (dlm_global_index == NM_MAX_NODES)
+		return -1;
+
+	return 0;
+}				/* dlm_driver_entry */
+
+/*
+ * dlm_read_params()
+ *
+ * Read insmod params
+ */
+static int dlm_read_params(void)
+{
+	int status = 0;
+	return status;
+}				/* dlm_read_params */
+
+
+/*
+ * dlm_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit dlm_driver_exit (void)
+{
+	printk("Unloaded dlm Driver module\n");
+	return;
+}				/* dlm_driver_exit */
+
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name, 
+		   dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast)
+{
+	dlm_status status;
+	dlm_lock_resource *res;
+	dlm_lock *lock = NULL;
+	char *buf = NULL;
+	int convert = 0, recovery = 0;
+	struct qstr q;
+
+	if (!lksb)
+		return DLM_BADARGS;
+
+	status = DLM_BADPARAM;
+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE)
+		goto error_status;
+
+	if (flags & ~LKM_VALID_FLAGS)
+		goto error_status;
+
+	convert = (flags & LKM_CONVERT);
+	recovery = (flags & LKM_RECOVERY);
+
+	if (recovery && (!dlm_is_recovery_lock(name, strlen(name)) ||
+		 convert) ) {
+		goto error_status;
+	}
+
+
+	if (convert) {
+		/* if converting, must pass in a valid dlm_lock */
+		if (!lksb->lockid || !lksb->lockid->lockres)
+			goto error_status;
+		lock = lksb->lockid;
+	
+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are essentially
+	 	 * static after the original lock call.  convert requests will check
+	 	 * to ensure that everything is the same and pass DLM_BADARGS if not.
+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
+	 	 */
+#warning differs from spec here!
+
+		if (lock->lksb != lksb || lock->ast != ast ||
+		    lock->bast != bast || lock->astdata != data) {
+			status = DLM_BADARGS;
+			printk("ERROR new args:  lksb=%p, ast=%p, bast=%p, astdata=%p\n", 
+			       lksb, ast, bast, data);
+			printk("      orig args: lksb=%p, ast=%p, bast=%p, astdata=%p\n", 
+			       lock->lksb, lock->ast, lock->bast, lock->astdata);
+			goto error_status;
+		}
+		res = lock->lockres;
+
+		down_read(&dlm->recovery_sem);
+		spin_lock(&res->spinlock);
+		if (flags & LKM_LOCAL) {
+			printk("strange LOCAL convert request!\n");
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				spin_unlock(&res->spinlock);
+				status = DLM_BADPARAM;
+				goto up_error;
+			}
+			res->owner = dlm->group_index;
+			printk("set owner to this node.  you SURE thats what you want!?\n");
+		}
+		status = do_dlmconvert(dlm, res, lock, flags, mode);
+	} else {
+		status = DLM_BADARGS;
+		if (!name)
+			goto error;
+		
+		status = DLM_IVBUFLEN;
+		q.len = strlen(name);
+		if (q.len > DLM_LOCKID_NAME_MAX)
+			goto error;
+
+		status = DLM_SYSERR;
+		buf = kmalloc(q.len+1, GFP_KERNEL);
+		if (!buf)
+			goto error;
+
+		memcpy(buf, name, q.len);
+		buf[q.len] = 0;
+		q.name = buf;
+		q.hash = full_name_hash(q.name, q.len);
+
+		if (!recovery)		
+			down_read(&dlm->recovery_sem);
+{
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2;
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+		res = dlm_get_lock_resource(dlm, &q, flags);
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+	printk("dlm_get_lock_resource took %llu cycles\n", u2.q-u1.q);
+}
+		if (!res) {
+			status = DLM_IVLOCKID;
+			goto up_error;
+		}
+		status = do_dlmlock(dlm, res, lksb, flags, mode, ast, bast, data);
+		if (status != DLM_NORMAL)
+			goto up_error;
+	}
+
+	/* TODO: lvb */
+	if (!recovery)
+		up_read(&dlm->recovery_sem);
+	return status;
+
+up_error:
+	if (!recovery)
+		up_read(&dlm->recovery_sem);
+error:
+	if (buf)
+		kfree(buf);
+	lksb->lockid = NULL;
+
+error_status:
+	// this is kind of unnecessary
+	lksb->status = status;
+	return status;
+}
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb, int flags,
+		      int type, dlm_astlockfunc_t *ast, dlm_bastlockfunc_t *bast, void *data)
+{
+	dlm_lock *tmplock;
+       	dlm_status status;
+	u8 *c;
+
+	dlmprintk("type=%d\n", type);
+
+	status = DLM_SYSERR;
+	tmplock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+	if (!tmplock)
+		goto error;
+
+	memset(tmplock, 0, sizeof(dlm_lock));
+	INIT_LIST_HEAD(&tmplock->list);
+	INIT_LIST_HEAD(&tmplock->ast_list);
+	spin_lock_init(&tmplock->spinlock);
+	tmplock->lockres = res;
+	tmplock->type = type;
+	tmplock->convert_type = LKM_IVMODE;
+	tmplock->highest_blocked = LKM_IVMODE;
+	tmplock->node = dlm->group_index;
+	tmplock->ast = ast;
+	tmplock->bast = bast;
+	tmplock->astdata = data;
+	tmplock->lksb = lksb;
+
+	lksb->lockid = tmplock;
+
+	c = (u8 *)(&tmplock->cookie);
+
+	spin_lock(&dlm_cookie_lock);
+	tmplock->cookie = dlm_next_cookie;
+	dlm_next_cookie++;
+	if (dlm_next_cookie & 0xff00000000000000ull) {
+		printk("eek! this node's cookie will now wrap!\n");
+		dlm_next_cookie = 1;
+	}
+	c[7] = (u8)(tmplock->node & 0x00ff);
+	spin_unlock(&dlm_cookie_lock);
+
+	if (res->owner == dlm->group_index)
+		status = dlmlock_local(dlm, res, tmplock, flags);
+	else 
+		status = dlmlock_remote(dlm, res, tmplock, flags);
+error:
+	if (status != DLM_NORMAL) {
+		if (tmplock)
+			kfree(tmplock);
+		lksb->lockid = NULL;
+	}
+	return status;
+}
+
+
+
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+	struct list_head *iter;
+	dlm_lock *tmplock;
+	int got_it = 0;
+
+	BUG_ON(!lock);
+	BUG_ON(!res);
+	BUG_ON(!dlm);
+
+	if (lock->node == dlm->group_index) {
+		BUG_ON(!lock->lksb);
+	}
+
+	dlmprintk("type=%d\n", lock->type);
+
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+			list_add_tail(&lock->list, &res->blocked);
+			goto done;
+		}
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+			list_add_tail(&lock->list, &res->blocked);
+			goto done;
+		}
+	}
+
+	/* got it right away */
+
+	/* if it is a remote request, proxy 
+	 * handler will set the lksb status */
+	if (lock->node == dlm->group_index)
+		lock->lksb->status = DLM_NORMAL;
+
+	list_add_tail(&lock->list, &res->granted);
+
+	if (dlm_do_ast(dlm, res, lock) < 0)
+		printk("eek\n");
+	got_it = 1;
+
+done:
+	spin_unlock(&res->spinlock);
+	dlm_kick_thread(dlm, res);
+	if (!got_it && (flags & LKM_NOQUEUE)) {
+		return DLM_NOTQUEUED;
+	}
+	return DLM_NORMAL;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+	dlm_status status = DLM_DENIED;
+	
+	dlmprintk("type=%d\n", lock->type);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	/* add lock to local (secondary) queue */
+	list_add_tail(&lock->list, &res->blocked);
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock 
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+	
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	if (status != DLM_NORMAL) {
+		/* remove from local queue if it failed */
+		list_del(&lock->list);
+	}
+bail:
+	spin_unlock(&res->spinlock);
+	return status;
+}
+
+
+/* must be already holding lockres->spinlock */
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	dlm_status status;
+
+{
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2;
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+
+	if (res->owner == dlm->group_index)
+		status = dlmconvert_local(dlm, res, lock, flags, type);
+	else 
+		status = dlmconvert_remote(dlm, res, lock, flags, type);
+
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+	printk("dlmconvert took %llu cycles\n", u2.q-u1.q);
+}
+	return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	dlm_status status = DLM_NORMAL;
+	struct list_head *iter;
+	dlm_lock *tmplock=NULL;
+	int remote_in_place = 0;
+
+	dlmprintk("type=%d, convert_type=%d, new convert_type=%d\n", lock->type, lock->convert_type, type);
+
+	spin_lock(&lock->spinlock);
+
+	/* already converting? */
+	if (lock->convert_type != LKM_IVMODE) {
+		printk("attempted to convert a lock with a lock conversion pending\n");
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		return DLM_DENIED;
+	}
+
+	/* must be on grant queue to convert */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		printk("attempted to convert a lock not on grant queue\n");
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		return DLM_DENIED;
+	}
+	
+	
+	/* in-place downconvert? */
+	if (type <= lock->type)
+		goto grant;
+
+	/* upconvert from here on */
+	status = DLM_NORMAL;
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (tmplock == lock)
+			continue;
+		if (!dlm_lock_compatible(tmplock->type, type))
+			goto switch_queues;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->type, type))
+			goto switch_queues;
+		/* existing conversion requests take precedence */
+		if (!dlm_lock_compatible(tmplock->convert_type, type))
+			goto switch_queues;
+	}
+
+	/* fall thru to grant */
+
+grant:
+	if (lock->node != dlm->group_index) {
+		dlmprintk0("no in-place convert for nonlocal locks :(  see if this helps...\n");
+		remote_in_place = 1;
+		goto switch_queues;
+	}
+
+	/* immediately grant the new lock type */
+	//printk("doing in-place %sconvert from %d to %d\n", 
+	//       type > lock->type ? "up" : "down", lock->type, type);
+	lock->type = type;
+	status = DLM_NORMAL;
+
+	/* if it is a remote request, proxy 
+	 * handler will set the lksb status */
+	if (lock->node == dlm->group_index)
+		lock->lksb->status = DLM_NORMAL;
+
+	if (dlm_do_ast(dlm, res, lock) < 0)
+		printk("eek\n");
+
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+
+	/* if successful, kick the queue runner */
+	if (status == DLM_NORMAL) {
+		dlm_kick_thread(dlm, res);
+	}
+
+	return status;
+
+switch_queues:
+	if (flags & LKM_NOQUEUE) {
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		return DLM_NOTQUEUED;
+	}
+
+	lock->convert_type = type;
+	list_del(&lock->list);
+	/* make sure the remote in-place convert gets handled right away */
+	if (remote_in_place)
+		list_add(&lock->list, &res->converting);
+	else
+		list_add_tail(&lock->list, &res->converting);
+	
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+	
+	dlm_kick_thread(dlm, res);
+	return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	dlm_status status = DLM_DENIED;
+	
+	dlmprintk("type=%d, convert_type=%d\n", lock->type, lock->convert_type);
+	
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	/* move lock to local convert queue */
+	list_del(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+	if (lock->convert_type != LKM_IVMODE) {
+		printk("error! converting a remote lock that is already converting!\n");
+		/* TODO: return correct error */
+		BUG();
+	}
+	lock->convert_type = type;
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock 
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+	
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+
+	/* if it failed, move it back to granted queue */
+	if (status != DLM_NORMAL) {
+		list_del(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
+		lock->convert_type = LKM_IVMODE;
+	}
+bail:
+	spin_unlock(&res->spinlock);
+	return status;
+}
+
+
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+	dlm_status status;
+	dlm_lock_resource *res;
+	dlm_lock *lock = NULL;
+	int call_ast = 0;
+
+	if (!lksb)
+		return DLM_BADARGS;
+
+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK))
+		return DLM_BADPARAM;
+
+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+		printk("VALBLK given with CANCEL: ignoring VALBLK\n");
+		flags &= ~LKM_VALBLK;
+	}
+
+	if (!lksb->lockid || !lksb->lockid->lockres)
+		return DLM_BADPARAM;
+
+	lock = lksb->lockid;
+	res = lock->lockres;
+
+	status = dlmunlock_local(dlm, res, lock, lksb, flags, &call_ast);
+	if (call_ast)
+		(*unlockast)(data, lksb->status);
+	return status;
+}
+
+
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast)
+{
+	dlm_status status;
+	int free_lock = 0, remote_ready = 0;
+	int local = 0, remove = 0, regrant = 0;
+
+	/* according to spec and opendlm code
+	 *  flags & LKM_CANCEL != 0: must be converting or blocked
+	 *  flags & LKM_CANCEL == 0: must be granted
+	 * iow, to unlock a converting lock, you must first LKM_CANCEL
+	 * the convert, then call the unlock again with no LKM_CANCEL
+	 */
+	*call_ast = 0;
+
+recheck:
+	spin_lock(&res->spinlock);
+	spin_lock(&lock->spinlock);
+
+	local = (res->owner == dlm->group_index);
+
+	if (flags & LKM_CANCEL) {
+		/* cancel request */
+		if (dlm_lock_on_list(&res->blocked, lock)) {
+			/* cancel this outright */
+			lksb->status = DLM_NORMAL;
+			status = DLM_NORMAL;
+			free_lock = 1;
+			*call_ast = 1;
+			remove = 1;
+			regrant = 0;
+		} else if (dlm_lock_on_list(&res->converting, lock)) {
+			/* cancel the request, put back on granted */
+			lksb->status = DLM_NORMAL;
+			status = DLM_NORMAL;
+			free_lock = 0;
+			*call_ast = 1;
+			remove = 1;
+			regrant = 1;
+		} else if (dlm_lock_on_list(&res->granted, lock)) {
+			/* too late, already granted.  DLM_CANCELGRANT */
+			lksb->status = DLM_CANCELGRANT;
+			status = DLM_NORMAL;
+			free_lock = 0;
+			*call_ast = 1;
+			remove = 0;
+			regrant = 0;
+		} else {
+			/* err. um. eek! */
+			printk("lock to cancel is not on any list!  bug!\n");
+			lksb->status = DLM_IVLOCKID;
+			status = DLM_IVLOCKID;
+			free_lock = 0;
+			*call_ast = 0;
+			remove = 0;
+			regrant = 0;
+		}
+	} else {
+		/* unlock request */
+		if (!dlm_lock_on_list(&res->granted, lock)) {
+			lksb->status = DLM_DENIED;
+			status = DLM_DENIED;
+			free_lock = 0;
+			*call_ast = 0;
+			remove = 0;
+			regrant = 0;
+		} else {
+			/* unlock granted lock */
+			lksb->status = DLM_NORMAL;
+			status = DLM_NORMAL;
+			free_lock = 1;
+			*call_ast = 1;
+			remove = 1;
+			regrant = 0;
+		}
+	}
+
+	if (!local) {
+		/* safe since nothing can change on this 
+		 * seconndary queue without lockres lock */
+		spin_unlock(&lock->spinlock);
+
+		/* if there was an outstanding change on the
+		 * lockres, conditions could have changed */
+		if (!remote_ready &&
+		    res->state & DLM_LOCK_RES_IN_PROGRESS) {
+			__dlm_wait_on_lockres(res);
+			res->state |= DLM_LOCK_RES_IN_PROGRESS;
+			remote_ready = 1;
+			spin_unlock(&res->spinlock);
+			goto recheck;
+		}
+
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			/* !!!!! */
+			spin_unlock(&res->spinlock);
+			return DLM_RECOVERING;
+		} else {
+			spin_unlock(&res->spinlock);
+			status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, flags);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		}
+		spin_lock(&lock->spinlock);
+	}
+
+	if (remove)
+		list_del(&lock->list);
+	if (regrant)
+		list_add_tail(&lock->list, &res->granted);
+
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+
+	if (free_lock) {
+#warning this must change to proper refcounting
+		/* TODO: refcounting... tho for now this will work because 
+		 * the middle layer is keeping track of everything */
+		kfree(lock);
+		lksb->lockid = NULL;
+	}
+	return status;
+}
+	
+
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags)
+{
+	struct inode *inode = NULL;
+	dlm_unlock_lock unlock;
+	int tmpret;
+	dlm_status ret;
+	int status = 0;
+
+	dlmprintk0("\n");
+
+	memset(&unlock, 0, sizeof(unlock));
+	unlock.node_idx = dlm->group_index;
+	unlock.flags = flags;
+	unlock.cookie = lock->cookie;
+	unlock.namelen = res->lockname.len;
+	strncpy(unlock.name, res->lockname.name, unlock.namelen);
+
+	ret = DLM_NOLOCKMGR;
+	lksb->status = DLM_NOLOCKMGR;
+	inode = nm_get_group_node_by_index(dlm->group, res->owner);
+	if (inode) {
+		tmpret = net_send_message(DLM_UNLOCK_LOCK_MSG, dlm->key, &unlock, sizeof(unlock), inode, &status);
+		if (tmpret >= 0) {
+			// successfully sent and received
+			if (status == DLM_CANCELGRANT)
+				ret = DLM_NORMAL;
+			else
+				ret = status;
+			lksb->status = status;
+		} else {
+			printk("error occurred in net_send_message: %d\n", tmpret);
+			ret = dlm_err_to_dlm_status(tmpret);
+			lksb->status = ret;
+		}
+		iput(inode);
+	}
+
+	return ret;
+}
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_unlock_lock *unlock = (dlm_unlock_lock *)msg->buf;
+	dlm_lock_resource *res;
+	struct list_head *iter, *queue;
+	dlm_lock *lock;
+	dlm_status status = DLM_NORMAL;
+	int found = 0;
+	dlm_lockstatus lksb;
+	int ignore;
+	struct qstr lockname = { .name=unlock->name, .len=unlock->namelen };
+
+	dlmprintk0("\n");
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_lock(&res->spinlock);
+		queue = &res->granted;
+again:
+		list_for_each(iter, queue) {
+			lock = list_entry(iter, dlm_lock, list);
+			if (lock->cookie == unlock->cookie &&
+			    lock->node == unlock->node_idx) {
+				found = 1;
+				/* unlockast only called on originating node */
+				status = dlmunlock_local(dlm, res, lock, &lksb, unlock->flags, &ignore);
+				break;
+			}
+		}
+		if (queue == &res->granted) {
+			queue = &res->converting;
+			goto again;
+		} else if (queue == &res->converting) {
+			queue = &res->blocked;
+			goto again;
+		}
+		spin_unlock(&res->spinlock);
+	}
+	if (!found)
+		printk("failed to find lock to unlock!  cookie=%llu\n", unlock->cookie);
+	else
+		status = lksb.status;
+
+	return status;
+}
+
+
+
+
+
+static dlm_ctxt * __dlm_lookup_domain(char *domain)
+{
+	dlm_ctxt *tmp = NULL;
+	struct list_head *iter;
+
+	list_for_each(iter, &dlm_domains) {
+		tmp = list_entry (iter, dlm_ctxt, list);
+		if (strncmp(tmp->name, domain, NM_MAX_NAME_LEN)==0)
+			break;
+		tmp = NULL;
+	}
+
+	return tmp;
+}
+
+dlm_ctxt * dlm_lookup_domain(char *domain)
+{
+	dlm_ctxt *tmp = NULL;
+	spin_lock(&dlm_domain_lock);
+	tmp = __dlm_lookup_domain(domain);
+	spin_unlock(&dlm_domain_lock);
+	return tmp;
+}
+
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+	struct list_head *iter;
+	dlm_lock_resource *tmpres=NULL;
+	struct list_head *bucket;
+	
+	bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+	/* check for pre-existing lock */
+	list_for_each(iter, bucket) {
+		tmpres = list_entry(iter, dlm_lock_resource, list);
+		if (tmpres->lockname.len == lockname->len &&
+		    strncmp(tmpres->lockname.name, lockname->name, lockname->len) == 0)
+			break;
+		tmpres = NULL;
+	}
+	return tmpres;
+}
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+	dlm_lock_resource *res;
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lock(dlm, lockname);
+	spin_unlock(&dlm->spinlock);
+	return res;
+}
+
+
+
+/*
+ * dlm_register_domain: one-time setup per "domain"
+ */
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key)
+{
+	dlm_ctxt *tmp = NULL, *dlm = NULL;
+	struct inode *group = NULL;
+	int tmpret, i;
+	char *netbuf;
+
+	if (strlen(domain) > NM_MAX_NAME_LEN) {
+		printk("domain name length too long\n");
+		goto leave;
+	}
+
+	group = nm_get_group_by_name(group_name);
+	if (!group) {
+		printk("no nm group %s for domain %s!\n", group_name, domain);
+		goto leave;
+	}
+
+	/* 
+	 * TODO: should i do some type of dlm-group-join business here?
+	 * I need to have new nodes communicate with other dlm nodes to 
+	 * wait until their master lists are empty before allowing me to
+	 * join.  does this belong here?  or in hb?
+	 * seems like stuff that heartbeat shouldn't care about, cuz we
+	 * would actually be preventing a node that is "UP" from being 
+	 * part of the dlm group.
+	 */ 
+	dlm = dlm_lookup_domain(domain);
+	if (dlm) {
+		/* found a pre-existing domain */
+		goto leave;
+	}
+
+	dlm = kmalloc(sizeof(dlm_ctxt), GFP_KERNEL);
+	if (dlm == NULL) {
+		printk("could not allocate dlm_ctxt\n");
+		goto leave;
+	}
+	memset(dlm, 0, sizeof(dlm_ctxt));
+	dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+	if (dlm->name == NULL) {
+		kfree(dlm);
+		dlm = NULL;
+		printk("could not allocate dlm domain name\n");
+		goto leave;
+	}
+	dlm->net_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!dlm->net_buf) {
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		printk("could not allocate dlm network temporary buffer\n");
+		goto leave;
+	}
+	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
+	if (!dlm->resources) {
+		kfree(dlm->name);
+		kfree(dlm);
+		free_page((unsigned long)dlm->net_buf);
+		dlm = NULL;
+		printk("could not allocate dlm hash\n");
+		goto leave;
+	}
+	memset(dlm->resources, 0, PAGE_SIZE);
+	
+	for (i=0; i<DLM_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&dlm->resources[i]);
+
+	strcpy(dlm->name, domain);
+	spin_lock_init(&dlm->spinlock);
+	INIT_LIST_HEAD(&dlm->list);
+	INIT_LIST_HEAD(&dlm->dirty_list);
+	INIT_LIST_HEAD(&dlm->reco.resources);
+	INIT_LIST_HEAD(&dlm->reco.received);
+	util_thread_info_init(&dlm->thread);
+	util_thread_info_init(&dlm->reco.thread);
+	init_rwsem(&dlm->recovery_sem);
+	dlm->group = group;
+	dlm->group_index = nm_this_node(group);
+	dlm->key = key;
+	dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+	dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+	dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+	dlm->reco.next_seq = 0;
+
+	spin_lock(&dlm_domain_lock);
+	tmp = __dlm_lookup_domain(domain);
+	if (tmp) {
+		spin_unlock(&dlm_domain_lock);
+		/* found a pre-existing domain */
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	/* add the new domain */
+	list_add_tail(&dlm->list, &dlm_domains);
+	spin_unlock(&dlm_domain_lock);
+
+	tmpret = hb_register_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	if (tmpret)
+		goto error;
+	tmpret = hb_register_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+	if (tmpret)
+		goto error;
+
+	/* TODO: need to use hb_fill_node_map to fill a temporary votemap
+	 * then communicate with each of these nodes that I want to come up
+	 * FOR THIS DLM.  there may be many nodes in this group heartbeating
+	 * but they may not care about this particular dlm instance.  once
+	 * everyone has come back with a response that i have been added or 
+	 * that they are not a member I can put together the REAL node map
+	 * for this dlm in dlm->node_map */
+	/* TODO: I guess we can fill this here as a superset of possible nodes
+	 * so that the hb_callbacks above have something to work on in the meantime
+	 * then trim out the nodes that are not part of this dlm once we know */
+	/* TODO: I may need to register a special net handler on insmod of dlm.o
+	 * with a key of 0 so that I can respond to requests even if I am not
+	 * part of a dlm group.  this would still leave a gap in time between the
+	 * start of heartbeating and the insmod dlm.o, unless I change the module 
+	 * loading stuff in clusterbo to include dlm.o (which would work fine) */
+#warning WRONG WRONG WRONG
+	tmpret = hb_fill_node_map(group, dlm->node_map, NM_MAX_NODES);
+	if (tmpret)
+		goto error;
+
+
+#if 0
+	tmpret = net_register_handler("reco-request", 
+		      DLM_NET_RECOVERY_REQUEST_MSG_TYPE, 
+		      key, sizeof(dlm_reco_request),
+		      dlm_recovery_request_handler, dlm);
+	if (tmpret)
+		goto error;
+	tmpret = net_register_handler("reco-lock-arr-req", 
+		      DLM_NET_RECOVERY_LOCK_ARR_REQ_MSG_TYPE, 
+		      key, sizeof(dlm_reco_lock_arr_req),
+		      dlm_recovery_lock_arr_req_handler, dlm);
+	if (tmpret)
+		goto error;
+	tmpret = net_register_handler("reco-response", 
+		      DLM_NET_RECOVERY_RESPONSE_MSG_TYPE, 
+		      key, sizeof(dlm_reco_response),
+		      dlm_recovery_response_handler, dlm);
+	if (tmpret)
+		goto error;
+#endif
+
+	netbuf = dlm->net_buf;
+	tmpret = net_register_handler(DLM_MASTER_REQUEST_RESP_MSG, key, 0, 
+				      sizeof(dlm_master_request_resp), 
+				      dlm_master_request_resp_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request_resp));
+
+	tmpret = net_register_handler(DLM_MASTER_REQUEST_MSG, key, 0, 
+				      sizeof(dlm_master_request), 
+				      dlm_master_request_handler,
+				      dlm, netbuf);
+
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request));
+
+	tmpret = net_register_handler(DLM_ASSERT_MASTER_MSG, key, 0, 
+				      sizeof(dlm_assert_master), 
+				      dlm_assert_master_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_assert_master));
+	tmpret = net_register_handler(DLM_CREATE_LOCK_MSG, key, 0, 
+				      sizeof(dlm_create_lock), 
+				      dlm_create_lock_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_create_lock));
+	tmpret = net_register_handler(DLM_CONVERT_LOCK_MSG, key, 0, 
+				      sizeof(dlm_convert_lock), 
+				      dlm_convert_lock_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_convert_lock));
+
+	tmpret = net_register_handler(DLM_UNLOCK_LOCK_MSG, key, 0,
+				      sizeof(dlm_unlock_lock),
+				      dlm_unlock_lock_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_unlock_lock));
+				
+	tmpret = net_register_handler(DLM_PROXY_AST_MSG, key, 0, 
+				      sizeof(dlm_proxy_ast), 
+				      dlm_proxy_ast_handler,
+				      dlm, netbuf);
+	if (tmpret)
+		goto error;
+	netbuf += L1_CACHE_ALIGN(sizeof(dlm_proxy_ast));
+// printk("netbuf=%p net_buf=%p diff=%d\n", netbuf, dlm->net_buf, ((char *)netbuf - (char *)dlm->net_buf));   // currently 768
+	
+	tmpret = dlm_launch_thread(dlm);
+	if (tmpret == 0)
+		goto leave;
+
+error:	
+	hb_unregister_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm);
+	hb_unregister_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm);
+	spin_lock(&dlm_domain_lock);
+	list_del(&dlm->list);
+	spin_unlock(&dlm_domain_lock);
+	free_page((unsigned long)dlm->net_buf);
+	free_page((unsigned long)dlm->resources);
+	kfree(dlm->name);
+	kfree(dlm);
+	dlm = NULL;
+
+leave:
+	if (!dlm && group)
+	       	iput(group);
+	return dlm;
+}
+
+void dlm_unregister_domain(dlm_ctxt *dlm)
+{
+	// fill me in please
+}
+
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname)
+{
+	memset(res, 0, sizeof(dlm_lock_resource));
+	res->lockname.name = lockname->name;
+	res->lockname.len = lockname->len;
+	res->lockname.hash = lockname->hash;
+	init_waitqueue_head(&res->wq);
+	spin_lock_init(&res->spinlock);
+	INIT_LIST_HEAD(&res->list);
+	INIT_LIST_HEAD(&res->granted);
+	INIT_LIST_HEAD(&res->converting);
+	INIT_LIST_HEAD(&res->blocked);
+	INIT_LIST_HEAD(&res->dirty);
+	INIT_LIST_HEAD(&res->recovering);
+
+	res->owner = DLM_LOCK_RES_OWNER_UNKNOWN;
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+}
+
+
+
+	
+/* will exit holding res->spinlock, but may drop in function */
+void dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	current->state = TASK_RUNNING;
+}
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		spin_lock(&res->spinlock);
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	current->state = TASK_RUNNING;
+}
+
+  
+
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock)
+{
+	dlm_astlockfunc_t *fn = lock->ast;
+
+	dlmprintk0("\n");
+
+	if (lock->node != dlm->group_index) {
+		return dlm_send_proxy_ast(dlm, res, lock, DLM_AST, 0);
+	}
+	if (!fn) {
+		printk("eek! lock has no ast %*s!  cookie=%llu\n", 
+		       res->lockname.len, res->lockname.name, lock->cookie);
+		return -EINVAL;
+	}
+	(*fn)(lock->astdata);
+	return 0;
+}
+
+
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type)
+{
+	dlm_bastlockfunc_t *fn = lock->bast;
+	
+	dlmprintk0("\n");
+
+	if (lock->node != dlm->group_index) {
+		return dlm_send_proxy_ast(dlm, res, lock, DLM_BAST, blocked_type);
+	}
+
+	if (!fn) {
+		printk("eek! lock has no bast %*s!  cookie=%llu\n", 
+		       res->lockname.len, res->lockname.name, lock->cookie);
+		return -EINVAL;
+	}
+	(*fn)(lock->astdata, blocked_type);
+	return 0;
+}
+
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type)
+{
+	int ret = 0;
+	dlm_proxy_ast past;
+	struct inode *inode = NULL;
+	
+	dlmprintk("to=%u, type=%d, blocked_type=%d\n", lock->node, type, blocked_type);
+
+	past.node_idx = dlm->group_index;
+	past.type = type;
+	past.blocked_type = blocked_type;
+	past.namelen = res->lockname.len;
+	strncpy(past.name, res->lockname.name, past.namelen);
+	past.cookie = lock->cookie;
+
+	ret = -EINVAL;
+	inode = nm_get_group_node_by_index(dlm->group, lock->node);
+	if (inode) {
+		ret = net_send_message(DLM_PROXY_AST_MSG, dlm->key, &past, sizeof(past), inode, NULL);
+		iput(inode);
+	}
+	if (ret < 0) {
+		printk("(%d) dlm_send_proxy_ast: returning %d\n", current->pid, ret);
+	}
+	return ret;
+}
+
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data)
+{
+	int status;
+	dlm_ctxt *dlm = data;
+	dlm_lock_resource *res;
+	dlm_lock *lock = NULL;
+	dlm_proxy_ast *past = (dlm_proxy_ast *) msg->buf;
+	struct qstr lockname = { .name=past->name, .len=past->namelen };
+	struct list_head *iter, *head=NULL;
+	u64 cookie = past->cookie;
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+	
+	dlmprintk("type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+	if (past->type != DLM_AST && 
+	    past->type != DLM_BAST) {
+		printk("Eeeek unknown ast type! %d, cookie=%llu, name=%*s\n", 
+		       past->type, cookie, lockname.len, lockname.name);
+		return 0;
+	}
+
+	res = dlm_lookup_lock(dlm, &lockname);
+	if (!res) {
+		printk("eek! got %sast for unknown lockres!  cookie=%llu, name=%*s, namelen=%d\n", 
+		       past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+		return 0;
+	}
+
+	if (!dlm_is_recovery_lock(past->name, past->namelen))
+		down_read(&dlm->recovery_sem);
+	spin_lock(&res->spinlock);
+
+	/* try convert queue for both ast/bast */
+	head = &res->converting;
+	lock = NULL;
+	list_for_each(iter, head) {
+		lock = list_entry (iter, dlm_lock, list);
+		if (lock->cookie == cookie)
+			goto do_ast;
+	}
+
+	/* if not on convert, try blocked for ast, granted for bast */
+	if (past->type == DLM_AST)
+		head = &res->blocked;
+	else 
+		head = &res->granted;
+
+	list_for_each(iter, head) {
+		lock = list_entry (iter, dlm_lock, list);
+		if (lock->cookie == cookie)
+			goto do_ast;
+	}
+
+	printk("eek! got %sast for unknown lock!  cookie=%llu, name=%*s, namelen=%d\n", 
+	       past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+	spin_unlock(&res->spinlock);
+	if (!dlm_is_recovery_lock(past->name, past->namelen))
+		up_read(&dlm->recovery_sem);
+	return 0;
+		
+do_ast:
+	if (past->type == DLM_AST) {
+		list_del(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
+		dlmprintk("ast: adding to granted list... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+		if (lock->convert_type != LKM_IVMODE) {
+			lock->type = lock->convert_type;
+			lock->convert_type = LKM_IVMODE;
+		} else {
+			// should already be there....
+		}
+		
+		lock->lksb->status = DLM_NORMAL;
+
+		status = dlm_do_ast(dlm, res, lock);
+		dlmprintk("ast done: now... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+	} else {
+		dlmprintk("bast: before... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+		status = dlm_do_bast(dlm, res, lock, past->blocked_type);
+		dlmprintk("bast: after... type=%d, convert_type=%d\n",
+			  lock->type, lock->convert_type);
+	}
+
+	if (status < 0)
+		printk("eeek: ast/bast returned %d\n", status);
+
+	spin_unlock(&res->spinlock);
+	if (!dlm_is_recovery_lock(past->name, past->namelen))
+		up_read(&dlm->recovery_sem);
+	return 0;
+}
+
+
+
+
+
+
+
+/*
+ * message handlers should just return status.
+ * this will get send back to the calling node if it
+ * requested a status return.
+ */
+
+
+/* remote lock creation */
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+	struct inode *inode = NULL;
+	dlm_create_lock create;
+	int tmpret, status = 0;
+	dlm_status ret;
+
+	dlmprintk0("\n");
+
+	memset(&create, 0, sizeof(create));
+	create.node_idx = dlm->group_index;
+	create.requested_type = lock->type;
+	create.cookie = lock->cookie;
+	create.namelen = res->lockname.len;
+	strncpy(create.name, res->lockname.name, create.namelen);
+
+	ret = DLM_NOLOCKMGR;
+	inode = nm_get_group_node_by_index(dlm->group, res->owner);
+	if (inode) {
+		tmpret = net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, sizeof(create), inode, &status);
+		if (tmpret >= 0) {
+			// successfully sent and received
+			ret = status;  // this is already a dlm_status
+		} else {
+			printk("error occurred in net_send_message: %d\n", tmpret);
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+		iput(inode);
+	}
+
+	return ret;
+}
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_create_lock *create = (dlm_create_lock *)msg->buf;
+	dlm_lock_resource *res;
+	dlm_lock *newlock;
+	dlm_status status = DLM_NORMAL;
+	struct qstr lockname = { .name=create->name, .len=create->namelen };
+	
+	dlmprintk0("\n");
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+	newlock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+	if (!newlock)
+		return DLM_SYSERR;
+	
+	memset(newlock, 0, sizeof(dlm_lock));
+	INIT_LIST_HEAD(&newlock->list);
+	INIT_LIST_HEAD(&newlock->ast_list);
+	spin_lock_init(&newlock->spinlock);
+	newlock->type = create->requested_type;
+	newlock->convert_type = LKM_IVMODE;
+	newlock->highest_blocked = LKM_IVMODE;
+	newlock->node = create->node_idx;
+	newlock->ast = NULL;
+	newlock->bast = NULL;
+	newlock->astdata = NULL;
+	newlock->cookie = create->cookie;
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lock(dlm, &lockname);
+	if (res) {
+		spin_lock(&res->spinlock);
+		newlock->lockres = res;
+		status = dlmlock_local(dlm, res, newlock, 0);
+		spin_unlock(&res->spinlock);
+	}
+
+	return status;
+}
+
+/* remote lock conversion */
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+	struct inode *inode = NULL;
+	dlm_convert_lock convert;
+	int tmpret;
+	dlm_status ret;
+	int status = 0;
+
+	dlmprintk0("\n");
+
+	memset(&convert, 0, sizeof(convert));
+	convert.node_idx = dlm->group_index;
+	convert.requested_type = type;
+	convert.cookie = lock->cookie;
+	convert.namelen = res->lockname.len;
+	strncpy(convert.name, res->lockname.name, convert.namelen);
+
+	ret = DLM_NOLOCKMGR;
+	inode = nm_get_group_node_by_index(dlm->group, res->owner);
+	if (inode) {
+		tmpret = net_send_message(DLM_CONVERT_LOCK_MSG, dlm->key, &convert, sizeof(convert), inode, &status);
+		if (tmpret >= 0) {
+			// successfully sent and received
+			ret = status;  // this is already a dlm_status
+		} else {
+			printk("error occurred in net_send_message: %d\n", tmpret);
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+		iput(inode);
+	}
+
+	return ret;
+}
+
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_convert_lock *convert = (dlm_convert_lock *)msg->buf;
+	dlm_lock_resource *res;
+	struct list_head *iter;
+	dlm_lock *lock;
+	dlm_status status = DLM_NORMAL;
+	int found = 0;
+	struct qstr lockname = { .name=convert->name, .len=convert->namelen };
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2, u3, u4, u5, u6, u7;
+
+
+	dlmprintk0("\n");
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+
+	lockname.hash = full_name_hash(lockname.name, lockname.len);
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lock(dlm, &lockname);
+	rdtsc(u3.hilo[0], u3.hilo[1]);
+	if (res) {
+		spin_lock(&res->spinlock);
+	rdtsc(u4.hilo[0], u4.hilo[1]);
+		list_for_each(iter, &res->granted) {
+			lock = list_entry(iter, dlm_lock, list);
+			if (lock->cookie == convert->cookie &&
+			    lock->node == convert->node_idx) {
+				found = 1;
+	rdtsc(u5.hilo[0], u5.hilo[1]);
+				status = dlmconvert_local(dlm, res, lock, 0, convert->requested_type);
+	rdtsc(u6.hilo[0], u6.hilo[1]);
+				break;
+			}
+		}
+		spin_unlock(&res->spinlock);
+	}
+	if (!found)
+		printk("failed to find lock to convert on grant queue!  cookie=%llu\n", convert->cookie);
+
+	rdtsc(u7.hilo[0], u7.hilo[1]);
+	dlmprintk("1-2:%llu 2-3:%llu 3-4:%llu 4-5:%llu 5-6:%llu 6-7:%llu\n",
+		  u2.q-u1.q, u3.q-u2.q, u4.q-u3.q, u5.q-u4.q, u6.q-u5.q, u7.q-u6.q);
+	return status;
+}
+
+void dlm_dump_everything(void)
+{
+	dlm_ctxt *dlm;
+	struct list_head *iter;
+
+	printk("dumping ALL dlm state for node %s\n", system_utsname.nodename);
+	spin_lock(&dlm_domain_lock);
+	list_for_each(iter, &dlm_domains) {
+		dlm = list_entry (iter, dlm_ctxt, list);
+		dlm_dump_dlm(dlm);
+	}
+	spin_unlock(&dlm_domain_lock);
+}
+
+void dlm_dump_dlm(dlm_ctxt *dlm)
+{
+	dlm_lock_resource *res;
+	dlm_lock *lock;
+	struct list_head *iter, *iter2;
+	struct list_head *bucket;
+	int i;
+
+	printk("dlm_ctxt: %s, group=%u, key=%u\n", dlm->name, dlm->group_index, dlm->key);
+	printk("some bug here... should not have to check for this...\n");
+	if (!dlm || !dlm->name) {
+		printk("wtf... dlm=%p\n", dlm);
+		return;
+	}
+		
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry(iter, dlm_lock_resource, list);
+			printk("lockres: %*s, owner=%u, state=%u\n", res->lockname.len, res->lockname.name,
+			       res->owner, res->state);
+			spin_lock(&res->spinlock);
+			printk("  granted queue: \n");
+			list_for_each(iter2, &res->granted) {
+				lock = list_entry(iter2, dlm_lock, list);
+				spin_lock(&lock->spinlock);
+				printk("    type=%d, conv=%d, node=%u, cookie=%llu\n", 
+				       lock->type, lock->convert_type, lock->node, lock->cookie);
+				spin_unlock(&lock->spinlock);
+			}
+			printk("  converting queue: \n");
+			list_for_each(iter2, &res->converting) {
+				lock = list_entry(iter2, dlm_lock, list);
+				spin_lock(&lock->spinlock);
+				printk("    type=%d, conv=%d, node=%u, cookie=%llu\n", 
+				       lock->type, lock->convert_type, lock->node, lock->cookie);
+				spin_unlock(&lock->spinlock);
+			}
+			printk("  blocked queue: \n");
+			list_for_each(iter2, &res->blocked) {
+				lock = list_entry(iter2, dlm_lock, list);
+				spin_lock(&lock->spinlock);
+				printk("    type=%d, conv=%d, node=%u, cookie=%llu\n", 
+				       lock->type, lock->convert_type, lock->node, lock->cookie);
+				spin_unlock(&lock->spinlock);
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+module_init (dlm_driver_entry);
+module_exit (dlm_driver_exit);

Added: trunk/cluster/dlmmod.h
===================================================================
--- trunk/cluster/dlmmod.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmmod.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,467 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMMOD_H
+#define CLUSTER_DLMMOD_H
+
+
+
+#if 0
+#define dlmprintk(x, arg...)
+#define dlmprintk0(x)
+#else
+#define dlmprintk(x, arg...)    printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__, ##arg)
+#define dlmprintk0(x)           printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__)
+#endif
+
+
+
+
+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
+#define DLM_HB_NODE_UP_PRI       (0x8000000)  
+
+#define DLM_LVB_LEN  64
+#define DLM_LOCKID_NAME_MAX    32
+
+#define DLM_DOMAIN_NAME_MAX_LEN    255
+#define DLM_LOCK_RES_OWNER_UNKNOWN     NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
+
+#define DLM_HASH_BITS     7
+#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
+#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
+
+typedef enum _dlm_ast_type {
+	DLM_AST = 0,
+	DLM_BAST,
+	DLM_ASTUNLOCK
+} dlm_ast_type;
+
+
+#define LKM_IVMODE      (-1)            /* invalid mode */
+#define LKM_NLMODE      0               /* null lock */
+#define LKM_CRMODE      1               /* concurrent read */    /* unsupported */
+#define LKM_CWMODE      2               /* concurrent write */    /* unsupported */
+#define LKM_PRMODE      3               /* protected read */
+#define LKM_PWMODE      4               /* protected write */    /* unsupported */
+#define LKM_EXMODE      5               /* exclusive */
+#define LKM_MAXMODE     5
+#define LKM_MODEMASK    0xff
+
+
+/* TODO: Flags which OCFS2 will require: 
+ *       - LKM_LOCAL
+ *       - LKM_VALBLK
+ *       - LKM_NOQUEUE
+ *       - LKM_CONVERT
+ *       - LKM_CANCEL   */
+#define LKM_ORPHAN      0x10            /* this lock is orphanable */    /* unsupported */
+#define LKM_PARENTABLE  0x20            /* this lock was orphaned */    /* unsupported */
+#define LKM_BLOCK       0x40            /* blocking lock request */    /* unsupported */
+#define LKM_LOCAL       0x80            /* local lock request */    
+#define LKM_VALBLK      0x100           /* lock value block request */
+#define LKM_NOQUEUE     0x200           /* non blocking request */
+#define LKM_CONVERT     0x400           /* conversion request */
+#define LKM_NODLCKWT    0x800           /* this lock wont deadlock */    /* unsupported */
+#define LKM_UNLOCK      0x1000          /* deallocate this lock */
+#define LKM_CANCEL      0x2000          /* cancel conversion request */
+#define LKM_DEQALL      0x4000          /* remove all locks held by proc */    /* unsupported */
+#define LKM_INVVALBLK   0x8000          /* invalidate lock value block */
+#define LKM_SYNCSTS     0x10000         /* return synchronous status if poss */    /* unsupported */
+#define LKM_TIMEOUT     0x20000         /* lock request contains timeout */    /* unsupported */
+#define LKM_SNGLDLCK    0x40000         /* request can self-deadlock */    /* unsupported */
+#define LKM_FINDLOCAL   0x80000         /* find local lock request */    /* unsupported */
+#define LKM_PROC_OWNED  0x100000        /* owned by process, not group */    /* unsupported */
+#define LKM_XID         0x200000        /* use transaction id for deadlock */    /* unsupported */
+#define LKM_XID_CONFLICT 0x400000       /* do not allow lock inheritance */    /* unsupported */
+#define LKM_FORCE       0x800000        /* force unlock flag */
+#define LKM_REVVALBLK   0x1000000       /* temporary solution: re-validate lock value block */    /* unsupported */
+
+#define LKM_RECOVERY    0x80000000      /* extension: flag for recovery lock, used to avoid recovery rwsem */
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN   9
+
+static inline int dlm_is_recovery_lock(char *lock_name, int name_len)
+{
+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+	    strncmp(lock_name, DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN)==0)
+		return 1;
+	return 0;
+}
+
+typedef enum _dlm_status {
+	DLM_NORMAL,               /* request in progress */
+	DLM_GRANTED,              /* request granted */
+	DLM_DENIED,               /* request denied */
+	DLM_DENIED_NOLOCKS,       /* request denied, out of system resources */
+	DLM_WORKING,              /* async request in progress */
+	DLM_BLOCKED,              /* lock request blocked */
+	DLM_BLOCKED_ORPHAN,       /* lock request blocked by a orphan lock*/
+	DLM_DENIED_GRACE_PERIOD,  /* topological change in progress */
+	DLM_SYSERR,               /* system error */
+	DLM_NOSUPPORT,            /* unsupported */
+	DLM_CANCELGRANT,          /* can't cancel convert: already granted */
+	DLM_IVLOCKID,             /* bad lockid */
+	DLM_SYNC,                 /* synchronous request granted */
+	DLM_BADTYPE,              /* bad resource type */
+	DLM_BADRESOURCE,          /* bad resource handle */
+	DLM_MAXHANDLES,           /* no more resource handles */
+	DLM_NOCLINFO,             /* can't contact cluster manager */
+	DLM_NOLOCKMGR,            /* can't contact lock manager */
+	DLM_NOPURGED,             /* can't contact purge daemon */
+	DLM_BADARGS,              /* bad api args */
+	DLM_VOID,                 /* no status */
+	DLM_NOTQUEUED,            /* NOQUEUE was specified and request failed */
+	DLM_IVBUFLEN,             /* invalid resource name length */
+	DLM_CVTUNGRANT,           /* attempted to convert ungranted lock */
+	DLM_BADPARAM,             /* invalid lock mode specified */
+	DLM_VALNOTVALID,          /* value block has been invalidated */
+	DLM_REJECTED,             /* request rejected, unrecognized client */
+	DLM_ABORT,                /* blocked lock request cancelled */
+	DLM_CANCEL,               /* conversion request cancelled */
+	DLM_IVRESHANDLE,          /* invalid resource handle */
+	DLM_DEADLOCK,             /* deadlock recovery refused this request */
+	DLM_DENIED_NOASTS,        /* failed to allocate AST */
+	DLM_FORWARD,              /* request must wait for primary's response */
+	DLM_TIMEOUT,              /* timeout value for lock has expired */
+	DLM_IVGROUPID,            /* invalid group specification */
+	DLM_VERS_CONFLICT,        /* version conflicts prevent request handling */
+	DLM_BAD_DEVICE_PATH,      /* Locks device does not exist or path wrong */
+	DLM_NO_DEVICE_PERMISSION, /* Client has insufficient pers for device */
+	DLM_NO_CONTROL_DEVICE,    /* Cannot set options on opened device */
+	DLM_MAXSTATS,             /* upper limit for return code validation */
+	
+	DLM_RECOVERING            /* our lame addition to allow caller to fail a lock 
+				     request if it is being recovered */
+} dlm_status;
+
+
+
+typedef struct _dlm_recovery_ctxt
+{
+	struct list_head resources;
+	struct list_head received;   // list of dlm_reco_lock_infos received from other nodes during recovery
+	u16 new_master;
+	u16 dead_node;
+	u16 sending_node;
+	u32 next_seq;
+	util_thread_info thread;
+} dlm_recovery_ctxt;
+
+
+struct _dlm_ctxt
+{
+	struct list_head list;
+	struct list_head *resources;
+	struct list_head dirty_list;
+	spinlock_t spinlock;
+	struct rw_semaphore recovery_sem;
+	char *name;
+	char *net_buf;
+	util_thread_info thread;
+	struct inode *group;
+	u32 key;
+	u16 group_index;
+	u32 node_map[8];
+	u32 recovery_map[8];
+	dlm_recovery_ctxt reco;
+};
+
+#define DLM_LOCK_RES_UNINITED             0x00000001
+#define DLM_LOCK_RES_RECOVERING           0x00000002
+#define DLM_LOCK_RES_READY                0x00000004
+#define DLM_LOCK_RES_DIRTY                0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010 
+
+typedef struct _dlm_lock_resource
+{
+	struct list_head list;
+	struct list_head granted;
+	struct list_head converting; 
+	struct list_head blocked;
+	struct list_head dirty;
+	struct list_head recovering; // dlm_recovery_ctxt.resources list
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	u16 owner;              // node which owns the lock resource, or unknown
+	u16 state;
+	struct qstr lockname;
+	char lvb[DLM_LVB_LEN];
+} dlm_lock_resource;
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, dlm_status);
+
+typedef struct _dlm_lockstatus dlm_lockstatus;
+
+typedef struct _dlm_lock
+{
+	struct list_head list;
+	struct list_head ast_list;
+	dlm_lock_resource *lockres;
+	spinlock_t spinlock;
+
+	s8 type;
+	s8 convert_type;
+	s8 highest_blocked;
+	s8 reserved1;
+	u16 node;
+	u16 reserved2;
+
+	dlm_astlockfunc_t *ast;     // ast and bast must be callable while holding a spinlock!
+	dlm_bastlockfunc_t *bast;
+	void *astdata;
+	u64 cookie;
+	dlm_lockstatus *lksb;
+} dlm_lock;
+
+
+struct _dlm_lockstatus {
+	dlm_status status;
+	dlm_lock *lockid;
+	char lvb[DLM_LVB_LEN];
+};
+
+enum {
+	DLM_MLE_BLOCK,
+	DLM_MLE_MASTER
+};
+
+typedef struct _dlm_lock_name
+{
+	u8 len;
+	u8 name[0];   // [DLM_LOCKID_NAME_MAX]
+} dlm_lock_name;
+
+/* good god this needs to be trimmed down */
+typedef struct _dlm_master_list_entry
+{
+	struct list_head list;
+	dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	atomic_t refcnt;
+	u32 maybe_map[8];
+	u32 vote_map[8];
+	u32 response_map[8];
+	u32 node_map[8];
+	u16 master;
+	u8 error;
+	u8 type;    // BLOCK or MASTER
+	union {
+		dlm_lock_resource *res;
+		dlm_lock_name name;
+	} u;
+} dlm_master_list_entry;
+
+void dlm_put_mle(dlm_master_list_entry *mle);
+static inline void dlm_get_mle(dlm_master_list_entry *mle)
+{
+	atomic_inc(&mle->refcnt);
+}
+
+
+#define DLM_MASTER_REQUEST_MSG  	500
+#define DLM_MASTER_REQUEST_RESP_MSG	501
+#define DLM_ASSERT_MASTER_MSG		502
+#define DLM_CREATE_LOCK_MSG		503
+#define DLM_CONVERT_LOCK_MSG		504
+#define DLM_PROXY_AST_MSG		505
+#define DLM_UNLOCK_LOCK_MSG		506
+
+
+enum {
+	DLM_MASTER_RESP_NO,
+	DLM_MASTER_RESP_YES,
+	DLM_MASTER_RESP_MAYBE,
+	DLM_MASTER_RESP_ERROR
+};
+
+typedef struct _dlm_master_request
+{
+	u16 node_idx;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request;
+
+typedef struct _dlm_master_request_resp
+{
+	u16 node_idx;
+	u8 response;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request_resp;
+
+typedef struct _dlm_assert_master
+{
+	u16 node_idx;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} dlm_assert_master;
+
+
+
+
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res);
+void dlm_thread_run_lock_resources(dlm_ctxt *dlm);
+int dlm_thread(void *data);
+int dlm_launch_thread(dlm_ctxt *dlm);
+void dlm_complete_thread(dlm_ctxt *dlm);
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name, 
+		   dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast);
+		   
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb,
+		      int flags, int type, dlm_astlockfunc_t *ast, 
+		      dlm_bastlockfunc_t *bast, void *data);
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data);
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast);
+
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key);
+void dlm_unregister_domain(dlm_ctxt *dlm);
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_refresh_lock_resource(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock);
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type);
+u16 dlm_nm_this_node(dlm_ctxt *dlm);
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res);
+
+int dlm_nm_init(dlm_ctxt *dlm);
+int dlm_heartbeat_init(dlm_ctxt *dlm);
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+dlm_ctxt * dlm_lookup_domain(char *domain);
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data);
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int dlm_hb_node_up(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node);
+
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data);
+int dlm_do_master_request(dlm_master_list_entry *mle, int to);
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to);
+int dlm_do_assert_master(dlm_master_list_entry *mle);
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data);
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname);
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void dlm_dump_everything(void);
+void dlm_dump_dlm(dlm_ctxt *dlm);
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+	/* NO_LOCK compatible with all */
+	if (request == LKM_NLMODE ||
+	    existing == LKM_NLMODE)
+		return 1;
+
+	/* EX incompatible with all non-NO_LOCK */
+	if (request == LKM_EXMODE)
+		return 0;
+	
+	/* request must be PR, which is compatible with PR */
+	if (existing == LKM_PRMODE)
+		return 1;
+
+	return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head, dlm_lock *lock)
+{
+	struct list_head *iter;
+	dlm_lock *tmplock;
+
+	list_for_each(iter, head) {
+		tmplock = list_entry(iter, dlm_lock, list);
+		if (tmplock == lock)
+			return 1;
+	}
+	return 0;
+}
+
+static inline int dlm_mle_equal(dlm_ctxt *dlm, dlm_master_list_entry *mle, struct qstr *lockname)
+{
+	dlm_lock_resource *res;
+
+	if (dlm != mle->dlm)
+		return 0;
+
+	if (mle->type == DLM_MLE_BLOCK) {
+		if (lockname->len != mle->u.name.len ||
+    	    	    strncmp(lockname->name, mle->u.name.name, lockname->len)!=0)
+			return 0;
+	} else {
+		res = mle->u.res;
+		if (res->lockname.hash != lockname->hash ||
+       	    	    res->lockname.len != lockname->len ||
+       	    	    strncmp(res->lockname.name, lockname->name, lockname->len)!=0)
+			return 0;
+	}
+	return 1;
+}
+
+static inline dlm_status dlm_err_to_dlm_status(int err)
+{
+	dlm_status ret;
+	if (err == -ENOMEM)
+		ret = DLM_SYSERR;
+	else if (err == -ETIMEDOUT || net_link_down(err, NULL)) 
+		ret = DLM_NOLOCKMGR;
+	else if (err == -EINVAL)
+		ret = DLM_BADPARAM;
+	else if (err == -ENAMETOOLONG)
+		ret = DLM_IVBUFLEN;
+	else
+		ret = DLM_BADARGS;
+	return ret;
+}
+
+#endif /* CLUSTER_DLMMOD_H */

Added: trunk/cluster/dlmrecovery.c
===================================================================
--- trunk/cluster/dlmrecovery.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmrecovery.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,705 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked);
+
+int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(dlm_ctxt *dlm);
+			
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node);
+static int dlm_remaster_locks_local(dlm_ctxt *dlm);
+int dlm_init_recovery_area(dlm_ctxt *dlm, u16 dead_node, u16 num_nodes);
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node);
+void dlm_destroy_recovery_area(dlm_ctxt *dlm, u16 dead_node);
+
+#define DLM_RECOVERY_THREAD_MS  2000
+
+#if 0
+/*
+ * RECOVERY THREAD
+ */
+
+void dlm_kick_recovery_thread(dlm_ctxt *dlm)
+{
+	/* wake the recovery thread */
+	atomic_set(&dlm->reco.thread.woken, 1);
+	wake_up(&dlm->reco.thread.thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(dlm_ctxt *dlm)
+{
+	printk("starting recovery thread...\n");
+	dlm->reco.thread.pid = kernel_thread (dlm_recovery_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (dlm->reco.thread.pid < 0) {
+		printk("unable to launch recovery thread, error=%d", dlm->reco.thread.pid);
+		return -EINVAL;
+	}
+	printk("recovery thread running...\n");
+	return 0;
+}
+
+void dlm_complete_recovery_thread(dlm_ctxt *dlm)
+{
+	printk ("waiting for recovery thread to exit....");
+	send_sig (SIGINT, dlm->reco.thread.task, 0);
+	wait_for_completion (&dlm->reco.thread.complete);
+	printk ("recovery thread exited\n");
+	dlm->reco.thread.task = NULL;
+}
+
+	/* 
+	 * this is lame, but here's how recovery works...
+	 * 1) all recovery threads cluster wide will work on recovering
+	 *    ONE node at a time
+	 * 2) negotiate who will take over all the locks for the dead node.
+	 *    thats right... ALL the locks.
+	 * 3) once a new master is chosen, everyone scans all locks
+	 *    and moves aside those mastered by the dead guy
+	 * 4) each of these locks should be locked until recovery is done
+	 * 5) the new master collects up all of secondary lock queue info
+	 *    one lock at a time, forcing each node to communicate back
+	 *    before continuing
+	 * 6) each secondary lock queue responds with the full known lock info
+	 * 7) once the new master has run all its locks, it sends a ALLDONE! 
+	 *    message to everyone
+	 * 8) upon receiving this message, the secondary queue node unlocks
+	 *    and responds to the ALLDONE
+	 * 9) once the new master gets responses from everyone, he unlocks 
+	 *    everything and recovery for this dead node is done
+	 *10) go back to 2) while there are still dead nodes
+	 *
+	 */
+
+
+
+int dlm_recovery_thread(void *data)
+{
+	int status, i;
+	int cnt = 0, dlm_num;
+	struct list_head *iter, *iter2, *tmpiter;
+	dlm_lock_resource *res;
+	char name[12];
+	dlm_ctxt *dlm = data;
+	u16 tmp;
+
+
+	dlm_num = nm_get_group_global_index(dlm->group);
+	sprintf(name, "dlmreco-%03u", dlm_num);
+	util_daemonize (name, strlen(name), 1);
+	dlm->reco.thread.task = current;
+
+	while (1) {
+		spin_lock(&dlm->spinlock);
+
+		/* check to see if the new master has died */
+		if (dlm->reco.new_master != NM_INVALID_SLOT_NUM &&
+		    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+			printk("new master %u died while recovering %u!\n",
+			       dlm->reco.new_master, dlm->reco.dead_node);
+			// unset the new_master, leave dead_node
+			dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+		}
+
+		/* select a target to recover */
+		if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+			dlm->reco.dead_node = find_next_bit (dlm->recovery_map, NM_MAX_NODES, 0);
+			if (dlm->reco.dead_node >= NM_MAX_NODES)
+				dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+		} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+			// BUG?
+			printk("dead_node %u no longer in recovery map!\n",
+			       dlm->reco.dead_node);
+			dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+		}
+
+		spin_unlock(&dlm->spinlock);
+
+		if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+			printk("nothing to recover!  sleeping now!\n");
+			goto sleep;
+		}
+
+		/* take write barrier */
+		/* (stops the list reshuffling thread, proxy ast handling) */
+		down_write(&dlm->recovery_sem);
+
+		/* choose a new master */
+		if (dlm->reco.new_master == NM_INVALID_SLOT_NUM) {
+			u16 new_dead_node = dlm->reco.dead_node;
+			dlm->reco.new_master = dlm_pick_recovery_master(dlm, &new_dead_node);
+			if (new_dead_node != dlm->reco.dead_node) {
+				// master wants to recover a different node
+				dlm->reco.dead_node = new_dead_node;
+				
+				// do local cleanup if heartbeat has not added the
+				// node to the recovery map yet
+				spin_lock(&dlm->spinlock);
+				if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+					dlm_do_local_recovery_cleanup(dlm, dlm->reco.dead_node, 1);
+					set_bit(dlm->reco.dead_node, dlm->recovery_map);
+					clear_bit(dlm->reco.dead_node, dlm->node_map);
+				}
+				spin_unlock(&dlm->spinlock);
+			}
+		}
+		
+
+		if (dlm->reco.new_master == dlm->group_index) {
+			status = dlm_remaster_locks_local(dlm);
+			if (status < 0) {
+				printk("error remastering locks for node %u!!!!  retrying!\n",
+				       dlm->reco.dead_node);
+			} else {
+				// success!  see if any other nodes need recovery
+				spin_lock(&dlm->spinlock);
+				clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+				spin_unlock(&dlm->spinlock);
+				dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+				dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+				dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+				dlm->reco.next_seq = 0;
+			}
+			up_write(&dlm->recovery_sem);
+			// pick another dead node
+			continue;
+		} else {
+			// sit around until new_master is dead or done
+			// we will get signalled by the waitqueue either way
+			printk("new_master %u is recovering dead_node %u... waiting...\n",
+			       dlm->reco.new_master, dlm->reco.dead_node);
+		}
+
+		up_write(&dlm->recovery_sem);
+
+sleep:
+		atomic_set(&dlm->reco.thread.woken, 0);
+		status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq, 
+					     &dlm->reco.thread.woken, 
+					     1, DLM_RECOVERY_THREAD_MS);
+		if (status == 0 || status == -ETIMEDOUT) {
+			if (atomic_read(&dlm->reco.thread.woken))
+				printk("aha!!! recovery thread woken!\n");
+			else 
+				printk("timed out waiting, running again\n");
+			continue;
+		}
+		printk("recovery thread got %d while waiting\n", status);
+		break;
+	}
+
+	flush_scheduled_work();
+	complete (&dlm->reco.thread.complete);
+	printk("quitting recovery thread!!!!!!\n");
+	return 0;
+}
+
+/* +- if this node is NOT the new master... */
+/* +--- if master's dead_node is not the one we chose, do local cleanup again with proper dead_node */
+/* +---	wait for poll messages from new master: register net message handler, it will do the work */
+/* +--- check for death of new master */
+/* +--- if dead, unregister the handler, unset new_master, keep dead_node and goto "select a target" */
+/* |- on request, send header with number of packets, get response, then start blasting packets */
+/* |- retransmit any missed packets on request */
+/* |- once ALL DONE is received, run all locks again */
+/* +--- unset the RECOVERING flag */
+/* +--- set the new owner as new_master */
+/* +--- remove dead_node from recovery map */
+/* +--- unset new_master and dead_node and start all over */
+
+
+static int dlm_remaster_locks_local(dlm_ctxt *dlm)
+{
+	int num_nodes = 255, i, status = 0;
+	u32 node_map[8];
+
+
+/* +- if this node is the new master, init the temp recovery area */
+/* |- poll each live node for lock state */
+/* |- collect the data from each node until node says it's done, or dead */
+/* +--- if node died, throw away temp recovery area, keep new_master and dead_node, goto "select a target" */
+/* |- apply all temp area changes to real lock */
+/* +- send ALL DONE message to each node */
+
+
+	status = dlm_init_recovery_area(dlm, dlm->reco.dead_node, num_nodes);
+	if (status < 0)
+		return status;
+
+	spin_lock(&dlm->spinlock);
+	num_nodes = nm_get_group_max_slots(dlm->group);
+	memcpy(node_map, dlm->node_map, sizeof(node_map));
+	spin_unlock(&dlm->spinlock);
+
+	for (i=0; i<num_nodes; i++) {
+		if (test_bit(i, node_map)) {
+			spin_lock(&dlm->spinlock);
+			dlm->reco.sending_node = i;
+			dlm->reco.next_seq = 0;
+			spin_unlock(&dlm->spinlock);
+			status = dlm_request_all_locks(dlm, i, dlm->reco.dead_node);
+			if (status < 0) {
+				spin_lock(&dlm->spinlock);
+				dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+				dlm->reco.next_seq = 0;
+				spin_unlock(&dlm->spinlock);
+				dlm_destroy_recovery_area(dlm, dlm->reco.dead_node);
+				return status;
+			}
+		}
+	}
+	return status;
+}
+
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node)
+{
+	printk("dlm_request_all_locks: dead node is %u, sending request to %u\n",
+	       dead_node, request_from);
+	// send message
+	// sleep until all received or error
+	return 0;
+}
+
+#endif
+
+#if 0
+
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data);
+
+typedef struct _dlm_reco_lock_info
+{
+	u16 node;
+	u16 unused1;
+	u64 cookie;
+	s8 type;
+	s8 convert_type;
+	u8 list;
+	u8 lockname_len;
+	u8 lockname[DLM_LOCKID_NAME_MAX];
+} dlm_reco_lock_info;
+
+enum {
+	DLM_RECO_MASTER_REQUEST, 
+	DLM_RECO_XMIT_LOCKS_REQUEST,
+	DLM_RECO_XMIT_LOCK_HDR_REQUEST,
+	DLM_RECO_XMIT_LOCK_ARR_REQUEST,
+	DLM_RECO_XMIT_COMPLETE_REQUEST,
+	DLM_RECO_ALL_DONE_REQUEST
+};
+
+enum {
+	DLM_RECO_NO_RESPONSE,
+	DLM_RECO_YES_RESPONSE
+};
+
+#define DLM_LOCKS_PER_PACKET   40
+
+typedef struct _dlm_reco_lock_arr_req
+{
+	u8 request_type;
+	u8 num_locks;
+	u16 dead_node;
+	u32 seqnum;
+	dlm_reco_lock_info lock[DLM_LOCKS_PER_PACKET];
+} dlm_reco_lock_arr_req;
+
+typedef struct _dlm_reco_request
+{
+	u8 request_type;
+	u8 unused1;
+	u16 dead_node;
+	u32 num;
+} dlm_reco_request;
+
+typedef struct _dlm_reco_response
+{
+	u8 response_type;
+	u8 unused1[7];
+} dlm_reco_response;
+
+static inline int dlm_reco_lock_info_valid(dlm_reco_lock_info *info)
+{
+	if (info->type != LKM_NLMODE &&
+	    info->type != LKM_PRMODE &&
+	    info->type != LKM_EXMODE)
+		return 0;
+	if (info->convert_type != LKM_NLMODE &&
+	    info->convert_type != LKM_PRMODE &&
+	    info->convert_type != LKM_EXMODE)
+		return 0;
+	if (info->list > 2)
+		return 0;
+	return 1;
+}
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order);
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order)
+{
+	int ret = -EINVAL;
+	dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+	
+	/* check a bunch of ugly conditions */
+	*out_of_order = 0;
+	if (req->num_locks > DLM_LOCKS_PER_PACKET) {
+		printk("num_locks too large! %u\n", req->num_locks);
+	} else if (req->seqnum != dlm->reco.next_seq) {
+		printk("expected seq %lu from node %u, got %lu\n",
+		       dlm->reco.next_seq, msg->src_node,
+		       req->seqnum);
+		*out_of_order = 1;
+	} else if (dlm->reco.dead_node != req->dead_node) {
+		printk("bad lock array: dead node=%u, sent=%u\n",
+		       dlm->reco.dead_node != req->dead_node);
+	} else if (dlm->reco.new_master != dlm->group_index) {
+		printk("this node is not the recovery master!\n");
+	} else if (dlm->reco.sending_node != msg->src_node ||
+		 dlm->group_index == msg->dest_node) {
+		printk("eek. sending_node=%u, actual=%u, dest=%u, me=%u\n",
+		       dlm->reco.sending_node, msg->src_node, 
+		       msg->dest_node, dlm->group_index);
+	} else
+		ret = 0;
+	return ret;
+}
+
+
+/* 
+ * gawd i hate udp
+ */
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+	dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+	dlm_lock_resource *res = NULL;
+	dlm_reco_lock_info *info;
+	dlm_lock **newlocks = NULL;
+	dlm_lock *lock = NULL;
+	int ret, i, out_of_order = 0;
+	
+	// TODO: ntoh(req)
+
+	ret = 0;
+	if (req->num_locks == 0)
+		goto send_response;
+
+	/* check to see if it's worth kmallocing */
+	spin_lock(&dlm->spinlock);
+	ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+	spin_unlock(&dlm->spinlock);
+	if (ret < 0)
+		goto send_response;
+
+	newlocks = kmalloc(req->num_locks * sizeof(dlm_lock *), GFP_KERNEL);
+	if (!newlocks) {
+		printk("failed to alloc temp lock array!\n");
+		ret = -ENOMEM;
+		goto send_response;
+	}
+	memset(newlocks, 0, req->num_locks * sizeof(dlm_lock *));
+	for (i=0; i<req->num_locks; i++) {
+		info = &(req->lock[i]);
+		if (!dlm_reco_lock_info_valid(info)) {
+			ret = -EINVAL;
+			goto send_response;
+		}
+		lock = newlocks[i] = kmem_cache_alloc(dlm_lock_cache, GFP_KERNEL);
+		if (!newlocks[i]) {
+			ret = -ENOMEM;
+			goto send_response;
+		}
+		memset(lock, 0, sizeof(dlm_lock));
+		LIST_HEAD_INIT(&lock->list);
+		LIST_HEAD_INIT(&lock->ast_list);
+		spin_lock_init(&lock->spinlock);
+		lock->type = info->type;
+		lock->convert_type = info->convert_type;
+		lock->node = dlm->group_index;
+		//atomic_set(&lock->ast_lock, 0);
+		//atomic_set(&lock->bast_lock, 0);
+		lock->ast = NULL;
+		lock->bast = NULL;
+		lock->astdata = (void *)info->list;   // cheating here...
+		lock->cookie = info->cookie;	
+	}
+
+	spin_lock(&dlm->spinlock);
+	/* ok now that everything is allocated and the lock has
+	 * been taken again, recheck all those stupid conditions */
+	ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+	if (ret < 0) {
+		spin_unlock(&dlm->spinlock);
+		goto send_response;
+	}
+	for (i=0; i<req->num_locks; i++) {
+		info = &(req->lock[i]);
+		lock = newlocks[i];
+		list_add_tail(&lock->list, &dlm->reco.received);
+	}
+	spin_unlock(&dlm->spinlock);
+
+send_response:
+	if (newlocks) {
+		if (ret < 0) {
+			for (i=0; i<req->num_locks; i++)
+				if (newlocks[i])
+					kmem_cache_free(dlm_reco_lock_info_cache, newlocks[i]);
+		}
+		kfree(newlocks);
+	}
+
+	return ret;
+}
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+}
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data)
+{
+	dlm_ctxt *dlm = data;
+}
+
+
+
+
+
+static int dlm_send_reco_request(dlm_ctxt *dlm, dlm_reco_request *buf, u16 to, struct inode *node)
+{
+	int ret;
+	net_msg *msg = net_package_message(DLM_NET_RECOVERY_REQUEST_MSG_TYPE, 
+				  dlm->key, buf, sizeof(*buf), 
+				  dlm->group_index, to);
+	if (!msg)
+		return -ENOMEM;
+	ret = net_send_udp_msg (node, msg, sizeof(*buf));
+	kfree(msg);
+	return ret;
+}
+
+static int dlm_recover_domain(dlm_ctxt *dlm)
+{
+
+	
+	return 0;
+}
+
+
+#endif
+
+#warning may need to change kfree to put_lock and refcounting here
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked)
+{
+	struct list_head *iter, *iter2, *tmpiter;
+	dlm_lock_resource *res;
+	dlm_lock *lock;
+	int i;
+	struct list_head *bucket;
+	
+	if (!locked)	
+		spin_lock(&dlm->spinlock);
+
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, dlm_lock_resource, list);
+			spin_lock(&res->spinlock);
+			if (res->owner == dead_node) {
+				res->state |= DLM_LOCK_RES_RECOVERING;
+				list_del(&res->recovering);
+				list_add_tail(&res->recovering, &dlm->reco.resources);
+			} else if (res->owner == dlm->group_index) {
+				list_for_each_safe(iter2, tmpiter, &res->granted) {
+					lock = list_entry (iter2, dlm_lock, list);
+					if (lock->node == dead_node) {
+						list_del(&lock->list);
+						kfree(lock);
+					}
+				}
+				list_for_each_safe(iter2, tmpiter, &res->converting) {
+					lock = list_entry (iter2, dlm_lock, list);
+					if (lock->node == dead_node) {
+						list_del(&lock->list);
+						kfree(lock);
+					}
+				}
+				list_for_each_safe(iter2, tmpiter, &res->blocked) {
+					lock = list_entry (iter2, dlm_lock, list);
+					if (lock->node == dead_node) {
+						list_del(&lock->list);
+						kfree(lock);
+					}
+				}
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+	if (!locked)
+		spin_unlock(&dlm->spinlock);
+}
+
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//int ret;
+	//struct inode *group = ptr1;
+	//struct inode *node = ptr2;
+	dlm_ctxt *dlm = data;
+	
+	spin_lock(&dlm->spinlock);
+
+	if (!test_bit(idx, dlm->node_map))
+		printk("node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, dlm->node_map);
+	
+	if (test_bit(idx, dlm->recovery_map))
+		printk("node %u already added to recovery map!\n", idx);
+	else {
+		set_bit(idx, dlm->recovery_map);
+		dlm_do_local_recovery_cleanup(dlm, idx, 1);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+	//struct inode *group = ptr1;
+	//struct inode *node = ptr2;
+	dlm_ctxt *dlm = data;
+
+	spin_lock(&dlm->spinlock);
+
+	if (test_bit(idx, dlm->recovery_map)) {
+		printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+	} else {
+		if (test_bit(idx, dlm->node_map))
+			printk("node %u already in node map!!!\n", idx);
+		else 
+			set_bit(idx, dlm->node_map);
+	}
+
+	spin_unlock(&dlm->spinlock);
+}
+
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+	if (test_bit(node, dlm->recovery_map))
+		return 1;
+	return 0;
+}
+
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+	if (test_bit(node, dlm->node_map))
+		return 1;
+	return 0;
+}
+
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+	int ret;
+	spin_lock(&dlm->spinlock);
+	ret = __dlm_hb_node_dead(dlm, node);
+	spin_unlock(&dlm->spinlock);
+	return ret;
+}
+
+int dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+	int ret;
+	spin_lock(&dlm->spinlock);
+	ret = __dlm_hb_node_up(dlm, node);
+	spin_unlock(&dlm->spinlock);
+	return ret;
+}
+
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node)
+{
+	u16 master = 0;
+#if 0
+	dlm_status ret;
+	dlm_lockstatus lksb;
+
+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 
+		      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+
+	if (ret == DLM_NORMAL) {
+		// I am master
+		// send message to all nodes saying that I am beginning a recovery session for node XX,
+		//   then call dlmunlock???
+
+	} else if (ret == DLM_NOTQUEUED) {
+		// another node is master
+		// wait on reco.new_master != NM_INVALID_SLOT_NUM
+	} 
+
+	// at this point, every node in this domain should have reco.new_master and .dead_node set, even
+	//   if they have not discovered the dead node on their own
+	//
+	//
+	// atomic_set(&dlm->reco.thread.woken, 0);
+	//     232                 status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq,
+	//         233                                              &dlm->reco.thread.woken,
+	//             234                                              1, DLM_RECOVERY_THREAD_MS);
+	//
+#endif
+	return master;
+}

Added: trunk/cluster/dlmthread.c
===================================================================
--- trunk/cluster/dlmthread.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmthread.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,329 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+extern u16 dlm_global_index;
+
+#define dlm_lock_is_remote(dlm, lock)     ((lock)->node != (dlm)->group_index)
+
+/*
+ * DLM THREAD
+ */
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+	dlm_lock *lock, *target;
+	struct list_head *iter, *tmpiter;
+	LIST_HEAD(bast_list);
+	struct list_head *head;
+	s8 hi;
+
+	spin_lock(&res->spinlock);
+
+#if 0
+	{
+		int g=0, c=0, b=0;
+		list_for_each(iter, &res->granted) {
+			g++;
+		}
+		list_for_each(iter, &res->converting) {
+			c++;
+		}
+		list_for_each(iter, &res->blocked) {
+			b++;
+		}
+		printk("(%d) granted: %d, converting: %d, blocked: %d\n", current->pid, g, c, b);
+	}
+#endif
+
+converting:
+	if (list_empty(&res->converting))
+		goto blocked;
+	target = list_entry(res->converting.next, dlm_lock, list);
+	if (target->convert_type == LKM_IVMODE) {
+		printk("eeek!!! converting a lock with no convert_type!!!!\n");
+		BUG();
+	}
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+	
+	/* we can convert the lock */
+	if (list_empty(&bast_list)) {
+		spin_lock(&target->spinlock);
+		DLM_ASSERT(target->highest_blocked == LKM_IVMODE);	
+		
+		dlmprintk("calling ast for converting lock: %*s, have: %d, granting: %d, node: %u\n", 
+			  res->lockname.len, res->lockname.name, target->type, target->convert_type, target->node);
+
+		target->type = target->convert_type;
+		target->convert_type = LKM_IVMODE;
+		list_del(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		if (target->node == dlm->group_index) {
+			DLM_ASSERT(target->lksb);
+			DLM_ASSERT(target->lksb->status);
+
+			target->lksb->status = DLM_NORMAL;
+		} else {
+			dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+		}
+
+		spin_unlock(&target->spinlock);
+
+		if (dlm_do_ast(dlm, res, target) < 0)
+			printk("eek\n");
+		/* go back and check for more */
+		goto converting;
+	}
+
+blocked:
+	if (list_empty(&res->blocked)) {
+		goto basts;
+	}
+	target = list_entry(res->blocked.next, dlm_lock, list);
+
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->type, target->type)) {
+			if (lock->highest_blocked == LKM_IVMODE)
+				list_add(&lock->ast_list, &bast_list);
+			if (lock->highest_blocked < target->type)
+				lock->highest_blocked = lock->type;
+		}
+	}
+	
+	/* we can grant the blocked lock (only 
+	 * possible if converting list empty) */
+	if (list_empty(&bast_list)) {
+		spin_lock(&target->spinlock);
+		DLM_ASSERT(target->highest_blocked == LKM_IVMODE);
+		
+		dlmprintk("calling ast for blocked lock: %*s, granting: %d, node: %u\n", 
+			  res->lockname.len, res->lockname.name, target->type, target->node);
+
+		// target->type is already correct
+		list_del(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		if (target->node == dlm->group_index) {
+			DLM_ASSERT(target->lksb);
+			DLM_ASSERT(target->lksb->status);
+		
+			target->lksb->status = DLM_NORMAL;
+		} else {
+			dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+		}
+		
+		spin_unlock(&target->spinlock);
+
+		if (dlm_do_ast(dlm, res, target) < 0)
+			printk("eek\n");
+		/* go back and check for more */
+		goto converting;
+	}
+
+basts:
+	list_for_each_safe(iter, tmpiter, &bast_list) {
+		lock = list_entry(iter, dlm_lock, ast_list);
+		spin_lock(&lock->spinlock);
+		DLM_ASSERT(lock->highest_blocked > LKM_IVMODE);
+		hi = lock->highest_blocked;
+		lock->highest_blocked = LKM_IVMODE;
+		list_del(&lock->ast_list);
+		spin_unlock(&lock->spinlock);
+
+		if (dlm_do_bast(dlm, res, lock, hi) < 0)
+			printk("eeek\n");
+	}
+	spin_unlock(&res->spinlock);
+}
+
+
+/* must have NO locks when calling this */
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+	if (res) {
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+		if (!(res->state & DLM_LOCK_RES_DIRTY)) {
+			list_add_tail(&res->dirty, &dlm->dirty_list);
+			res->state |= DLM_LOCK_RES_DIRTY;
+		}
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+	}
+
+	/* wake the dlm thread */
+	atomic_set(&dlm->thread.woken, 1);
+	wake_up(&dlm->thread.thread_wq);
+}
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(dlm_ctxt *dlm)
+{
+	printk("starting dlm thread...\n");
+	dlm->thread.pid = kernel_thread (dlm_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (dlm->thread.pid < 0) {
+		printk("unable to launch dlm thread, error=%d", dlm->thread.pid);
+		return -EINVAL;
+	}
+	printk("dlm thread running for %s...\n", dlm->name);
+	return 0;
+}
+
+void dlm_complete_thread(dlm_ctxt *dlm)
+{
+	printk ("waiting for dlm thread to exit....");
+	send_sig (SIGINT, dlm->thread.task, 0);
+	wait_for_completion (&dlm->thread.complete);
+	printk ("dlm thread exited\n");
+	dlm->thread.task = NULL;
+}
+
+
+
+
+int dlm_thread(void *data)
+{
+	int status;
+	struct list_head *iter, *tmpiter;
+	dlm_lock_resource *res;
+	dlm_ctxt *dlm = data;
+
+	util_daemonize ("dlm_thread", strlen("dlm_thread"), 1);
+	dlm->thread.task = current;
+
+	while (1) {
+		down_read(&dlm->recovery_sem);
+		spin_lock(&dlm->spinlock);
+		list_for_each_safe(iter, tmpiter, &dlm->dirty_list) {
+			res = list_entry(iter, dlm_lock_resource, dirty);
+			/* don't shuffle secondary queues */
+			if (res->owner != dlm->group_index)
+				continue;
+			dlm_shuffle_lists(dlm, res);
+			spin_lock(&res->spinlock);
+			list_del(&res->dirty);
+			res->state &= ~DLM_LOCK_RES_DIRTY;
+			spin_unlock(&res->spinlock);
+		}
+		spin_unlock(&dlm->spinlock);
+		up_read(&dlm->recovery_sem);
+			
+		atomic_set(&dlm->thread.woken, 0);
+		status = util_wait_atomic_eq(&dlm->thread.thread_wq, 
+					     &dlm->thread.woken, 
+					     1, DLM_THREAD_MS);
+
+		if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+			if (atomic_read(&dlm->thread.woken))
+				printk("aha!!! dlm thread woken!\n");
+			else 
+				printk("timed out waiting, running again\n");
+#endif
+			continue;
+		}
+	
+		printk("DLM thread got %d while waiting\n", status);
+		break;
+	}
+
+	flush_scheduled_work();
+	complete (&dlm->thread.complete);
+	printk("quitting DLM thread!!!!!!\n");
+	return 0;
+}

Added: trunk/cluster/heartbeat.c
===================================================================
--- trunk/cluster/heartbeat.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/heartbeat.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,869 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Keeps track of alive nodes in the cluster.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+static void hb_teardown(void);
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx);
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start);
+static ssize_t write_disk(struct file *file, char *buf, size_t size);
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx);
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx);
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx);
+static int hb_do_disk_heartbeat(void *page);
+static int hb_thread(void *data);
+static void hb_complete_thread(void);
+static void hb_kick_thread(void);
+static int hb_launch_thread(void);
+static inline int hb_wait_on_callback_state(int type);
+
+
+
+/* globals */
+extern char *nm_nodename;
+static spinlock_t hb_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(hb_net_groups);
+static LIST_HEAD(hb_disk_groups);
+static int hb_callback_state[HB_NUM_CB];
+struct list_head hb_callbacks[HB_NUM_CB];
+static spinlock_t hb_cb_lock = SPIN_LOCK_UNLOCKED;
+static struct task_struct *hb_task = NULL;
+static atomic_t hb_thread_woken = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(hb_thread_wait_queue);
+static struct completion hb_complete;
+static int hb_pid = -1;
+
+static wait_queue_head_t hb_cb_wq;
+static atomic_t hb_cb_ready = ATOMIC_INIT(0);
+
+
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else {
+		printk("eek!  EIO!\n");
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+}
+
+
+
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx)
+{
+	int ret;
+	printk("hb_do_node_down: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+	printk("NOT removing node from group\n");
+	//ret = nm_remove_node_from_group(group, node);
+	hb_do_callbacks(HB_NODE_DOWN_CB, group, node, 0);
+	return 0;
+}
+
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx)
+{
+	printk("hb_do_node_up: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+	hb_do_callbacks(HB_NODE_UP_CB, group, node, 0);
+	return 0;
+}
+
+static inline void hb_submit_bh(int rw, struct buffer_head *bh)
+{
+	printk("submit_bh: rw=%s, blocknr=%lu, mapped=%s\n",
+	       rw==WRITE?"write":"read", bh->b_blocknr, 
+	       buffer_mapped(bh) ? "yes" : "no");
+	submit_bh(rw, bh);
+}
+
+
+static int hb_do_disk_heartbeat(void *page)
+{
+	nm_group_inode_private *priv;
+	struct inode *group, *node;
+	struct list_head *iter;
+	struct buffer_head *bh;
+	hb_disk_slot *slot;
+	hb_disk_heartbeat_block *hb_block;
+	int ino, idx, ret, i;
+	struct inode **dead_nodes, **live_nodes;
+	LIST_HEAD(tmplist);
+	u64 blkno;
+	cluster_disk *disk;
+
+	// NM_MAX_NODES is 255
+	dead_nodes = page;
+	live_nodes = page + (sizeof(struct inode *) * 256);
+	
+	spin_lock(&hb_lock);
+	list_splice_init(&hb_disk_groups, &tmplist);
+	spin_unlock(&hb_lock);
+
+	list_for_each(iter, &tmplist) {
+		priv = list_entry(iter, nm_group_inode_private, disk_list);
+		group = priv->inode;
+		disk = &priv->disk;
+
+		memset(page, 0, PAGE_SIZE);
+		down(&group->i_sem);
+
+		idx = 0;
+		while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+retry_submit:
+			bh = slot->bh;
+			node = slot->inode;
+
+			ino = nm_get_node_global_index(node);
+
+			if (ino == nm_this_node(group)) {
+				lock_buffer(bh);
+				if (!buffer_mapped(bh)) {
+					blkno = (unsigned long long) bh->b_blocknr;
+					unlock_buffer(bh);
+					brelse(bh);
+					slot->bh = getblk(disk->dev,
+							  blkno,
+							  (1 << disk->blocksize_bits));
+					goto retry_submit;
+				}
+				memset(bh->b_data, 0, bh->b_size);
+				hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+				hb_block->time = CURRENT_TIME;
+				if (!hb_block->time)
+					hb_block->time = 1;
+				set_buffer_uptodate(bh);
+				clear_buffer_dirty(bh);
+				bh->b_end_io = hb_end_buffer_io_sync;
+				hb_submit_bh(WRITE, bh);
+			} else {
+				lock_buffer(bh);
+				if (!buffer_mapped(bh)) {
+					blkno = (unsigned long long) bh->b_blocknr;
+					unlock_buffer(bh);
+					brelse(bh);
+					slot->bh = getblk(disk->dev,
+							  blkno,
+							  (1 << disk->blocksize_bits));
+					goto retry_submit;
+				}
+				clear_buffer_uptodate(bh);
+				bh->b_end_io = hb_end_buffer_io_sync;
+				hb_submit_bh(READ, bh);
+			}
+			idx++;
+		}
+	
+		idx = 0;
+		while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+			bh = slot->bh;
+			node = slot->inode;
+
+			ino = nm_get_node_global_index(node);
+
+			wait_on_buffer(bh);
+			hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+			if (hb_block->time != slot->last_time) {
+				if (slot->state == HB_NODE_STATE_INIT) {
+					printk("first time for this node!\n");
+					live_nodes[ino] = node;
+					slot->state = HB_NODE_STATE_UP;
+				}
+				node->i_atime = hb_block->time;
+				slot->last_time = hb_block->time;
+				slot->margin = HB_DISK_MARGIN;
+				hb_do_callbacks(HB_NODE_RESPONDED_CB, group, node, HB_TYPE_DISK);
+			} else {
+				slot->margin--;
+				printk("node %d missed.  margin=%d\n", ino, slot->margin);
+			}
+
+			if (ino != nm_this_node(group) && slot->margin <= 0) {
+				printk("node %d JUST DIED!!!!\n", ino);
+				dead_nodes[ino] = node;
+				slot->state = HB_NODE_STATE_DOWN;
+			}
+			idx++;
+		}
+
+		up(&group->i_sem);
+
+		/* Do holding group i_sem while doing node-up/down.
+		 * Changes may need to be made to the group, so 
+		 * i_sem will be needed... */
+		for (i=0; i<NM_MAX_NODES; i++) {
+			if (live_nodes[i])
+				ret = hb_do_node_up(group, live_nodes[i], i);
+			else if (dead_nodes[i])
+				ret = hb_do_node_down(group, dead_nodes[i], i);
+		}
+	}
+	
+	spin_lock(&hb_lock);
+	list_splice(&tmplist, &hb_disk_groups);
+	spin_unlock(&hb_lock);
+	return 0;
+}
+
+
+static int hb_thread(void *data)
+{
+	int status;
+	void *page;
+	
+	page = (void *) __get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	util_daemonize ("hb_thread", strlen("hb_thread"), 1);
+	hb_task = current;
+
+	while (1) {
+		status = hb_do_disk_heartbeat(page);
+
+		atomic_set(&hb_thread_woken, 0);
+		status = util_wait_atomic_eq(&hb_thread_wait_queue, 
+					     &hb_thread_woken, 
+					     1, HB_THREAD_MS);
+
+		if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+			if (atomic_read(&hb_thread_woken))
+				printk("aha!!! hb thread woken!\n");
+			else 
+				printk("hb thread timed out waiting, running again\n");
+#endif
+			continue;
+		}
+		printk("hb thread got %d while waiting\n", status);
+		break;
+	}
+
+	flush_scheduled_work();
+	complete (&hb_complete);
+	printk("quitting hb thread!!!!!!\n");
+	return 0;
+}
+
+
+static void hb_kick_thread(void)
+{
+	atomic_set(&hb_thread_woken, 1);
+	wake_up(&hb_thread_wait_queue);
+}
+
+/* Launch the hb thread for the mounted volume */
+static int hb_launch_thread(void)
+{
+	hb_pid = -1;
+	hb_task = NULL;
+	init_completion (&hb_complete);
+
+	printk("starting hb thread...\n");
+	hb_pid = kernel_thread (hb_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (hb_pid < 0) {
+		printk("unable to launch hb thread, error=%d", hb_pid);
+		return -EINVAL;
+	}
+	printk("hb thread running...\n");
+	return 0;
+}
+
+static void hb_complete_thread(void)
+{
+	printk ("waiting for hb thread to exit....");
+	send_sig (SIGINT, hb_task, 0);
+	wait_for_completion (&hb_complete);
+	printk ("hb thread exited\n");
+	hb_task = NULL;
+}
+
+
+
+
+
+
+
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start)
+{
+	int ret = -EINVAL;
+	cluster_disk *disk;
+	nm_group_inode_private *priv;
+
+	priv = group->u.generic_ip;
+	if (!priv)
+		goto leave;
+
+	if (priv->state == NM_GROUP_READY)
+		return 0;
+
+	/* hold an extra ref as long as hb keeps track of the group */
+	igrab(group);
+
+	disk = &priv->disk;
+	if (blocks > NM_MAX_NODES)
+	       blocks = NM_MAX_NODES;
+	disk->dev = dev;
+	disk->blocksize_bits = bits;
+	disk->num_blocks = blocks;
+	disk->start_block = start;
+	util_init_rarray(&disk->slots, sizeof(hb_disk_slot));
+
+	/* start allowing group additions */
+	ret = nm_make_group_ready(group);
+
+leave:
+	if (ret < 0)
+		iput(group);
+
+	return ret;
+}
+	
+
+static ssize_t write_disk(struct file *file, char *buf, size_t size)
+{
+	hb_op *data;
+	struct inode *group = NULL;
+	struct file *filp = NULL;
+	kdev_t dev;
+	int ret, tmpret;
+	nm_group_inode_private *priv;
+	u32 tmpmap[8];
+	
+	printk("write_disk\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+	data = (hb_op *) buf; if (data->magic != HB_OP_MAGIC)
+		return -EINVAL;
+
+	switch (data->opcode)
+	{
+		case HB_OP_START_DISK_HEARTBEAT:
+			if (data->bits < 9 || data->bits > 12) {
+				ret = sprintf(buf, "%d: bad blocksize bits! %u", -EINVAL, data->bits);
+				break;
+			}
+			group = nm_get_group_by_num(data->group_num);
+			if (!group || !group->u.generic_ip) {
+				ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+				break;
+			}
+			priv = group->u.generic_ip;
+			if (strncmp(priv->disk.uuid, data->disk_uuid, CLUSTER_DISK_UUID_LEN) != 0) {
+				ret = sprintf(buf, "%d: bad disk uuid!", -EINVAL);
+				break;
+			}
+			filp = fget(data->fd);
+			if (!filp) {
+				ret = sprintf(buf, "%d: bad fd!", -EINVAL);
+				break;
+			}
+			dev = filp->f_dentry->d_inode->i_rdev;
+			tmpret = hb_init_disk_hb_group(group, dev, data->bits, data->blocks, data->start);
+			if (tmpret < 0) {
+				fput(filp);
+				ret = sprintf(buf, "%d: failed to init disk heartbeat for group %u!", 
+					      -EINVAL, data->group_num);
+			} else {
+				ret = sprintf(buf, "0: disk heartbeat started for group %u!", 
+					      data->group_num);
+			}
+			break;
+
+		case HB_OP_GET_NODE_MAP:
+			group = nm_get_group_by_num(data->group_num);
+			if (!group || !group->u.generic_ip) {
+				ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+				break;
+			}
+			
+			if ((ret = hb_fill_node_map(group, tmpmap, sizeof(tmpmap))) == 0) {
+				ret = sprintf(buf, "0: ");
+				buf += ret;
+				memcpy(buf, tmpmap, sizeof(tmpmap));
+				ret += sizeof(tmpmap);
+			} else {
+				ret = sprintf(buf, "%d: error occurred in hb_fill_node_map", ret);
+			}
+			break;
+
+		default:
+			ret = sprintf(buf, "%d: bad opcode! %u", -EINVAL, data->opcode);
+			break;
+	}
+
+	if (group)
+		iput(group);
+	
+	return ret;
+}
+
+
+extern struct file_operations transaction_ops;
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+static int hb_fill_super(struct super_block * sb, void * data, int silent)
+{
+	int ret;
+	TA_write_ops *ops;
+	static struct tree_descr hb_files[] = {
+		[HB_Disk] = {".disk", &transaction_ops, S_IWUSR},
+		/* last one */ {""}
+	};
+	
+	ops = kmalloc(sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	memset(ops, 0, sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)));
+	ops->num_ops = HB_WriteOpArraySize;
+	ops->write_op[HB_Disk] = write_disk;
+
+	printk("calling simple_fill_super...\n");
+	ret = simple_fill_super(sb, 0x5551212f, hb_files);
+	if (ret >= 0)
+		TA_GENERIC_SB_MEMBER(sb) = ops;
+	else 
+		kfree(ops);
+	return ret;
+}
+
+static struct super_block *hb_read_super (struct super_block *sb, void *data, int silent)
+{
+	printk("welcome to hb_read_super!!!\n");
+	return (hb_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (hb_fs_type, "hb", hb_read_super, FS_SINGLE|FS_LITTER);
+
+
+/* TODO: make callbacks all return int */
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	hb_disk_slot *slot;
+	struct inode *group = ptr1;
+	struct inode *node = ptr2;
+	cluster_disk *disk;
+	nm_group_inode_private *priv;
+	int ino, ret = 0;
+	u64 block;
+
+	printk("hb_nm_group_node_add_cb: group=%lu, node=%lu, idx=%u\n",
+	       group->i_ino, node->i_ino, idx);
+
+	down(&group->i_sem);	
+	priv = group->u.generic_ip;
+	if (!priv) {
+		printk("eek! bad group inode!\n");
+		goto leave;
+	}
+	disk = &priv->disk;
+	if (disk->uuid[0]) {
+		ret = util_resize_rarray(&disk->slots, idx+1);
+		if (ret < 0) {
+			printk("eeeeeeek!!!! failed to resize disk state data\n");
+			goto leave;
+		}
+	
+		ino = nm_get_node_global_index(node);
+		if (ino > disk->num_blocks) {
+			printk("disk heartbeat area does not have enough blocks!\n");
+			goto leave;
+		}
+		block = ino + disk->start_block;
+	
+		slot = util_rarray_idx_to_slot(&disk->slots, idx);
+		if (!slot) {
+			printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+			goto leave;
+		}
+		slot->inode = igrab(node);
+		slot->last_time = 0;
+		slot->margin = HB_INITIAL_DISK_MARGIN;
+#warning needs to change for 2.6
+		slot->bh = getblk(disk->dev, (int)block, (1 << disk->blocksize_bits));
+		slot->state = HB_NODE_STATE_INIT;
+	} else {
+		printk("doing nothing for group add for non-disk heartbeat group\n");
+	}
+	
+leave:
+	up(&group->i_sem);
+	return;	
+}
+
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	hb_disk_slot *slot;
+	struct inode *group = ptr1;
+	struct inode *node = ptr2;
+	cluster_disk *disk;
+	nm_group_inode_private *priv;
+	int ret = -EINVAL;
+
+	printk("hb_nm_group_node_del_cb: group=%lu, node=%lu, idx=%u\n",
+	       group->i_ino, node->i_ino, idx);
+
+	down(&group->i_sem);
+	priv = group->u.generic_ip;
+	if (!priv) {
+		printk("eek! bad group inode!\n");
+		goto leave;
+	}
+	disk = &priv->disk;
+	slot = util_rarray_idx_to_slot(&disk->slots, idx);
+	if (!slot) {
+		printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+		goto leave;
+	}
+	if (slot->inode!=node) {
+		printk("eeeeeeek!!!! node inode changed!\n");
+		goto leave;
+	}
+	iput(node);
+	if (slot->bh) {
+		wait_on_buffer(slot->bh);
+		brelse(slot->bh);
+	}
+	memset(slot, 0, sizeof(hb_disk_slot));
+	ret = 0;
+leave:
+
+	up(&group->i_sem);
+	printk("hb_nm_group_node_del_cb done: %d\n", ret);
+	return;
+}
+
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	//struct inode *node = ptr1;
+}
+
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+	struct inode *group = ptr1;
+	nm_group_inode_private *priv;
+
+	printk("hb_nm_group_add_cb: group=%lu, idx=%u\n",
+	       group->i_ino, idx);
+	
+	priv = group->u.generic_ip;
+	if (!priv) {
+		printk("eek! bad group inode!\n");
+		return;
+	}
+
+	spin_lock(&hb_lock);
+	list_add_tail(&priv->net_list, &hb_net_groups);
+	if (priv->disk.uuid[0]) {
+		printk("adding priv=%p inode=%p to disk group list\n", priv, group);
+		list_add_tail(&priv->disk_list, &hb_disk_groups);
+	}
+	spin_unlock(&hb_lock);
+}
+
+enum {
+	HB_CB_STATE_FROZEN = 0,
+	HB_CB_STATE_READY
+};
+
+static int __init init_hb(void)
+{
+	int retval=-1, i;
+	printk("loading heartbeat module: nodename is %s\n", nm_nodename);
+
+	if (proc_mkdir("cluster/heartbeat", 0)) {
+		// ???
+	}
+
+	//hb_net_timestamps = __get_free_page(GFP_KERNEL);
+	//if (!hb_net_timestamps)
+	//	goto done;
+
+	for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+		INIT_LIST_HEAD(&hb_callbacks[i]);
+	init_waitqueue_head(&hb_cb_wq);
+	for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+		hb_callback_state[i] = HB_CB_STATE_READY;
+
+	if (nm_register_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb))
+		goto done;
+	if (nm_register_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb))
+		goto done;
+	if (nm_register_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb))
+		goto done;
+	if (nm_register_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb))
+		goto done;
+ 
+	if (hb_launch_thread() < 0)
+		goto done;
+ 
+	retval = register_filesystem(&hb_fs_type);
+done:
+	if (retval)
+		hb_teardown();
+	return retval;
+}
+
+static void __exit exit_hb(void)
+{
+	int i;
+	spin_lock(&hb_cb_lock);
+	for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++) {
+		hb_wait_on_callback_state(i);
+		hb_callback_state[i] = HB_CB_STATE_FROZEN;
+	}
+	spin_unlock(&hb_cb_lock);
+
+	hb_complete_thread();
+	hb_teardown();
+	unregister_filesystem(&hb_fs_type);
+	printk("unloading heartbeat module\n");
+}
+
+static void hb_teardown(void)
+{
+	nm_unregister_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb);
+	nm_unregister_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb);
+	nm_unregister_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb);
+	nm_unregister_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb);
+	remove_proc_entry("cluster/heartbeat", NULL);
+	//if (hb_net_timestamps)
+	//	kfree(hb_net_timestamps);
+}
+
+module_init(init_hb)
+module_exit(exit_hb)
+
+
+int hb_fill_node_map(struct inode *group, void *map, int size)
+{
+	hb_disk_slot *slot;
+	int idx = 0;
+	nm_group_inode_private *priv;
+	
+	priv = group->u.generic_ip;
+
+	memset(map, 0, size);
+	down(&group->i_sem);
+
+	if (priv->disk.uuid[0]) {
+		while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+			if (idx >= size-1) {
+				printk("map size (%d) too small for idx (%d)\n",
+			       	size, idx);
+				up(&group->i_sem);
+				return -EINVAL;
+			}
+			if (slot->state == HB_NODE_STATE_UP)
+				set_bit(idx, map);
+			idx++;
+		}
+	} else {
+		printk("filling straight from slot bitmap for non-disk heartbeat group\n");
+		memcpy(map, priv->slot_bitmap, size);
+	}
+
+	up(&group->i_sem);
+
+	return 0;
+}
+		
+
+static inline int hb_wait_on_callback_state(int type)
+{
+	while (hb_callback_state[type] == HB_CB_STATE_FROZEN) {
+		spin_unlock(&hb_cb_lock);
+		atomic_set(&hb_cb_ready, 0);
+		if (util_wait_atomic_eq(&hb_cb_wq, &hb_cb_ready, 1, 0) == -EINTR) {
+			return -EINTR;
+		}
+		spin_lock(&hb_cb_lock);
+	}
+	return 0;
+}
+
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority)
+{
+	hb_callback_func *f, *tmp;
+	struct list_head *iter;
+	int ret;
+
+	if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+		return -EINVAL;
+	f = kmalloc(sizeof(hb_callback_func), GFP_KERNEL);
+	if (f == NULL)
+		return -ENOMEM;
+	memset(f, 0, sizeof(hb_callback_func));
+	f->func = func;
+	f->data = data;
+	f->priority = priority;
+
+	spin_lock(&hb_cb_lock);
+	ret = hb_wait_on_callback_state(type);
+	if (ret < 0) {
+		spin_unlock(&hb_cb_lock);
+		kfree(f);
+		return ret;
+	}
+	
+	list_for_each(iter, &hb_callbacks[type]) {
+		tmp = list_entry (iter, hb_callback_func, list);
+		if (priority < tmp->priority) {
+			list_add_tail(&f->list, iter);
+			spin_unlock(&hb_cb_lock);
+			return 0;
+		}
+	}
+	list_add_tail(&f->list, &hb_callbacks[type]);
+	spin_unlock(&hb_cb_lock);
+	return 0;
+}
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data)
+{
+	struct list_head *iter, *tmpiter;
+	int ret = -EINVAL;
+	hb_callback_func *f;
+
+	if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+		return -EINVAL;
+
+	spin_lock(&hb_cb_lock);
+	ret = hb_wait_on_callback_state(type);
+	if (ret < 0) {
+		spin_unlock(&hb_cb_lock);
+		return ret;
+	}
+	hb_callback_state[type] = HB_CB_STATE_FROZEN;
+	spin_unlock(&hb_cb_lock);
+
+	list_for_each_safe(iter, tmpiter, &hb_callbacks[type]) {
+		f = list_entry (iter, hb_callback_func, list);
+		if (f->func == func && f->data == data) {
+			list_del(&f->list);
+			kfree(f);
+			ret = 0;
+			break;
+		}
+	}
+
+	spin_lock(&hb_cb_lock);
+	hb_callback_state[type] = HB_CB_STATE_READY;
+	atomic_set(&hb_cb_ready, 1);
+	wake_up(&hb_cb_wq);
+	spin_unlock(&hb_cb_lock);
+	return ret;
+}
+
+
+
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx)
+{
+	struct list_head *iter;
+	hb_callback_func *f;
+	int ret;
+	
+	spin_lock(&hb_cb_lock);
+	ret = hb_wait_on_callback_state(type);
+	if (ret < 0) {
+		spin_unlock(&hb_cb_lock);
+		printk("missed hb callback(%d) due to EINTR!\n", type);
+		return;
+	}
+	hb_callback_state[type] = HB_CB_STATE_FROZEN;
+	spin_unlock(&hb_cb_lock);
+
+	list_for_each(iter, &hb_callbacks[type]) {
+		f = list_entry (iter, hb_callback_func, list);
+		(f->func) (ptr1, ptr2, idx, f->data);
+	}
+
+	spin_lock(&hb_cb_lock);
+	hb_callback_state[type] = HB_CB_STATE_READY;
+	atomic_set(&hb_cb_ready, 1);
+	wake_up(&hb_cb_wq);
+	spin_unlock(&hb_cb_lock);
+}

Added: trunk/cluster/heartbeat.h
===================================================================
--- trunk/cluster/heartbeat.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/heartbeat.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,129 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_HEARTBEAT_H
+#define CLUSTER_HEARTBEAT_H
+
+
+enum {
+	HB_NODE_STATE_INIT = 0,
+	HB_NODE_STATE_DOWN,
+	HB_NODE_STATE_UP
+};
+
+struct _heartbeat_ctxt
+{
+	int dummy;
+};
+
+typedef struct _hb_disk_slot
+{
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct list_head list;
+	unsigned long last_time;
+	u16 margin;
+	u16 state;
+} hb_disk_slot;
+
+
+
+#define HB_THREAD_MS                  2000   // every 2 seconds
+
+
+#define HB_OP_MAGIC      0xf00d
+enum {
+	HB_OP_START_DISK_HEARTBEAT=371,
+	HB_OP_GET_NODE_MAP
+};
+
+typedef struct _hb_op
+{
+	u16 magic;
+	u16 opcode;
+	unsigned int fd;
+	char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+	u16 group_num;
+	u32 bits;
+	u32 blocks;
+	u64 start;
+} hb_op;
+
+enum {
+	HB_TYPE_DISK = 0,
+	HB_TYPE_NET
+};
+
+
+/* callback stuff */
+
+enum {
+	HB_NODE_DOWN_CB = 0,
+	HB_NODE_UP_CB,
+	HB_NODE_RESPONDED_CB,    // this one is very chatty
+	HB_NUM_CB
+};
+
+typedef void (hb_cb_func)(struct inode *, struct inode *, int, void *);
+
+typedef struct _hb_callback_func
+{
+	struct list_head list;
+	hb_cb_func *func;
+	void *data;
+	int priority;
+} hb_callback_func;
+
+
+enum {
+	HB_Root = 1,
+	HB_Disk,
+	HB_WriteOpArraySize
+};
+
+typedef struct _hb_disk_heartbeat_block
+{
+	u64 time;
+} hb_disk_heartbeat_block;
+
+
+// number of initial allowed misses 
+#define HB_INITIAL_DISK_MARGIN     60
+#define HB_INITIAL_NET_MARGIN      60
+
+// number of allowed misses in steady state
+#define HB_DISK_MARGIN             30
+#define HB_NET_MARGIN              30
+
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data);
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority);
+int hb_fill_node_map(struct inode *group, void *map, int size);
+
+
+
+#endif /* CLUSTER_HEARTBEAT_H */

Added: trunk/cluster/nodemanager.c
===================================================================
--- trunk/cluster/nodemanager.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/nodemanager.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1330 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.c
+ *
+ * totally lame static node management placeholder
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/hash.h>
+
+#include <asm/uaccess.h>
+
+#include "tcp.h"
+#include "dlmmod.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+/*
+ * This nm module is similar to nfsd/nfsctl.c in that it uses
+ * transaction files (in /proc/cluster/nm) to communicate with
+ * the kernel module instead of ioctls or other means.
+ *
+ * Files involved:
+ *  /proc/cluster/nm/cluster - used to create/destroy cluster, adds 
+ *                             nodes/groups to the cluster, queries info
+ *                             about the cluster
+ *  /proc/cluster/nm/group   - adds/removes nodes from a group, queries
+ *                             info about a group
+ *  /proc/cluster/nm/node    - changes info for a node, queries info about
+ *                             a node
+ *
+ * This nm implementation basically allows this node to live in exactly one 
+ * cluster.  All "clustered" nodes that are known to this node should be
+ * added to the cluster, and all nodes should see the same list of nodes in
+ * the same order at all times.  The "slot" number given to a node in this 
+ * global cluster list is fixed and never changes.  Groups can be dynamically
+ * created within a cluster (TODO: currently static only) and be made up of 
+ * one or more nodes (listed at most once) in the global list.  A node may exist
+ * in many groups.  Also, a group may have an optional disk UUID which is simply
+ * stored for later use by the heartbeat service.  (The heartbeat service will
+ * do disk heartbeating only for those groups with valid UUIDs.)  
+ *
+ * USAGE:
+ * For our purposes, the nm service can be autoloaded by an fstab entry or manually
+ * through mount (mount -t nm none /proc/cluster/nm).  Once that is done, an init
+ * script (or single executable on an initrd) should be run to create the static
+ * cluster info, possibly from a file like /etc/nm.conf or similar.  We should 
+ * probably create a "dlm" or "everyone" group (with NO disk heartbeating) so that 
+ * the dlm service can be used with the network only.  This group should contain 
+ * all known nodes.  After this is done, the net, hb and dlm modules can come up.
+ * The nm service is now ready for use, since groups don't need to be created till 
+ * later.
+ * 
+ * A group services daemon can be written (by someone!? ;-) to run at this point.
+ * Since the "dlm" group has everything it needs for full dlmming (since it uses 
+ * only network), the dlm itself can be used to arbitrate for group creation, 
+ * and additions/deletions from groups.  Callbacks should be registered with nm by
+ * other services that care on each of these events.  For instance, heartbeat should
+ * register a callback with nm for group creation, and addition and deletion from 
+ * a group so that it can make any necessary changes to its heartbeating (primarily
+ * so that it can begin/end disk heartbeat for any group/node that needs it).
+ *   
+ * NOTE NOTE NOTE !!!!:
+ * This is intended to be a quickie implementation.  (translation: lame)  I do not
+ * want to step on anyone's toes who may have implemented something wayyy better.
+ * If something out there "wins", we will plug into that instead.  If nothing really
+ * takes off, we at least have a (lame) reference to work off of.  However, since this 
+ * implementation exists solely to make ocfs2 work, and one of the major advantages
+ * of ocfs version 1 was ease of setup, we don't want to move to something 
+ * substantially more complicated than this (one conf file).
+ *
+ */ 
+
+
+
+/* globals */
+nm_cluster cluster;
+struct super_block *single_sb;
+char *nm_nodename;
+static spinlock_t nm_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t nm_cb_lock = SPIN_LOCK_UNLOCKED;
+struct list_head nm_callbacks[NM_NUM_CB];
+
+
+static void nm_teardown(void);
+static int nm_create_cluster(char *buf);
+static void nm_init_cluster(nm_cluster *cluster);
+int nm_create_node(char *buf, nm_op *data);
+int nm_name_cluster(char *buf, nm_op *data);
+int nm_destroy_cluster(char *buf);
+int nm_get_cluster_num_nodes(char *buf);
+int nm_get_cluster_num_groups(char *buf);
+int nm_get_node_info(char *buf, nm_op *data);
+int nm_get_group_info(char *buf, nm_op *data);
+nm_cluster *nm_get_cluster(void);
+struct inode *nm_get_group_by_name(char *node_name);
+struct inode *nm_get_node_by_name(char *node_name);
+int nm_init(dlm_ctxt *dlm);
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx);
+
+/* support for adding files, dirs, hardlinks in /proc/cluster/nm/... */
+extern struct file_operations simple_dir_operations;
+extern struct inode_operations simple_dir_inode_operations;
+extern struct file_operations transaction_ops;
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request);
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+
+static ssize_t write_node(struct file *file, char *buf, size_t size);
+static ssize_t write_group(struct file *file, char *buf, size_t size);
+static ssize_t write_cluster(struct file *file, char *buf, size_t size);
+
+static struct inode * __nm_get_group_by_num(u16 group_num);
+static struct inode * __nm_get_node_by_num(u16 node_num);
+
+
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child);
+
+#define NM_HASH_BITS     7
+#define NM_HASH_SIZE     (1 << NM_HASH_BITS)
+#define NM_HASH_MASK     (NM_HASH_SIZE - 1)
+
+static struct list_head *nm_ip_hash = NULL;
+static spinlock_t nm_ip_hash_lock;
+
+static int nm_init_ip_hash(void);
+static void nm_destroy_ip_hash(void);
+
+
+static void nm_destroy_ip_hash(void)
+{
+	int i;
+	if (!nm_ip_hash)
+		return;
+	for (i=0; i<NM_HASH_SIZE; i++) {
+		/* TODO: cleanup */
+	}
+	free_page((unsigned long)nm_ip_hash);
+}
+
+static int nm_init_ip_hash(void)
+{
+	int i;
+	
+	if ((PAGE_SIZE / sizeof(struct list_head)) < NM_HASH_SIZE) {
+		printk("eek!  hash size too big for this arch!\n");
+		BUG();
+	}
+
+	nm_ip_hash = (struct list_head *) __get_free_page(GFP_KERNEL);
+	if (!nm_ip_hash)
+		return -ENOMEM;
+	for (i=0; i<NM_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nm_ip_hash[i]);
+	spin_lock_init(&nm_ip_hash_lock);
+	return 0;
+}
+
+
+
+
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request)
+{
+	int start = 0, slot_num;
+	if (request != NM_INVALID_SLOT_NUM)
+		start = request;
+	slot_num = find_next_zero_bit (bitmap, max, start);
+	if (slot_num >= max)
+		return -1;
+	if (request != NM_INVALID_SLOT_NUM && slot_num != request)
+		return -1;
+	set_bit(slot_num, bitmap);
+	return slot_num;
+}
+
+
+
+
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+	struct qstr name;
+	struct dentry *dentry = ERR_PTR(-EINVAL);
+	struct inode *inode;
+
+	if (!file->name)
+		goto out;
+	name.name = file->name;
+	name.len = strlen(name.name);
+	printk("adding file %*s\n", name.len, name.name);
+	name.hash = full_name_hash(name.name, name.len);
+	dentry = d_alloc(parent, &name);
+	if (!dentry) {
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode = new_inode(s);
+	if (!inode) {
+		dput(dentry);
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode->i_mode = file->mode;
+	inode->i_uid = inode->i_gid = 0;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (file->mode & S_IFDIR) {
+		inode->i_op = &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+	} else {
+		inode->i_fop = file->ops;
+	}		    
+	inode->i_ino = ino;
+	insert_inode_hash(inode);
+	d_add(dentry, inode);
+
+out:
+	return dentry;
+}
+
+
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+	struct qstr name;
+	struct dentry *dentry = ERR_PTR(-EINVAL);
+	struct inode *inode;
+
+	if (!file->name)
+		goto out;
+	name.name = file->name;
+	name.len = strlen(name.name);
+	printk("adding link %*s\n", name.len, name.name);
+	name.hash = full_name_hash(name.name, name.len);
+	dentry = d_alloc(parent, &name);
+	if (!dentry) {
+		printk("failed to d_alloc\n");
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode = iget(s, ino);
+	if (!inode) {
+		printk("failed to iget\n");
+		dput(dentry);
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	if (!inode->u.generic_ip) {
+		printk("bad inode: %d\n", ino);
+		iput(inode);
+		dput(dentry);
+		dentry = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	inode->i_nlink++;
+	d_add(dentry, inode);
+
+out:
+	return dentry;
+}
+
+
+
+
+
+/* cluster, node and group transaction files. 
+ * here's where the actual work of nm takes place. */
+
+static int nm_create_cluster(char *buf)
+{
+	int ret = -EINVAL;
+	
+	printk("create cluster...\n");
+	
+	spin_lock(&nm_lock);
+	if (cluster.state == NM_CLUSTER_UP) {
+		ret = sprintf(buf, "%d: cluster already up\n", -EINVAL);
+	} else {
+		cluster.state = NM_CLUSTER_UP;
+		ret = sprintf(buf, "0: cluster state: UP");
+	}
+	spin_unlock(&nm_lock);
+	return ret;
+}
+
+
+
+int nm_create_group(char *buf, nm_op *data)
+{
+	struct tree_descr desc;
+	struct dentry *dentry = NULL;
+	struct inode *inode = NULL;
+	int ino, group_num;
+	int ret = -EINVAL;
+	nm_group_inode_private *g = NULL;
+
+	printk("create group...\n");
+
+	data->arg_u.gc.name[NM_MAX_NAME_LEN] = '\0';
+	inode = nm_get_group_by_name(data->arg_u.gc.name);
+	if (inode) {
+		ret = sprintf(buf, "%d: group %u (%s) already exists", -EEXIST, 
+			      nm_get_group_global_index(inode), data->arg_u.gc.name);
+		iput(inode);
+		return ret;
+	}
+
+	group_num = data->arg_u.gc.group_num;
+	if (group_num > NM_INVALID_SLOT_NUM)
+		goto leave;
+
+	spin_lock(&cluster.bitmap_lock);
+	group_num = nm_find_next_slot(&(cluster.group_bitmap[0]), 255, group_num);
+	spin_unlock(&cluster.bitmap_lock);
+
+	if (group_num < 0) {
+		printk("out of group slots!\n");
+		goto leave;
+	}
+
+	ino = group_num + NM_GROUP_INODE_START;
+
+	desc.name = data->arg_u.gc.name;
+	desc.ops = NULL;
+	desc.mode = S_IFDIR | 0755;
+	dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+	if (IS_ERR(dentry))
+		goto leave;
+	inode = igrab(dentry->d_inode);
+	if (!inode) {
+		printk("igrab failed!\n");
+		goto leave;
+	}
+		
+	g = kmalloc(sizeof(nm_group_inode_private), GFP_KERNEL);
+	if (!g) 
+		goto leave;
+
+	memset(g, 0, sizeof(nm_group_inode_private));
+	memcpy(g->disk.uuid, data->arg_u.gc.disk_uuid, CLUSTER_DISK_UUID_LEN);
+	spin_lock_init(&g->bitmap_lock);
+	if (g->disk.uuid[0])
+		g->state = NM_GROUP_NOT_READY;
+	else
+		g->state = NM_GROUP_READY;
+	g->inode = inode;
+	inode->u.generic_ip = g;
+
+	ret = sprintf(buf, "0: group %u (%s) added, uuid: %s", group_num,
+		      data->arg_u.gc.name, g->disk.uuid);
+	nm_do_callbacks(NM_GROUP_ADD_CB, inode, NULL, group_num);
+
+leave:
+	if (ret < 0) {
+		if (inode) {
+			if (inode->u.generic_ip)
+				kfree(inode->u.generic_ip);
+			iput(inode);
+		}
+		if (dentry)
+			dput(dentry);
+	}
+	return ret;
+}
+
+
+int nm_create_node(char *buf, nm_op *data)
+{
+	struct tree_descr desc;
+	struct dentry *dentry = NULL;
+	struct inode *inode = NULL;
+	int ino, node_num, bucket;
+	int ret = -EINVAL;
+	nm_node_inode_private *n = NULL;
+
+	printk("add cluster node ...\n");
+
+	data->arg_u.node.node_name[NM_MAX_NAME_LEN] = '\0';
+	inode = nm_get_node_by_name(data->arg_u.node.node_name);
+	if (inode) {
+		ret = sprintf(buf, "%d: node %u (%s) already exists", -EEXIST, 
+			      nm_get_node_global_index(inode), 
+			      data->arg_u.node.node_name);
+		iput(inode);
+		return ret;
+	}
+
+	node_num = data->arg_u.node.node_num;
+	if (node_num > NM_INVALID_SLOT_NUM) {
+		printk("bad node_num: %d\n", node_num);
+		goto leave;
+	}
+
+	spin_lock(&cluster.bitmap_lock);
+	node_num = nm_find_next_slot(&(cluster.node_bitmap[0]), 255, node_num);
+	spin_unlock(&cluster.bitmap_lock);
+
+	if (node_num < 0) {
+		printk("out of node slots!\n");
+		goto leave;
+	}
+
+	ino = node_num + NM_NODE_INODE_START;
+
+	desc.name = data->arg_u.node.node_name;
+	desc.ops = NULL;
+	desc.mode = S_IFREG | S_IWUSR;
+	dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+	if (IS_ERR(dentry)) {
+		printk("bad dentry\n");
+		goto leave;
+	}
+	inode = igrab(dentry->d_inode);
+	if (!inode) {
+		printk("igrab failed!\n");
+		goto leave;
+	}
+		
+	n = kmalloc(sizeof(nm_node_inode_private), GFP_KERNEL);
+	if (!n) {
+		printk("could not kmalloc\n");
+		goto leave;
+	}
+	memcpy(&n->node, &data->arg_u.node, sizeof(nm_node_info));
+	INIT_LIST_HEAD(&n->ip_hash);
+	n->net.sock = NULL;
+	INIT_LIST_HEAD(&n->net.list);
+	spin_lock_init(&n->net.sock_lock);
+	n->net.flags = 0;
+
+	/* hash on first ip address */
+	spin_lock(&nm_ip_hash_lock);
+	bucket = hash_long(n->node.ifaces[0].addr_u.ip_addr4, NM_HASH_BITS);
+	list_add_tail(&n->ip_hash, &nm_ip_hash[bucket]);
+	spin_unlock(&nm_ip_hash_lock);
+	printk("hashed ip %d.%d.%d.%d to bucket %d\n", NIPQUAD(n->node.ifaces[0].addr_u.ip_addr4), bucket);
+	n->inode = inode;
+	inode->u.generic_ip = n;
+
+	ret = sprintf(buf, "0: node %u (%s) added", node_num, n->node.node_name);
+	nm_do_callbacks(NM_NODE_ADD_CB, inode, NULL, node_num);
+
+leave:
+	if (ret < 0) {
+		if (inode) {
+			if (inode->u.generic_ip)
+				kfree(inode->u.generic_ip);
+			iput(inode);
+		}
+		if (dentry)
+			dput(dentry);
+	}
+	return ret;
+}
+
+int nm_make_group_ready(struct inode *group)
+{
+	nm_group_inode_private *g = group->u.generic_ip;
+	if (!g)
+		return -EINVAL;
+	g->state = NM_GROUP_READY;
+	return 0;
+}
+
+int nm_add_node_to_group(char *buf, nm_op *data)
+{
+	struct tree_descr desc;
+	struct inode *inode = NULL;
+	struct dentry *dentry = NULL, *child = NULL;
+	nm_group_inode_private *g = NULL;
+	int group_num, slot_num;
+	int ret = -EINVAL;
+	u16 ino;
+	char tmpname[6];
+
+	printk("add node to group...\n");
+
+	group_num = data->arg_u.gc.group_num;
+	ino = data->arg_u.gc.node_num;
+	slot_num = data->arg_u.gc.slot_num;
+
+	/* request a certain slot, or NM_INVALID_SLOT_NUM for any slot */
+	if (slot_num > NM_INVALID_SLOT_NUM)
+		goto leave;
+	
+	if (ino >= NM_INVALID_SLOT_NUM || group_num >= NM_INVALID_SLOT_NUM)
+		goto leave;
+
+       	inode = __nm_get_group_by_num(group_num);
+	if (!inode)
+		goto leave;
+	if (list_empty(&inode->i_dentry))
+		goto leave;
+	dentry = dget(list_entry(inode->i_dentry.next, struct dentry, d_alias));
+	if (!dentry)
+		goto leave;
+	g = inode->u.generic_ip;
+	if (!g)
+		goto leave;
+
+	if (g->state == NM_GROUP_NOT_READY) {
+		ret = sprintf(buf, "%d: group disk has not been discovered.  cannot add nodes.", -EROFS);
+		goto leave;
+	}
+
+	spin_lock(&g->bitmap_lock);
+	slot_num = nm_find_next_slot(&(g->slot_bitmap[0]), 255, slot_num);
+	spin_unlock(&g->bitmap_lock);
+	if (slot_num < 0)
+		goto leave;
+
+	/* create hardlink to ino with name "slot_num" */
+	sprintf(tmpname, "%03u", slot_num);
+	desc.name = &(tmpname[0]);
+	desc.ops = NULL;
+	desc.mode = 0;
+	child = nm_add_link(single_sb, dentry, &desc, 
+			    NM_NODE_INODE_START+ino);
+	if (IS_ERR(child)) {
+		printk("error adding link for %s\n", tmpname);
+		child = NULL;
+		goto leave;
+	}
+
+	ret = sprintf(buf, "0: node %u added to group: %*s", 
+		      ino, dentry->d_name.len, dentry->d_name.name);
+
+	if (!igrab(child->d_inode))
+		goto leave;
+	nm_do_callbacks(NM_GROUP_NODE_ADD_CB, inode, child->d_inode, slot_num);
+	iput(child->d_inode);
+
+leave:
+	if (dentry)
+		dput(dentry);
+	if (child)
+		dput(child);
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+
+int nm_remove_node_from_group(struct inode *group, struct inode *node)
+{
+	struct dentry *child = NULL;
+	nm_group_inode_private *g = NULL;
+	int slot_num;
+	int ret = -EINVAL;
+
+	printk("remove node from group...\n");
+
+	slot_num = nm_get_group_index(group, node, &child);
+
+	if (slot_num == NM_MAX_NODES || !child)
+		goto leave;
+
+	g = group->u.generic_ip;
+	if (!g)
+		goto leave;
+		
+	printk("killing the dentry now!!\n");
+	down(&group->i_zombie);
+	node->i_nlink--;
+	d_delete(child);
+	up(&group->i_zombie);
+	printk("done killing the dentry!!\n");
+
+
+	if (!igrab(node))
+		goto leave;
+	nm_do_callbacks(NM_GROUP_NODE_DEL_CB, group, node, slot_num);
+	iput(node);
+	
+	spin_lock(&g->bitmap_lock);
+	clear_bit(slot_num, (void *)(&g->slot_bitmap[0]));
+	spin_unlock(&g->bitmap_lock);
+
+	ret = 0;
+
+leave:
+	if (child)
+		dput(child);
+	return ret;
+}
+
+
+
+int nm_name_cluster(char *buf, nm_op *data)
+{
+	int ret = -EINVAL;
+
+	printk("name cluster...\n");
+	spin_lock(&nm_lock);
+	if (cluster.state == NM_CLUSTER_UP) {
+		ret = sprintf(buf, "%d: cluster name could not be set.  cluster already up.", -EINVAL);
+		goto leave;
+	}
+	memset(cluster.name, 0, NM_MAX_NAME_LEN+1);
+	memcpy(cluster.name, data->arg_u.name, NM_MAX_NAME_LEN);
+	ret = sprintf(buf, "0: cluster name set: %s", cluster.name);
+leave:
+	spin_unlock(&nm_lock);
+	return ret;
+}
+
+int nm_destroy_cluster(char *buf)
+{
+	int ret;
+	printk("destroy cluster...\n");
+
+	/* TODO */
+	spin_lock(&nm_lock);
+	nm_init_cluster(&cluster);
+	ret = sprintf(buf, "0: rudely destroyed cluster!!!");
+	spin_unlock(&nm_lock);
+	return ret;
+}
+
+int nm_get_cluster_num_nodes(char *buf)
+{
+	int num_nodes=0, i;
+	
+	printk("get cluster num nodes...\n");
+
+	spin_lock(&cluster.bitmap_lock);
+	for (i=0; i<8; i++)
+		num_nodes += hweight32(cluster.node_bitmap[i]);
+	spin_unlock(&cluster.bitmap_lock);
+
+	return sprintf(buf, "0: %d", num_nodes);
+}
+
+int nm_get_cluster_num_groups(char *buf)
+{
+	int num_groups=0, i;
+	
+	printk("get cluster num groups...\n");
+
+	spin_lock(&cluster.bitmap_lock);
+	for (i=0; i<8; i++)
+		num_groups += hweight32(cluster.group_bitmap[i]);
+	spin_unlock(&cluster.bitmap_lock);
+
+	return sprintf(buf, "0: %d", num_groups);
+}
+
+int nm_get_group_num_nodes(struct inode *group)
+{
+	int num_nodes=0, i;
+	nm_group_inode_private *g;
+	
+	printk("get group num nodes...\n");
+	
+	g = group->u.generic_ip;
+	if (!g)
+		return -EINVAL;
+
+	spin_lock(&g->bitmap_lock);
+	for (i=0; i<8; i++)
+		num_nodes += hweight32(g->slot_bitmap[i]);
+	spin_unlock(&g->bitmap_lock);
+
+	return num_nodes;
+}
+
+int nm_get_group_max_slots(struct inode *group)
+{
+	int last=0, i;
+	nm_group_inode_private *g;
+	
+	printk("get group num nodes...\n");
+	
+	g = group->u.generic_ip;
+	if (!g)
+		return -EINVAL;
+
+#warning need to change this for 64 bit 
+	spin_lock(&g->bitmap_lock);
+	for (i=7; i>=0; i--) {
+		if (g->slot_bitmap[i]) {
+			last = fls(g->slot_bitmap[i]);
+			last += (i * sizeof(g->slot_bitmap[i]));
+			break;
+		}
+	}
+	spin_unlock(&g->bitmap_lock);
+
+	return last;
+}
+
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx)
+{
+	nm_group_inode_private *priv;
+	int next;
+
+	if (*idx >= 255)
+		return NULL;
+	priv = group->u.generic_ip;
+	if (!priv)
+		return NULL;
+	next = find_next_bit(priv->slot_bitmap, 255, *idx);
+	if (next >= 255)
+		return NULL;
+	*idx = next;
+	return util_rarray_idx_to_slot(&priv->disk.slots, next);
+}
+
+int nm_get_node_info(char *buf, nm_op *data)
+{
+	int ret, tmpret, i;
+	nm_node_inode_private *priv;
+	nm_network_iface *n;
+	struct inode *inode = NULL;
+	struct dentry *dentry;
+	u16 node_num;
+	u16 vers;
+
+	ret = -EINVAL;
+	node_num = data->arg_u.index;
+	inode = __nm_get_node_by_num(node_num);
+	if (inode) {
+		dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+		priv = inode->u.generic_ip;
+		ret = sprintf(buf, "0: global_index=%u\n"
+			           "name=%*s\n",
+				priv->node.node_num, dentry->d_name.len, 
+				dentry->d_name.name);
+		buf += ret;
+		for (i=0; i<NM_MAX_IFACES; i++) {
+			n = &priv->node.ifaces[i];
+			vers = ntohs(n->ip_version);
+			printk("ip_version=%u, vers=%u\n", n->ip_version, vers);
+			if (vers!=4 && vers!=6)
+				continue;
+			/* TODO: how to print ipv6? */
+			tmpret = sprintf(buf, "iface%d.port=%u\n"
+				            "iface%d.version=%d\n"
+					    "iface%d.addr=%d.%d.%d.%d\n",
+				      i, ntohs(n->ip_port), i, vers, i,
+				      NIPQUAD(n->addr_u.ip_addr4));
+			buf += tmpret;
+			ret += tmpret;
+		}
+		iput(inode);
+	}
+	return ret;
+}
+
+int nm_get_group_info(char *buf, nm_op *data)
+{
+	int ret, tmpret;
+	nm_group_inode_private *g = NULL;
+	struct inode *inode = NULL;
+	u16 group_num;
+	struct dentry *dentry, *child;
+
+	ret = -EINVAL;
+	group_num = data->arg_u.index;
+	inode = __nm_get_group_by_num(group_num);
+	if (inode) {
+		g = inode->u.generic_ip;
+		dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+		ret = sprintf(buf, "0: group_num=%u\n"
+		        	   "name=%*s\n"
+				   "disk_uuid=%s\n",
+			      group_num, dentry->d_name.len, 
+			      dentry->d_name.name, g->disk.uuid);
+		buf += ret;
+
+		spin_lock(&dcache_lock);
+		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+			tmpret = sprintf(buf, "%*s\n", child->d_name.len, 
+					 child->d_name.name);
+			buf += tmpret;
+			ret += tmpret;
+		}
+		spin_unlock(&dcache_lock);
+		iput(inode);
+	}
+	return ret;
+}
+
+	
+
+static ssize_t write_cluster(struct file *file, char *buf, size_t size)
+{
+	nm_op *data;
+	int ret;
+	u16 me;
+	
+	printk("write_cluster\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+        data = (nm_op *) buf;
+	if (data->magic != NM_OP_MAGIC)
+		return -EINVAL;
+
+	switch (data->opcode) {
+		case NM_OP_CREATE_CLUSTER:
+			ret = nm_create_cluster(buf);
+			break;
+		case NM_OP_CREATE_GROUP:
+			ret = nm_create_group(buf, data);
+			break;
+		case NM_OP_NAME_CLUSTER:
+			ret = nm_name_cluster(buf, data);
+			break;
+		case NM_OP_DESTROY_CLUSTER:
+			ret = nm_destroy_cluster(buf);
+			break;
+		case NM_OP_ADD_CLUSTER_NODE:
+			ret = nm_create_node(buf, data);
+			break;
+		case NM_OP_GET_CLUSTER_NUM_NODES:
+			ret = nm_get_cluster_num_nodes(buf);
+			break;
+		case NM_OP_GET_GLOBAL_NODE_NUM:
+			ret = 0;
+			me = nm_this_node(NULL);
+			if (me >= NM_MAX_NODES)
+				ret = -EINVAL;
+			ret = sprintf(buf, "%d: %u", ret, me);
+			break;
+		default:
+			ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+			break;
+	}
+	printk("leaving!\n");
+	return ret;
+}
+
+static ssize_t write_node(struct file *file, char *buf, size_t size)
+{
+	nm_op *data;
+	int ret;
+	
+	printk("write_node\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+        data = (nm_op *) buf;
+	if (data->magic != NM_OP_MAGIC)
+		return -EINVAL;
+
+	switch (data->opcode) {
+		case NM_OP_GET_NODE_INFO:
+			ret = nm_get_node_info(buf, data);
+			break;
+		default:
+			ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+			break;
+	}
+	printk("leaving!\n");
+	return ret;
+}
+
+static ssize_t write_group(struct file *file, char *buf, size_t size)
+{
+	nm_op *data;
+	int ret;
+	
+	printk("write_group\n");
+
+        if (size < sizeof(*data))
+                return -EINVAL;
+        data = (nm_op *) buf;
+	if (data->magic != NM_OP_MAGIC)
+		return -EINVAL;
+
+	printk("opcode is %u, add_group is %u\n", data->opcode, NM_OP_ADD_GROUP_NODE);
+	switch (data->opcode) {
+		case NM_OP_GET_GROUP_INFO:
+			ret = nm_get_group_info(buf, data);
+			break;
+
+		case NM_OP_ADD_GROUP_NODE:
+			ret = nm_add_node_to_group(buf, data);
+			break;
+
+		default:
+			ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+			break;
+	}
+	printk("leaving!\n");
+	return ret;
+}
+
+
+
+static struct inode * __nm_get_group_by_num(u16 group_num)
+{
+	struct inode *inode = iget(single_sb, group_num + NM_GROUP_INODE_START);
+	if (!inode)
+		return NULL;
+	if (!inode->u.generic_ip) {
+		iput(inode);
+		return NULL;
+	}
+	return inode;
+}	
+
+static struct inode * __nm_get_node_by_num(u16 node_num)
+{
+	struct inode *inode = iget(single_sb, node_num + NM_NODE_INODE_START);
+	if (!inode)
+		return NULL;
+	if (!inode->u.generic_ip) {
+		iput(inode);
+		return NULL;
+	}
+	return inode;
+}
+
+/* ipv4 only for now... */
+struct inode * nm_get_node_by_ip(u32 addr)
+{
+	int bucket;
+	struct list_head *iter;
+	nm_node_inode_private *priv;
+	struct inode *ret = NULL;
+	
+	bucket = hash_long(addr, NM_HASH_BITS);
+
+	spin_lock(&nm_ip_hash_lock);
+	list_for_each(iter, &nm_ip_hash[bucket]) {
+		priv = list_entry(iter, nm_node_inode_private, ip_hash);
+		if (priv->node.ifaces[0].addr_u.ip_addr4 == addr) {
+			ret = igrab(priv->inode);
+			break;
+		}
+		    
+	}
+	spin_unlock(&nm_ip_hash_lock);
+	return ret;
+}
+
+
+struct inode * nm_get_group_by_num(u16 group_num)
+{
+	struct inode *inode;
+	spin_lock(&nm_lock);
+	inode = __nm_get_group_by_num(group_num);
+	spin_unlock(&nm_lock);
+	return inode;
+}
+
+nm_cluster * nm_get_cluster(void)
+{
+	return &cluster;
+}
+
+struct inode * nm_get_node_by_num(u16 node_num)
+{
+	struct inode *inode;
+	spin_lock(&nm_lock);
+	inode = __nm_get_node_by_num(node_num);
+	spin_unlock(&nm_lock);
+	return inode;
+}
+
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index)
+{
+	struct dentry *dentry = NULL, *parent;
+	struct inode *inode = NULL;
+	char tmpname[6];
+
+	if (list_empty(&group->i_dentry))
+		return NULL;
+	parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+	if (!parent)
+		return NULL;
+	
+	sprintf(tmpname, "%03u", index);
+	dentry = lookup_one_len(tmpname, parent, strlen(tmpname));
+	if (!IS_ERR(dentry)) {
+		inode = dentry->d_inode;
+		if (inode) {
+			inode = igrab(inode);
+			if (!inode->u.generic_ip || !S_ISREG (inode->i_mode)) {
+				printk("bad inode!\n");
+				iput(inode);
+				inode = NULL;
+			}
+		}
+		if (!inode)
+			dput(dentry);
+	}
+	dput(parent);
+	return inode;
+}
+
+
+struct inode * __nm_get_node_by_name(char *node_name, int dir)
+{
+	struct dentry *dentry = NULL;
+	struct inode *inode = NULL;
+	
+	dentry = lookup_one_len(node_name, single_sb->s_root, strlen(node_name));
+	if (!IS_ERR(dentry)) {
+		inode = dentry->d_inode;
+		if (inode) {
+			inode = igrab(inode);
+			if (!inode->u.generic_ip ||
+		    	(dir && !S_ISDIR (inode->i_mode)) ||
+		    	(!dir && !S_ISREG (inode->i_mode))) {
+				printk("bad inode!\n");
+				iput(inode);
+				inode = NULL;
+			}
+		}
+	}
+	return inode;
+}
+
+
+/* 
+ * if group is NULL: return the global index for this node
+ * if group is non NULL: return the index within the group of this node
+ *
+ * NOTE: currently getting the group index is slow
+ *       will need to change this somehow
+ */
+u16 nm_this_node(struct inode *group)
+{
+	struct inode *inode = NULL;
+	struct dentry *child = NULL;
+	u16 node_num = NM_MAX_NODES;
+
+       	inode = nm_get_node_by_name(nm_nodename);
+	if (inode && inode->u.generic_ip) {
+		if (group)
+			node_num = nm_get_group_index(group, inode, &child);
+		else 
+			node_num = nm_get_node_global_index(inode);
+
+	}
+	iput(inode);
+	dput(child);
+	//printk("for group=%p, this node is %u\n", group, node_num);
+	return node_num;
+}
+
+/* slow */
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child)
+{
+	struct dentry *tmp = NULL, *parent = NULL;
+	u16 slot_num = NM_MAX_NODES;
+	struct list_head *iter;
+	char tmpname[6];
+	char *err;
+
+	*child = NULL;
+	parent = NULL;
+	if (list_empty(&group->i_dentry))
+		goto leave;
+	parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+	if (!parent)
+		goto leave;
+		
+	spin_lock(&dcache_lock);
+	list_for_each(iter, &parent->d_subdirs) {
+		tmp = list_entry(iter, struct dentry, d_child);
+		if (tmp->d_inode == inode)
+			break;
+		tmp = NULL;
+	}
+	if (tmp)
+		dget_locked(tmp);
+	spin_unlock(&dcache_lock);
+
+	if (!tmp || tmp->d_name.len > 3)
+		goto leave;
+	strncpy(tmpname, tmp->d_name.name, tmp->d_name.len);
+	tmpname[tmp->d_name.len] = '\0';
+	err=NULL;
+	slot_num = simple_strtoul(tmpname, &err, 10);
+	
+	if (*err != '\0')
+		slot_num = NM_MAX_NODES;  // error
+	else
+		*child = dget(tmp);  // done, get extra ref for child
+		
+leave:
+	dput(parent);
+	dput(tmp);
+
+	return slot_num;
+}
+
+int nm_init(dlm_ctxt *dlm)
+{
+	return 0;
+}
+
+int nm_register_callback(int type, void (*func)(void *, void *, u16))
+{
+	nm_callback_func *f;
+
+	if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+		return -EINVAL;
+	f = kmalloc(sizeof(nm_callback_func), GFP_KERNEL);
+	if (f == NULL)
+		return -ENOMEM;
+	memset(f, 0, sizeof(nm_callback_func));
+	f->func = func;
+	spin_lock(&nm_cb_lock);
+	list_add_tail(&f->list, &nm_callbacks[type]);
+	spin_unlock(&nm_cb_lock);
+	return 0;
+}
+
+#warning need to change nm callbacks to be like hb callbacks... no locks when calling.
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16))
+{
+	struct list_head *iter, *tmpiter;
+	int ret = -EINVAL;
+	nm_callback_func *f;
+
+	if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+		return ret;
+
+	spin_lock(&nm_cb_lock);
+	list_for_each_safe(iter, tmpiter, &nm_callbacks[type]) {
+		f = list_entry (iter, nm_callback_func, list);
+		if (f->func == func) {
+			list_del(&f->list);
+			kfree(f);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock(&nm_cb_lock);
+	return ret;
+}
+
+
+
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx)
+{
+	struct list_head *iter;
+	nm_callback_func *f;
+	
+	spin_lock(&nm_cb_lock);
+	list_for_each(iter, &nm_callbacks[type]) {
+		f = list_entry (iter, nm_callback_func, list);
+		(f->func) (ptr1, ptr2, idx);
+	}
+	spin_unlock(&nm_cb_lock);
+}
+
+
+static void nm_teardown(void)
+{
+	remove_proc_entry("cluster/nm", NULL);
+	remove_proc_entry("cluster", NULL);
+}
+
+static void nm_init_cluster(nm_cluster *cluster)
+{
+	int i;
+	memset(cluster, 0, sizeof(nm_cluster));
+	cluster->state = NM_CLUSTER_DOWN;
+	spin_lock_init(&cluster->bitmap_lock);
+	
+	for (i=NM_NODE_ADD_CB; i<=NM_GROUP_NODE_DEL_CB; i++)
+		INIT_LIST_HEAD(&nm_callbacks[i]);
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+static int nm_fill_super(struct super_block * sb, void * data, int silent)
+{
+	int ret, sz;
+	TA_write_ops *ops;
+	static struct tree_descr nm_files[] = {
+		[NM_Cluster] = {".cluster", &transaction_ops, S_IWUSR},
+		[NM_Node] = {".node", &transaction_ops, S_IWUSR},
+		[NM_Group] = {".group", &transaction_ops, S_IWUSR},
+		/* last one */ {""}
+	};
+	
+	sz = sizeof(nm_files) / sizeof(struct tree_descr);
+	ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+	ops->num_ops = sz;
+	ops->write_op[NM_Cluster] = write_cluster;
+	ops->write_op[NM_Node] = write_node;
+	ops->write_op[NM_Group] = write_group;
+
+	single_sb = NULL;
+	printk("calling simple_fill_super...\n");
+	ret = simple_fill_super(sb, 0x98675309, nm_files);
+	if (ret >= 0) {
+		TA_GENERIC_SB_MEMBER(sb) = ops;
+		single_sb = sb;
+	} else {
+		kfree(ops);
+	}
+	return ret;
+}
+
+static struct super_block *nm_read_super (struct super_block *sb, void *data, int silent)
+{
+	printk("welcome to nm_read_super!!!\n");
+	return (nm_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (nm_fs_type, "nm", nm_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_nm(void)
+{
+	int retval;
+	nm_nodename = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
+	if (nm_nodename==NULL) {
+		printk("could not allocate a few bytes for nodename!\n");
+		return -ENOMEM;
+	}
+	strcpy(nm_nodename, system_utsname.nodename);
+	printk("loading nm module: nodename is %s\n", nm_nodename);
+
+	if (nm_init_ip_hash() < 0) {
+		printk("failed to allocate node IP hash\n");
+		return -ENOMEM;
+	}
+
+	nm_init_cluster(&cluster);
+
+	if (proc_mkdir("cluster", 0)) {
+		if (proc_mkdir("cluster/nm", 0)) {
+		}
+	}
+	printk("calling register_filesystem\n");
+	retval = register_filesystem(&nm_fs_type);
+	printk("done calling register_filesystem: ret=%d\n", retval);
+	if (retval)
+		nm_teardown();
+	return retval;
+}
+
+static void __exit exit_nm(void)
+{
+	nm_teardown();
+	unregister_filesystem(&nm_fs_type);
+	nm_destroy_ip_hash();
+	kfree(nm_nodename);
+	printk("unloading nm module\n");
+}
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_nm)
+module_exit(exit_nm)

Added: trunk/cluster/nodemanager.h
===================================================================
--- trunk/cluster/nodemanager.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/nodemanager.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,252 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_NODEMANAGER_H
+#define CLUSTER_NODEMANAGER_H
+
+
+
+struct _nm_ctxt
+{
+	int dummy;
+};
+
+#define NM_MAX_IFACES            2
+#define NM_MAX_NODES             255
+#define NM_INVALID_SLOT_NUM      255
+
+/* host name, group name, cluster name all 64 bytes */
+#define NM_MAX_NAME_LEN          64    // __NEW_UTS_LEN
+
+
+#define NM_GROUP_INODE_START    200000
+#define NM_NODE_INODE_START     100000
+
+enum {
+	NM_CLUSTER_DOWN=0,
+	NM_CLUSTER_UP
+};
+
+enum {
+	NM_GROUP_NOT_READY=0,
+	NM_GROUP_READY
+};
+
+enum {
+	NM_Root = 1,
+	NM_Cluster,
+	NM_Node,
+	NM_Group,
+};
+
+
+
+
+typedef struct _nm_network_iface
+{
+	u16 ip_port;			/* for simplicity, just define exactly one port for this if */
+	u16 ip_version;
+	union {
+		u32 ip_addr4;		/* IPv4 address in NBO */
+		u32 ip_addr6[4];	/* IPv6 address in NBO */
+	} addr_u;
+} nm_network_iface;
+
+typedef struct _nm_node_info 
+{
+	u16 node_num;
+	char node_name[NM_MAX_NAME_LEN+1];
+	nm_network_iface ifaces[NM_MAX_IFACES];
+} nm_node_info;
+
+
+typedef struct _nm_cluster
+{
+	char name[NM_MAX_NAME_LEN+1];
+	int state;
+	spinlock_t bitmap_lock;
+	u32 group_bitmap[8];
+	u32 node_bitmap[8];
+} nm_cluster;
+
+
+typedef struct _nm_group_inode_private
+{
+	struct inode *inode;
+	struct list_head net_list;
+	struct list_head disk_list;
+	cluster_disk disk;
+	int state;
+	spinlock_t bitmap_lock;
+	u32 slot_bitmap[8];
+} nm_group_inode_private;
+
+#ifdef __KERNEL__
+/* TODO: move this */
+#define NET_FLAG_CREATING_SOCKET   0x00000001
+typedef struct _net_inode_private
+{
+	struct socket *sock;
+	wait_queue_t sleep;
+	spinlock_t sock_lock;
+	struct list_head handlers;
+	struct list_head list;
+	int flags;
+} net_inode_private;
+
+typedef struct _nm_node_inode_private
+{
+	struct inode *inode;
+	nm_node_info node;
+	struct list_head ip_hash;
+	net_inode_private net;
+} nm_node_inode_private;
+#endif
+
+/* transaction file nm_op stuff */
+
+#define NM_OP_MAGIC      0xbeaf
+enum {
+	NM_OP_CREATE_CLUSTER=123,
+	NM_OP_DESTROY_CLUSTER,
+	NM_OP_NAME_CLUSTER,
+	NM_OP_ADD_CLUSTER_NODE,
+	NM_OP_GET_CLUSTER_NUM_NODES,
+	NM_OP_GET_NODE_INFO,
+	NM_OP_CREATE_GROUP,
+	NM_OP_GET_GROUP_INFO,
+	NM_OP_ADD_GROUP_NODE,
+	NM_OP_GET_GLOBAL_NODE_NUM
+};
+
+typedef struct _nm_group_change
+{
+	u16 group_num;
+	u16 node_num;
+	u16 slot_num;
+	char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+	char name[NM_MAX_NAME_LEN+1];
+} nm_group_change;
+
+typedef struct _nm_op
+{
+	u16 magic;
+	u16 opcode;
+	union {
+		u16 index;
+		char name[NM_MAX_NAME_LEN+1];
+		nm_node_info node;
+		nm_group_change gc;
+	} arg_u;
+} nm_op;
+
+
+/* callback stuff */
+
+enum {
+	NM_NODE_ADD_CB = 0,
+	NM_NODE_DEL_CB,
+	NM_GROUP_ADD_CB,
+	NM_GROUP_DEL_CB,
+	NM_GROUP_NODE_ADD_CB,
+	NM_GROUP_NODE_DEL_CB,
+	NM_NUM_CB
+};
+
+typedef void (nm_cb_func)(void *, void *, u16);
+
+typedef struct _nm_callback_func
+{
+	struct list_head list;
+	nm_cb_func *func;
+	//void (*func)(void *, void *, u16);
+} nm_callback_func;
+
+
+
+
+u16 nm_this_node(struct inode *group);
+int nm_init(struct _dlm_ctxt *dlm);
+nm_cluster * nm_get_cluster(void);
+int nm_register_callback(int type, void (*func)(void *, void *, u16));
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16));
+int nm_get_group_num_nodes(struct inode *group);
+int nm_get_group_max_slots(struct inode *group);
+int nm_make_group_ready(struct inode *group);
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx);
+int nm_remove_node_from_group(struct inode *group, struct inode *node);
+int nm_create_group(char *buf, nm_op *data);
+int nm_add_node_to_group(char *buf, nm_op *data);
+
+#ifdef __KERNEL__
+
+
+struct inode * nm_get_group_by_num(u16 group_num);
+struct inode * nm_get_node_by_num(u16 node_num);
+struct inode * __nm_get_node_by_name(char *node_name, int dir);
+struct inode * nm_get_node_by_ip(u32 addr);
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index);
+
+static inline struct inode * nm_get_node_by_name(char *node_name)
+{
+	return __nm_get_node_by_name(node_name, 0);
+}
+static inline struct inode * nm_get_group_by_name(char *group_name)
+{
+	return __nm_get_node_by_name(group_name, 1);
+}
+
+
+static inline int nm_get_node_global_index(struct inode *node)
+{
+	return (node->i_ino - NM_NODE_INODE_START);
+}
+static inline int nm_get_group_global_index(struct inode *group)
+{
+	return (group->i_ino - NM_GROUP_INODE_START);
+}
+#endif
+
+static inline int nm_valid_ino(int ino)
+{
+#if 0
+	// these should never be referred to in kernel
+	if (ino >= NM_Cluster && ino <= NM_Group)
+		return 1;
+#endif
+	if (ino >= NM_NODE_INODE_START &&
+	    ino < NM_NODE_INODE_START + NM_MAX_NODES)
+		return 1;
+	if (ino >= NM_GROUP_INODE_START &&
+	    ino < NM_GROUP_INODE_START + NM_MAX_NODES)
+		return 1;
+	return 0;
+}
+
+
+	
+#endif /* CLUSTER_NODEMANAGER_H */

Added: trunk/cluster/tcp.c
===================================================================
--- trunk/cluster/tcp.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/tcp.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1614 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.c
+ *
+ * tcp network stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+//#if 0
+#define netprintk(x, arg...)    printk("(%d) " x, current->pid, ##arg)
+#define netprintk0(x)           printk("(%d) " x, current->pid)
+//#else
+#if 0
+#define netprintk(x, arg...)    
+#define netprintk0(x)           
+#endif
+
+struct socket *recv_sock = NULL;
+static u16 ip_version, ip_port;
+static void *net_junk_buf = NULL;
+static struct inode *net_inode = NULL;
+static u16 net_node_num;
+
+char *gsd_buf = NULL;
+char *gsd_handler_buf = NULL;
+
+
+static spinlock_t net_handler_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_list_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_status_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(net_handlers);
+static LIST_HEAD(net_recv_list);
+static LIST_HEAD(net_dispatch_list);
+static LIST_HEAD(net_status_list);
+
+static DECLARE_WAIT_QUEUE_HEAD(net_disp_thread_wait_queue);
+static DECLARE_WAIT_QUEUE_HEAD(net_recv_thread_wait_queue);
+static int net_recv_pid = -1;
+static struct task_struct *net_recv_task = NULL;
+static struct completion net_recv_complete;
+
+
+
+/////////////////////
+static void net_shutdown(void);
+static int net_startup(void);
+static int __init net_driver_entry (void);
+static int net_init_driver(void);
+static void __exit net_driver_exit (void);
+static void net_remove_handlers(void);
+static int net_check_message_valid(net_msg *msg, u32 len);
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode);
+static void net_dump_msg(struct socket *sock, struct inode *inode);
+static int net_recv_message_header(net_msg *hdr, struct socket *sock);
+static int net_init_tcp_recv_sock(void);
+static int net_receive_thread(void *data);
+static int net_receive(void);
+static int net_accept_tcp_connections(void);
+static void net_release_tcp_sock(void);
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd);
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+
+int gsd_message_action(gsd_message *g);
+int gsd_message_handler(net_msg *msg, u32 len, void *data);
+void gsd_teardown(void);
+int gsd_setup(void);
+
+
+//////////////////////
+
+
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_get_handler(net_msg_handler *nmh)
+{
+	atomic_inc(&nmh->refcnt);
+}
+
+static inline void net_get_handler(net_msg_handler *nmh)
+{
+	spin_lock(&net_handler_lock);
+	__net_get_handler(nmh);
+	spin_unlock(&net_handler_lock);
+}
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_put_handler(net_msg_handler *nmh)
+{
+	atomic_dec(&nmh->refcnt);
+	if (!atomic_read(&nmh->refcnt)) {
+		if (net_handler_in_use(nmh))
+			netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+		kfree(nmh);
+	}
+}
+
+static inline void net_put_handler(net_msg_handler *nmh)
+{
+	if (atomic_dec_and_lock(&nmh->refcnt, &net_handler_lock)) {
+		if (net_handler_in_use(nmh))
+			netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+		kfree(nmh);
+		spin_unlock(&net_handler_lock);
+	}
+}
+
+
+
+DECLARE_MUTEX(net_state_lock);
+u32 net_driver_state = NET_DRIVER_UNINITED;
+u32 net_num_dispatched = 0;
+
+
+/*
+ * net_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init net_driver_entry (void)
+{
+	struct proc_dir_entry *de;
+	de = proc_mkdir("cluster/net", 0);
+	if (!de)
+		return -1;
+	de->proc_fops->ioctl = net_ioctl;
+	
+	netprintk0("Loaded net Driver module\n");
+	return 0;
+}				/* net_driver_entry */
+
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	net_ioc data;
+	gsd_ioc gsd_data;
+	int ret = 0;
+	gsd_message g;
+	int response = 0;
+	struct inode *to = NULL;
+	struct file *file = NULL;
+
+	if (_IOC_TYPE (cmd) != NET_IOC_MAGIC) {
+		ret = -ENOTTY;
+		goto exit_ioctl;
+	}
+
+	switch (cmd) {
+	    case NET_IOC_ACTIVATE:
+		    memset(&data, 0, sizeof(net_ioc));
+		    down(&net_state_lock);
+		    data.status = net_driver_state;
+		    if (net_driver_state == NET_DRIVER_UNINITED) {
+			    ret = net_init_driver();
+			    if (ret < 0) {
+				    netprintk("error trying to activate net driver: %d\n", ret);
+				    data.status = NET_DRIVER_UNINITED;
+			    } else {
+				    netprintk0("activated net driver!\n");
+				    net_driver_state = data.status = NET_DRIVER_READY;
+			    }
+		    }
+		    up(&net_state_lock);
+
+		    ret = copy_to_user ((net_ioc *) arg, &data,
+					sizeof (net_ioc));
+		    break;
+	    case NET_IOC_GETSTATE:
+		    memset(&data, 0, sizeof(net_ioc));
+		    down(&net_state_lock);
+		    data.status = net_driver_state;
+		    up(&net_state_lock);
+		    ret = copy_to_user ((net_ioc *) arg, &data,
+					sizeof (net_ioc));
+		    break;
+		    
+	    case GSD_IOC_CREATE_GROUP:
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+		   
+		    file = fget(gsd_data.fd); 
+		    if (!file || !file->f_dentry || !file->f_dentry->d_inode) { 
+			    ret = -EINVAL;
+			    break; 
+		    } 
+		    to = file->f_dentry->d_inode;
+
+		    g.action = GSD_ACTION_ADD_GROUP;
+		    g.from = net_node_num;
+		    g.namelen = gsd_data.namelen;
+		    memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+		    if (to == net_inode) { 
+			    /* create the group locally */
+			    ret = gsd_message_action(&g);
+		    } else { 
+			    /* create the group on remote node */
+			    ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response); 
+			    if (ret == 0) 
+				    ret = response;
+		    }
+
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    gsd_data.status = ret;
+		    ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+		    break;
+
+	    case GSD_IOC_ADD_GROUP_NODE:
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+		   
+		    file = fget(gsd_data.fd); 
+		    if (!file || !file->f_dentry || !file->f_dentry->d_inode) { 
+			    ret = -EINVAL;
+			    break; 
+		    } 
+		    to = file->f_dentry->d_inode;
+
+		    g.action = GSD_ACTION_ADD_GROUP_NODE;
+		    g.from = net_node_num;
+		    g.namelen = gsd_data.namelen;
+		    memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+		    if (to == net_inode) {
+			    /* create the group locally */
+			    ret = gsd_message_action(&g);
+		    } else { 
+			    /* create the group on remote node */
+			    ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response); 
+			    if (ret == 0) 
+				    ret = response;
+		    }
+		    memset(&gsd_data, 0, sizeof(gsd_ioc));
+		    gsd_data.status = ret;
+		    ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+		    break;
+	    default:
+		    ret = -ENOTTY;
+		    break;
+	}
+
+exit_ioctl:
+
+	if (file)
+		fput(file);
+
+	return ret;
+}				/* net_ioctl */
+
+static int net_init_driver(void)
+{
+	nm_node_info *info;
+	nm_node_inode_private *priv;
+
+	/* get the global node number for this node */
+	net_node_num = nm_this_node(NULL);
+	if (net_node_num >= NM_MAX_NODES) {
+		netprintk0("local nm node number not initialized!\n");
+		return -1;
+	}
+	net_inode = nm_get_node_by_num(net_node_num);
+	if (!net_inode) {
+		netprintk0("local nm node inode not initialized!\n");
+		return -1;
+	}
+	priv = (nm_node_inode_private *)net_inode->u.generic_ip;
+	if (!priv) {
+		iput(net_inode);
+		netprintk0("local nm node info not initialized!\n");
+		return -1;
+	}
+	info = &priv->node;
+	ip_version = info->ifaces[0].ip_version;
+	ip_port = info->ifaces[0].ip_port;
+
+	if (net_startup() < 0)
+		return -1;
+
+	if (gsd_setup() < 0)
+		return -1;
+
+	return 0;
+}				/* net_init_driver*/
+
+
+/*
+ * net_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit net_driver_exit (void)
+{
+	down(&net_state_lock);
+	if (net_driver_state == NET_DRIVER_READY) {
+		netprintk0("shutting down network\n");
+		net_shutdown();
+		netprintk0("removing all net driver handlers\n");
+		net_remove_handlers();
+		gsd_teardown();
+		if (net_inode)
+			iput(net_inode);
+		net_driver_state = NET_DRIVER_UNINITED;
+	}
+	up(&net_state_lock);
+	remove_proc_entry("cluster/net", NULL);
+	netprintk0("Unloading net driver module\n");
+	return;
+}				/* net_driver_exit */
+
+
+static int net_startup(void)
+{
+	net_recv_pid = -1;
+	net_recv_task = NULL;
+	init_completion (&net_recv_complete);
+
+	net_junk_buf = (void *) __get_free_page(GFP_KERNEL);
+	if (!net_junk_buf)
+		return -ENOMEM;
+
+	netprintk0("starting net receive thread...\n");
+	net_recv_pid = kernel_thread (net_receive_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (net_recv_pid < 0) {
+		netprintk("unable to launch net receive thread, error=%d", net_recv_pid);
+		net_shutdown();
+		return -EINVAL;
+	}
+
+	netprintk0("net thread running...\n");
+	return 0;
+}
+
+static void net_shutdown(void)
+{
+	netprintk ("waiting for net thread to exit....");
+	send_sig (SIGINT, net_recv_task, 0);
+	wait_for_completion (&net_recv_complete);
+	free_page((unsigned long)net_junk_buf);
+	netprintk ("net thread exited\n");
+}
+
+
+static int net_receive_thread(void *data)
+{
+	int status = 0;
+	DECLARE_WAITQUEUE(main_wait, current);
+
+	util_daemonize ("netrecv", strlen("netrecv"), 1);
+	net_recv_task = current;
+
+	status = net_init_tcp_recv_sock();
+       	if (status >= 0 && recv_sock) {
+		add_wait_queue_exclusive(recv_sock->sk->sleep, &main_wait);
+		while (1) {
+			status = 0;
+			if (recv_sock->sk->tp_pinfo.af_tcp.accept_queue)
+				status = net_accept_tcp_connections();
+			if (!list_empty(&net_recv_list))
+				status = net_receive();
+
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(20*HZ);
+			current->state = TASK_RUNNING;
+
+			if (signal_pending(current)) {
+				netprintk0("net recv thread got signal!\n");
+				break;
+			}
+		}
+		remove_wait_queue(recv_sock->sk->sleep, &main_wait);
+	} else {
+		netprintk0("failed to initialize net_thread!\n");
+	}
+
+	/* Flush all scheduled tasks */
+	flush_scheduled_work();
+	net_release_tcp_sock();
+	net_recv_task = NULL;
+	complete (&net_recv_complete);
+	return 0;
+}
+
+typedef union _my_timing_t
+{
+	__u64 q;
+	__u32 lohi[2];
+} my_timing_t;
+
+
+static int net_check_message_valid(net_msg *msg, u32 len)
+{
+	return 1;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/* for lack of a better place to do this */
+
+int gsd_setup()
+{
+	int ret;
+	gsd_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!gsd_buf)
+		return -ENOMEM;
+	/* need this stupidity until I can divorce the actual nm actions
+	 * from the output they send to their user buffer */
+	gsd_handler_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!gsd_handler_buf)
+		return -ENOMEM;
+
+	ret = net_register_handler(GSD_MESSAGE, 0, 0, sizeof(gsd_message),
+				   gsd_message_handler, NULL, gsd_buf);
+
+	return ret;
+}
+
+void gsd_teardown()
+{
+	free_page((unsigned long)gsd_buf);
+	free_page((unsigned long)gsd_handler_buf);
+}
+
+int gsd_message_handler(net_msg *msg, u32 len, void *data)
+{
+	return gsd_message_action((gsd_message *)msg->buf);
+}
+
+int gsd_message_action(gsd_message *g)
+{
+	int ret;
+	nm_op op;
+	int namelen = g->namelen;
+	struct inode *node=NULL, *group=NULL;
+	char name[NM_MAX_NAME_LEN+1];
+	
+	if (namelen > NM_MAX_NAME_LEN)
+		return -EINVAL;
+	strncpy(name, g->name, namelen);
+	name[namelen] = '\0';
+	
+	memset(&op, 0, sizeof(op));
+	switch (g->action) {
+		case GSD_ACTION_ADD_GROUP:
+			group = nm_get_group_by_name(name);
+			if (group) {
+				ret = 0;
+				break;
+			}
+			op.arg_u.gc.group_num = NM_INVALID_SLOT_NUM;
+			memcpy(op.arg_u.gc.name, name, namelen);
+			memcpy(op.arg_u.gc.disk_uuid, name, namelen);
+
+			ret = nm_create_group(gsd_handler_buf, &op);
+			if (ret >= 0)
+				ret = 0;
+			break;
+
+		case GSD_ACTION_ADD_GROUP_NODE:
+			group = nm_get_group_by_name(name);
+			if (!group) {
+				ret = -EINVAL;
+				break;
+			}
+			node = nm_get_group_node_by_index(group, g->from);
+			if (node) {
+				ret = 0;
+				if (nm_get_node_global_index(node) != g->from)
+					ret = -EINVAL;
+				break;
+			}
+			op.arg_u.gc.group_num = nm_get_group_global_index(group);
+			op.arg_u.gc.node_num = g->from;
+			op.arg_u.gc.slot_num = g->from;
+			ret = nm_add_node_to_group(gsd_handler_buf, &op);
+			if (ret >= 0)
+				ret = 0;
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+	}
+
+	if (node)
+		iput(node);
+	if (group)
+		iput(group);
+	return ret;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+int net_register_handler(u32 msg_type, u32 key, int flags, u32 max_len, 
+			 net_msg_handler_func *func, void *data, void *buf)
+{
+	net_msg_handler *nmh, *found=NULL;
+	u32 packet_len = sizeof(net_msg) + max_len;
+
+	if (packet_len < NET_MIN_MSG_LEN || packet_len > NET_MAX_MSG_LEN) {
+		netprintk("max_len for message handler out of range: %u\n", 
+			max_len);
+		return -EINVAL;
+	}
+
+	/* if expecting any message payload, must pass a prealloced buffer */
+	if (!buf && max_len) {
+		netprintk("max_len > 0 (%u), but no buffer supplied!\n",
+		       max_len);
+		return -EINVAL;
+	}
+
+	if (!msg_type) {
+		netprintk("no message type provided: %u, %p\n", msg_type, func);
+		return -EINVAL;
+
+	}
+	if (!func) {
+		netprintk("no message handler provided: %u, %p\n",
+		       msg_type, func);
+		return -EINVAL;
+	}
+
+       	nmh = kmalloc(sizeof(net_msg_handler), GFP_KERNEL);
+	if (!nmh) {
+		return -ENOMEM;
+	}
+	memset(nmh, 0, sizeof(net_msg_handler));
+	nmh->func = func;
+	nmh->data = data;
+	nmh->msg_type = msg_type;
+	nmh->max_len = max_len;
+	nmh->key = key;
+	spin_lock_init(&nmh->lock);
+	atomic_set(&nmh->refcnt, 0);
+	if (max_len == 0) {
+		nmh->buf = &nmh->hdr;
+	} else {
+		nmh->buf = buf;
+	}
+	nmh->flags = flags;
+	INIT_LIST_HEAD(&nmh->list);
+	net_get_handler(nmh);
+
+	
+	/* add the new handler, checking for pre-existing */
+	spin_lock(&net_handler_lock);
+	found = net_lookup_handler(msg_type, key);
+	if (!found) {
+		list_add_tail(&nmh->list, &net_handlers);
+	} else {
+		spin_unlock(&net_handler_lock);
+		net_put_handler(found);
+		netprintk("message handler for type %u, key %u already exists!!!\n",
+		       msg_type, key);
+		/* this should destroy it */
+		net_put_handler(nmh);
+		return -EEXIST;
+	}
+	spin_unlock(&net_handler_lock);
+	return 0;
+}
+
+
+
+/* net_handler_lock should be held */
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key)
+{
+	net_msg_handler *ret;
+	struct list_head *iter;
+
+	list_for_each(iter, &net_handlers) {
+		ret = list_entry(iter, net_msg_handler, list);
+		if (ret->msg_type == msg_type && ret->key == key) {
+			__net_get_handler(ret);
+			return ret;
+		}
+	}
+	return NULL;
+}
+
+
+
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len)
+{
+	net_msg *ret = NULL;
+	net_msg_handler *handler = NULL;
+	u32 packet_len;
+
+	spin_lock(&net_handler_lock);
+	handler = net_lookup_handler(msg_type, key);
+	spin_unlock(&net_handler_lock);
+	
+	if (!handler) {
+		netprintk("no such message type: %u/%u\n", msg_type, key);
+		return NULL;
+	}
+	if (net_handler_msg_len_ok(handler, len)) {
+		netprintk("len for message type %u incorrect: %u, should be %u\n", 
+		       msg_type, len, handler->max_len);
+		goto done;
+	}
+	packet_len = len + sizeof(net_msg);
+	ret = kmalloc(packet_len, GFP_KERNEL);
+	if (!ret) {
+		netprintk("failed to allocate %u bytes for message!\n", packet_len);
+		goto done;
+	}
+	memset(ret, 0, packet_len);
+	ret->magic = NET_MSG_MAGIC;
+	ret->data_len = len;
+	ret->msg_type = msg_type;
+	ret->key = key;
+	if (len > 0)
+		memcpy(&(ret->buf[0]), data, len);
+
+done:
+	if (handler)
+		net_put_handler(handler);
+	return ret;
+}
+
+/* TODO Fix */
+static void net_remove_handlers(void)
+{
+	/* TODO: make an iterator in nm for running over each global inode
+	 * do I have this already?  then call destroy on each.  last put
+	 * will do the work.  doesnt matter if it's slow.  this is only
+	 * on shutdown... */
+}
+
+
+
+
+/*
+ * net_recv_tcp_msg()
+ *
+ */
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len)
+{
+	nm_node_inode_private *priv;
+	nm_node_info *node;
+	int status = -EINVAL, error;
+	mm_segment_t oldfs;
+	struct sockaddr_in sin;
+	struct iovec iov = { 
+		.iov_len = *packet_len, 
+		.iov_base = data 
+	};
+	struct msghdr msg = { 
+		.msg_control = NULL, 
+		.msg_controllen = 0, 
+		.msg_iovlen = 1, 
+		.msg_iov = &iov, 
+		.msg_name = (struct sockaddr *) &sin, 
+		.msg_namelen = sizeof (sin),
+       		.msg_flags = 0 
+	};
+
+
+	priv = (nm_node_inode_private *)inode->u.generic_ip;
+	node = &priv->node;
+	if (!sock) {
+		spin_lock(&priv->net.sock_lock); 
+		/* TODO: sock refcounting... i think we can get/put the sk */
+		sock = priv->net.sock;
+		if (!sock)
+			return -EINVAL;
+		spin_unlock(&priv->net.sock_lock); 
+	}
+	
+	memset (&sin, 0, sizeof (sin));
+	oldfs = get_fs ();
+	set_fs (get_ds ());
+	error = sock_recvmsg (sock, &msg, *packet_len, msg.msg_flags);
+	set_fs (oldfs);
+
+	status = 0;
+	if (error < 0) {
+		if (error == -ERESTARTSYS) {
+			status = -EBADF;
+			netprintk ("Shutting down\n");
+		} else {
+			status = -EINVAL;
+			netprintk ("unable to recvmsg, error=%d\n", error);
+		}
+		goto bail;
+	} else {
+		*packet_len = iov.iov_len;
+		status = 0;
+		netprintk("woot.  recevied len=%d\n", *packet_len);
+		if (!net_check_message_valid(data, *packet_len)) {
+			netprintk0("eeeek bad net message!\n");
+			status = -EINVAL;
+		}
+	}
+
+	//netprintk ("Received packet from: %d.%d.%d.%d\n",
+	//		NIPQUAD (sin.sin_addr.s_addr));
+
+bail:
+	return status;
+}				/* net_recv_tcp_msg */
+
+
+/*
+ * net_send_tcp_msg()
+ *
+ */
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len)
+{
+	int status = 0, error;
+	struct sockaddr_in sin;
+	mm_segment_t oldfs;
+	nm_node_inode_private *priv;
+	nm_node_info *node;
+
+	priv = (nm_node_inode_private *)inode->u.generic_ip;
+	node = &priv->node;
+	if (!sock) {
+		spin_lock(&priv->net.sock_lock);
+		/* TODO: sock refcounting... i think we can get/put the sk */
+		sock = priv->net.sock;
+		spin_unlock(&priv->net.sock_lock);
+	}
+
+	oldfs = get_fs ();
+	netprintk("Sending msg to node=%u, name=%s\n", node->node_num, node->node_name);
+	memset (&sin, 0, sizeof (sin));
+	sin.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+	sin.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+	sin.sin_port = node->ifaces[0].ip_port;
+	
+
+	status = -EINVAL;
+	if (sock) {
+		struct iovec iov = {
+			.iov_base = data,
+			.iov_len = packet_len
+		};
+		struct msghdr msg = {
+			.msg_iov = &iov,
+			.msg_iovlen = 1,
+			.msg_control = NULL,
+			.msg_controllen = 0,
+			.msg_name = (struct sockaddr *) &sin,
+			.msg_namelen = sizeof (sin),
+			.msg_flags = 0
+		};
+		
+		status = 0;	
+		set_fs (get_ds ());
+		error = sock_sendmsg (sock, &msg, packet_len);
+		set_fs (oldfs);
+	
+		if (error < 0) {
+			netprintk ("unable to sendmsg, error=%d\n", error);
+			status = -EINVAL;
+		} 
+	}
+	if (status < 0)
+		netprintk ("bad status: %d\n", status);
+
+	status = 0;
+	return status;
+}				/* net_send_tcp_msg */
+
+static spinlock_t net_msg_num_lock = SPIN_LOCK_UNLOCKED;
+static u64 net_msg_num = 1;
+
+/*
+ * net_send_message
+ *
+ *   - this is probably the function you are looking for
+ *   - it will package up the message for you, verifying that
+ *       the message handler is there and the length is ok,
+ *       connect to the other node if there is not already a
+ *       socket for it, and optionally wait on a status return
+ *       from the other node 
+ *   - all you need prior to this call is to have inited the
+ *       net stuff, to have a valid inode for the node to contact 
+ *       in nm, and to have registered the message handler
+ */
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status)
+{
+	int ret = 0, tmpret;
+	net_msg *msg = NULL;
+	net_msg_handler *handler = NULL;
+	u32 packet_len;
+	net_status_ctxt nsc;
+	wait_queue_t sleep;
+	nm_node_inode_private *priv = NULL;
+	net_inode_private *net = NULL;
+
+	if (!inode || !inode->u.generic_ip) {
+		netprintk0("bad inode, cannot send message\n");
+		return -EINVAL;
+	}
+	priv = (nm_node_inode_private *)inode->u.generic_ip;
+	net = &priv->net;
+	spin_lock(&net->sock_lock); 
+	if (!net->sock) {
+		spin_unlock(&net->sock_lock);
+		ret = net_init_tcp_sock(inode);
+		if (!(ret == 0 || ret == -EEXIST)) {
+			netprintk0("failed to create socket!");
+			return -EINVAL;
+		}
+	}
+	spin_unlock(&net->sock_lock); 
+	
+
+	spin_lock(&net_handler_lock);
+	handler = net_lookup_handler(msg_type, key);
+	spin_unlock(&net_handler_lock);
+	
+	if (!handler) {
+		netprintk("no such message type: %u/%u\n", msg_type, key);
+		return -EINVAL;
+	}
+
+	if (net_handler_msg_len_ok(handler, len)) {
+		netprintk("len for message type %u incorrect: %u, should be %u\n", 
+		       msg_type, len, handler->max_len);
+		ret = -EINVAL;
+		goto done;
+	}
+	packet_len = len + sizeof(net_msg);
+	msg = kmalloc(packet_len, GFP_KERNEL);
+	if (!msg) {
+		netprintk("failed to allocate %u bytes for message!\n", packet_len);
+		ret = -ENOMEM;
+		goto done;
+	}
+	memset(msg, 0, packet_len);
+	msg->magic = NET_MSG_MAGIC;
+	msg->data_len = len;
+	msg->msg_type = msg_type;
+	msg->key = key;
+	spin_lock(&net_msg_num_lock);
+	msg->msg_num = net_msg_num;
+	if (net_msg_num == NET_MSG_NUM_MAX) {
+		printk("eek!  net_msg_num wrapping to 1 now...\n");
+		net_msg_num = 1;
+	}
+	spin_unlock(&net_msg_num_lock);
+	if (len > 0)
+		memcpy(&(msg->buf[0]), data, len);
+
+	/* does the caller want to wait for a simple status? */
+	if (status) {
+		msg->status = 1;
+
+		INIT_LIST_HEAD(&nsc.list);
+		init_waitqueue_head(&nsc.wq);
+		atomic_set(&nsc.woken, 0);
+		nsc.msg_num = msg->msg_num;
+		nsc.status = 0;
+		spin_lock(&net_status_lock);
+		list_add(&nsc.list, &net_status_list);
+		spin_unlock(&net_status_lock);
+
+		init_waitqueue_entry(&sleep, current);
+		spin_lock(&net->sock_lock);
+		if (!net->sock) {
+			spin_unlock(&net->sock_lock);
+			netprintk0("caller wanted status return but socket went away!\n");
+			kfree(msg);
+			return -EINVAL;
+		}
+		add_wait_queue(net->sock->sk->sleep, &sleep);
+		spin_unlock(&net->sock_lock); 
+	}
+{
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2;
+	rdtsc(u1.hilo[0], u1.hilo[1]);
+
+
+	ret = net_send_tcp_msg(inode, NULL, msg, packet_len);
+
+	rdtsc(u2.hilo[0], u2.hilo[1]);
+	netprintk("net_send_tcp_msg took %llu cycles\n", u2.q-u1.q);
+	if (status) {
+		if (ret >= 0) {
+			/* wait on other node's handler */
+			rdtsc(u1.hilo[0], u1.hilo[1]);
+			tmpret = util_wait_atomic_eq(&nsc.wq, &nsc.woken, 1, 0);
+			rdtsc(u2.hilo[0], u2.hilo[1]);
+			netprintk("waiting on status took %llu cycles\n", u2.q-u1.q);
+			*status = nsc.status;
+			netprintk("status return requested, status is %d\n", *status);
+			remove_wait_queue(recv_sock->sk->sleep, &sleep);
+		} else {
+			netprintk("status return requested, and error returned from net_send_tcp_msg=%d\n", ret);
+			/* return bad status right away */
+			*status = ret;
+		}
+	} else if (ret < 0) {
+		netprintk("no status return requested, but error returned from net_send_tcp_msg=%d\n", ret);
+	}
+}
+	
+done:
+	if (handler)
+		net_put_handler(handler);
+	if (msg)
+		kfree(msg);
+	return ret;
+}
+
+
+
+
+
+/*
+ * net_receive: receive from and dispatch all sockets with data pending
+ */
+static int net_receive(void)
+{
+	struct inode *inode;
+	struct list_head *iter, *tmpiter;
+	nm_node_inode_private *priv;
+	net_inode_private *net;
+	struct socket *sock;
+	struct sock *sk;
+	net_msg hdr;
+	net_msg_handler *hnd = NULL;
+	int err = 0;
+	int tmperr;
+	union {
+		u64 q;
+		u32 hilo[2];
+	} u1, u2, u3, u4, u5, u6;
+
+
+start_over:	
+	spin_lock(&net_list_lock);
+	list_for_each_safe(iter, tmpiter, &net_recv_list) {
+		net = list_entry(iter, net_inode_private, list);
+		priv = container_of(net, nm_node_inode_private, net);
+	       	inode = priv->inode;
+		sock = net->sock;
+		
+		if (!sock) {
+			//netprintk0("no socket yet....\n");
+			continue;
+		}
+
+		if (sock->sk->state != TCP_ESTABLISHED &&
+		    sock->sk->state != TCP_CLOSE_WAIT) {
+			netprintk0("kill it and continue\n");
+			net_dump_and_close_sock(sock, inode);
+			continue;
+		}
+	
+		sk = sock->sk;
+		if (skb_queue_empty(&sk->receive_queue)) {
+			//netprintk("queue empty for %lu\n", inode->i_ino);
+			continue;
+		}
+	
+			
+
+		list_del(&net->list);
+		spin_unlock(&net_list_lock);
+	
+		memset(&hdr, 0, sizeof(net_msg));
+		err = net_recv_message_header(&hdr, sock);
+		if (err < 0) {
+			netprintk0("failed to receive message!\n");
+			goto error;
+		}
+		netprintk("received message header... magic=%u type=%u key=%u\n", 
+			  hdr.magic, hdr.msg_type, hdr.key);
+
+		if (hdr.magic == NET_MSG_STATUS_MAGIC) {
+rdtsc(u1.hilo[0], u1.hilo[1]);
+			net_dump_msg(sock, inode);
+			/* special type for returning message status */
+rdtsc(u2.hilo[0], u2.hilo[1]);
+			net_do_status_return(hdr.msg_num, hdr.status);
+rdtsc(u3.hilo[0], u3.hilo[1]);
+printk("status return: net_dump_msg took %llu, net_do_status_return took %llu\n", u2.q-u1.q, u3.q-u2.q);
+			err = 0;
+			goto error;
+		} else if (hdr.magic != NET_MSG_MAGIC) {
+			netprintk("bad magic: %u\n", hdr.magic);
+			goto error;
+		}
+		
+		if (net_is_valid_error_type(hdr.msg_type)) {
+			/* do error handling */
+			netprintk("this is a standard error message: type=%d\n", hdr.msg_type);
+			if (hdr.msg_type == NET_ALREADY_CONNECTED) {
+				netprintk0("error: there is already a socket for this connection\n");
+			} else if (hdr.msg_type == NET_UNKNOWN_HOST) {
+				netprintk0("error: unknown host\n");
+			}
+			net_dump_msg(sock, inode);
+			err = 0;
+			goto error;
+		}
+
+		/* find a handler for it */
+		spin_lock(&net_handler_lock);
+		hnd = net_lookup_handler(hdr.msg_type, hdr.key);
+		spin_unlock(&net_handler_lock);
+		
+		if (!hnd) {
+			err = -EINVAL;
+			netprintk0("no handler for message.\n");
+			goto error;
+		}
+rdtsc(u1.hilo[0], u1.hilo[1]);
+		err = net_dispatch_message(inode, sock, &hdr, hnd);
+rdtsc(u2.hilo[0], u2.hilo[1]);
+printk("net_dispatch_message took %llu\n", u2.q-u1.q);
+
+		/* if node has requested status return, do it now */
+		if (hdr.status) {
+#ifdef BIG_NET_MSG
+			u16 n = hdr.src_node;
+			hdr.src_node = hdr.dst_node;
+			hdr.dst_node = n;
+#endif
+			hdr.status = err;
+			hdr.magic = NET_MSG_STATUS_MAGIC;  // twiddle the magic
+rdtsc(u3.hilo[0], u3.hilo[1]);
+			tmperr = net_send_tcp_msg(inode, sock, &hdr, sizeof(net_msg));
+rdtsc(u4.hilo[0], u4.hilo[1]);
+printk("status return (net_send_tcp_msg) took %llu\n", u4.q-u3.q);
+		} else if (err < 0) {
+			netprintk("dispatch (%u/%u) returned %d\n",
+				  hdr.msg_type, hdr.key, err);
+		}
+
+
+		net_put_handler(hnd);
+
+		// re-add this socket
+		spin_lock(&net_list_lock);
+		list_add_tail(&net->list, &net_recv_list);
+		spin_unlock(&net_list_lock);
+		goto start_over;
+
+error:
+		if (err < 0) {
+			if (net_link_down(err, sock)) {
+				// do NOT re-add this socket
+				netprintk("link down! err=%d\n", err);
+				net_dump_and_close_sock(sock, inode);
+			} else {
+				netprintk("bad message... node=%lu.\n", inode->i_ino);
+				net_dump_msg(sock, inode);
+				// re-add this socket
+				spin_lock(&net_list_lock);
+				list_add_tail(&net->list, &net_recv_list);
+				spin_unlock(&net_list_lock);
+			}
+		} else {
+			// re-add this socket
+			spin_lock(&net_list_lock);
+			list_add_tail(&net->list, &net_recv_list);
+			spin_unlock(&net_list_lock);
+		}
+		goto start_over;
+	}
+	spin_unlock(&net_list_lock);
+
+	return 0;
+}
+
+
+void net_do_status_return(u64 msg_num, s32 status)
+{
+	net_status_ctxt *nsc;
+	struct list_head *iter;
+
+	spin_lock(&net_status_lock);
+	list_for_each(iter, &net_status_list) {
+		nsc = list_entry(iter, net_status_ctxt, list);
+		if (nsc->msg_num == msg_num) {
+			nsc->status = status;
+			atomic_set(&nsc->woken, 1);
+			list_del(&nsc->list);
+			spin_unlock(&net_status_lock);
+			wake_up(&nsc->wq);
+			return;
+		}
+	}
+	spin_unlock(&net_status_lock);
+}
+
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd)
+{
+	int ret = -EINVAL;
+	int len, packet_len;
+
+	len = hdr->data_len;
+	packet_len = len + sizeof(net_msg);
+
+	spin_lock(&hnd->lock);
+	if (net_handler_in_use(hnd)) {
+		netprintk0("EEEEEK!  handler in use! bugbug\n");
+		spin_unlock(&hnd->lock);
+		return -EINVAL;
+	}
+	if (len > hnd->max_len) {
+		netprintk("eek! advertised message data len is too large %u (max: %u)\n",
+		       len, hnd->max_len);
+		spin_unlock(&hnd->lock);
+		return -EINVAL;
+	}
+	hnd->flags |= (1 << NET_HND_IN_USE);
+	spin_unlock(&hnd->lock);
+
+	memset(hnd->buf, 0, packet_len);
+	ret = net_recv_tcp_msg(inode, sock, hnd->buf, &packet_len);
+	if (ret < 0) {
+		netprintk("net_recv_tcp_msg returned: %d\n", ret);
+	} else {
+		net_num_dispatched++;
+		ret = (hnd->func)((net_msg *)hnd->buf, packet_len, hnd->data);
+	}
+	
+	spin_lock(&hnd->lock);
+	hnd->flags &= ~(1 << NET_HND_IN_USE);
+	spin_unlock(&hnd->lock);
+
+	return ret;
+}
+
+
+
+/*
+ * net_accept_tcp_connections()
+ *
+ */
+static int net_accept_tcp_connections(void)
+{
+	int error, slen;
+	struct sockaddr_in sin;
+	struct socket *sock;
+	struct inode *inode;
+
+	if (!recv_sock) {
+		netprintk0("no socket!\n");
+		return 0;
+	}
+	
+	if (!recv_sock->sk->tp_pinfo.af_tcp.accept_queue) {
+		//netprintk0("no connections on the queue\n");
+		return 0;
+	}
+	error = 0;
+	while (error >= 0) {
+		sock = sock_alloc();
+		if (!sock)
+			break;
+
+		sock->type = recv_sock->type;
+		sock->ops = recv_sock->ops;
+		error = recv_sock->ops->accept(recv_sock, sock, O_NONBLOCK);
+		if (error < 0) {
+			sock_release(sock);
+			break;
+		}
+		if (sock->sk->state == TCP_CLOSE) {
+			sock_release(sock);
+			continue;
+		}
+			
+		slen = sizeof(sin);
+		error = sock->ops->getname(sock, (struct sockaddr *) &sin, &slen, 1);
+		if (error < 0)
+			break;
+		
+		netprintk("attempt to connect from %u.%u.%u.%u:%04x\n", 
+			NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+
+		inode = nm_get_node_by_ip(sin.sin_addr.s_addr);
+		if (inode) {
+			int exists = 1;
+			nm_node_inode_private *priv = inode->u.generic_ip;
+			net_inode_private *net = NULL;
+
+			if (priv) {
+				net = &priv->net;
+				netprintk("connect from known host: %s\n",
+				      priv->node.node_name);
+				if (ntohs(sin.sin_port) >= 1024)
+					netprintk("warning: connect from unprivileged port: %u.%u.%u.%u:%d\n",
+						NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+	 			spin_lock(&priv->net.sock_lock); 
+				if (!priv->net.sock) {
+					netprintk("new sock, doesnt exist\n");
+					exists = 0;
+					priv->net.sock = sock;
+					if (current != net_recv_task) {
+						netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+						if (net_recv_task == NULL) 
+							BUG();
+						init_waitqueue_entry(&priv->net.sleep, net_recv_task);
+					} else {
+						netprintk("process %p added to waitqueue\n", current);
+						init_waitqueue_entry(&priv->net.sleep, current);
+					}
+					add_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+				}
+	 			spin_unlock(&priv->net.sock_lock); 
+
+				if (exists) {
+					netprintk0("already a socket for this connection!\n");
+					net_send_error(sock, NET_ALREADY_CONNECTED);
+					net_dump_and_close_sock(sock, inode);
+				} else {
+					spin_lock(&net_list_lock);
+					netprintk("added inode %lu to net_recv_list\n", inode->i_ino);
+					if (list_empty(&net->list))
+						list_add_tail(&net->list, &net_recv_list);
+					spin_unlock(&net_list_lock);
+				}
+			}
+
+			iput(inode);
+		} else {
+			netprintk0("connect from unknown host...\n");
+			net_send_error(sock, NET_UNKNOWN_HOST);
+			net_dump_and_close_sock(sock, inode);
+		}
+	}
+	return error;
+}
+
+
+int net_send_error(struct socket *sock, u32 err_type)
+{
+        struct msghdr   msg;
+        mm_segment_t    oldfs;
+        struct iovec    iov;
+        int             len;
+	static net_msg err;
+
+	if (!net_is_valid_error_type(err_type)) {
+		netprintk("bug! bad error type! %u\n", err_type);
+		return -EINVAL;
+	}
+	memset(&err, 0, sizeof(net_msg));	
+	err.magic        = NET_MSG_MAGIC;
+	err.msg_type     = err_type;
+
+        msg.msg_name     = 0;
+        msg.msg_namelen  = 0;
+        msg.msg_iov      = &iov;
+        msg.msg_iovlen   = 1;
+        msg.msg_control  = NULL;
+        msg.msg_controllen = 0;
+        msg.msg_flags    = MSG_NOSIGNAL;
+        msg.msg_iov->iov_len = (__kernel_size_t)sizeof(net_msg);
+        msg.msg_iov->iov_base = (char*) &err;
+
+        oldfs = get_fs(); set_fs(KERNEL_DS);
+        len = sock_sendmsg(sock, &msg, (size_t)(sizeof(net_msg)));
+        set_fs(oldfs);
+
+        return len;
+}
+
+
+static int net_recv_message_header(net_msg *hdr, struct socket *sock)
+{
+	int status;
+	mm_segment_t oldfs;
+	struct iovec iov = {
+		.iov_base = hdr,
+		.iov_len = sizeof(net_msg)
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = NULL,
+		.msg_controllen = 0,
+		.msg_name = 0,    // (struct sockaddr *) &sin,
+		.msg_namelen = 0, // sizeof (sin),
+		.msg_flags = 0
+	};
+
+	status = 0;
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	status = sock_recvmsg(sock, &msg, sizeof(net_msg), MSG_PEEK);
+	set_fs(oldfs);
+
+	if (status < 0) {
+		if (status == -ERESTARTSYS) {
+			status = -EBADF;
+			netprintk ("Shutting down\n");
+		} else {
+			status = -EINVAL;
+			netprintk ("unable to recvmsg, error=%d\n", status);
+		}
+	}
+	// error or bytes received
+	return status;
+}
+
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode)
+{
+	nm_node_inode_private *priv = NULL;
+
+	net_dump_msg(sock, inode);
+
+	if (sock->sk) {
+		if (inode) {
+	       		priv = inode->u.generic_ip;
+			if (priv) {
+	 			spin_lock(&priv->net.sock_lock); 
+				remove_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+				priv->net.sock = NULL;
+	 			spin_unlock(&priv->net.sock_lock); 
+			}
+		}
+	}
+	sock_release(sock);
+}
+
+static void net_dump_msg(struct socket *sock, struct inode *inode)
+{
+	struct msghdr           msg;
+	struct iovec            iov;
+	int                     len;
+	mm_segment_t            oldfs;
+
+	if (sock->sk) {
+		len = 1;
+		while (len>0)
+		{
+			msg.msg_name     = 0;
+			msg.msg_namelen  = 0;
+			msg.msg_iov      = &iov;
+			msg.msg_iovlen   = 1;
+			msg.msg_control  = NULL;
+			msg.msg_controllen = 0;
+			msg.msg_flags    = MSG_DONTWAIT;
+			msg.msg_iov->iov_base = net_junk_buf;
+			msg.msg_iov->iov_len  = (__kernel_size_t)PAGE_SIZE;
+			len = 0;
+			oldfs = get_fs(); set_fs(KERNEL_DS);
+			len = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+			set_fs(oldfs);
+		}
+	}
+}
+
+
+int net_init_tcp_sock(struct inode *inode)
+{
+	nm_node_inode_private *priv;
+	nm_node_info *node;
+	net_inode_private *net = NULL;
+	struct sockaddr_in myaddr, remoteaddr;
+	int err = -EINVAL;
+	int i;
+	struct sock *sk;
+	struct socket *sock = NULL;
+
+	priv = inode->u.generic_ip;
+	if (!priv) {
+		netprintk0("bad inode\n");
+		return -EINVAL;
+	}
+	net = &priv->net;
+	node = &priv->node;
+	
+	if ((err = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+		netprintk("can't create socket: err=%d\n", err);
+		return err;
+	}
+
+	spin_lock(&net->sock_lock); 
+	if (net->sock || net->flags & NET_FLAG_CREATING_SOCKET) {
+		netprintk("socket already created or creating for inode %lu\n", inode->i_ino);
+		spin_unlock(&net->sock_lock);
+		sock_release(sock);
+		return -EEXIST;
+	}
+	net->flags |= NET_FLAG_CREATING_SOCKET;
+	spin_unlock(&net->sock_lock);
+
+	memset(&myaddr, 0, sizeof(myaddr));
+	myaddr.sin_family = AF_INET;
+	myaddr.sin_port = htons(0);  // any port
+	err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, sizeof(myaddr));
+	
+	memset (&remoteaddr, 0, sizeof (remoteaddr));
+	remoteaddr.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+	remoteaddr.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+	remoteaddr.sin_port = node->ifaces[0].ip_port;
+
+	//netprintk("connecting new socket: ip %d.%d.%d.%d, port %d\n", NIPQUAD(remoteaddr.sin_addr.s_addr), remoteaddr.sin_port);
+	err = sock->ops->connect(sock, (struct sockaddr *) &remoteaddr, 
+					sizeof(remoteaddr), 0); /* TODO put this back!  O_NONBLOCK); */
+	//netprintk("connect status %d\n", err);
+	
+	if (err >= 0) {
+		spin_lock(&net->sock_lock);
+		net->sock = sock;
+		net->flags &= ~NET_FLAG_CREATING_SOCKET;
+
+		netprintk0("1) ok this node is actively trying to connect, add to waitqueue\n");
+		if (current != net_recv_task) {
+			netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+			if (net_recv_task == NULL) 
+				BUG();
+			init_waitqueue_entry(&net->sleep, net_recv_task);
+		} else {
+			netprintk("process %p added to waitqueue\n", current);
+			init_waitqueue_entry(&net->sleep, current);
+		}
+		add_wait_queue(sock->sk->sleep, &net->sleep);
+
+		spin_unlock(&net->sock_lock);
+		goto out;
+	}
+
+	sk = sock->sk;
+	switch (err) {
+		case -EALREADY:
+		case -EINPROGRESS:
+					
+			/* TODO: awful awful awful */
+			for (i=0; i<100; i++) {
+				/* Protect against TCP socket state changes */
+				lock_sock(sk);
+				if (sk->state == TCP_ESTABLISHED) {
+					release_sock(sk);
+					netprintk0("woo!  connected...\n");
+					err = 0;
+					spin_lock(&net->sock_lock);
+					net->flags &= ~NET_FLAG_CREATING_SOCKET;
+					net->sock = sock;
+
+					netprintk0("2) ok this node is actively trying to connect, add to waitqueue\n");
+					if (current != net_recv_task) {
+						netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+						if (net_recv_task == NULL) 
+							BUG();
+						init_waitqueue_entry(&net->sleep, net_recv_task);
+					} else {
+						netprintk("process %p added to waitqueue\n", current);
+						init_waitqueue_entry(&net->sleep, current);
+					}
+					add_wait_queue(sock->sk->sleep, &net->sleep);
+
+					spin_unlock(&net->sock_lock);
+					break;
+				} else {
+					netprintk("waiting for connection: pass %d, state %d\n", i, sk->state);
+					/* TODO */
+#if 0
+					task->tk_timeout = RPC_CONNECT_TIMEOUT;
+					/* if the socket is already closing, delay briefly */
+					if ((1<<sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+						task->tk_timeout = RPC_REESTABLISH_TIMEOUT;
+					rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
+#endif
+					/* TODO: this is awful... change it later */
+				}
+				release_sock(sk);
+				util_sleep(100);
+			}
+			break;
+		case -ECONNREFUSED:
+		case -ECONNRESET:
+		case -ENOTCONN:
+			netprintk("conn refused, reset or not connected\n");
+			break;
+		default:
+			/* Report myriad other possible returns.  If this file
+			* system is soft mounted, just error out, like Solaris.  */
+			netprintk("error %d connecting to server\n", err);
+			/* TODO */
+#if 0
+			/* This will prevent anybody else from connecting */
+			rpc_delay(task, RPC_REESTABLISH_TIMEOUT);
+			task->tk_status = status;
+#endif
+			break;
+	}
+
+out:
+	if (err < 0) {
+		if (net) {
+			spin_lock(&net->sock_lock);
+			if (net->sock)
+				netprintk0("wha?! there's a socket there already!!!!\n");
+			net->flags &= ~NET_FLAG_CREATING_SOCKET;
+			spin_unlock(&net->sock_lock);
+		}
+	       	if (sock) 
+			sock_release(sock);
+	} else {
+		/* add this inode to the receive list, if not already */
+		spin_lock(&net_list_lock);
+		if (list_empty(&net->list))
+			list_add_tail(&net->list, &net_recv_list);
+		spin_unlock(&net_list_lock);
+	}
+
+	return err;
+}
+
+
+
+/*
+ * net_init_tcp_recv_sock()
+ *
+ */
+static int net_init_tcp_recv_sock(void)
+{
+	struct sockaddr_in sin;
+	int status = -EINVAL;
+
+	/* Create Receive Socket */
+	status = sock_create(net_ip_version_to_family(ip_version),
+			     SOCK_STREAM, IPPROTO_TCP,
+			     &recv_sock);
+	if (status < 0) {
+		netprintk ("unable to create socket, error=%d", status);
+		goto bail;
+	}
+
+
+	/* Bind Receive Socket */
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = net_ip_version_to_family(ip_version);
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = ip_port;
+
+	status = recv_sock->ops->bind(recv_sock,
+					 (struct sockaddr *)&sin,
+					 sizeof(sin));
+	if (status < 0) {
+		netprintk ("unable to bind socket to port %d, error=%d", 
+			ntohs(ip_port), status);
+	}
+
+	/* !!! dunno about these... */
+	recv_sock->sk->reuse = 1;
+	status = recv_sock->ops->listen(recv_sock, 64);
+
+bail:
+	return status;
+}				/* net_init_tcp_recv_sock */
+
+
+static void net_release_tcp_sock(void)
+{
+	if (recv_sock) {
+		sock_release (recv_sock);
+		recv_sock = NULL;
+	}
+}
+
+
+module_init (net_driver_entry);
+module_exit (net_driver_exit);

Added: trunk/cluster/tcp.h
===================================================================
--- trunk/cluster/tcp.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/tcp.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_TCP_H
+#define CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+#include "nodemanager.h"
+
+
+#ifdef __KERNEL__
+
+#define NET_DISP_THREAD_MS   5000   /* TODO */
+#define NET_RECV_THREAD_MS   5000   /* TODO */
+
+#ifdef BIG_NET_MSG
+#define NET_MSG_MAGIC           ((u32)0xbc0ffa55)
+#define NET_MSG_STATUS_MAGIC    ((u32)0xbc0ffa56)
+#define NET_MSG_NUM_MAX         ((u64)0xffffffffffffffffULL)
+typedef struct _net_msg
+{
+	__u32 magic;
+	__u32 data_len;
+	__u16 src_node;
+	__u16 dst_node;
+	__u32 msg_type;
+	__u32 key;
+	__s32 status;
+	__u64 msg_num;
+	__u8  buf[0];
+} net_msg;
+#else
+
+#define NET_MSG_MAGIC           ((u16)0xfa55)
+#define NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define NET_MSG_NUM_MAX         ((u32)0xffffffffUL)
+typedef struct _net_msg
+{
+	__u16 magic;
+	__u16 data_len;
+	__u16 msg_type;
+	__s16 status;
+	__u32 key;
+	__u32 msg_num;
+	__u8  buf[0];
+} net_msg;
+
+#endif
+
+typedef int (net_msg_handler_func)(net_msg *msg, u32 len, void *data);
+
+typedef struct _net_msg_handler
+{
+	struct list_head list;
+	u32 msg_type;
+	u32 key;
+	net_msg_handler_func *func;
+	void *data;
+	net_msg hdr;
+	u32 max_len;
+	void *buf;
+	spinlock_t lock;
+	atomic_t refcnt;
+	int flags;
+} net_msg_handler;
+
+typedef struct _net_status_ctxt
+{
+	struct list_head list;
+	s32 status;
+	u64 msg_num;
+	wait_queue_head_t wq;
+	atomic_t woken;
+} net_status_ctxt;
+
+void net_do_status_return(u64 msg_num, s32 status);
+
+/* no clue for these yet... */
+#define NET_MIN_MSG_LEN  (0)
+#define NET_MAX_MSG_LEN  (8192)
+	
+
+#define NET_ALREADY_CONNECTED   2
+#define NET_UNKNOWN_HOST        3
+	
+
+static inline int net_is_valid_error_type(u32 err_type)
+{
+	if (err_type == NET_ALREADY_CONNECTED ||
+	    err_type == NET_UNKNOWN_HOST)
+		return 1;
+	return 0;
+}
+		       
+enum {
+	NET_HND_VAR_LEN = 0,
+	NET_HND_IN_USE,
+};
+
+#define net_handler_variable_len(h)   ((h)->flags & (1 << NET_HND_VAR_LEN))
+#define net_handler_in_use(h)         ((h)->flags & (1 << NET_HND_IN_USE))
+
+static inline int net_handler_msg_len_ok(net_msg_handler *handler, u32 len)
+{
+	return (net_handler_variable_len(handler) ? 
+		len > handler->max_len : len != handler->max_len);
+}
+
+
+static inline int net_ip_version_to_family(u16 ip_version)
+{
+	printk("ip_version passed: %u, host byteorder: %u\n", ip_version, ntohs(ip_version));
+	return PF_INET;
+	switch (ntohs(ip_version)) {
+		case 4:
+			return PF_INET;
+		case 6:
+			return PF_INET6;
+		default:
+			BUG();
+	}
+
+	return 4;
+}
+
+
+
+/* TODO: figure this out.... */
+static inline int net_link_down(int err, struct socket *sock)
+{
+	if (sock) {
+		if (sock->sk->state != TCP_ESTABLISHED &&
+	    	    sock->sk->state != TCP_CLOSE_WAIT)
+			return 1;
+	}
+
+	if (err >= 0)
+		return 0;
+	switch (err) {
+		/* ????????????????????????? */
+		case -ERESTARTSYS:
+		case -EBADF:
+		/* When the server has died, an ICMP port unreachable 
+		 * message prompts ECONNREFUSED. */
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+			return 1;
+	}
+	return 0;
+}
+
+enum {
+	NET_DRIVER_UNINITED,
+	NET_DRIVER_READY,
+};
+
+
+int net_register_handler(u32 msg_type, u32 key, int flags, 
+			 u32 max_len, net_msg_handler_func *func, void *data, void *buf);
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len);
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len);
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len);
+int net_send_error(struct socket *sock, u32 err_type);
+int net_init_tcp_sock(struct inode *inode);
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status);
+int net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *group);
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key);
+
+#endif /* __KERNEL__ */
+
+typedef struct _net_ioc
+{
+	u32 status;
+} net_ioc;
+
+typedef struct _gsd_ioc
+{
+	int fd;
+	int namelen;
+	char name[NM_MAX_NAME_LEN+1];
+	int status;
+} gsd_ioc;
+
+#define  NET_IOC_MAGIC          'O'
+#define  NET_IOC_ACTIVATE       _IOR(NET_IOC_MAGIC, 1, net_ioc)
+#define  NET_IOC_GETSTATE       _IOR(NET_IOC_MAGIC, 2, net_ioc)
+#define  GSD_IOC_CREATE_GROUP   _IOR(NET_IOC_MAGIC, 3, gsd_ioc)
+#define  GSD_IOC_ADD_GROUP_NODE _IOR(NET_IOC_MAGIC, 4, gsd_ioc)
+
+#define GSD_MESSAGE   130
+#define GSD_ACTION_ADD_GROUP        (0x01)
+#define GSD_ACTION_ADD_GROUP_NODE   (0x02)
+
+typedef struct _gsd_message
+{
+	u16 from;
+	u8 action;
+	u8 namelen;
+	u8 name[NM_MAX_NAME_LEN];
+} gsd_message;
+
+#endif /* CLUSTER_TCP_H */

Added: trunk/cluster/test.c
===================================================================
--- trunk/cluster/test.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/test.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,811 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * test.c
+ *
+ * test module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/proc_fs.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "dlmmod.h"
+
+#include "compat_libfs.h"
+
+#define testprintk(x, arg...)    printk("TEST: (%d) " x, current->pid, ##arg)
+#define testprintk0(x)           printk("TEST: (%d) " x, current->pid)
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size);
+static ssize_t write_net_send(struct file *file, char *buf, size_t size);
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size);
+
+enum {
+	TEST_Root = 1,
+	TEST_NetRegister,
+	TEST_NetSend,
+	TEST_NetGetNum,
+	TEST_DLMPoop,
+	TEST_DLMPoop2,
+	TEST_DLMPoop3,
+	TEST_DLMRegister
+};
+
+extern spinlock_t net_state_lock;
+extern u32 net_driver_state;
+extern struct file_operations transaction_ops;
+extern char *nm_nodename;
+extern u32 net_num_dispatched;
+
+
+static void test_teardown(void);
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data);
+
+static int test_net_send(int arg);
+static int test_net_register(int arg);
+static int test_net_get_num(int arg);
+static int test_dlm_poop(int arg);
+static int test_dlm_poop2(int arg);
+static int test_dlm_poop3(int arg);
+static int test_dlm_register(int arg);
+
+
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data)
+{
+	testprintk("got a message!  type=%u, len=%u, data=%d\n", msg->msg_type, len, *(int *)data);
+	return 0;
+}
+
+#define TEST_MSG_TYPE1    87654321
+#define TEST_KEY1         12378534
+	
+int test_data1 = 723123123;
+
+static int test_net_register(int arg)
+{
+	int ret;
+	struct inode *dest_inode;
+	u16 dest_node_num = (u16)arg;
+
+	testprintk("running test_net_register: will contact node %u\n", dest_node_num);
+
+	dest_inode = nm_get_node_by_num(dest_node_num);
+	if (!dest_inode) {
+		testprintk("eeek! failed to find node %u\n", dest_node_num);
+		return 0;
+	}
+	{       
+		struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+		testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+	}
+
+	ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+				   test_small_msg_func, &test_data1, NULL);
+	if (ret < 0) {
+		testprintk0("eek!  register failed!\n");
+		return -1;
+	}
+	ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+				   test_small_msg_func, &test_data1, NULL);
+	if (ret >= 0) {
+		testprintk0("eek!  re-register was supposed to fail but didnt!!!\n");
+		return -1;
+	}
+	testprintk0("sweet.  re-register failed like it should have.\n");
+
+	testprintk0("creating socket now...\n");
+	ret = net_init_tcp_sock(dest_inode);
+	if (ret < 0) {
+		testprintk0("failed to make socket\n");
+		return -1;
+	}
+	testprintk("net_init_tcp_sock returned %d\n", ret);
+
+	testprintk0("leaving test_net_register!\n");
+	return 0;
+}
+
+
+static int test_net_send(int arg)
+{
+	int ret;
+	struct inode *dest_inode;
+	u16 dest_node_num = (u16)arg;
+
+	testprintk("running test_net_send: will contact node %u\n", dest_node_num);
+
+	dest_inode = nm_get_node_by_num(dest_node_num);
+	if (!dest_inode) {
+		testprintk("eeek! failed to find node %u\n", dest_node_num);
+		return 0;
+	}
+	{
+		struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+		testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+	}
+
+	testprintk0("packaging message now\n");
+
+	{
+		testprintk0("woo!  made a message packet... lets try sending it to ourself...\n");
+		testprintk0("waiting for socket to be created\n");
+		while (1) {
+			printk(".");
+			spin_lock(&net_state_lock);
+			if (net_driver_state == NET_DRIVER_READY) {
+				spin_unlock(&net_state_lock);
+				break;
+			}
+			spin_unlock(&net_state_lock);
+			util_sleep (100);
+		}
+		printk(".  done... let's go!\n");
+		ret = net_send_message(TEST_MSG_TYPE1, TEST_KEY1, NULL, 0, dest_inode, NULL);
+		testprintk("sent!!!! ret=%d\n", ret);
+	}
+	testprintk0("leaving test_net_send!\n");
+	return 0;
+	
+}
+
+static int test_net_get_num(int arg)
+{
+	testprintk("number of messages dispatched: %u\n", net_num_dispatched);
+	return 0;
+}
+
+void my_ast(void *data);
+void my_bast(void *data, int blocked_type);
+	
+dlm_lockstatus lksb1, lksb2;
+wait_queue_head_t convert_wq;
+atomic_t convert_flag;
+
+dlm_ctxt *the_dlm = NULL;
+
+static int test_dlm_poop(int arg)
+{
+	testprintk("calling dlm_dump_dlm(%p)\n", the_dlm);
+	if (the_dlm)
+		dlm_dump_dlm(the_dlm);
+
+#if 0
+	dlm_ctxt *dlm;
+	dlm_status status;
+	void *data1 = &lksb1;
+	void *data2 = &lksb2;
+	int ret;
+
+	memset(&lksb1, 0, sizeof(dlm_lockstatus));
+	memset(&lksb1, 0, sizeof(dlm_lockstatus));
+
+	testprintk0("calling dlm_register_domain...\n");
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+	testprintk("dlm_register_domain returned %p\n", dlm);
+
+	testprintk0("calling dlmlock...\n");
+	status = dlmlock(dlm, LKM_EXMODE, &lksb1, 0, "lock1", my_ast, data1, my_bast);
+	testprintk("dlmlock returned %d.  lksb.status=%d, lock=%p\n", status, lksb1.status, lksb1.lockid);
+
+	testprintk0("calling dlmlock to do a convert...\n");
+	status = dlmlock(dlm, LKM_PRMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data1, my_bast);
+	testprintk("dlmlock returned %d\n", status);
+
+	init_waitqueue_head (&convert_wq);
+	atomic_set(&convert_flag, 0);
+
+	testprintk0("calling second dlmlock...\n");
+	status = dlmlock(dlm, LKM_EXMODE, &lksb2, 0, "lock1", my_ast, data2, my_bast);
+	testprintk("dlmlock returned %d.  lksb.status=%d, lock=%p\n", status, lksb2.status, lksb2.lockid);
+
+	testprintk0("sleeping now!\n");
+	ret = util_wait_atomic_eq(&convert_wq, &convert_flag, 1, 20000);
+	testprintk("wait returned %d\n", ret);
+
+	testprintk0("calling dlmlock to do a convert the blocking lock to NL...\n");
+	status = dlmlock(dlm, LKM_NLMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data2, my_bast);
+	testprintk("dlmlock returned %d\n", status);
+
+	testprintk0("sleeping\n");
+	util_sleep(10000);
+	testprintk0("DONE!\n");
+#endif
+	return 0;
+}
+
+
+void my_ast(void *data)
+{
+	dlm_lockstatus *l = data;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	testprintk("AST!!!:   lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+}
+
+void my_bast(void *data, int blocked_type)
+{
+	dlm_lockstatus *l = data;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	testprintk("BAST!!!:   blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&convert_flag, 1);
+	wake_up(&convert_wq);
+}
+
+atomic_t finish;
+
+typedef struct _poo
+{
+	struct task_struct *task;
+	dlm_ctxt *dlm;
+	dlm_lockstatus *lksb;
+	wait_queue_head_t wq;
+	atomic_t ast_flag;
+	atomic_t bast_flag;
+	struct completion complete;
+} poo;
+void my_ast2(void *data);
+void my_bast2(void *data, int blocked_type);
+int test_dlm_thread(void *data);
+atomic_t asts_fired, basts_fired;
+
+typedef union _my_timing_t
+{
+	__u64 q;
+	__u32 lohi[2];
+} my_timing_t;
+
+
+static int test_dlm_poop2(int arg)
+{
+	dlm_ctxt *dlm;
+	dlm_status status;
+	void *data1 = &lksb1;
+	void *data2 = &lksb2;
+	int ret;
+	int pid1, pid2;
+	poo *poo1, *poo2;
+	my_timing_t t1, t2, t3;
+
+	poo1 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo1=%p\n", poo1);	
+	poo2 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo2=%p\n", poo2);	
+
+	atomic_set(&finish, 0);
+	atomic_set(&asts_fired, 0);
+	atomic_set(&basts_fired, 0);
+
+	testprintk0("calling dlm_register_domain...\n");
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+	testprintk("dlm_register_domain returned %p\n", dlm);
+	
+	poo1->dlm = dlm;
+	poo2->dlm = dlm;
+	init_completion(&poo1->complete);
+	init_completion(&poo2->complete);
+
+	rdtsc(t1.lohi[0], t1.lohi[1]);
+	pid1 = kernel_thread (test_dlm_thread, poo1, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (pid1 < 0) {
+		printk("unable to launch thread, error=%d", pid1);
+		return -EINVAL;
+	}
+	pid2 = kernel_thread (test_dlm_thread, poo2, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (pid2 < 0) {
+		printk("unable to launch thread, error=%d", pid2);
+		return -EINVAL;
+	}
+	testprintk("dlm threads running for %s...\n", dlm->name);
+	testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+	testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+	//testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+	//testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+	//testprintk("sending sigint now...\n");
+	//send_sig (SIGINT, poo1->task, 0);
+	//send_sig (SIGINT, poo2->task, 0);
+	//atomic_set(&finish, 1);
+	while (1) {
+		util_sleep(30000);
+		rdtsc(t3.lohi[0], t3.lohi[1]);
+		testprintk("another 30 sec: asts=%d, basts=%d, diff=%llu\n", 
+			   atomic_read(&asts_fired), atomic_read(&basts_fired), 
+			    t3.q - t1.q);
+		if (atomic_read(&finish)==1) {
+			printk("finish set!\n");
+			break;
+		}
+	}
+	wait_for_completion (&poo1->complete);
+	wait_for_completion (&poo2->complete);
+	rdtsc(t2.lohi[0], t2.lohi[1]);
+	kfree(poo1);
+	kfree(poo2);
+	testprintk("leaving!   asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired), 
+		    t2.q - t1.q);
+	return 0;
+}
+
+
+int test_dlm_thread(void *data)
+{
+	dlm_status status;
+	int ret;
+	dlm_lockstatus *lksb;
+	poo *mypoo = data;
+	dlm_ctxt *dlm = mypoo->dlm;
+
+	testprintk("mypoo=%p, dlm=%p\n", mypoo, dlm);
+	mypoo->task = current;
+	lksb = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+	memset(lksb, 0, sizeof(dlm_lockstatus));
+
+	mypoo->lksb = lksb;
+	init_waitqueue_head(&mypoo->wq);
+
+	atomic_set(&mypoo->ast_flag, 0);
+	atomic_set(&mypoo->bast_flag, 0);
+	
+	testprintk("mypoo=%p, dlm=%p, task=%p\n", mypoo, dlm, mypoo->task);
+
+	testprintk("calling dlmlock(%p, %d, %p, 0, \"lock1\", %p, %p, %p) to create the lock...\n",
+		    dlm, LKM_EXMODE, lksb, my_ast2, data, my_bast2);
+	status = dlmlock(dlm, LKM_EXMODE, lksb, 0, "lock1", my_ast2, data, my_bast2);
+	testprintk("dlmlock returned %d.  lksb.status=%d, lock=%p\n", status, lksb->status, lksb->lockid);
+
+again:
+	ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+	if (ret < 0) {
+		testprintk("1: waiting on ast converting to EX, ret=%d, type=%d, convtype=%d\n", 
+		       ret, lksb->lockid->type, lksb->lockid->convert_type);
+		if (ret == -EINTR)
+			goto leave;
+		goto again;
+	}
+	atomic_set(&mypoo->ast_flag, 0);
+
+
+
+wait_bast:	
+	ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->bast_flag, 1, 0);
+	if (ret < 0) {
+		testprintk("2: waiting on bast after converting to EX, ret=%d, type=%d, convtype=%d\n", 
+		       ret, lksb->lockid->type, lksb->lockid->convert_type);
+		if (ret == -EINTR)
+			goto leave;
+		goto wait_bast;
+	}
+	atomic_set(&mypoo->bast_flag, 0);
+
+
+
+
+	atomic_set(&mypoo->ast_flag, 0);
+
+	status = dlmlock(dlm, LKM_NLMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+wait_ast:
+	ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+	if (ret < 0) {
+		testprintk("3: waiting on ast converting to NL, ret=%d, type=%d, convtype=%d\n", 
+		       ret, lksb->lockid->type, lksb->lockid->convert_type);
+		if (ret == -EINTR)
+			goto leave;
+		goto wait_ast;
+	}
+
+	atomic_set(&mypoo->ast_flag, 0);
+	atomic_set(&mypoo->bast_flag, 0);
+
+	status = dlmlock(dlm, LKM_EXMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+
+	if (atomic_read(&finish) == 0)
+		goto again;
+leave:
+
+	atomic_set(&finish, 1);
+	kfree(mypoo->lksb);
+	complete (&mypoo->complete);
+	testprintk0("exiting thread\n");
+	return 0;
+}
+
+
+void my_ast2(void *data)
+{
+	poo *mypoo = data;
+	dlm_lockstatus *l = mypoo->lksb;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&asts_fired);
+	//testprintk("AST!!!:   lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	//       l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&mypoo->ast_flag, 1);
+	wake_up(&mypoo->wq);
+}
+
+void my_bast2(void *data, int blocked_type)
+{
+	poo *mypoo = data;
+	dlm_lockstatus *l = mypoo->lksb;
+	dlm_lock *lock = l->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&basts_fired);
+	//testprintk("BAST!!!:   blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	//       blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&mypoo->bast_flag, 1);
+	wake_up(&mypoo->wq);
+}
+
+wait_queue_head_t wq3;
+atomic_t ast_flag3, bast_flag3;
+dlm_lockstatus *lksb3;
+
+void my_bast3(void *data, int blocked_type);
+void my_ast3(void *data);
+
+void my_ast3(void *data)
+{
+	dlm_lock *lock = lksb3->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&asts_fired);
+	testprintk("AST!!!:   lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&ast_flag3, 1);
+	wake_up(&wq3);
+}
+
+void my_bast3(void *data, int blocked_type)
+{
+	dlm_lock *lock = lksb3->lockid;
+	dlm_lock_resource *res = lock->lockres;
+
+	atomic_inc(&basts_fired);
+	testprintk("BAST!!!:   blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n", 
+	       blocked_type, lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+	atomic_set(&bast_flag3, 1);
+	wake_up(&wq3);
+}
+
+static int test_dlm_poop3(int arg)
+{
+	dlm_ctxt *dlm;
+	dlm_status status;
+	int ret, i;
+	my_timing_t t1, t2, t3, t4;
+
+	atomic_set(&finish, 0);
+	atomic_set(&asts_fired, 0);
+	atomic_set(&basts_fired, 0);
+
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+
+	lksb3 = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+	memset(lksb3, 0, sizeof(dlm_lockstatus));
+
+	init_waitqueue_head(&wq3);
+
+	atomic_set(&ast_flag3, 0);
+	atomic_set(&bast_flag3, 0);
+
+	i = 0;
+	rdtsc(t1.lohi[0], t1.lohi[1]);
+
+	/* CREATE -> NL */	
+	testprintk0("creating lock\n");
+rdtsc(t3.lohi[0], t3.lohi[1]);
+	status = dlmlock(dlm, LKM_NLMODE, lksb3, 0, "lock1", my_ast3, NULL, my_bast3);
+
+	while (1) {
+		testprintk("%d: waiting on ast\n", i);
+		ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+		if (ret == -EINTR)
+			break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->NL took: %llu\n", i, t4.q - t3.q);
+		testprintk("%d: no bast for NL\n", i);
+	
+		atomic_set(&ast_flag3, 0);
+		atomic_set(&bast_flag3, 0);
+
+		if (i == 10) {
+			testprintk("%d: reached 10, goodbye\n", i);
+			break;
+		}
+		dlm_dump_dlm(dlm);
+	
+		/* CONVERT -> EX */	
+		testprintk("%d: converting dlmlock->EX\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+		status = dlmlock(dlm, LKM_EXMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+		testprintk("%d: waiting on ast\n", i);
+		ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+		if (ret == -EINTR)
+			break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->EX took: %llu\n", i, t4.q - t3.q);
+		atomic_set(&ast_flag3, 0);	
+	
+		testprintk("%d: waiting on bast\n", i);
+		ret = util_wait_atomic_eq(&wq3, &bast_flag3, 1, 0);
+		if (ret == -EINTR)
+			break;
+		atomic_set(&ast_flag3, 0);
+		atomic_set(&bast_flag3, 0);
+	
+		/* CONVERT -> NL */	
+		testprintk("%d: converting dlmlock->NL\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+		status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+	
+		/* WAIT ON AST AGAIN */
+		i++;
+	}
+	
+	/* DOWNCONVERT LAST TIME */
+	/* TODO: replace with dlmunlock once implemented */
+	status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+	kfree(lksb3);
+
+	rdtsc(t2.lohi[0], t2.lohi[1]);
+	testprintk("leaving!   asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired), 
+		    t2.q - t1.q);
+	return 0;
+}
+
+
+static int test_dlm_register(int arg)
+{
+	dlm_ctxt *dlm;
+
+	testprintk0("calling dlm_register_domain...\n");
+	dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+	testprintk("dlm_register_domain returned %p\n", dlm);
+
+	the_dlm = dlm;	
+	testprintk0("leaving!\n");
+	return 0;
+}
+
+
+
+
+/*
+ * module stuff
+ */
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_net_register(%d)\n", arg);
+	tmpret = test_net_register(arg);
+	ret = sprintf(buf, "test_net_register(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_net_send(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_net_send(%d)\n", arg);
+	tmpret = test_net_send(arg);
+	ret = sprintf(buf, "test_net_send(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_net_get_num(%d)\n", arg);
+	tmpret = test_net_get_num(arg);
+	ret = sprintf(buf, "test_net_get_num(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_poop(%d)\n", arg);
+	tmpret = test_dlm_poop(arg);
+	ret = sprintf(buf, "test_dlm_poop(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_poop2(%d)\n", arg);
+	tmpret = test_dlm_poop2(arg);
+	ret = sprintf(buf, "test_dlm_poop2(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_poop3(%d)\n", arg);
+	tmpret = test_dlm_poop3(arg);
+	ret = sprintf(buf, "test_dlm_poop3(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size)
+{
+	int arg = 0, tmpret, ret;
+       	if (size > 0)
+		arg = simple_strtoul(buf, NULL, 0);
+	printk("calling test_dlm_register(%d)\n", arg);
+	tmpret = test_dlm_register(arg);
+	ret = sprintf(buf, "test_dlm_register(%d) returned: %d\n", arg, tmpret);
+	return ret;
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+static int test_fill_super(struct super_block * sb, void * data, int silent)
+{	
+	int ret, sz;
+	TA_write_ops *ops;
+	static struct tree_descr test_files[] = {
+		[TEST_NetRegister] = {"net-register", &transaction_ops, S_IWUSR},
+		[TEST_NetSend] = {"net-send", &transaction_ops, S_IWUSR},
+		[TEST_NetGetNum] = {"net-get-num", &transaction_ops, S_IWUSR},
+		[TEST_DLMPoop] = {"dlm-poop", &transaction_ops, S_IWUSR},
+		[TEST_DLMPoop2] = {"dlm-poop2", &transaction_ops, S_IWUSR},
+		[TEST_DLMPoop3] = {"dlm-poop3", &transaction_ops, S_IWUSR},
+		[TEST_DLMRegister] = {"dlm-register", &transaction_ops, S_IWUSR},
+		/* last one */ {""}
+	};
+	
+	sz = sizeof(test_files) / sizeof(struct tree_descr);
+	ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+	ops->num_ops = sz;
+	ops->write_op[TEST_NetRegister] = write_net_register;
+	ops->write_op[TEST_NetSend] = write_net_send;
+	ops->write_op[TEST_NetGetNum] = write_net_get_num;
+	ops->write_op[TEST_DLMPoop] = write_dlm_poop;
+	ops->write_op[TEST_DLMPoop2] = write_dlm_poop2;
+	ops->write_op[TEST_DLMPoop3] = write_dlm_poop3;
+	ops->write_op[TEST_DLMRegister] = write_dlm_register;
+
+	printk("calling simple_fill_super...\n");
+	ret = simple_fill_super(sb, 0x12beAf00L, test_files);
+	if (ret >= 0) {
+		TA_GENERIC_SB_MEMBER(sb) = ops;
+	} else {
+		kfree(ops);
+	}
+	return ret;
+}
+
+static struct super_block *test_read_super (struct super_block *sb, void *data, int silent)
+{
+	printk("welcome to test_read_super!!!\n");
+	return (test_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (test_fs_type, "test", test_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_test(void)
+{
+	int retval;
+	void *ret;
+
+	printk("loading test module: nodename is %s\n", nm_nodename);
+
+	ret = proc_mkdir("cluster/test", 0);
+	printk("proc_mkdir of cluster/test returned %p\n", ret);
+
+	printk("calling register_filesystem\n");
+	retval = register_filesystem(&test_fs_type);
+	printk("done calling register_filesystem: ret=%d\n", retval);
+	if (retval) {
+		printk("oopsy that did not work\n");
+		test_teardown();
+	} else
+		printk("woot.  good to go.\n");
+	return retval;
+}
+
+static void __exit exit_test(void)
+{
+	test_teardown();
+	unregister_filesystem(&test_fs_type);
+	printk("unloading test module\n");
+}
+
+static void test_teardown(void)
+{
+	printk("removing cluster/test\n");
+	remove_proc_entry("cluster/test", NULL);
+}
+
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_test)
+module_exit(exit_test)

Added: trunk/cluster/util.c
===================================================================
--- trunk/cluster/util.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/util.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,349 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.c
+ *
+ * General purpose code
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "util.h"
+
+static void util_timeout_func(unsigned long data);
+
+/* block all but 'mask' sigs, optionally saving off our previous
+ * signal state. */
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+	sigset_t tmpsig;
+
+	siginitsetinv(&tmpsig, mask);
+	sigprocmask(SIG_BLOCK, &tmpsig, oldsigs);
+#else
+#ifdef HAVE_NPTL
+	spin_lock_irq (&current->sighand->siglock);
+	if (oldsigs)
+		*oldsigs = current->blocked;
+	siginitsetinv (&current->blocked, mask);
+	recalc_sigpending ();
+	spin_unlock_irq (&current->sighand->siglock);
+#else
+	spin_lock_irq (&current->sigmask_lock);
+	if (oldsigs)
+		*oldsigs = current->blocked;
+	siginitsetinv (&current->blocked, mask);
+	recalc_sigpending (current);
+	spin_unlock_irq (&current->sigmask_lock);
+#endif
+#endif
+}
+
+void util_unblock_sigs(sigset_t newsig)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+	sigprocmask(SIG_SETMASK, &newsig, NULL);
+#else
+#ifdef HAVE_NPTL
+       	spin_lock_irq (&current->sighand->siglock);
+	current->blocked = newsig;
+	recalc_sigpending ();
+	spin_unlock_irq (&current->sighand->siglock);
+#else
+	spin_lock_irq (&current->sigmask_lock);
+	current->blocked = newsig;
+	recalc_sigpending (current);
+	spin_unlock_irq (&current->sigmask_lock);
+#endif
+#endif
+}
+
+/*
+ * util_daemonize() 
+ *
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/* yes, len is unused but kept here for backwards compatibility. */
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+	sigset_t tmpsig;
+
+	daemonize (name);
+
+	if (shutdown_sigs) {
+		/* Unblock SIGKILL, SIGSTOP, SIGHUP and SIGINT */
+		sigemptyset(&tmpsig);
+		sigaddsetmask(&tmpsig, SHUTDOWN_SIGS);
+		sigprocmask(SIG_UNBLOCK, &tmpsig, NULL);
+	}
+
+	return;
+}				/* util_daemonize */
+#else
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+	daemonize ();
+	reparent_to_init ();
+
+	if (len > 0) {
+		if (len > 15)
+			BUG();
+		strncpy (current->comm, name, len);
+		current->comm[len] = '\0';
+	}
+
+	if (shutdown_sigs)
+		util_block_sigs(NULL, SHUTDOWN_SIGS);
+	else
+		util_block_sigs(NULL, 0);
+	return;
+}				/* util_daemonize */
+#endif
+
+/*
+ * util_sleep()
+ *
+ * The interval time is in milliseconds
+ *
+ * This function needs to be removed.
+ * Instead call schedule_timeout() directly and handle signals.
+ */
+int util_sleep (__u32 ms)
+{
+	__u32 numJiffies;
+
+	/* 10ms = 1 jiffy, minimum resolution is one jiffy */
+	numJiffies = ms * HZ / 1000;
+	numJiffies = (numJiffies < 1) ? 1 : numJiffies;
+
+	set_current_state (TASK_INTERRUPTIBLE);
+	numJiffies = schedule_timeout (numJiffies);
+
+	return 0;
+}				/* util_sleep */
+
+/* prefetch has been declared to allow to build in debug mode */
+#ifdef DEBUG
+#ifndef ARCH_HAS_PREFETCH
+inline void prefetch (const void *x)
+{;
+}
+#endif
+#endif
+
+
+static void util_timeout_func(unsigned long data)
+{
+	util_timeout *to = (util_timeout *)data; 
+
+	to->timed_out = 1;
+	wake_up(&to->wait);
+}
+
+void util_init_timeout(util_timeout *to)
+{
+	init_timer(&to->timer);
+	to->timer.data = (unsigned long)to;
+	to->timer.function = util_timeout_func;
+	to->timed_out = 0;
+	init_waitqueue_head(&to->wait);
+}
+
+void util_set_timeout(util_timeout *to, __u32 timeout)
+{
+	__u32 how_long;
+
+	if (!timeout) {
+		to->timed_out = 1;
+		return ;
+	}
+
+	how_long = (timeout * HZ / 1000);
+	if (how_long < 1)
+		how_long = 1;
+
+	to->timer.expires = jiffies + how_long;
+	add_timer(&to->timer);
+}
+
+void util_clear_timeout(util_timeout *to)
+{
+	del_timer_sync(&to->timer);
+}
+
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms)
+{
+	int ret;
+	util_timeout timeout;
+	DECLARE_WAITQUEUE(wait, current);
+	DECLARE_WAITQUEUE(to_wait, current);
+
+	util_init_timeout(&timeout);
+
+	if (ms) {
+		util_set_timeout(&timeout, ms);
+		if (timeout.timed_out) {
+			util_clear_timeout(&timeout);
+		}
+	}
+	add_wait_queue(wq, &wait);
+	add_wait_queue(&timeout.wait, &to_wait);
+	do { 
+		ret = 0;
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (atomic_read(var)==val)
+			break;
+		ret = -ETIMEDOUT;
+		if (timeout.timed_out)
+			break;
+		schedule();
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+	} while (1);
+	
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(wq, &wait);
+	remove_wait_queue(&timeout.wait, &to_wait);
+
+	if (ms)
+		util_clear_timeout(&timeout);
+
+	return ret;
+}
+
+/* resizable (using chained pages) array stuff */
+void util_init_rarray(util_rarray *arr, u16 elem_size)
+{
+	arr->elements = 0;
+	arr->max_elem = 0;
+	arr->elem_size = elem_size;
+	arr->page = NULL;
+}	
+
+
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx)
+{
+	int pgnum, pgoff;
+	util_rarray_page *pg;
+	
+	if (idx >= arr->max_elem) {
+		printk("eek! asked for %d, but only %d elements\n", 
+		       idx, arr->max_elem);
+		return NULL;
+	}
+	
+	pgnum = idx / UTIL_RARRAY_ELEM_PER_BUF(arr);
+	pgoff = idx % UTIL_RARRAY_ELEM_PER_BUF(arr);
+	pg = (util_rarray_page *)arr->page;
+	while (pgnum--) {
+		if (!pg->next) {
+			printk("eeek! no next page!\n");
+			return NULL;
+		}
+		pg = pg->next;
+	}
+	return (((char *)pg->buf) + (pgoff * arr->elem_size));
+}
+
+
+void * util_get_new_rarray_slot(util_rarray *arr, int *index)
+{
+	char *tmp;
+	util_rarray_page *newpg, *pg;
+	
+	if (arr->max_elem == arr->elements) {
+		newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+		if (!newpg) {
+			printk("could not grow array!!!\n");
+			return NULL;
+		}
+		memset(newpg, 0, PAGE_SIZE);
+		if (arr->page) {
+			pg = (util_rarray_page *)arr->page;
+			while (pg->next)
+				pg = pg->next;
+			pg->next = newpg;
+		} else
+			arr->page = newpg;
+		arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+	}
+
+	tmp = util_rarray_idx_to_slot(arr, arr->elements);
+	if (tmp) {
+		if (index)
+			*index = arr->elements;
+		arr->elements++;
+	}
+	return tmp;
+}
+
+
+int util_add_to_rarray(util_rarray *arr, void *new)
+{
+	void *slot;
+	int idx;
+
+	slot = util_get_new_rarray_slot(arr, &idx);
+	if (slot == NULL) 
+		return -EINVAL;
+	memcpy(slot, new, arr->elem_size);
+	return idx;
+}
+
+/* resizes rarray to at least newelem elements */
+int util_resize_rarray(util_rarray *arr, int newelem)
+{
+	util_rarray_page *newpg, *pg;
+
+	printk("util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+	while (arr->max_elem < newelem) {
+		newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+		if (!newpg) {
+			printk("could not grow array!!!\n");
+			return -ENOMEM;
+		}
+		memset(newpg, 0, PAGE_SIZE);
+		if (arr->page) {
+			pg = (util_rarray_page *)arr->page;
+			while (pg->next)
+				pg = pg->next;
+			pg->next = newpg;
+		} else
+			arr->page = newpg;
+		arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+	}
+	printk("leaving util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+
+	return 0;
+}
+
+

Added: trunk/cluster/util.h
===================================================================
--- trunk/cluster/util.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/util.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_UTIL_H
+#define CLUSTER_UTIL_H
+
+#ifdef __KERNEL__
+#define SHUTDOWN_SIGS   (sigmask(SIGKILL) | sigmask(SIGHUP) | \
+			 sigmask(SIGINT) | sigmask(SIGQUIT))
+
+/* timeout structure taken from Ben's aio.c */
+typedef struct _util_timeout {
+	struct timer_list	timer;
+	int			timed_out;
+	wait_queue_head_t	wait;
+} util_timeout;
+
+void util_clear_timeout(util_timeout *to);
+void util_daemonize(char *name, int len, int shutdown_sigs);
+void util_init_timeout(util_timeout *to);
+void util_set_timeout(util_timeout *to, __u32 timeout);
+void util_show_stack(unsigned long *esp);
+void util_show_trace(unsigned long *stack);
+int util_sleep(__u32 ms);
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms);
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask);
+void util_unblock_sigs(sigset_t newsig);
+
+/* exits when var == val, or on timeout */
+static inline int util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int timeout)
+{
+	int ret = 0;
+	if (atomic_read(var) != val)
+		ret = __util_wait_atomic_eq(wq, var, val, timeout);
+	return ret;
+}
+
+#endif  /* __KERNEL__ */
+
+/* resizable array */
+typedef struct _util_rarray
+{
+	void *page;
+	u16 elements;
+	u16 max_elem;
+	u16 elem_size;
+	u16 reserved1;
+} util_rarray;
+
+#define UTIL_RARRAY_PAGE_BUF_SIZE    (PAGE_SIZE - offsetof(util_rarray_page, buf))
+#define UTIL_RARRAY_ELEM_PER_BUF(r)  ((UTIL_RARRAY_PAGE_BUF_SIZE) / (r)->elem_size)
+typedef struct _util_rarray_page
+{
+	void *next;
+	char buf[0];
+} util_rarray_page;
+
+void util_init_rarray(util_rarray *arr, u16 elem_size);
+void * util_get_new_rarray_slot(util_rarray *arr, int *index);
+int util_add_to_rarray(util_rarray *arr, void *new);
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx);
+int util_resize_rarray(util_rarray *arr, int newelem);
+
+#ifdef __KERNEL__
+typedef struct _util_thread_info
+{
+	wait_queue_head_t thread_wq;
+	atomic_t woken;
+	struct task_struct *task;
+	struct completion complete;
+	int pid;
+} util_thread_info;
+
+
+static inline void util_thread_info_init(util_thread_info *info)
+{
+	init_waitqueue_head(&info->thread_wq);
+	atomic_set(&info->woken, 0);
+	info->task = NULL;
+	info->pid = -1;
+	init_completion(&info->complete);
+}
+#endif /* __KERNEL__ */
+
+#endif /* CLUSTER_UTIL_H */

Added: trunk/cluster/warning_hack.h
===================================================================
--- trunk/cluster/warning_hack.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/warning_hack.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * warning_hack.h
+ *
+ * just to get rid of stupid warnings
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef WARNING_HACK_H
+#define WARNING_HACK_H
+
+struct mem_dqinfo;
+struct request;
+
+extern __inline__ int generic_fls(int x);
+extern __inline__ int get_bitmask_order(unsigned int count);
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+extern inline int rq_data_dir(struct request *rq);
+	
+
+#endif /* WARNING_HACK_H */

Modified: trunk/src/Makefile
===================================================================
--- trunk/src/Makefile	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/Makefile	2004-12-06 21:45:32 UTC (rev 1693)
@@ -69,7 +69,7 @@
 	buffer_head_io.c	\
 	dcache.c		\
 	dir.c			\
-	dlm.c			\
+	dlmglue.c		\
 	extent_map.c		\
 	file.c			\
 	heartbeat.c		\
@@ -77,22 +77,18 @@
 	ioctl.c			\
 	journal.c		\
 	localalloc.c		\
-	lockres.c		\
 	namei.c			\
-	nm.c			\
 	proc.c			\
+	slot_map.c		\
 	suballoc.c		\
 	super.c			\
 	symlink.c		\
 	sysfile.c		\
 	util.c			\
 	ver.c			\
-	volcfg.c		\
 	vote.c
-
 HFILES = \
 	ocfs2_fs.h		\
-	ocfs2_disk_dlm.h	\
 	ocfs1_fs_compat.h	\
 	ocfs.h			\
 	ocfs_log.h		\
@@ -102,7 +98,7 @@
 	alloc.h			\
 	dcache.h		\
 	dir.h			\
-	dlm.h			\
+	dlmglue.h		\
 	extent_map.h		\
 	file.h			\
 	heartbeat.h		\
@@ -110,19 +106,16 @@
 	ioctl.h			\
 	journal.h		\
 	localalloc.h		\
-	lockres.h		\
 	namei.h			\
-	nm.h			\
 	proc.h			\
+	slot_map.h		\
 	suballoc.h		\
 	super.h			\
 	symlink.h		\
 	sysfile.h		\
 	util.h			\
 	ver.h			\
-	volcfg.h		\
 	vote.h
-
 VERSION_FILES = $(CFILES) $(HFILES)
 VERSION_SRC = ver.c
 VERSION_PREFIX = OCFS
@@ -195,7 +188,7 @@
 BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__ 
 DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
 
-INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC)
+INCLUDES = -I. -I$(TOPDIR) -I$(KERNELINC) -I$(GCCINC)
 
 CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
 	-fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
@@ -244,8 +237,8 @@
 INSTALL_RULES = install-ocfs
 
 install-ocfs: $(INSTALL_MODULE)
-	$(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)
-	$(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/$<
+	$(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+	$(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/ocfs2/$<
 
 include $(TOPDIR)/Postamble.make
 
@@ -271,7 +264,7 @@
 STAMP_DIR = $(OCFS_SRC_DIR)
 include $(OCFS_SRC_DIR)/../Versioning.make
 
-EXTRA_CFLAGS += $(GLOBAL_DEFINES)
+EXTRA_CFLAGS += $(GLOBAL_DEFINES) -I$(CLUSTERINC)
 
 CFLAGS_$(VERSION_OBJ) += $(VERDEFS)
 

Modified: trunk/src/alloc.c
===================================================================
--- trunk/src/alloc.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/alloc.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -35,7 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "localalloc.h"
@@ -210,7 +210,7 @@
 			/* we always use node zeros suballocator */
 			eb->h_suballoc_node = 0;
 #else
-			eb->h_suballoc_node = osb->node_num;
+			eb->h_suballoc_node = osb->slot_num;
 #endif
 			eb->h_suballoc_bit = suballoc_bit_start;
 			eb->h_list.l_count = ocfs2_extent_recs_per_eb(osb->sb);
@@ -1170,12 +1170,16 @@
 
 	down_write(&OCFS_I(inode)->ip_alloc_sem);
 
+	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						     inode->i_size);
+
+	/* the extent map gets truncated in ocfs_do_truncate */
+	ocfs2_lvb_set_trunc_clusters(inode, target_i_clusters);
+
 	last_eb_bh = tc->tc_last_eb_bh;
 	tc->tc_last_eb_bh = NULL;
 	handle = tc->tc_handle;
 
-	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
-						     inode->i_size);
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 
 	if (fe->id2.i_list.l_tree_depth) {
@@ -1236,6 +1240,14 @@
 			LOG_ERROR_STATUS(status);
 			goto bail;
 		}
+		/* Since we got our cluster lock from caller and we
+		 * don't add it to the handle: */
+		ocfs_set_inode_lock_trans(osb->journal, inode);
+
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+		status = ocfs_mark_inode_dirty(handle, inode, fe_bh);
+		if (status < 0)
+			LOG_ERROR_STATUS(status);
 	} else {
 		status = ocfs_extend_trans(handle, credits);
 		if (status < 0) {
@@ -1346,15 +1358,15 @@
 		}
 
 		ocfs_handle_add_inode(handle, ext_alloc_inode);
-		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0, 
-					   &ext_alloc_bh, ext_alloc_inode);
+		status = ocfs2_meta_lock(ext_alloc_inode,
+					handle,
+					&ext_alloc_bh,
+					1);
 		if (status < 0) {
 			if (status != -EINTR)
 				LOG_ERROR_STATUS (status);
 			goto bail;
 		}
-		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-				     0, ext_alloc_inode);
 	}
 
 	data_alloc_inode = ocfs_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, -1);
@@ -1365,15 +1377,12 @@
 	}
 
 	ocfs_handle_add_inode(handle, data_alloc_inode);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
-				   0, &data_alloc_bh, data_alloc_inode);
+	status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-			     0, data_alloc_inode);
 
 	(*tc)->tc_bitmap_inode    = data_alloc_inode;
 	(*tc)->tc_bitmap_bh       = data_alloc_bh;

Modified: trunk/src/aops.c
===================================================================
--- trunk/src/aops.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/aops.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -83,7 +83,7 @@
 
 	/* We don't use the page cache to create symlink data, so if
 	 * need be, copy it over from the buffer cache. */
-	if (!buffer_uptodate(bh_result) && ocfs_inode_is_new(osb, inode)) {
+	if (!buffer_uptodate(bh_result) && ocfs_inode_is_new(inode)) {
 		buffer_cache_bh = sb_getblk(osb->sb, 
 					    fe->id2.i_list.l_recs[0].e_blkno + iblock);
 		if (!buffer_cache_bh) {
@@ -96,7 +96,7 @@
 		 * the bh, even if it commits while we're doing the
 		 * copy, the data is still good. */
 		if (buffer_jbd(buffer_cache_bh) 
-		    && ocfs_inode_is_new(osb, inode)) {
+		    && ocfs_inode_is_new(inode)) {
 			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
 			if (!kaddr) {
 				LOG_ERROR_ARGS("couldn't kmap!\n");
@@ -125,12 +125,11 @@
 }
 
 static int ocfs_get_block(struct inode *inode, sector_t iblock,
-		struct buffer_head *bh_result, int create)
+			  struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
 	u64 vbo = 0;
 	u64 p_blkno;
-	int open_direct;
 
 	LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, %d)\n", inode,
 			(unsigned long long)iblock, bh_result, create);
@@ -140,8 +139,6 @@
 		       inode, inode->i_ino);
 	}
 
-	open_direct = OCFS_I(inode)->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO;
-
 	if (S_ISLNK(inode->i_mode)) {
 		/* this always does I/O for some reason. */
 		err = ocfs_symlink_get_block (inode, iblock, bh_result, 
@@ -162,13 +159,8 @@
 	}
 	spin_unlock(&OCFS_I(inode)->ip_lock);
 
-	if (!open_direct)
-		down_read(&OCFS_I(inode)->ip_node_extend_sem);
-
 	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
 					   NULL);
-	if (!open_direct)
-		up_read(&OCFS_I(inode)->ip_node_extend_sem);
 
 	if (err) {
 		LOG_ERROR_ARGS("Error %d from get_blocks(0x%p, %llu, 1, %llu, NULL)\n",
@@ -500,55 +492,39 @@
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
  */
-static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create)
+static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     unsigned long max_blocks,
+				     struct buffer_head *bh_result, int create)
 {
 	int ret = -1;
 	int status;
-	ocfs_super *osb = NULL;
 	u64 vbo_max; /* file offset, max_blocks from iblock */
 	u64 p_blkno;
 	int contig_blocks;
-	int set_new = 0; /* flag */
 	unsigned char blocksize_bits;
 
 	if (!inode || !bh_result) {
-		LOG_ERROR_STR("ocfs_direct_IO_get_blocks: inode or bh_result is null");
+		LOG_ERROR_STR("inode or bh_result is null");
 		return -EIO;
 	}
-	osb = OCFS_SB(inode->i_sb);
 
 	blocksize_bits = inode->i_sb->s_blocksize_bits;
-	/* make sure we're up to date... */
-	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
-		LOG_TRACE_STR ("ocfs_direct_IO_get_blocks: verify oin.");
-		status = ocfs_verify_update_inode (osb, inode);
-		if (status < 0) {
-			LOG_TRACE_STR ("ocfs_verify_update_inode failed");
-			ret = -EIO;
-			goto bail;
-		}
-	}
 
 	/* This function won't even be called if the request isn't all
 	 * nicely aligned and of the right size, so there's no need
 	 * for us to check any of that. */
 
-	vbo_max = (u64)(iblock + max_blocks) << blocksize_bits;
+	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
 
-	/* NOTE: create flag is set when we ?may? have to allocate some
-	   blocks for the file. */
-	if (create &&
-	    (vbo_max > ocfs2_clusters_to_bytes(inode->i_sb,
-					       OCFS_I(inode)->ip_clusters))) {
-		/* WARNING: How much do we really want to extend the file? */
-		status = ocfs_extend_file(osb, inode, vbo_max);
-		if (status < 0) {
-			status = -ENOSPC;
-			LOG_ERROR_STR("ocfs_direct_IO_get_blocks: failed to extend the file!");
-			goto bail;
-		}
-		set_new = 1;
+	spin_lock(&OCFS_I(inode)->ip_lock);
+	if ((iblock + max_blocks) >
+	    ocfs2_clusters_to_blocks(inode->i_sb,
+				     OCFS_I(inode)->ip_clusters)) {
+		spin_unlock(&OCFS_I(inode)->ip_lock);
+		err = -EIO;
+		goto bail;
 	}
+	spin_unlock(&OCFS_I(inode)->ip_lock);
 
 	/* This figure out the size of the next contiguous block, and
 	 * our logical offset */	
@@ -561,16 +537,7 @@
 		goto bail;
 	}
 
-	/* Do whatever we need to the buffer_head */
-	if (set_new) {
-		set_buffer_new(bh_result);
-		/* Do we really want to set bh_result->b_blocknr here too? */
-		bh_result->b_blocknr = p_blkno;
-	} else {
-		clear_buffer_new(bh_result);
-		/* is the last argument here correct? */
-		map_bh(bh_result, inode->i_sb, p_blkno);
-	}
+	map_bh(bh_result, inode->i_sb, p_blkno);
 
 	/* make sure we don't map more than max_blocks blocks here as
 	   that's all the kernel will handle at this point. */

Modified: trunk/src/dcache.c
===================================================================
--- trunk/src/dcache.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dcache.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,7 +38,6 @@
 #include "alloc.h"
 #include "dcache.h"
 #include "file.h"
-#include "vote.h"
 
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_DCACHE
 
@@ -72,13 +71,16 @@
 		goto bail;
 	}
 
+	spin_lock(&OCFS_I(inode)->ip_lock);
 	/* did we or someone else delete this inode? */
 	if (INODE_DELETED(inode)) {
+		spin_unlock(&OCFS_I(inode)->ip_lock);
 		LOG_TRACE_ARGS("dentry_revalidate: inode (%llu) deleted, "
 			       "returning false\n",
 			       OCFS_I(inode)->ip_blkno);
 		goto bail;
 	}
+	spin_unlock(&OCFS_I(inode)->ip_lock);
 
 #warning "should we do this for all files?"
 	if (S_ISDIR(inode->i_mode) && (!inode->i_nlink)) {
@@ -88,23 +90,8 @@
 		goto bail;
 	}
 
-	if (ocfs_node_map_is_only(osb, &osb->publ_map, osb->node_num)) {
-		LOG_TRACE_STR ("Only node alive.  revalidate=true.");
-		ret = 1;
-		goto bail;
-	}
-
-	/* if I hold cache lock, no revalidate needed */
-	if (ocfs_is_local_cache_lock(osb, inode)) {
-		ret = 1;
-		goto bail;
-	}
-
 	ret = 1;
 
-	/* TODO: Is this really necessary? */
-	atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
-
 bail:
 	LOG_EXIT_INT (ret);
 

Modified: trunk/src/dir.c
===================================================================
--- trunk/src/dir.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dir.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -49,7 +49,7 @@
 
 #include "alloc.h"
 #include "dir.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "inode.h"
@@ -85,7 +85,6 @@
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct super_block * sb = inode->i_sb;
 	int have_disk_lock = 0;
-	ocfs_super *osb = OCFS_SB(sb);
 
 	LOG_SET_CONTEXT(READDIR);
 
@@ -94,7 +93,7 @@
 	stored = 0;
 	bh = NULL;
 
-	error = ocfs_acquire_lock_ro(osb, inode);
+	error = ocfs2_meta_lock(inode, NULL, NULL, 0);
 	if (error < 0) {
 		if (error != -EINTR)
 			LOG_ERROR_STATUS (error);
@@ -201,11 +200,8 @@
 
 	stored = 0;
 bail:
-	if (have_disk_lock) {
-		error = ocfs_release_lock_ro (osb, inode);
-		if (error < 0)
-			LOG_ERROR_STATUS (error);
-	}
+	if (have_disk_lock)
+		ocfs2_meta_unlock(inode, 0);
 
 	LOG_EXIT_STATUS(stored);
 	LOG_CLEAR_CONTEXT();
@@ -224,7 +220,6 @@
 			    struct ocfs2_dir_entry **dirent)
 {
 	int status = -ENOENT;
-	int tmpstat;
 	int lock_acq = 0;
 
 	LOG_ENTRY_ARGS ("(osb=%p, parent=%llu, name='%*s', blkno=%p, inode=%p)\n",
@@ -232,7 +227,7 @@
 
 	if (take_lock) {
 		/* Get a lock on the directory... */
-		status = ocfs_acquire_lock_ro (osb, inode);
+		status = ocfs2_meta_lock(inode, NULL, NULL, 0);
 		if (status < 0) {
 			/* Volume should be disabled in this case */
 			if (status != -EINTR)
@@ -254,13 +249,7 @@
 leave:
 
 	if (take_lock && lock_acq)
-	{
-		tmpstat = ocfs_release_lock_ro (osb, inode);
-		if (tmpstat < 0) {
-			LOG_ERROR_STATUS (tmpstat);
-			/* Volume should be disabled in this case */
-		}
-	}
+		ocfs2_meta_unlock(inode, 0);
 
 	if (status < 0) {
 		*dirent = NULL;

Deleted: trunk/src/dlm.c
===================================================================
--- trunk/src/dlm.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlm.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,732 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlm.c
- *
- * Distributed lock manager. Requests and processes lock votes.
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/random.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-#include "ocfs2.h"
-
-#include "alloc.h"
-#include "dcache.h"
-#include "dlm.h"
-#include "inode.h"
-#include "lockres.h"
-#include "nm.h"
-#include "util.h"
-#include "vote.h"
-
-#include "ocfs_journal.h"
-#include "buffer_head_io.h"
-
-#define WAIT_FOR_VOTE_INCREMENT  200
-
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_DLM
-
-/* inode is definitely non NULL */
-static inline int ocfs_wait_for_readonly_drop(ocfs_super *osb, struct inode *inode)
-{
-	int status = 0;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
-	if (ocfs_node_map_is_empty(&lockres->readonly_map))
-		return status;
-	status = ocfs_drop_readonly_cache_lock(osb, inode, 0);
-	return status;	
-}
-
-/*
- * ocfs_update_disk_lock()
- * inode is definitely non NULL
- */
-void ocfs_update_disk_lock (ocfs_super * osb, 
-			    struct buffer_head *bh, 
-			    struct inode *inode)
-{
-	ocfs2_dinode *fe = NULL;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
-	LOG_ENTRY ();
-
-	fe = (ocfs2_dinode *) bh->b_data;
-
-	/* We do our own I/O here to lock out dirty readers from
-	 * refreshing the bh when we're in the middle of changing
-	 * it. We shouldn't ever get here if it's a journalled buffer
-	 * so io_sem is not necessary. */
-	if (buffer_jbd(bh)) {
-		printk("Ugh, block %llu has the JBD bit set!\n",
-		       (unsigned long long)bh->b_blocknr);
-		BUG();
-	}
-
-	if ((DISK_LOCK(fe)->dl_master == lockres->master_node_num)
-	    && (DISK_LOCK(fe)->dl_level == lockres->lock_type))
-		goto skip_write;
-
-	lock_buffer(bh);
-
-	if (buffer_jbd(bh)) {
-		printk("Ugh, block %llu has the JBD bit set!\n",
-		       (unsigned long long)bh->b_blocknr);
-		BUG();
-	}
-
-	DISK_LOCK(fe)->dl_master = lockres->master_node_num;
-	DISK_LOCK(fe)->dl_level = lockres->lock_type;
-
-	set_buffer_uptodate(bh);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)	
-	/*
-	 * mark_buffer_clean() doesn't exist in 2.6.x kernels.
-	 * Not many places actually used mark_buffer_clean, but
-	 * at least reiserfs uses clear_buffer_dirty() as
-	 * a replacment.
-	 */
-	clear_buffer_dirty(bh);
-#else
-	mark_buffer_clean(bh);
-#endif
-	bh->b_end_io = ocfs_end_buffer_io_sync;
-	submit_bh(WRITE, bh);
-	wait_on_buffer(bh);
-	SET_BH_SEQNUM(inode, bh);
-
-skip_write:
-	LOG_EXIT ();
-}				/* ocfs_update_disk_lock */
-
-int ocfs_notify_cluster(ocfs_super *osb, 
-			struct inode *inode,
-			u32 message_flags)
-{
-	int status = -EAGAIN;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	u32 flags;
-
-	LOG_ENTRY_ARGS("(inode = %llu, flags = 0x%x)\n", 
-		       OCFS_I(inode)->ip_blkno, message_flags);
-
-	while (status == -EAGAIN) {
-		ocfs_acquire_lockres_write(inode);
-
-		flags = message_flags;
-		if (ocfs_inode_is_new(osb, inode))
-			flags |= FLAG_FAST_PATH_LOCK;
-
-		if (ocfs_task_interruptible ()) {
-			ocfs_release_lockres_write (inode);
-			LOG_TRACE_ARGS("interrupted... inode = %llu\n",
-				       OCFS_I(inode)->ip_blkno);
-			status = -EINTR;
-			goto bail;
-		}
-		
-		status = new_lock_function(osb, lockres->lock_type, flags, NULL, inode);
-
-		if (status < 0) {
-			if (status != -EAGAIN)
-				LOG_ERROR_STATUS (status);
-			ocfs_release_lockres_write (inode); // ocfs_file_open ocfs_symlink
-			if (status == -EAGAIN || status == -ETIMEDOUT) {
-				ocfs_sleep (50);
-				status = -EAGAIN;
-				continue;
-			}
-
-			goto bail;
-		}
-		ocfs_release_lockres_write (inode); // ocfs_file_open 
-	}
-bail:
-	LOG_EXIT_STATUS (status);
-	return status;
-}
-
-enum {
-	invalid_path = 0,
-	fast_path, 
-	become_master, 
-	get_x, 
-	wait_for_release, 
-	master_request,
-	num_paths
-};
-
-static const char *lock_path_strs[] = {
-	"invalid_path", "fast_path", "become_master",
-	"get_x", "wait_for_release", "master_request"
-};
-
-static inline const char * lock_path_str(int lock_path);
-static inline const char * lock_path_str(int lock_path)
-{
-	if (lock_path >= num_paths || lock_path <= invalid_path)
-		return lock_path_strs[0];
-	return lock_path_strs[lock_path];
-}
-
-/*
- * ocfs_acquire_lock()
- * inode is definitely non NULL
- */
-int ocfs_acquire_lock (ocfs_super * osb, __u32 lock_type,
-		   __u32 flags, struct buffer_head **bh, struct inode *inode)
-{
-	int status;
-	__u32 updated;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	int k = 0;
-	int no_owner = 0, owner_dead = 0, wait_on_recovery = 0;
-	__u32 extra_lock_flags = 0;
-	__u64 lock_id;
-
-	LOG_ENTRY_ARGS ("(0x%p, %u, %u, 0x%p)\n",
-			osb, lock_type, flags, bh);
-
-	OCFS_ASSERT(lock_type != OCFS_LKM_NLMODE);
-	OCFS_ASSERT(inode);
-	OCFS_ASSERT(bh);
-	OCFS_ASSERT(!journal_current_handle());
-
-	lock_id = OCFS_I(inode)->ip_blkno;
-	LOG_TRACE_ARGS("lock_id = %llu\n", lock_id);
-
-	flags |= FLAG_ACQUIRE_LOCK;
-
-	*bh = sb_getblk(osb->sb, OCFS_I(inode)->ip_blkno);
-	if (*bh == NULL) {
-		status = -EIO;
-		LOG_ERROR_STATUS(status);
-		goto finally;
-	}
-
-	updated = 0;
-again:
-	ocfs_acquire_lockres_write (inode);
-
-	LOG_TRACE_ARGS("attempting to get lock, pass: %d\n", ++k);
-
-	/* if updated = 1 then we've read a valid bh so skip the
-	 * update_lockres if we can trust it. */
-	if (updated && (lockres->master_node_num != osb->node_num))
-		updated = 0;
-
-	if (!updated) {
-		status = ocfs_update_lockres(osb, *bh, inode, 1);
-		if (status < 0) {
-			ocfs_release_lockres_write (inode);
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
-		updated = 1;
-	}
-
-reevaluate:
-	no_owner = (lockres->master_node_num == OCFS_INVALID_NODE_NUM);
-
-	/* master node is an invalid node */
-	if (unlikely(lockres->master_node_num >= osb->max_nodes && !no_owner)) {
-		LOG_ERROR_ARGS("lockres: master_node=%d, owner=%s, lockid=%llu\n",
-			       lockres->master_node_num, no_owner?"no":"yes",
-			       lock_id);
-		LOG_ERROR_STATUS (status = -EINVAL);
-		ocfs_release_lockres_write (inode); // ocfs_acquire_lock
-		goto finally;
-	}
-
-	wait_on_recovery =
-		ocfs_node_is_recovering(osb, lockres->master_node_num);
-	owner_dead = !(no_owner ||
-		       ocfs_node_is_alive(&osb->publ_map, 
-				 	  lockres->master_node_num));
-	if (owner_dead || wait_on_recovery) {
-		// if owner is dead or in recovery and the lockres 
-		// has the readonly flag set, clear it
-		clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-	}
-
-	status = 0;
-	extra_lock_flags = 0;
-
-	if (flags & FLAG_READONLY) {
-		if (test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ||
-		    (lockres->master_node_num == osb->node_num &&
-		     lockres->lock_type == OCFS_LKM_EXMODE)) {
-			/* already readonly or local node is master */
-			/* THIS node will see it as readonly, but OTHER
-			 * nodes will have to wait until lock_holders drops 
-			 * to 0 (to finish journal flush on this inode) */
-#ifdef VERBOSE_LOCKING_TRACE
-			printk("acquire_lock: lockid %llu, setting readonly\n",
-			       lock_id);
-#endif
-			set_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-			goto skip_lock_write;
-		}
-
-		if (lockres->master_node_num == OCFS_INVALID_NODE_NUM ||
-			   owner_dead || wait_on_recovery) {
-			/* no master or dead master */
-			extra_lock_flags = FLAG_REMASTER;
-		} else {
-			/* valid master, but either not cachelock or elsewhere */
-			if (lockres->lock_type != OCFS_LKM_EXMODE) {
-				/* treat just like a normal master change request */
-				extra_lock_flags = FLAG_CHANGE_MASTER;
-			}
-		}
-		goto do_lock;
-	} 
-
-#warning NEED MORE HANDLING HERE NOW FOR DROPPING LOCAL READONLY!!!
-	// anything else is NOT a readonly request
-	if (lockres->master_node_num != osb->node_num)
-		clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-
-	status = ocfs_wait_for_readonly_drop(osb, inode);
-	if (status < 0) {
-		if (status == -EAGAIN) {
-			// the rodrop thread is already running and needs the lockres
-			ocfs_release_lockres_write(inode);
-			ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
-			ocfs_acquire_lockres_write(inode);
-			goto reevaluate;
-		}
-		LOG_ERROR_STATUS(status);
-		goto finally;
-	}
-
-	if (ocfs_inode_is_new(osb, inode)) {
-	    if (lockres->master_node_num != osb->node_num) {
-		printk("inode is new, but lockres is out of date! "
-			"owner = %d, type = %d\n", 
-		       lockres->master_node_num, lockres->lock_type);
-
-		BUG();
-	    }
-	    extra_lock_flags |= FLAG_FAST_PATH_LOCK;
-	}
-
-	/* some lock requests are simple messages and don't require a
-	 * master change. */
-	if (flags & FLAG_TRUNCATE_PAGES)
-		goto do_lock;
-
-	if ((lockres->master_node_num != osb->node_num)
-	    && (wait_on_recovery || no_owner || owner_dead)) {
-		extra_lock_flags |= FLAG_REMASTER;
-	} else if (lockres->master_node_num != osb->node_num) {
-		extra_lock_flags |= FLAG_CHANGE_MASTER;
-	}
-
-do_lock:
-	LOG_TRACE_ARGS("lockres: master=%d, locktype=%d, flags: %08x\n",
-		       lockres->master_node_num, lockres->lock_type, 
-		       flags|extra_lock_flags);
-
-#ifdef VERBOSE_LOCKING_TRACE
-	printk("acquire_lock: lockid=%llu, this=%d, master=%d, locktype=%d, "
-	       "flags=%08x, readonly=%s\n", lock_id, osb->node_num,
-	       lockres->master_node_num, lockres->lock_type, flags|extra_lock_flags,
-	       test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no");
-#endif	
-	if (wait_on_recovery 
-	    && !((flags|extra_lock_flags) & FLAG_FILE_RECOVERY)) {
-		int waitcnt = 0;
-		LOG_TRACE_ARGS("Waiting on node %u to be recovered\n",
-			       	lockres->master_node_num);
-		while (1) {
-			LOG_TRACE_ARGS("waitcnt = %d\n", waitcnt);
-			if (!ocfs_node_is_recovering(osb, 
-						     lockres->master_node_num))
-				break;
-			ocfs_sleep(500);
-		}
-	}
-
-	if (ocfs_task_interruptible ()) {
-		ocfs_release_lockres_write (inode);
-		LOG_TRACE_ARGS("interrupted... inode %llu\n",
-			       OCFS_I(inode)->ip_blkno);
-		status = -EINTR;
-		goto finally;
-	}
-	status = new_lock_function(osb, lock_type, flags|extra_lock_flags, *bh, inode);
-
-	if (status < 0) {
-		ocfs_release_lockres_write (inode); // ocfs_acquire_lock
-		if (status == -EAGAIN || status == -ETIMEDOUT) {
-			if (status == -ETIMEDOUT)
-				LOG_ERROR_ARGS("Timed out acquiring lock for inode "
-					       "%llu, retrying...\n", OCFS_I(inode)->ip_blkno);
-			ocfs_sleep (50);
-			goto again;
-		}
-		goto finally;
-	} 
-
-	/* We got the lock */
-	status = 0;
-
-skip_lock_write:
-	OCFS_ASSERT(status == 0);
-	lockres->lock_holders++;
-	if ((extra_lock_flags & FLAG_FAST_PATH_LOCK)
-	    && ((flags & FLAG_FILE_EXTEND) || (flags & FLAG_FILE_TRUNCATE)))
-		lockres->uncommitted_holders++;
-	LOG_TRACE_ARGS("lockres->lock_holders = %u\n", lockres->lock_holders);
-	LOG_TRACE_ARGS("lockres->uncommitted_holders = %u\n", 
-		       lockres->uncommitted_holders);
-	ocfs_release_lockres_write (inode); // ocfs_acquire_lock
-
-	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
-		ocfs2_dinode *fe = (ocfs2_dinode *) (*bh)->b_data;
-		status = ocfs_refresh_inode(inode, fe);
-		if (status < 0)
-			LOG_ERROR_STATUS(status);
-		status = 0;
-	}
-finally:
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_acquire_lock */
-
-
-/*
- * ocfs_release_lock_full()
- * inode is definitely non NULL
- */
-int ocfs_release_lock_full (ocfs_super * osb, __u32 lock_type, __u32 flags, struct inode *inode, __u32 num_ident)
-{
-	int status = 0;
-	int vote_status = 0;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	__u64 lock_id;
-	__u32 num_to_send;
-	ocfs_node_map votemap;
-
-	LOG_ENTRY_ARGS ("(0x%p, %u, %u, 0x%p)\n",
-			osb, lock_type, flags, lockres);
-
-	OCFS_ASSERT(inode);
-	OCFS_ASSERT(num_ident);
-
-	lock_id = OCFS_I(inode)->ip_blkno;
-	LOG_TRACE_ARGS("lock_id = %llu", lock_id);
-
-	flags |= FLAG_RELEASE_LOCK;
-
-	ocfs_acquire_lockres_write(inode);
-
-	if ((lockres->lock_type == OCFS_LKM_EXMODE) &&
-	    (lockres->master_node_num == osb->node_num) &&
-	    !(flags & (FLAG_FILE_EXTEND|FLAG_FILE_TRUNCATE))) {
-		status = 0;
-		goto finally;
-	}
-
-	if (flags & FLAG_READONLY) {
-		if (lockres->lock_type != OCFS_LKM_EXMODE ||
-		    lockres->master_node_num == OCFS_INVALID_NODE_NUM ||
-		    !(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state))) {
-			LOG_ERROR_ARGS("READONLY release has issues! type=%d, master=%d, readonly=%s\n",
-				       lockres->lock_type, lockres->master_node_num, 
-				       test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ?  "yes" : "no");
-			BUG();
-		}
-		status = 0;
-		goto finally;
-	}
-
-	OCFS_ASSERT(lockres->uncommitted_holders <= lockres->lock_holders);
-
-	num_to_send = num_ident;
-	/* we don't want to send over a count for any size change
-	 * which includes acquires which we also didn't broadcast. */
-	if ((flags & FLAG_FILE_EXTEND) || (flags & FLAG_FILE_TRUNCATE)) {
-		if (num_ident <= lockres->uncommitted_holders) {
-			/* it breaks the rules to send zero or
-			 * negative lock releases! */
-			num_to_send = 0;
-			lockres->uncommitted_holders -= num_ident;
-			status = 0;
-			goto finally;
-		} 
-		num_to_send -= lockres->uncommitted_holders;
-		lockres->uncommitted_holders = 0;
-	}
-
-	OCFS_ASSERT(num_to_send);
-
-	ocfs_node_map_dup(osb, &votemap, &osb->publ_map);
-	ocfs_node_map_clear_bit(&votemap, osb->node_num);
-	if (ocfs_node_map_is_empty(&votemap))
-		goto finally;
-
-	if (!(flags & FLAG_FILE_UPDATE_OIN))
-		goto finally;
-
-	status = -EAGAIN;
-	while (status == -EAGAIN) {
-		// remove dead nodes
-		ocfs_node_map_and(&votemap, &osb->publ_map);
-		if (ocfs_node_map_is_empty(&votemap)) {
-			// last node in map died, so this node gets the lock
-			status = 0;
-			break;
-		}
-		status = ocfs_send_dlm_request_msg (osb, lock_id, lock_type, flags, 
-						    &votemap, inode, num_to_send, &vote_status);
-		if (status >= 0 || status == -EAGAIN) {
-			if (status != -EAGAIN)
-				status = vote_status;
-
-			if (status >= 0) {
-				break;
-			} else if (status == -EAGAIN) {
-				LOG_TRACE_ARGS ("EAGAIN on net vote, id=%llu\n", lock_id);
-				continue;
-			} else {
-				LOG_ERROR_STATUS (status);
-				break;
-			}
-		} else if (status == -ETIMEDOUT) {
-			LOG_TRACE_ARGS ("ETIMEDOUT on net vote, id=%llu\n", lock_id);
-			status = -EAGAIN;
-
-			LOG_ERROR_ARGS("Timed out releasing lock for inode %llu, retrying...\n", OCFS_I(inode)->ip_blkno);
-			ocfs_release_lockres_write(inode);
-			ocfs_sleep(200);
-			ocfs_acquire_lockres_write(inode);
-			continue;
-		} else 
-			LOG_ERROR_STATUS (status);
-	}
-
-finally:
-	if (lockres->lock_holders - num_ident < 0) {
-		printk("About to decrement lock_holders one too many! lockid "
-		       "= %llu\n", lock_id);
-		BUG();
-	}
-#warning "is this wise, or shouldn't we be retrying the lock release later?"
-	lockres->lock_holders -= num_ident;
-	LOG_TRACE_ARGS("lockres->lock_holders = %u\n", lockres->lock_holders);
-
-	ocfs_release_lockres_write (inode);
-  	LOG_EXIT_STATUS (status);
-	return (status);
-}				/* ocfs_release_lock_full */
-
-/* inode is definitely non NULL */
-int new_lock_function(ocfs_super * osb, __u32 requested_lock, __u32 flags, struct buffer_head *bh, struct inode *inode)	
-{
-	ocfs_node_map vote_map;
-	ocfs2_dinode *fe = NULL;
-	__u64 lock_id;
-	__u32 lock_type = requested_lock;
-	int need_to_zap_buffers = 0, need_lock_write = 1;
-	int is_readonly = (flags & FLAG_READONLY) ? 1 : 0;
-	int status = 0, vote_status = 0;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
-	LOG_ENTRY ();
-
-	lock_id = OCFS_I(inode)->ip_blkno;
-
-	if (flags & FLAG_READONLY) {
-		if (flags & (FLAG_CHANGE_MASTER | FLAG_REMASTER)) {
-			/* not currently readonly.  treat like normal change master. */
-			flags &= ~FLAG_READONLY;
-		}
-	} else if (flags & FLAG_CHANGE_MASTER) {
-		/* non-readonly with CHANGE_MASTER should have no readonly flag */
-		if (test_bit(LOCK_STATE_READONLY, &lockres->readonly_state)) {
-			LOG_ERROR_ARGS("change_master but currently readonly\n");
-			clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-		}
-	}
-	if (flags & (FLAG_CHANGE_MASTER | FLAG_REMASTER)) {
-		/* on a master change... */
-		need_to_zap_buffers = 1; /* need to dump local buffers */
-		need_lock_write = 1;     /* and rewrite the lock */
-	} else if (flags & (FLAG_DROP_READONLY | FLAG_TRUNCATE_PAGES)) {
-		need_lock_write = 0;
-		need_to_zap_buffers = 0;
-	} else if (flags & FLAG_READONLY) {
-		need_lock_write = 0;
-		need_to_zap_buffers = 1;
-	} else if (!bh) {
-		need_lock_write = 0;
-		need_to_zap_buffers = 0;
-	} else {
-		fe = (ocfs2_dinode *) bh->b_data;
-		/* may not need to rewrite the lock later if
-		 * we already have a cachelock */
-		if ((DISK_LOCK(fe)->dl_master == osb->node_num)
-		    && (DISK_LOCK(fe)->dl_level != requested_lock))
-			need_lock_write = 1;
-		else
-			need_lock_write = 0;
-		need_to_zap_buffers = 0; 
-	}
-
-	/* that's why it's called fast path */
-	if (flags & FLAG_FAST_PATH_LOCK)
-		goto vote_success;
-
-
-#define BROADCAST_FLAGS (FLAG_FILE_DELETE | FLAG_FILE_RENAME | FLAG_RELEASE_DENTRY | FLAG_FILE_EXTEND | FLAG_FILE_TRUNCATE | FLAG_FILE_UPDATE_OIN | FLAG_TRUNCATE_PAGES | FLAG_DROP_READONLY | FLAG_REMASTER)
-
-	/* figure out who to vote with */
-	if (flags & BROADCAST_FLAGS) {
-		ocfs_node_map_dup(osb, &vote_map, &osb->publ_map); /* broadcast */
-		/* only nodes that see this is readonly */
-		if (flags & FLAG_DROP_READONLY)
-			ocfs_node_map_and(&vote_map, &lockres->readonly_map);
-	} else {
-		ocfs_node_map_init(osb, &vote_map);
-		ocfs_node_map_set_bit(&vote_map, lockres->master_node_num); /* just owner */
-		lock_type = lockres->lock_type;
-	}
-	ocfs_node_map_clear_bit(&vote_map, osb->node_num);
-
-	// remove dead nodes
-	ocfs_node_map_and(&vote_map, &osb->publ_map);
-
-	if (ocfs_node_map_is_empty(&vote_map)) {
-		/* As this is the only node alive, make it master of the lock */
-		goto vote_success;
-	}
-
-	status = ocfs_send_dlm_request_msg (osb, lock_id, lock_type,
-					    flags, &vote_map,
-					    inode, 1,
-					    &vote_status);
-	if (status >= 0) {
-		status = vote_status;
-	}
-
-	if (status < 0) {
-		if (status != -EAGAIN &&
-		    status != -ETIMEDOUT &&
-		    status != -EINTR &&
-		    status != -EBUSY)
-			LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-vote_success:
-	if (need_to_zap_buffers)
-		ocfs_inc_inode_seq(osb, inode);
-
-	/* just alerting owner on open */
-	if (flags & FLAG_TRUNCATE_PAGES)
-		goto bail;
-	
-	/* converted EX to readonly EX */
-	if (flags & FLAG_READONLY)
-		goto bail;
-
-	/* drop readonly should remove anyone who has responded */
-       	if (flags & FLAG_DROP_READONLY) {
-		ocfs_node_map_clear_bits(&lockres->readonly_map, &vote_map);
-		goto bail;
-	}
-
-	/* update the disk lock */
-	if (need_lock_write) {
-		lockres->lock_type = requested_lock;
-		lockres->master_node_num = osb->node_num;
-		if (!bh) {
-			printk("We're trying to write a lock but I wasn't "
-			       "passed a buffer: inode %llu, flags %u\n", 
-			       OCFS_I(inode)->ip_blkno, flags);
-			BUG();
-		}
-
-		/* want to refresh the lock from the latest on disk
-		 * state before writing it back out. */
-		status = ocfs_read_block(osb, lock_id, &bh, 0, inode);
-		if (!status)
-			ocfs_update_disk_lock(osb, bh, inode);
-
-		if (status < 0)
-			LOG_ERROR_STATUS(status);
-
-		atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
-	}
-
-bail:
-	/* if we removed FLAG_READONLY above, or converted an
-	 * EX to readonly, set the readonly state now */
-	if (status >= 0 && (is_readonly || flags & FLAG_READONLY)) {
-		set_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-	}
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}
-
-void ocfs_compute_dlm_stats(int status, int vote_status, ocfs_dlm_stats *stats)	
-{
-	atomic_inc (&stats->total);
-	if (status == -ETIMEDOUT)
-		atomic_inc (&stats->etimedout);
-	else {
-		switch (vote_status) {
-			case -EAGAIN:
-			case FLAG_VOTE_UPDATE_RETRY:
-				atomic_inc (&stats->eagain);
-				break;
-			case -ENOENT:
-			case FLAG_VOTE_FILE_DEL:
-				atomic_inc (&stats->enoent);
-				break;
-			case -EBUSY:
-			case -ENETUNREACH:
-			case FLAG_VOTE_OIN_ALREADY_INUSE:
-				atomic_inc (&stats->efail);
-				break;
-			case 0:
-			case FLAG_VOTE_NODE:
-			case FLAG_VOTE_OIN_UPDATED:
-				atomic_inc (&stats->okay);
-				break;
-			default:
-				atomic_inc (&stats->def);
-				break;
-		}
-	}
-}

Deleted: trunk/src/dlm.h
===================================================================
--- trunk/src/dlm.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlm.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,78 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlm.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_DLM_H
-#define OCFS2_DLM_H
-
-int new_lock_function(ocfs_super *osb, __u32 requested_lock,
-		      __u32 flags, struct buffer_head *bh,
-		      struct inode *inode);
-int ocfs_acquire_lock(ocfs_super *osb, __u32 lock_type,
-		      __u32 flags, struct buffer_head **bh,
-		      struct inode *inode);
-void ocfs_compute_dlm_stats(int status, int vote_status,
-			    ocfs_dlm_stats *stats);
-#define ocfs_release_lock(osb, lock_type, flags, inode)  \
-	ocfs_release_lock_full(osb, lock_type, flags, inode, 1)
-int ocfs_release_lock_full(ocfs_super *osb, __u32 lock_type,
-			   __u32 flags, struct inode *inode, __u32 num_ident);
-
-#define ocfs_acquire_lock_ro(osb, inode)  \
-({ \
- 	int status; \
-	struct buffer_head *junkbh = NULL;\
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, FLAG_READONLY, &junkbh, inode);\
- 	if (junkbh) \
-	 	brelse(junkbh);\
- 	(status); \
- })
-
-#define ocfs_release_lock_ro(osb, inode)   ocfs_release_lock(osb, OCFS_LKM_EXMODE, FLAG_READONLY, inode)
-  
-
-void ocfs_get_publish_vote_map(ocfs_super *osb, ocfs_publish *publish,
-			       ocfs_node_map *vote_map);
-int ocfs_notify_cluster(ocfs_super *osb, 
-			struct inode *inode,
-			u32 message_flags);
-static inline int ocfs_notify_on_rename(ocfs_super *osb, struct inode *inode)
-{
-	/* whatcha tryin' to do to us! */
-	OCFS_ASSERT(!S_ISDIR(inode->i_mode));
-
-	return(ocfs_notify_cluster(osb, 
-				   inode, 
-				   FLAG_RELEASE_DENTRY|FLAG_FILE_RENAME));
-}
-static inline int ocfs_notify_on_open(ocfs_super *osb, struct inode *inode)
-{
-	return(ocfs_notify_cluster(osb, 
-				   inode, 
-				   FLAG_TRUNCATE_PAGES));
-}
-void ocfs_update_disk_lock (ocfs_super * osb, 
-			    struct buffer_head *bh, 
-			    struct inode *inode);
-#endif /* OCFS2_DLM_H */

Added: trunk/src/dlmglue.c
===================================================================
--- trunk/src/dlmglue.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlmglue.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1818 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * middle.c
+ *
+ * description here
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "util.h"
+#include "vote.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_DLMGLUE
+
+/* lock ids are made up in the following manner:
+ * name[0]     --> type
+ * name[1-6]   --> 6 pad characters, reserved for now
+ * name[7-22]  --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31]    --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN  32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+static char ocfs2_lock_type_char[OCFS_NUM_LOCK_TYPES] = {
+	[OCFS_TYPE_META]	'M',
+	[OCFS_TYPE_DATA] 	'D',
+	[OCFS_TYPE_SUPER]       'S'
+};
+
+static int ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				 u64 blkno,
+				 u32 generation,
+				 char **ret);
+
+static void ocfs2_inode_ast_func(void *opaque);
+static void ocfs2_inode_bast_func(void *opaque, int level);
+static void ocfs2_super_ast_func(void *opaque);
+static void ocfs2_super_bast_func(void *opaque, int level);
+/* so far, all locks have gotten along with the same unlock ast */
+static void ocfs2_unlock_ast_func(void *opaque,
+				  dlm_status status);
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue);
+static int ocfs2_unblock_meta(ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_data(ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_super(ocfs2_lock_res *lockres,
+			       int *requeue);
+typedef void (ocfs2_convert_worker_t)(ocfs2_lock_res *, int);
+static int ocfs2_generic_unblock_lock(ocfs_super *osb,
+				      ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker);
+
+struct ocfs2_lock_res_ops {
+	void (*ast)(void *);
+	void (*bast)(void *, int);
+	void (*unlock_ast)(void *, dlm_status);
+	int  (*unblock)(ocfs2_lock_res *, int *);
+};
+
+struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+	.ast = ocfs2_inode_ast_func,
+	.bast = ocfs2_inode_bast_func,
+	.unlock_ast = ocfs2_unlock_ast_func,
+	.unblock = ocfs2_unblock_meta,
+};
+
+static void ocfs2_data_convert_worker(ocfs2_lock_res *lockres,
+				      int blocking);
+
+struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
+	.ast = ocfs2_inode_ast_func,
+	.bast = ocfs2_inode_bast_func,
+	.unlock_ast = ocfs2_unlock_ast_func,
+	.unblock = ocfs2_unblock_data,
+};
+
+struct ocfs2_lock_res_ops ocfs2_super_lops = {
+	.ast = ocfs2_super_ast_func,
+	.bast = ocfs2_super_bast_func,
+	.unlock_ast = ocfs2_unlock_ast_func,
+	.unblock = ocfs2_unblock_super,
+};
+
+static inline int ocfs2_is_inode_lock(ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS_TYPE_META || 
+		lockres->l_type == OCFS_TYPE_DATA;
+}
+
+static inline int ocfs2_is_super_lock(ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS_TYPE_SUPER;
+}
+
+static inline ocfs_super * ocfs2_lock_res_super(ocfs2_lock_res *lockres)
+{
+	OCFS_ASSERT(ocfs2_is_super_lock(lockres));
+
+	return (ocfs_super *) lockres->l_priv;
+}
+
+static inline struct inode * ocfs2_lock_res_inode(ocfs2_lock_res *lockres)
+{
+	OCFS_ASSERT(ocfs2_is_inode_lock(lockres));
+
+	return (struct inode *) lockres->l_priv;
+}
+
+static void ocfs2_lock_res_init_common(ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       void *priv);
+static int ocfs2_lock_create(ocfs_super *osb,
+			     ocfs2_lock_res *lockres,
+			     int level,
+			     int flags);
+static inline int ocfs2_may_continue_on_blocked_lock(ocfs2_lock_res *lockres,
+						     int wanted);
+static int ocfs2_cluster_lock(ocfs_super *osb,
+			      ocfs2_lock_res *lockres,
+			      int level,
+			      int lkm_flags);
+static void ocfs2_cluster_unlock(ocfs_super *osb,
+				 ocfs2_lock_res *lockres,
+				 int level);
+static inline void ocfs2_generic_handle_downconvert_action(ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(ocfs2_lock_res *lockres);
+static void ocfs2_generic_handle_bast(ocfs2_lock_res *lockres, int level);
+static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
+						    ocfs2_lock_res *lockres);
+static void ocfs2_inc_inode_seq(ocfs_super *osb,
+				struct inode *inode);
+static void ocfs2_schedule_blocked_lock(ocfs_super *osb,
+					ocfs2_lock_res *lockres);
+static void ocfs2_schedule_blocked_inode_lock(struct inode *inode,
+					      ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(ocfs2_lock_res *lockres,
+						int convert);
+static void ocfs2_vote_on_unlock(ocfs_super *osb,
+				 ocfs2_lock_res *lockres);
+/* Called after we refresh our inode, only has any effect if we have
+ * an EX lock. This populates the LVB with the initial values for our
+ * change set. */
+static void ocfs2_reset_meta_lvb_values(struct inode *inode);
+static void __ocfs2_stuff_meta_lvb(struct inode *inode);
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode);
+static void __ocfs2_lvb_on_downconvert(ocfs2_lock_res *lockres,
+				       int new_level);
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh);
+static int __ocfs2_drop_lock(ocfs_super *osb,
+			     ocfs2_lock_res *lockres);
+static void ocfs2_drop_super_lock(ocfs_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static int __ocfs2_downconvert_lock(ocfs_super *osb,
+				    ocfs2_lock_res *lockres,
+				    int new_level,
+				    int lvb);
+static int __ocfs2_cancel_convert(ocfs_super *osb,
+				  ocfs2_lock_res *lockres);
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  ocfs2_lock_res *lockres,
+						  int new_level);
+
+static inline int ocfs2_lvb_is_trustable(ocfs2_lock_res *lockres)
+{
+	ocfs2_lvb *lvb = (ocfs2_lvb *) lockres->l_lksb.lvb;
+	int ret = 0;
+
+	spin_lock(&lockres->l_lock);
+	if (lvb->lvb_seq &&
+	    lockres->l_local_seq == lvb->lvb_seq)
+		ret = 1;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void ocfs2_set_local_seq_from_lvb(ocfs2_lock_res *lockres)
+{
+	ocfs2_lvb *lvb = (ocfs2_lvb *) lockres->l_lksb.lvb;
+
+	spin_lock(&lockres->l_lock);
+	if (lvb->lvb_seq)
+		lockres->l_local_seq = lvb->lvb_seq;
+	spin_unlock(&lockres->l_lock);
+}
+
+/* fill in new values as we add them to the lvb. */
+static inline void ocfs2_meta_lvb_get_values(ocfs2_lock_res *lockres,
+					     unsigned int *trunc_clusters)
+{
+	ocfs2_meta_lvb *lvb;
+	OCFS_ASSERT(lockres->l_type == OCFS_TYPE_DATA);
+
+	spin_lock(&lockres->l_lock);
+	OCFS_ASSERT(lockres->l_level > LKM_NLMODE);
+
+	lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+	if (trunc_clusters)
+		*trunc_clusters = lvb->lvb_trunc_clusters;
+
+	spin_unlock(&lockres->l_lock);
+}
+
+static int ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				 u64 blkno,
+				 u32 generation,
+				 char **ret)
+{
+	int len;
+	char *name = NULL;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(type < OCFS_NUM_LOCK_TYPES);
+
+	name = kmalloc(OCFS2_LOCK_ID_MAX_LEN, GFP_KERNEL);
+	if (!name) {
+		len = -ENOMEM;
+		goto bail;
+	}
+	memset(name, 0, OCFS2_LOCK_ID_MAX_LEN);
+
+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN - 1, "%c%s%016llx%08x", 
+		       ocfs2_lock_type_char[type], OCFS2_LOCK_ID_PAD, blkno, 
+		       generation);
+
+	OCFS_ASSERT(len = (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("built lock resource with name: %s\n", name);
+#endif
+	*ret = name;
+bail:
+	LOG_EXIT();
+	return (len);
+}
+
+static void ocfs2_lock_res_init_common(ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       void *priv)
+{
+	memset(res, 0, sizeof(ocfs2_lock_res));
+	spin_lock_init(&res->l_lock);
+	init_waitqueue_head(&res->l_event);
+	res->l_type = type;
+	res->l_level = LKM_IVMODE;
+	INIT_LIST_HEAD(&res->l_blocked_list);
+	res->l_priv = priv;
+}
+
+int ocfs2_inode_lock_res_init(ocfs2_lock_res *res,
+			      enum ocfs2_lock_type type,
+			      struct inode *inode)
+{
+	int status;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(type == OCFS_TYPE_META ||
+		    type == OCFS_TYPE_DATA);
+
+	ocfs2_lock_res_init_common(res, type, inode);
+
+	if (type == OCFS_TYPE_META)
+		res->l_ops = &ocfs2_inode_meta_lops;
+	else
+		res->l_ops = &ocfs2_inode_data_lops;
+
+	status = ocfs2_build_lock_name(type,
+				       OCFS_I(inode)->ip_blkno,
+				       inode->i_generation,
+				       &res->l_name);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+int ocfs2_super_lock_res_init(ocfs2_lock_res *res,
+     			      ocfs_super *osb)
+{
+	enum ocfs2_lock_type type = OCFS_TYPE_SUPER;
+	int status;
+
+	LOG_ENTRY();
+
+	ocfs2_lock_res_init_common(res, type, osb);
+
+	res->l_ops = &ocfs2_super_lops;
+
+	status = ocfs2_build_lock_name(type,
+				       OCFS2_SUPER_BLOCK_BLKNO,
+				       0,
+				       &res->l_name);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+void ocfs2_lock_res_free(ocfs2_lock_res *res)
+{
+	if (res->l_name)
+		kfree(res->l_name);
+}
+
+static inline void ocfs2_inc_holders(ocfs2_lock_res *lockres,
+				     int level)
+{
+	OCFS_ASSERT(lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void ocfs2_dec_holders(ocfs2_lock_res *lockres,
+				     int level)
+{
+	OCFS_ASSERT(lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		OCFS_ASSERT(lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		OCFS_ASSERT(lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(ocfs2_lock_res *lockres)
+{
+	OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BUSY);
+	OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_blocking = LKM_NLMODE;
+	lockres->l_flags &= ~OCFS2_LOCK_BLOCKED;
+	lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+	wake_up(&lockres->l_event);
+}
+
+static void ocfs2_inc_inode_seq(ocfs_super *osb,
+				struct inode *inode)
+{
+	atomic_t *seq = GET_INODE_CLEAN_SEQ(inode);
+
+	LOG_TRACE_ARGS("incrementing inode seq... current is %d\n", 
+		       atomic_read(seq));
+
+	/* wrap to ONE after 13 bits, will need a spinlock */
+	spin_lock (&osb->clean_buffer_lock);
+	if ((atomic_read(&osb->clean_buffer_seq)+1) % STATE_BIT_MAX == 0)
+		atomic_set(&osb->clean_buffer_seq, 1);
+	else
+		atomic_inc(&osb->clean_buffer_seq);
+	spin_unlock (&osb->clean_buffer_lock);
+
+	/* doesn't matter if this another process */
+	/* has already incremented the global seq */
+	atomic_set(seq, atomic_read(&osb->clean_buffer_seq));
+
+	LOG_TRACE_ARGS("done incrementing inode seq... new is %d\n", 
+		       atomic_read(seq));
+}
+
+static inline void ocfs2_generic_handle_convert_action(ocfs2_lock_res *lockres)
+{
+	OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BUSY);
+	OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+	/* Convert from RO to EX doesn't really need anything as our
+	 * information is already up to data. Convert from NL to
+	 * *anything* however should mark ourselves as needing an
+	 * update */
+	if (lockres->l_level == LKM_NLMODE)
+		lockres->l_flags |= OCFS2_LOCK_NEEDS_REFRESH;
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+}
+
+static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
+						    ocfs2_lock_res *lockres)
+{
+	ocfs_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* generic_handle_convert_action will set the refresh flag for us. */
+	if (lockres->l_level == LKM_NLMODE)
+		ocfs2_inc_inode_seq(osb, inode);
+	ocfs2_generic_handle_convert_action(lockres);
+}
+
+static inline void ocfs2_generic_handle_attach_action(ocfs2_lock_res *lockres)
+{
+	OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BUSY);
+	OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+	if (lockres->l_requested > LKM_NLMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+		lockres->l_flags |= OCFS2_LOCK_NEEDS_REFRESH;
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_flags |= OCFS2_LOCK_ATTACHED;
+	lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+}
+
+static void ocfs2_inode_ast_func(void *opaque)
+{
+	ocfs2_lock_res *lockres = opaque;
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+	ocfs_super *osb = OCFS2_SB(inode->i_sb);
+	dlm_lockstatus *lksb;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("AST fired for inode %llu\n", OCFS_I(inode)->ip_blkno);
+#endif
+	OCFS_ASSERT(ocfs2_is_inode_lock(lockres));
+
+	spin_lock(&lockres->l_lock);
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		printk("ocfs2_inode_ast_func: lksb status value of %u on "
+		       "inode %llu\n", lksb->status, OCFS_I(inode)->ip_blkno);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		if (lockres->l_type == OCFS_TYPE_META &&
+		    lockres->l_requested > LKM_NLMODE &&
+		    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+			ocfs2_inc_inode_seq(osb, inode);
+
+		ocfs2_generic_handle_attach_action(lockres);
+		break;
+	case OCFS2_AST_CONVERT:
+		if (lockres->l_type == OCFS_TYPE_META)
+			ocfs2_handle_meta_convert_action(inode, lockres);
+		else
+			ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		BUG();
+	}
+
+	/* data locking ignores refresh flag for now. */
+	if (lockres->l_type == OCFS_TYPE_DATA)
+		lockres->l_flags &= ~OCFS2_LOCK_NEEDS_REFRESH;
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock(&lockres->l_lock);
+	wake_up(&lockres->l_event);
+}
+
+static void ocfs2_generic_handle_bast(ocfs2_lock_res *lockres, int level)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= OCFS2_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+	spin_unlock(&lockres->l_lock);
+}
+
+static void ocfs2_inode_bast_func(void *opaque, int level)
+{
+	ocfs2_lock_res *lockres = opaque;
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+	ocfs_super *osb = OCFS2_SB(inode->i_sb);
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("BAST fired for inode %llu\n", OCFS_I(inode)->ip_blkno);
+#endif
+	ocfs2_generic_handle_bast(lockres, level);
+
+	ocfs2_schedule_blocked_inode_lock(inode, lockres);
+	ocfs2_kick_vote_thread(osb);
+}
+
+static void ocfs2_super_ast_func(void *opaque)
+{
+	ocfs2_lock_res *lockres = opaque;
+	dlm_lockstatus *lksb;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("AST fired for inode %llu\n", OCFS_I(inode)->ip_blkno);
+#endif
+	OCFS_ASSERT(ocfs2_is_super_lock(lockres));
+
+	spin_lock(&lockres->l_lock);
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		printk("ocfs2_super_ast_func: lksb status value of %u!\n",
+		       lksb->status);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		BUG();
+	}
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock(&lockres->l_lock);
+	wake_up(&lockres->l_event);
+}
+
+static void ocfs2_super_bast_func(void *opaque, int level)
+{
+	ocfs2_lock_res *lockres = opaque;
+	ocfs_super *osb = ocfs2_lock_res_super(lockres);
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("Superblock BAST fired\n");
+#endif
+	ocfs2_generic_handle_bast(lockres, level);
+
+	ocfs2_schedule_blocked_lock(osb, lockres);
+	ocfs2_kick_vote_thread(osb);
+}
+
+static inline void ocfs2_recover_from_dlm_error(ocfs2_lock_res *lockres,
+						int convert)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+	if (convert)
+		lockres->l_action = OCFS2_AST_INVALID;
+	else
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock(&lockres->l_lock);
+}
+
+static int ocfs2_lock_create(ocfs_super *osb,
+			     ocfs2_lock_res *lockres,
+			     int level,
+			     int flags)
+{
+	int ret = 0;
+	dlm_status status;
+
+	LOG_ENTRY();
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	lockres->l_action = OCFS2_AST_ATTACH;
+	lockres->l_requested = level;
+	lockres->l_flags |= OCFS2_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	status = dlmlock(osb->dlm,
+			 level,
+			 &lockres->l_lksb,
+			 flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		LOG_ERROR_ARGS("Dlm returns %d\n", status);
+		ret = -ENOENT;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+	}
+
+bail:
+	LOG_EXIT_STATUS(ret);
+	return ret;
+}
+
+static inline int ocfs2_check_wait_flag(ocfs2_lock_res *lockres,
+					int flag)
+{
+	int ret;
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+	return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(ocfs2_lock_res *lockres)
+
+{
+	wait_event_interruptible(lockres->l_event,
+				 !ocfs2_check_wait_flag(lockres,
+							OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_blocked_lock(ocfs2_lock_res *lockres)
+
+{
+	wait_event_interruptible(lockres->l_event,
+				 !ocfs2_check_wait_flag(lockres,
+							OCFS2_LOCK_BLOCKED));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(ocfs2_lock_res *lockres)
+
+{
+	wait_event_interruptible(lockres->l_event,
+				 !ocfs2_check_wait_flag(lockres,
+							OCFS2_LOCK_REFRESHING));}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(ocfs2_lock_res *lockres,
+						     int wanted)
+{
+	OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static int ocfs2_cluster_lock(ocfs_super *osb,
+			      ocfs2_lock_res *lockres,
+			      int level,
+			      int lkm_flags)
+{
+	int ret;
+	dlm_status status;
+
+	LOG_ENTRY();
+
+again:
+	if (signal_pending(current)) {
+		ret = -EINTR;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+	    level > lockres->l_level) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+		ocfs2_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		/* lock has not been created yet. */
+		spin_unlock(&lockres->l_lock);
+		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		if (ret < 0) {
+			LOG_ERROR_STATUS(ret);
+			goto bail;
+		}
+		goto again;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+		ocfs2_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		lockres->l_action = OCFS2_AST_CONVERT;
+		lockres->l_requested = level;
+		lockres->l_flags |= OCFS2_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(osb->dlm,
+				 level,
+				 &lockres->l_lksb,
+				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
+				 lockres->l_name,
+				 lockres->l_ops->ast,
+				 lockres,
+				 lockres->l_ops->bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				ret = -EAGAIN;
+			else {
+				LOG_ERROR_ARGS("Dlm returns %d\n", status);
+				ret = -ENOENT;
+			}
+			ocfs2_recover_from_dlm_error(lockres, 1);
+			goto bail;
+		}
+
+		ocfs2_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	/* Ok, if we get here then we're good to go. */
+	ocfs2_inc_holders(lockres, level);
+
+	spin_unlock(&lockres->l_lock);
+
+	ret = 0;
+bail:
+	LOG_EXIT_STATUS(ret);
+	return ret;
+}
+
+static void ocfs2_cluster_unlock(ocfs_super *osb,
+				 ocfs2_lock_res *lockres,
+				 int level)
+{
+	spin_lock(&lockres->l_lock);
+	ocfs2_dec_holders(lockres, level);
+	ocfs2_vote_on_unlock(osb, lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+	int status;
+	ocfs_super *osb = OCFS2_SB(inode->i_sb);
+	ocfs2_lock_res *lockres;
+
+	OCFS_ASSERT(inode);
+	OCFS_ASSERT(ocfs_inode_is_new(inode));
+
+	LOG_ENTRY();
+
+	/* NOTE: That we don't increment any of the holder counts, nor
+	 * do we add anything to a journal handle. Since this is
+	 * supposed to be a new inode which the cluster doesn't know
+	 * about yet, there is no need to.  As far as the LVB handling
+	 * is concerned, this is basically like acquiring an EX lock
+	 * on a resource which has an invalid one -- we'll set it
+	 * valid when we release the EX. */
+
+	lockres = &OCFS_I(inode)->ip_meta_lockres;
+	OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	lockres->l_flags |= OCFS2_LOCK_LOCAL;
+
+	status = ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+	lockres->l_flags &= ~OCFS2_LOCK_LOCAL;
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	lockres = &OCFS_I(inode)->ip_data_lockres;
+	OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	lockres->l_flags |= OCFS2_LOCK_LOCAL;
+
+	status = ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+	lockres->l_flags &= ~OCFS2_LOCK_LOCAL;
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+int ocfs2_data_lock(struct inode *inode,
+		    int write)
+{
+	int status, level;
+	ocfs2_lock_res *lockres;
+
+	OCFS_ASSERT(inode);
+
+	LOG_ENTRY();
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("ocfs2: (%u) inode %llu, take %s DATA lock\n",
+	       current->pid, OCFS_I(inode)->ip_blkno,
+	       write ? "EXMODE" : "PRMODE");
+#endif
+
+	lockres = &OCFS_I(inode)->ip_data_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0);
+	if (status < 0 && status != -EINTR)
+		LOG_ERROR_STATUS(status);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static void ocfs2_vote_on_unlock(ocfs_super *osb,
+				 ocfs2_lock_res *lockres)
+{
+	int kick = 0;
+
+	/* If we know that another node is waiting on our lock, kick
+	 * the vote thread * pre-emptively when we reach a release
+	 * condition. */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+		switch(lockres->l_blocking) {
+		case LKM_EXMODE:
+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+				kick = 1;
+			break;
+		case LKM_PRMODE:
+			if (!lockres->l_ex_holders)
+				kick = 1;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (kick)
+		ocfs2_kick_vote_thread(osb);
+}
+
+void ocfs2_data_unlock(struct inode *inode,
+		       int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_data_lockres;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("ocfs2: (%u) inode %llu drop %s DATA lock\n",
+	       OCFS_I(inode)->ip_blkno, current->pid,
+	       write ? "EXMODE" : "PRMODE");
+#endif
+
+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+static inline int ocfs2_wait_on_recovery(ocfs_super *osb)
+{
+	wait_event_interruptible(osb->recovery_event,
+				 ocfs_node_map_is_empty(osb,
+							&osb->recovery_map));
+
+	if (signal_pending(current))
+		return -EINTR;
+
+	return 0;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_meta_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+	ocfs_inode_private *oip = OCFS_I(inode);
+	ocfs2_lock_res *lockres = &oip->ip_meta_lockres;
+	ocfs2_meta_lvb *lvb     = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	lvb->lvb_iclusters = oip->ip_clusters;
+	lvb->lvb_iuid      = inode->i_uid;
+	lvb->lvb_igid      = inode->i_gid;
+	lvb->lvb_isize     = inode->i_size;
+	lvb->lvb_imode     = inode->i_mode;
+	lvb->lvb_inlink    = inode->i_nlink;
+	lvb->lvb_iatime    = ocfs_get_seconds(inode->i_atime);
+	lvb->lvb_ictime    = ocfs_get_seconds(inode->i_ctime);
+	lvb->lvb_imtime    = ocfs_get_seconds(inode->i_mtime);
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+	ocfs_inode_private *oip = OCFS_I(inode);
+	ocfs2_lock_res *lockres = &oip->ip_meta_lockres;
+	ocfs2_meta_lvb *lvb     = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	/* We're safe here without the lockres lock... */
+	spin_lock(&oip->ip_lock);
+	oip->ip_clusters = lvb->lvb_iclusters;
+	inode->i_uid     = lvb->lvb_iuid;
+	inode->i_gid     = lvb->lvb_igid;
+	inode->i_size    = lvb->lvb_isize;
+	inode->i_mode    = lvb->lvb_imode;
+	inode->i_nlink   = lvb->lvb_inlink;
+	inode->i_blocks  = (inode->i_size + inode->i_sb->s_blocksize - 1) 
+		>> inode->i_sb->s_blocksize_bits;
+	OCFS_SET_INODE_TIME(inode, i_atime, lvb->lvb_iatime);
+	OCFS_SET_INODE_TIME(inode, i_ctime, lvb->lvb_ictime);
+	OCFS_SET_INODE_TIME(inode, i_mtime, lvb->lvb_imtime);
+	spin_unlock(&oip->ip_lock);
+}
+
+static void ocfs2_reset_meta_lvb_values(struct inode *inode)
+{
+	ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+	ocfs2_meta_lvb *lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+	u32 i_clusters;
+
+	spin_lock(&OCFS_I(inode)->ip_lock);
+	i_clusters = OCFS_I(inode)->ip_clusters;
+	spin_unlock(&OCFS_I(inode)->ip_lock);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_level == LKM_EXMODE)
+		lvb->lvb_trunc_clusters = i_clusters;
+	spin_unlock(&lockres->l_lock);
+}
+
+static void __ocfs2_lvb_on_downconvert(ocfs2_lock_res *lockres,
+				       int new_level)
+{
+	ocfs2_lvb *lvb = (ocfs2_lvb *) lockres->l_lksb.lvb;
+
+	if (lockres->l_level == LKM_EXMODE) {
+		lvb->lvb_seq++;
+		/* Overflow? */
+		if (!lvb->lvb_seq)
+			lvb->lvb_seq = 1;
+		lockres->l_local_seq = lvb->lvb_seq;
+		if (new_level == LKM_NLMODE)
+			lockres->l_local_seq++;
+	} else if (lockres->l_level == LKM_PRMODE) {
+		if (lvb->lvb_seq)
+			lockres->l_local_seq++;
+	}
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ * -1 means error, 0 means no refresh needed, > 0 means you need to
+ *   refresh this and you MUST call ocfs2_complete_lock_res_refresh
+ *   afterwards. */
+static int ocfs2_should_refresh_lock_res(ocfs2_lock_res *lockres)
+{
+
+	int status = 0;
+	LOG_ENTRY();
+
+refresh_check:
+	spin_lock(&lockres->l_lock);
+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		spin_unlock(&lockres->l_lock);
+		if (signal_pending(current)) {
+			status = -EINTR;
+			goto bail;
+		}
+		ocfs2_wait_on_refreshing_lock(lockres);
+		goto refresh_check;
+	}
+
+	/* Ok, I'll be the one to refresh this lock. */
+	lockres->l_flags |= OCFS2_LOCK_REFRESHING;
+	spin_unlock(&lockres->l_lock);
+
+	status = 1;
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(ocfs2_lock_res *lockres,
+						   int status)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~OCFS2_LOCK_REFRESHING;
+	if (!status)
+		lockres->l_flags &= ~OCFS2_LOCK_NEEDS_REFRESH;
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh)
+{
+	int status;
+	u32 trustable_clusters = 0;
+	ocfs2_lock_res *lockres;
+	ocfs2_dinode *fe;
+
+	lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (!status)
+		goto bail;
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	/* we don't want to use the LVB for bitmap files as the
+	 * used/set bit union is not currently sent over the wire. */
+	if (!(OCFS_I(inode)->ip_flags & OCFS_INODE_BITMAP) &&
+	    ocfs2_lvb_is_trustable(lockres)) {
+		/* yay, fastpath! */
+		ocfs2_meta_lvb_get_values(lockres, &trustable_clusters);
+		ocfs2_refresh_inode_from_lvb(inode);
+	} else {
+		/* Boo, we have to go to disk. */
+		/* read bh, cast, ocfs_refresh_inode */
+		status = ocfs_read_block(OCFS2_SB(inode->i_sb),
+					 OCFS_I(inode)->ip_blkno, bh,
+					 OCFS_BH_CACHED, inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		fe = (ocfs2_dinode *) (*bh)->b_data;
+
+		/* This is a good chance to make sure we're not
+		 * locking an invalid object. */
+		OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+		OCFS_ASSERT(inode->i_generation == 
+			    le32_to_cpu(fe->i_generation));
+		if ((fe->i_dtime) || (!(fe->i_flags & OCFS2_VALID_FL)))
+			BUG();
+
+		ocfs_refresh_inode(inode, fe);
+	}
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("inode %llu, I can only trust %u clusters\n",
+	       OCFS_I(inode)->ip_blkno, trustable_clusters);
+#endif
+
+	ocfs2_extent_map_trunc(inode, trustable_clusters);
+
+	ocfs2_set_local_seq_from_lvb(lockres);
+	ocfs2_reset_meta_lvb_values(inode);
+
+	ocfs2_complete_lock_res_refresh(lockres, 0);
+bail:
+	return status;
+}
+
+int ocfs2_meta_lock_flags(struct inode *inode,
+			  ocfs_journal_handle *handle,
+			  struct buffer_head **ret_bh,
+			  int ex,
+			  int flags)
+{
+	int status, level, dlm_flags;
+	ocfs2_lock_res *lockres;
+	ocfs_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *bh = NULL;
+
+	OCFS_ASSERT(inode);
+
+	if (handle && !ex)
+		BUG();
+
+	LOG_ENTRY();
+
+#ifdef VERBOSE_LOCKING_TRACE
+	printk("ocfs2: (%u) inode %llu, take %s META lock\n",
+	       OCFS_I(inode)->ip_blkno, current->pid,
+	       ex ? "EXMODE" : "PRMODE");
+#endif
+
+	if (!(flags & OCFS2_META_LOCK_RECOVERY)) {
+		status = ocfs2_wait_on_recovery(osb);
+		if (status < 0)
+			goto bail;
+	}
+
+	lockres = &OCFS_I(inode)->ip_meta_lockres;
+	level = ex ? LKM_EXMODE : LKM_PRMODE;
+	dlm_flags = 0;
+	if (flags & OCFS2_META_LOCK_NOQUEUE)
+		dlm_flags |= LKM_NOQUEUE;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags);
+	if (status < 0) {
+		if (status != -EINTR && status != -EAGAIN)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	if (!(flags & OCFS2_META_LOCK_RECOVERY)) {
+		status = ocfs2_wait_on_recovery(osb);
+		if (status < 0)
+			goto bail;
+	}
+
+	status = ocfs2_meta_lock_update(inode, &bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	if (ret_bh && !bh) {
+		/* caller wants a buffer head but we haven't read it yet. */
+		status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &bh,
+					 OCFS_BH_CACHED, inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
+	if (ret_bh) {
+		*ret_bh = bh;
+		get_bh(*ret_bh);
+	}
+	if (handle) {
+		status = ocfs_handle_add_lock(handle, inode);
+		if (status < 0)
+			LOG_ERROR_STATUS(status);
+	}
+bail:
+	if (bh)
+		brelse(bh);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+	printk("ocfs2: (%u) inode %llu drop %s META lock\n",
+	       OCFS_I(inode)->ip_blkno, current->pid,
+	       ex ? "EXMODE" : "PRMODE");
+#endif
+
+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+int ocfs2_super_lock(ocfs_super *osb,
+		     int ex)
+{
+	int status;
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	ocfs2_lock_res *lockres = &osb->super_lockres;
+	struct buffer_head *bh;
+	ocfs2_slot_info *si = osb->slot_info;
+
+	LOG_ENTRY();
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	/* The super block lock path is really in the best position to
+	 * know when resources covered by the lock need to be
+	 * refreshed, so we do it here. Of course, making sense of
+	 * everything is up to the caller :) */
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	if (status) {
+		bh = si->si_bh;
+		status = ocfs_read_block(osb, bh->b_blocknr, &bh, 0,
+					 si->si_inode);
+		if (status < 0)
+			LOG_ERROR_STATUS(status);
+
+		ocfs2_complete_lock_res_refresh(lockres, status);
+	}
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+void ocfs2_super_unlock(ocfs_super *osb,
+			int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	ocfs2_lock_res *lockres = &osb->super_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_dlm_init(ocfs_super *osb)
+{
+	int status, pid;
+	u32 dlm_key;
+	dlm_ctxt *dlm = NULL;
+
+	LOG_ENTRY();
+
+	/* launch vote thread */
+	init_completion (&osb->vote_event_init);
+	pid = kernel_thread(ocfs2_vote_thread, osb,
+			    CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (pid < 0) {
+		status = pid;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	wait_for_completion(&osb->vote_event_init);
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. For now we'll just
+	 * yank that off uuid. */
+	memcpy(&dlm_key, osb->uuid, sizeof(dlm_key));
+
+	/* for now, group_name == domain */
+	dlm = dlm_register_domain(osb->group_name, osb->group_name, dlm_key);
+	if (!dlm) {
+		/* This is a best guess on return value... */
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	osb->dlm = dlm;
+
+	status = ocfs2_super_lock_res_init(&osb->super_lockres, osb);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+bail:
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+void ocfs2_dlm_shutdown(ocfs_super *osb)
+{
+	int wait_on_vote_task = 0;
+
+	ocfs2_drop_super_lock(osb);
+
+	/* needs to be able to deal with the dlm being in many
+	 * different states. */
+	spin_lock(&osb->vote_task_lock);
+	if (osb->vote_task) {
+		osb->vote_exit = 1;
+		ocfs2_kick_vote_thread(osb);
+		wait_on_vote_task = 1;
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	if (wait_on_vote_task)
+		wait_for_completion(&osb->vote_event_complete);
+
+	ocfs2_lock_res_free(&osb->super_lockres);
+	dlm_unregister_domain(osb->dlm);
+}
+
+static void ocfs2_unlock_ast_func(void *opaque, dlm_status status)
+{
+	ocfs2_lock_res *lockres = opaque;
+
+	if (status != DLM_NORMAL)
+		LOG_ERROR_ARGS("Dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		lockres->l_action = OCFS2_AST_INVALID;
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = LKM_IVMODE;
+		break;
+	default:
+		BUG();
+	}
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+/* BEWARE: called with lockres lock, and always drops it. */
+static int __ocfs2_drop_lock(ocfs_super *osb,
+			     ocfs2_lock_res *lockres)
+{
+	int ret = 0;
+	dlm_status status;
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		printk("ocfs2: destroying busy lock!\n");
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		printk("ocfs2: destroying blocked lock!\n");
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~OCFS2_LOCK_ATTACHED;
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	OCFS_ASSERT(lockres->l_action == OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres->l_flags |= OCFS2_LOCK_BUSY;
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock(&lockres->l_lock);
+
+	status = dlmunlock(osb->dlm,
+			   &lockres->l_lksb,
+			   LKM_VALBLK,
+			   lockres->l_ops->unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		LOG_ERROR_ARGS("Dlm returns %d\n", status);
+		ret = -ENOENT;
+		goto bail;
+	}
+
+	ocfs2_wait_on_busy_lock(lockres);
+	if (signal_pending(current)) {
+		printk("ocfs2_drop_lock: Signal caught!\n");
+		ret = -EINTR;
+	}
+bail:
+	LOG_EXIT_STATUS(ret);
+	return ret;
+}
+
+static void ocfs2_drop_super_lock(ocfs_super *osb)
+{
+	int status;
+	ocfs2_lock_res *lockres;
+
+	lockres = &osb->super_lockres;
+
+	spin_lock(&lockres->l_lock);
+	status = __ocfs2_drop_lock(osb, lockres);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+	int status, err;
+	ocfs2_lock_res *lockres;
+
+	lockres = &OCFS_I(inode)->ip_data_lockres;
+	spin_lock(&lockres->l_lock);
+	err = __ocfs2_drop_lock(OCFS2_SB(inode->i_sb), lockres);
+	if (err < 0)
+		LOG_ERROR_STATUS(err);
+
+	status = err;
+
+	/* the metadata lock requires a bit more work as we have an
+	 * LVB to worry about. */
+	lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED) {
+		if (lockres->l_level == LKM_EXMODE)
+			__ocfs2_stuff_meta_lvb(inode);
+		/* fake an NLMODE downconvert for the lvb code. */
+		__ocfs2_lvb_on_downconvert(lockres, LKM_NLMODE);
+	}
+	err = __ocfs2_drop_lock(OCFS2_SB(inode->i_sb), lockres);
+	if (err < 0)
+		LOG_ERROR_STATUS(err);
+	if (err < 0 && !status)
+		status = err;
+
+	return status;
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+/* called with the spinlock held, and WILL drop it. */
+static int __ocfs2_downconvert_lock(ocfs_super *osb,
+				    ocfs2_lock_res *lockres,
+				    int new_level,
+				    int lvb)
+{
+	int status, flags = LKM_CONVERT;
+	OCFS_ASSERT(lockres->l_blocking > LKM_NLMODE);
+	OCFS_ASSERT(lockres->l_level > new_level);
+
+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
+	lockres->l_requested = new_level;
+	lockres->l_flags |= OCFS2_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	if (lvb)
+		flags |= LKM_VALBLK;
+
+	status = dlmlock(osb->dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		LOG_ERROR_ARGS("Dlm returns %d\n", status);
+		status = -ENOENT;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		goto bail;
+	}
+	status = 0;
+bail:
+	return status;
+}
+
+/* called with the spinlock held, and WILL drop it. */
+static int __ocfs2_cancel_convert(ocfs_super *osb,
+				  ocfs2_lock_res *lockres)
+{
+	int status;
+
+	/* were we in a convert when we got the bast fire? */
+	OCFS_ASSERT(lockres->l_action == OCFS2_AST_CONVERT ||
+		    lockres->l_action == OCFS2_AST_DOWNCONVERT);
+	/* set things up for the unlockast to know to just
+	 * clear out the ast_action and unset busy, etc. */
+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+	spin_unlock(&lockres->l_lock);
+
+	status = dlmunlock(osb->dlm,
+			   &lockres->l_lksb,
+			   LKM_CANCEL,
+			   lockres->l_ops->unlock_ast,
+			   lockres);
+	if (status == DLM_NORMAL)
+		status = 0;
+
+	if (status == DLM_CANCELGRANT) {
+		/* If we got this, then the ast was fired
+		 * before we could cancel. We cleanup our
+		 * state, and restart the function. */
+		spin_lock(&lockres->l_lock);
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+		spin_unlock(&lockres->l_lock);
+	} else {
+		LOG_ERROR_ARGS("Dlm returns %d\n", status);
+		status = -ENOENT;
+		ocfs2_recover_from_dlm_error(lockres, 0);
+	}
+
+	return status;
+}
+
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  ocfs2_lock_res *lockres,
+						  int new_level)
+{
+	int ret;
+
+	OCFS_ASSERT(new_level == LKM_NLMODE || new_level == LKM_PRMODE);
+	if (new_level == LKM_PRMODE)
+		ret = !lockres->l_ex_holders && 
+			ocfs_inode_fully_checkpointed(inode);
+	else /* Must be NLMODE we're converting to. */
+		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
+			ocfs_inode_fully_checkpointed(inode);
+
+	return ret;
+}
+
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue)
+{
+	int new_level;
+	int set_lvb = 0;
+	ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+	spin_lock(&lockres->l_lock);
+	if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+		spin_unlock(&lockres->l_lock);
+		return 0;
+	}
+
+	OCFS_ASSERT(lockres->l_level == LKM_EXMODE || 
+		    lockres->l_level == LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue++;
+		if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+			/* If we're already trying to cancel a lock conversion
+			 * then just drop the spinlock and requeue ourselves
+			 * to check again later. */
+			spin_unlock(&lockres->l_lock);
+			return 0;
+		}
+
+		return __ocfs2_cancel_convert(OCFS2_SB(inode->i_sb),
+					      lockres);
+	}
+
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
+		if (lockres->l_level == LKM_EXMODE) {
+			__ocfs2_stuff_meta_lvb(inode);
+			set_lvb = 1;
+		}
+		__ocfs2_lvb_on_downconvert(lockres, new_level);
+		return __ocfs2_downconvert_lock(OCFS2_SB(inode->i_sb),
+						lockres, new_level,
+						set_lvb);
+	}
+	if (!ocfs_inode_fully_checkpointed(inode))
+		ocfs_start_checkpoint(OCFS2_SB(inode->i_sb));
+
+	*requeue++;
+	spin_unlock(&lockres->l_lock);
+
+	return 0;
+}
+
+static int ocfs2_generic_unblock_lock(ocfs_super *osb,
+				      ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker)
+{
+	int blocking;
+	int new_level;
+
+	spin_lock(&lockres->l_lock);
+	if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+		spin_unlock(&lockres->l_lock);
+		*requeue = 0;
+		return 0;
+	}
+
+	OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+recheck:
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+			/* If we're already trying to cancel a lock conversion
+			 * then just drop the spinlock and requeue ourselves
+			 * to check again later. */
+			spin_unlock(&lockres->l_lock);
+			return 0;
+		}
+
+		return __ocfs2_cancel_convert(osb, lockres);
+	}
+
+	/* if we're blocking an exclusive and we have *any* holders,
+	 * then requeue. */
+	if ((lockres->l_blocking == LKM_EXMODE) 
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		*requeue = 1;
+		return 0;
+	}
+
+	/* If it's a PR we're blocking, then only
+	 * requeue if we've got any EX holders */
+	if (lockres->l_blocking == LKM_PRMODE &&
+	    lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		*requeue = 1;
+		return 0;
+	}
+
+	/* If we get here, then we know that there are no more
+	 * incompatible holders (and anyone asking for an incompatible
+	 * lock is blocked). We can now downconvert the lock */
+	if (!worker)
+		goto downconvert;
+
+	/* Some lockres types want to do a bit of work before
+	 * downconverting a lock. Allow that here. The worker function
+	 * may sleep, so we save off a copy of what we're blocking as
+	 * it may change while we're not holding the spin lock. */
+	blocking = lockres->l_blocking;
+	spin_unlock(&lockres->l_lock);
+
+	worker(lockres, blocking);
+
+	spin_lock(&lockres->l_lock);
+	if (blocking != lockres->l_blocking) {
+		/* If this changed underneath us, then we can't drop
+		 * it just yet. */
+		goto recheck;
+	}
+
+downconvert:
+	*requeue = 0;
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+	return __ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+}
+
+static void ocfs2_data_convert_worker(ocfs2_lock_res *lockres,
+				      int blocking)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+        sync_mapping_buffers(inode->i_mapping);
+        if (blocking == LKM_EXMODE)
+                ocfs_truncate_inode_pages(inode, 0);
+}
+
+int ocfs2_unblock_data(ocfs2_lock_res *lockres,
+			       int *requeue)
+{
+	int status;
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+	ocfs_super *osb = OCFS2_SB(inode->i_sb);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    ocfs2_data_convert_worker);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+	/* because of inode ref counting, we never want to propagate
+	 * up requeue requests for inode locks. Instead we do it
+	 * ourselves here, and lose the extra ref we got from queueing
+	 * when we came in. */
+	if (*requeue)
+		ocfs2_schedule_blocked_inode_lock(inode, lockres);
+
+	iput(inode);
+	*requeue = 0;
+
+	return status;
+}
+
+int ocfs2_unblock_meta(ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	status = ocfs2_do_unblock_meta(inode, requeue);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+	/* if you're confused by this, see the comment in
+	 * ocfs2_unblock_data */
+	if (*requeue)
+		ocfs2_schedule_blocked_inode_lock(inode, lockres);
+
+	iput(inode);
+	*requeue = 0;
+
+	return status;
+}
+
+static int ocfs2_unblock_super(ocfs2_lock_res *lockres,
+			       int *requeue)
+{
+	int status;
+	ocfs_super *osb = ocfs2_lock_res_super(lockres);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+	return status;
+}
+
+void ocfs2_process_blocked_lock(ocfs_super *osb,
+				ocfs2_lock_res *lockres)
+{
+	int status;
+	int requeue = 0;
+
+	OCFS_ASSERT(lockres);
+	OCFS_ASSERT(lockres->l_ops);
+	OCFS_ASSERT(lockres->l_ops->unblock);
+
+	status = lockres->l_ops->unblock(lockres, &requeue);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+	if (requeue)
+		ocfs2_schedule_blocked_lock(osb, lockres);
+}
+
+static void ocfs2_schedule_blocked_lock(ocfs_super *osb,
+					ocfs2_lock_res *lockres)
+{
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&lockres->l_blocked_list)) {
+		list_add_tail(&lockres->l_blocked_list,
+			      &osb->blocked_lock_list);
+		osb->blocked_lock_count++;
+	}
+	spin_unlock(&osb->vote_task_lock);
+}
+
+/* needed for inodes as we have to take a reference on them.. */
+static void ocfs2_schedule_blocked_inode_lock(struct inode *inode,
+					      ocfs2_lock_res *lockres)
+{
+	if (!igrab(inode)) {
+		LOG_ERROR_ARGS("Inode %llu asked to be scheduled during "
+			       "clear_inode!\n", OCFS_I(inode)->ip_blkno);
+		return;
+	}
+
+	ocfs2_schedule_blocked_lock(OCFS2_SB(inode->i_sb), lockres);
+}

Added: trunk/src/dlmglue.h
===================================================================
--- trunk/src/dlmglue.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlmglue.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,131 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef MIDDLE_H
+#define MIDDLE_H
+
+/*
+ * LVB Sequence number rules:
+ * local seq and lvb seq are initialized to zero.
+ *
+ * Note that the lvb is basically invalid until the 1st EX downconvert
+ * as he's the only guy that can set it valid. This is ok though as PR
+ * holders would have to do an I/O under lock anyway.
+ *
+ * NL->PR:
+ * NL->EX:
+ * If LVB is valid:
+ *   if local seq == lvb seq, then we are up to date with the contents.
+ *   otherwise, we take the slow path to get up to date and then set our
+ *   local seq to the lvb seq.
+ *
+ * PR->NL: 
+ * If LVB is valid:
+ *   We increment our local seq. -- this allows up to
+ *   one set of changes to the lvb before we considers ourselves
+ *   invalid.
+ *
+ * PR->EX:
+ *   Do nothing.
+ *
+ * EX->NL:
+ * EX->PR:
+ * Set the LVB as valid.
+ * Populate the LVB contents (this is lock type specific)
+ * Increment the LVB seq.
+ * Set my local seq to the LVB seq.
+ * if (EX->NL)
+ *   do an additional increment of my local seq.
+ */
+typedef struct _ocfs2_lvb {
+	u32 lvb_seq;
+} ocfs2_lvb;
+typedef struct _ocfs2_meta_lvb {
+	ocfs2_lvb lvb;
+	u32       lvb_trunc_clusters;
+	u32       lvb_iclusters;
+	u32       lvb_iuid;
+	u32       lvb_igid;
+	u64       lvb_isize;
+	u16       lvb_imode;
+	u16       lvb_inlink;
+	u64       lvb_iatime;
+	u64       lvb_ictime;
+	u64       lvb_imtime;
+} ocfs2_meta_lvb;
+
+int ocfs2_dlm_init(ocfs_super *osb);
+void ocfs2_dlm_shutdown(ocfs_super *osb);
+int ocfs2_inode_lock_res_init(ocfs2_lock_res *res,
+			      enum ocfs2_lock_type type,
+			      struct inode *inode);
+int ocfs2_super_lock_res_init(ocfs2_lock_res *res,
+			      ocfs_super *osb);
+void ocfs2_lock_res_free(ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_data_lock(struct inode *inode,
+		    int write);
+void ocfs2_data_unlock(struct inode *inode,
+		       int write);
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_flags(i, h, b, e, 0)
+int ocfs2_meta_lock_flags(struct inode *inode,
+			  ocfs_journal_handle *handle,
+			  struct buffer_head **ret_bh,
+			  int ex,
+			  int flags);
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(ocfs_super *osb,
+		     int ex);
+void ocfs2_super_unlock(ocfs_super *osb,
+			int ex);
+/* for the vote thread */
+void ocfs2_process_blocked_lock(ocfs_super *osb,
+				ocfs2_lock_res *lockres);
+
+static inline void ocfs2_lvb_set_trunc_clusters(struct inode *inode,
+						unsigned int trunc_clusters)
+{
+	ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+	ocfs2_meta_lvb *lvb;
+
+	spin_lock(&lockres->l_lock);
+	OCFS_ASSERT(lockres->l_level == LKM_EXMODE);
+
+	lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+	if (lvb->lvb_trunc_clusters > trunc_clusters)
+		lvb->lvb_trunc_clusters = trunc_clusters;
+	spin_unlock(&lockres->l_lock);
+}
+
+#endif

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/file.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,7 +38,7 @@
 
 #include "alloc.h"
 #include "dir.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "sysfile.h"
@@ -63,188 +63,62 @@
 					     ocfs2_dinode *fe,
 					     u64 new_size);
 
-static void ocfs_fe_set_attributes(ocfs2_dinode *fe, struct iattr *attr)
-{
-	if (attr->ia_valid & ATTR_SIZE)
-		fe->i_size = attr->ia_size;
-	if (attr->ia_valid & ATTR_UID)
-		fe->i_uid = attr->ia_uid;
-	if (attr->ia_valid & ATTR_GID)
-		fe->i_gid = attr->ia_gid;
-	if (attr->ia_valid & ATTR_MODE)
-		fe->i_mode = attr->ia_mode;
-	if (attr->ia_valid & ATTR_CTIME)
-		fe->i_ctime = ocfs_get_seconds(attr->ia_ctime);
-	if (attr->ia_valid & ATTR_ATIME)
-		fe->i_atime = ocfs_get_seconds(attr->ia_atime);
-	if (attr->ia_valid & ATTR_MTIME)
-		fe->i_mtime = ocfs_get_seconds(attr->ia_mtime);
-}
-
 int ocfs_sync_inode(struct inode *inode)
 {
 	filemap_fdatawrite(inode->i_mapping);
 	return sync_mapping_buffers(inode->i_mapping);
 }
 
-static inline int ocfs_wait_on_first_open(ocfs_super *osb, 
-					   struct inode *inode) 
+/* Checks an open request against our currently open mode */
+static inline int ocfs2_valid_open(int mode, int open_direct)
 {
-	int status = 0;
-	sigset_t tmpsig;
+	int ret = 1;
 
-	ocfs_block_sigs(&tmpsig, SHUTDOWN_SIGS);
-again:
-	if (signal_pending(current)) {
-		status = -EINTR;
-		goto bail;
+	if (mode & O_DIRECT) {
+		if (!open_direct)
+			ret = 0;
+	} else {
+		if (open_direct && !(mode & O_RDONLY))
+			ret = 0;
 	}
-
-	spin_lock(&OCFS_I(inode)->ip_lock);
-	if (!(OCFS_I(inode)->ip_open_flags & OCFS_IN_FIRST_OPEN))
-		goto bail;
-	spin_unlock(&OCFS_I(inode)->ip_lock);
-
-	interruptible_sleep_on(&osb->open_event);
-	goto again;
-
-bail:
-	spin_unlock(&OCFS_I(inode)->ip_lock);
-	ocfs_unblock_sigs(tmpsig);
-
-	return(status);
+	return ret;
 }
 
-static inline void ocfs_notify_openers(ocfs_super *osb) 
-{
-	wake_up(&osb->open_event);
-}
-
 /*
  * ocfs_file_open()
  *
  */
 static int ocfs_file_open(struct inode *inode, struct file *file)
 {
-	int ret =0, err = 0, status = 0, first_open = 0;
+	int status;
 	int mode = file->f_flags;
-	ocfs_super *osb = OCFS_SB(inode->i_sb);
 	ocfs_inode_private *oip = OCFS_I(inode);
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, '%*s')\n", inode, file, 
 			file->f_dentry->d_name.len, 
 			file->f_dentry->d_name.name);
 
-	if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
-		LOG_ERROR_STR ("Volume has been shutdown");
-		status = -EACCES;
-		goto leave;
-	}
+	status = -EACCES;
 
-	if (atomic_read(&oip->ip_needs_verification)) {
-		down(&inode->i_sem);
-		status = ocfs_verify_update_inode (osb, inode);
-		up(&inode->i_sem);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-	}
-
 	spin_lock(&oip->ip_lock);
-	if (!oip->ip_open_cnt++) {
-		first_open = 1;
-		oip->ip_open_flags |= OCFS_IN_FIRST_OPEN;
+	if (oip->ip_open_cnt &&
+	    !ocfs2_valid_open(mode, 
+			      oip->ip_flags & OCFS_INODE_OPEN_DIRECT)) {
+		spin_unlock(&oip->ip_lock);
+		goto leave;
 	}
-	spin_unlock(&oip->ip_lock);
 
-	if (!first_open)
-		status = ocfs_wait_on_first_open(osb, inode);
+	if (mode & O_DIRECT)
+		oip->ip_flags |= OCFS_INODE_OPEN_DIRECT;
+	else
+		oip->ip_flags &= ~OCFS_INODE_OPEN_DIRECT;
 
-	if (status < 0) {
-		if (status != -EINTR)
-			LOG_ERROR_STATUS(status);
-		spin_lock(&oip->ip_lock);
-		oip->ip_open_cnt--;
-		goto leave_unlock;
-	}
-
-	/* TODO: if we're not opening for write then lets send an additional
-	 * flag over to tell the other node it's not necessary to do the
-	 * truncate_inode_pages (he just has to sync). */
-
-	status = 0;
-	if (!(mode & O_DIRECT) 
-	    && (first_open || (mode & (O_WRONLY|O_RDWR))))
-		status = ocfs_notify_on_open(osb, inode);
-
-	spin_lock(&oip->ip_lock);
-	if (first_open) {
-		oip->ip_open_flags &= ~OCFS_IN_FIRST_OPEN;
-		ocfs_notify_openers(osb);
-	}
-
-	if (status < 0) {
-		oip->ip_open_cnt--;
-		if (status != -EINTR)
-			LOG_ERROR_STATUS(status);
-		goto leave_unlock;
-	}
-
-	if (oip->ip_open_cnt > 1) {
-		/*  We're not the only person who has it open right
-		 *  now so lets check whether the requested
-		 *  access/share access conflicts with the existing
-		 *  open operations. */
-
-		LOG_TRACE_ARGS ("oin->ip_open_cnt > 0! : %u\n", 
-				oip->ip_open_cnt);
-		if (!(mode & O_DIRECT)) {
-			if ((oip->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO) && !(mode & O_RDONLY)) {
-				oip->ip_open_cnt--;
-				status = -EACCES;
-				LOG_TRACE_STR("file is already open O_DIRECT, "
-						"cannot open non O_DIRECT");
-				goto leave_unlock;
-			}
-		} else if (mode & O_DIRECT) {
-			if (!(oip->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO)) {
-				oip->ip_open_cnt--;
-				status = -EACCES;
-				LOG_TRACE_STR("file is already open non "  \
-					      "O_DIRECT, cannot open "     \
-					      "O_DIRECT");
-				goto leave_unlock;
-			}
-		}
-		status = 0;
-	} else {
-		if (mode & O_DIRECT)
-			OCFS_SET_FLAG(oip->ip_open_flags, OCFS_OIN_OPEN_FOR_DIRECTIO);
-		else
-			OCFS_CLEAR_FLAG(oip->ip_open_flags, OCFS_OIN_OPEN_FOR_DIRECTIO);
-	}
-
-leave_unlock:
+	oip->ip_open_cnt++;
 	spin_unlock(&oip->ip_lock);
-
+	status = 0;
 leave:
-	if (status < 0) {
-		if (status != -ENOENT && status != -ENOMEM &&
-			status != -EACCES && status != -EINTR) {
-			LOG_ERROR_STATUS (status);
-			ret = -EACCES;
-		} else
-			ret = status;
-	} else {
-		ret = 0;
-	}
-
-	LOG_TRACE_ARGS
-	    ("exiting file_open: file=%p dentry=%p inode=%p kiovec=%d\n",
-	     file, file->f_dentry, file->f_dentry->d_inode, err);
-	LOG_EXIT_INT (ret);
-	return ret;
+	LOG_EXIT_STATUS(status);
+	return status;
 }				/* ocfs_file_open */
 
 static int ocfs_file_release(struct inode *inode, struct file *file)
@@ -258,7 +132,7 @@
 
 	spin_lock(&oip->ip_lock);
 	if (!--oip->ip_open_cnt)
-		oip->ip_open_flags &= ~OCFS_OIN_OPEN_FOR_DIRECTIO;
+		oip->ip_flags &= ~OCFS_INODE_OPEN_DIRECT;
 	spin_unlock(&oip->ip_lock);
 
 	LOG_EXIT_INT(0);
@@ -271,7 +145,7 @@
  *
  */
 static int ocfs_sync_file(struct file *file, struct dentry *dentry,
-		int datasync)
+			  int datasync)
 {
 	int err = 0;
 	journal_t *journal;
@@ -303,87 +177,8 @@
 	return (err < 0) ? -EIO : 0;
 }				/* ocfs_sync_file */
 
-/* ocfs_change_file_attrib()
- *
- */
-static int ocfs_change_file_attrib(ocfs_super *osb, struct iattr *attr,
-				   struct inode *inode)
-{
-	int status = 0;
-	ocfs2_dinode *fe = NULL;
-	struct buffer_head *bh = NULL;
-	ocfs_journal_handle *handle = NULL;
 
-	LOG_ENTRY ();
-
 #ifdef PURE_EVIL
-	if (evil_filename_check(EVIL_INODE, inode)) {
-		LOG_ERROR_STR("EVIL ATTRIB");
-	}
-#endif
-
-	handle = ocfs_alloc_handle(osb);
-	if (handle == NULL) {
-		LOG_ERROR_STATUS(status);
-		goto leave;
-	}
-
-	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, &bh, inode);
-	if (status < 0) {
-		if (status != -EINTR)
-			LOG_ERROR_STATUS (status);
-		goto leave;
-	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_FILE_UPDATE_OIN, 
-			     inode);
-
-	/* Start a transaction - need a minimal amount of block credits (1) */
-	handle = ocfs_start_trans(osb, handle, OCFS_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		LOG_ERROR_STATUS(status);
-		goto leave;
-	}
-
-	fe = (ocfs2_dinode *) bh->b_data;
-
-	OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
-
-	status = ocfs_journal_access(handle, inode, bh, 
-				     OCFS_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto leave;
-	}
-
-	fe = (ocfs2_dinode *) bh->b_data;
-
-	fe->i_mtime = OCFS_CURRENT_TIME;
-
-	ocfs_fe_set_attributes(fe, attr);
-
-	status = ocfs_journal_dirty(handle, bh);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto leave;
-	}
-
-leave:
-	if (handle)
-		ocfs_commit_trans(handle);
-
-	if (bh != NULL)
-		brelse(bh);
-
-	if (status < 0)
-		if (status != -ENOSPC && status != -EINTR)
-			LOG_ERROR_STATUS (status);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_change_file_attrib */
-
-
-#ifdef PURE_EVIL
 int evil_filename_check(int type, void *ptr)
 {
 	struct file *filp = ptr;
@@ -423,7 +218,7 @@
 		size_t count, loff_t *ppos)
 {
 	int ret = 0;
-	int writingAtEOF = 0;
+	int extended = 0;
 	ocfs_super *osb = NULL;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
@@ -433,24 +228,15 @@
 	int do_direct_io = 0;
 	int sector_size;
 	int have_i_sem = 0;
+	int level = filp->f_flags & O_APPEND;
+	loff_t saved_ppos;
 
 	LOG_SET_CONTEXT(WRITE);
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, '%*s')\n", filp, buf,
 			(unsigned int)count,
-			filp->f_dentry->d_name.len, filp->f_dentry->d_name.name);
-
-#ifdef PURE_EVIL
-	if (evil_filename_check(EVIL_DENTRY, dentry)) {
-		int z;
-		LOG_ERROR_ARGS("EVIL FILE_WRITE: count=%u, ppos=%llu, flags=%d\n", (unsigned int)count, *ppos, filp->f_flags);
-		for (z=0; z<(count<16?count:16); z++) {
-			printk("data[%d]=%02x ", z, ((char)buf[z]) & 0xff);
-		}
-		printk("\n");
-	}
-#endif
-
+			filp->f_dentry->d_name.len, 
+			filp->f_dentry->d_name.name);
 	/* happy write of zero bytes */
 	if (count == 0) {
 		ret = 0;
@@ -466,19 +252,25 @@
 	osb = OCFS_SB(inode->i_sb);
 	sector_size = 1 << osb->s_sectsize_bits;
 
-	if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
-		LOG_TRACE_STR ("Volume has already started shutdown");
-		ret = -EIO;
-		goto bail;
-	}
-	
 	down(&inode->i_sem);
 	have_i_sem = 1;
 
+lock:
+	status = ocfs2_meta_lock(inode, NULL, NULL, level);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		ret = status;
+		goto bail;
+	}
+
+	/* work on a copy of ppos until we're sure that we won't have
+	 * to recalculate it due to relocking. */
+	saved_ppos = *ppos;
+
 	if (filp->f_flags & O_APPEND) {
 		LOG_TRACE_ARGS("O_APPEND: inode->i_size=%llu, ppos was %llu\n",
-			       inode->i_size, *ppos);
-		*ppos = inode->i_size;
+			       inode->i_size, saved_ppos);
+		saved_ppos = inode->i_size;
 
 		/* ugh, work around some applications which open
 		 * everything O_DIRECT + O_APPEND and really don't
@@ -490,40 +282,38 @@
 	if (filp->f_flags & O_DIRECT) {
 		/* anything special for o_direct? */
 		LOG_TRACE_STR ("O_DIRECT");
-		if (((*ppos) & (sector_size - 1)) || (count & (sector_size - 1)) || 
-		    ((unsigned long)buf & (sector_size - 1)) ) {
+		if ((saved_ppos & (sector_size - 1)) || 
+		    (count & (sector_size - 1)) || 
+		    ((unsigned long)buf & (sector_size - 1))) {
 			do_direct_io = 0;
 			filp->f_flags |= O_SYNC;
 		} else
 			do_direct_io = 1;
 	}
-	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
-		LOG_TRACE_STR ("OIN_NEEDS_VERIFICATION");
-		status = ocfs_verify_update_inode (osb, inode);
-		if (status < 0) {
-			LOG_TRACE_STR ("ocfs_verify_update_inode failed");
-			LOG_TRACE_STR ("TODO: disable volume");
-			ret = -EIO;
-			goto bail;
-		}
-	}
 
-	newsize = count + *ppos;
+	newsize = count + saved_ppos;
 	if (filp->f_flags & O_APPEND)
 		newsize = count + inode->i_size;
 
 	LOG_TRACE_ARGS ("ppos=%llu newsize=%llu cursize=%llu\n",
-			*ppos, newsize, inode->i_size);
+			saved_ppos, newsize, inode->i_size);
 
 	if (newsize > inode->i_size) {
-		writingAtEOF = 1;
+		if (!level) {
+			/* we want an extend, but need a higher
+			 * level cluster lock. */
+			LOG_TRACE_ARGS("inode %llu, had a PR, looping back "
+				       "for EX\n", OCFS_I(inode)->ip_blkno);
+			ocfs2_meta_unlock(inode, level);
+			level = 1;
+			goto lock;
+		}
+		extended = 1;
 
-		LOG_TRACE_ARGS
-		    ("Writing at EOF, will need more allocation: have=%llu, "
-		     "need=%llu\n",
-		     ocfs2_clusters_to_bytes(inode->i_sb,
-					     OCFS_I(inode)->ip_clusters),
-		     newsize);
+		LOG_TRACE_ARGS("Writing at EOF, will need more allocation: "
+			       "i_size=%llu, need=%llu\n",
+			       inode->i_size, newsize);
+
 		status = ocfs_extend_file(osb, inode, newsize);
 		if (status < 0) {
 			if (status != -EINTR && status != -ENOSPC) {
@@ -534,10 +324,28 @@
 				ret = -ENOSPC;
 			} else
 				ret = status;
+
+			ocfs2_meta_unlock(inode, level);
 			goto bail;
 		}
 	}
 
+	/* we've got whatever cluster lock is appropriate now, so we
+	 * can stuff *ppos back. */
+	*ppos = saved_ppos;
+
+	if (!do_direct_io) {
+		status = ocfs2_data_lock(inode, 1);
+		if (status < 0) {
+			if (status != -EINTR)
+				LOG_ERROR_STATUS(status);
+			ret = status;
+
+			ocfs2_meta_unlock(inode, level);
+			goto bail;
+		}
+	}
+
 	down_read(&OCFS_I(inode)->ip_alloc_sem);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 	/* 
@@ -565,7 +373,10 @@
 		ret = generic_file_write_nolock (filp, buf, count, ppos);
 #endif
 	up_read(&OCFS_I(inode)->ip_alloc_sem);
-	if (writingAtEOF) {
+	if (!do_direct_io)
+		ocfs2_data_unlock(inode, 1);
+
+	if (extended) {
 		LOG_TRACE_STR
 		    ("Generic_file_write ok, asking for OIN update now");
 		inode->i_size = newsize;
@@ -586,6 +397,7 @@
 				LOG_ERROR_ARGS("Unable to pre-zero extension of inode (%d)", status);
 		}
 	}
+	ocfs2_meta_unlock(inode, level);
 
 bail:
 	if (have_i_sem)
@@ -644,14 +456,23 @@
 		} else
 			do_direct_io = 1;
 	}
-	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
-		down(&inode->i_sem);
-		status = ocfs_verify_update_inode (osb, inode);
-		up(&inode->i_sem);
+
+	/* yay, PR (shared) locks all 'round :) */
+	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		/* is this ret code correct? */
+		ret = status;
+		goto bail;
+	}
+
+	if (!do_direct_io) {
+		status = ocfs2_data_lock(inode, 0);
 		if (status < 0) {
-			LOG_TRACE_STR ("ocfs_verify_update_inode failed");
-			LOG_TRACE_STR ("TODO: disable volume");
-			ret = -EIO;
+			if (status != -EINTR)
+				LOG_ERROR_STATUS(status);
+			/* is this ret code correct? */
+			ret = status;
 			goto bail;
 		}
 	}
@@ -684,6 +505,9 @@
 	if (ret == -EINVAL)
 		LOG_ERROR_STR ("Generic_file_read returned -EINVAL");
 
+	if (!do_direct_io)
+		ocfs2_data_unlock(inode, 0);
+	ocfs2_meta_unlock(inode, 0);
 bail:
 	LOG_EXIT_INT (ret);
 
@@ -729,9 +553,10 @@
 
 	grow = new_i_size > inode->i_size;
 	inode->i_size = new_i_size;
-	OCFS_SET_INODE_TIME(inode, i_mtime, OCFS_CURRENT_TIME);
 	inode->i_blocks = (new_i_size + sb->s_blocksize - 1) 
 		>> sb->s_blocksize_bits;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
 	status = ocfs_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
@@ -797,22 +622,14 @@
 	LOG_ENTRY_ARGS("(inode = %llu, new_i_size = %llu\n", 
 		       OCFS_I(inode)->ip_blkno, new_i_size);
 
-	handle = ocfs_alloc_handle(osb);
-	if (handle == NULL) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto bail;
-	}
+	ocfs_truncate_inode_pages(inode, new_i_size);
 
-	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 
-				    FLAG_FILE_TRUNCATE|FLAG_FILE_UPDATE_OIN,
-				    &fe_bh, inode);
+	status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &fe_bh,
+				 OCFS_BH_CACHED, inode);
 	if (status < 0) {
-		if (status != -EINTR)
-			LOG_ERROR_STATUS (status);
+		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-			     FLAG_FILE_TRUNCATE|FLAG_FILE_UPDATE_OIN, inode);
 
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
@@ -841,19 +658,32 @@
 			       "truncate\n", fe->i_clusters);
 		/* No allocation change is required, so lets fast path
 		 * this truncate. */	
-		handle = ocfs_start_trans(osb, handle, 
+		handle = ocfs_start_trans(osb, NULL, 
 					  OCFS_INODE_UPDATE_CREDITS);
 		if (handle == NULL) {
 			LOG_ERROR_STATUS (status = -ENOMEM);
 			goto bail;
 		}
 
+		/* Since we got our cluster lock from caller and we
+		 * don't add it to the handle: */
+		ocfs_set_inode_lock_trans(osb->journal, inode);
+
 		status = ocfs_set_inode_size(handle, inode, fe_bh, new_i_size);
 		if (status < 0)
 			LOG_ERROR_STATUS (status);
 		goto bail;
 	}
 
+	/* This forces other nodes to sync and drop their pages */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
@@ -1046,7 +876,7 @@
 	/* TODO: We will keep a small history of allocs on the filp
 	 * and calculate a reasonable overalloc based on that data
 	 * here. */
-	return(0);
+	return 0;
 }
 
 /* ocfs_extend_file()
@@ -1085,26 +915,20 @@
 		goto leave;
 	}
 
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, FLAG_FILE_EXTEND, 
-				    &bh, inode);
+	status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &bh,
+				 OCFS_BH_CACHED, inode);
 	if (status < 0) {
-		if (status != -EINTR)
-			LOG_ERROR_STATUS (status);
+		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
-	ocfs_handle_add_lock(handle, 
-			     OCFS_LKM_EXMODE,
-			     FLAG_FILE_EXTEND|FLAG_FILE_UPDATE_OIN,
-			     inode);
 
 	fe = (ocfs2_dinode *) bh->b_data;
 	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
-	OCFS_ASSERT(new_i_size >= fe->i_size);
+	OCFS_ASSERT(inode->i_size == fe->i_size);
+	OCFS_ASSERT(new_i_size >= inode->i_size);
 
-	if (fe->i_size == new_i_size) {
-		OCFS_ASSERT(inode->i_size == new_i_size);
-		goto leave;
-	}
+	if (inode->i_size == new_i_size)
+  		goto leave;
 
 	clusters_to_add = ocfs2_clusters_for_bytes(osb->sb, new_i_size) 
 		- fe->i_clusters;
@@ -1114,14 +938,14 @@
 		       OCFS_I(inode)->ip_blkno, new_i_size, inode->i_size, 
 		       fe->i_clusters, clusters_to_add);
 
-	if (!clusters_to_add) 
+	if (!clusters_to_add)
 		goto do_start_trans;
 
 	overalloc_bits = 0;
 	if (!skip_overalloc) {
-		overalloc_bits = ocfs_calc_overalloc_bits(osb, 
-							  NULL, 
-							  fe, 
+		overalloc_bits = ocfs_calc_overalloc_bits(osb,
+							  NULL,
+							  fe,
 							  new_i_size);
 		clusters_to_add += overalloc_bits;
 		skip_overalloc = 1;
@@ -1171,6 +995,9 @@
 		goto leave;
 	}
 
+	/* Since we got our cluster lock from caller and we don't add
+	 * it to the handle: */
+	ocfs_set_inode_lock_trans(osb->journal, inode);
 restarted_transaction:
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
@@ -1249,14 +1076,14 @@
 		fe->i_size = ocfs2_clusters_to_bytes(osb->sb, fe->i_clusters);
 	else
 		fe->i_size = new_i_size;
-#warning "is there a reason why we don't update i_blocks here?"
+
 	LOG_TRACE_ARGS("fe: i_clusters = %u, i_size=%llu\n", 
 		       fe->i_clusters, fe->i_size);
 
 	LOG_TRACE_ARGS("inode: ip_clusters=%u, i_size=%llu\n",
 		       OCFS_I(inode)->ip_clusters, inode->i_size);
 
-	fe->i_mtime = OCFS_CURRENT_TIME;
+	fe->i_ctime = fe->i_mtime = OCFS_CURRENT_TIME;
 
 	status = ocfs_journal_dirty(handle, bh);
 	if (status < 0) {
@@ -1299,33 +1126,18 @@
  */
 int ocfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
+	int status = 0;
+	int unlock = 0;
+	u64 newsize;
 	struct inode *inode = dentry->d_inode;
-	int error = 0;
-	__u64 newsize;
-	int status;
-	ocfs_super *osb = NULL;
 	struct super_block *sb = inode->i_sb;
+	ocfs_super *osb = OCFS2_SB(sb);
+	struct buffer_head *bh = NULL;
+	ocfs_journal_handle *handle = NULL;
 
-	LOG_SET_CONTEXT(SETATTR);
-
 	LOG_ENTRY_ARGS ("(0x%p, '%*s')\n", dentry,
 			dentry->d_name.len, dentry->d_name.name);
 
-	osb = OCFS_SB(inode->i_sb);
-
-#ifdef PURE_EVIL
-	if (evil_filename_check(EVIL_DENTRY, dentry)) {
-		LOG_ERROR_ARGS("EVIL SETATTR\n");
-	}
-#endif
-
-	if (!dentry->d_parent || !dentry->d_parent->d_inode) {
-		LOG_ERROR_STR ("bad inode or root inode");
-		goto bail;
-	}
-
-	newsize = attr->ia_size;
-
 	if (attr->ia_valid & ATTR_MODE)
 		LOG_TRACE_ARGS ("mode change: %d\n", attr->ia_mode);
 	if (attr->ia_valid & ATTR_UID)
@@ -1337,38 +1149,43 @@
 	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
 		LOG_TRACE_STR ("time change...");
 
-	if (!(attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME |
-				ATTR_SIZE | ATTR_GID | ATTR_UID | ATTR_MODE))) {
-		LOG_TRACE_STR
-		    ("can only change mode, uid, gid, size and time.  exiting!");
-		goto bail;
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
+		LOG_TRACE_ARGS("can't handle attrs: 0x%x\n", attr->ia_valid);
+		return 0;
 	}
 
-	error = inode_change_ok (inode, attr);
-	if (error)
+	status = inode_change_ok (inode, attr);
+	if (status)
+		return status;
+
+	newsize = attr->ia_size;
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
 		goto bail;
+	}
+	unlock = 1;
 
-	/* get the file and parent offsets, and the file oin if present */
-	if (attr->ia_valid & ATTR_SIZE) {
-		if (inode->i_size > newsize) {
-			ocfs_truncate_inode_pages(inode, newsize);
-			status = ocfs_truncate_file(osb, newsize, 
-						    inode);
-		} else {
+	if (attr->ia_valid & ATTR_SIZE &&
+	    newsize != inode->i_size) {
+		if (inode->i_size > newsize)
+			status = ocfs_truncate_file(osb, newsize, inode);
+		else
 			status = ocfs_extend_file(osb, inode, newsize);
-		}
 		if (status < 0) {
 			if (status != -EINTR && status != -ENOSPC)
-				LOG_ERROR_STATUS (status);
-			error = -ENOSPC;
+				LOG_ERROR_STATUS(status);
+			status = -ENOSPC;
 			goto bail;
 		}
-
 		spin_lock(&OCFS_I(inode)->ip_lock);
 		inode->i_size = newsize;
 		inode->i_blocks = (newsize + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-		if (OCFS_I(inode)->ip_open_flags &
-		    OCFS_OIN_OPEN_FOR_DIRECTIO) {
+		if (OCFS_I(inode)->ip_flags & OCFS_INODE_OPEN_DIRECT) {
 			/* This is a total broken hack for O_DIRECT crack */
 			OCFS_I(inode)->ip_mmu_private = inode->i_size;
 		}
@@ -1380,20 +1197,42 @@
 		}
 	}
 
-	status = ocfs_change_file_attrib(osb, attr, inode);
+	handle = ocfs_start_trans(osb, NULL, OCFS_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	/* Ok, this is the last transaction we'll do for a setattr so
+	 * just add our lock to the handle and let commit_trans deal
+	 * with it. */
+	status = ocfs_handle_add_lock(handle, inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+	unlock = 0;
+
+	status = inode_setattr (inode, attr);
 	if (status < 0) {
-		if (status != -EINTR)
-			LOG_ERROR_STATUS (status);
-		error = -EIO;
+		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
-	error = inode_setattr (inode, attr);
 
+	status = ocfs_mark_inode_dirty(handle, inode, bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
 bail:
-	LOG_EXIT_INT (error);
+	if (handle)
+		ocfs_commit_trans(handle);
+	if (unlock)
+		ocfs2_meta_unlock(inode, 1);
+	if (bh)
+		brelse(bh);
 
-	LOG_CLEAR_CONTEXT();
-	return error;
+	LOG_EXIT_STATUS(status);
+	return status;
 }				/* ocfs_setattr */
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)

Modified: trunk/src/heartbeat.c
===================================================================
--- trunk/src/heartbeat.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/heartbeat.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -3,7 +3,8 @@
  *
  * heartbeat.c
  *
- * Keeps track of alive nodes in the cluster.
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recover when needed.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
@@ -30,6 +31,10 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+
 #include "ocfs_log.h"
 #include "ocfs.h"
 #include "ocfs2.h"
@@ -37,8 +42,6 @@
 #include "alloc.h"
 #include "heartbeat.h"
 #include "util.h"
-#include "volcfg.h"
-#include "vote.h"
 
 #include "ocfs_journal.h"
 #include "buffer_head_io.h"
@@ -46,311 +49,386 @@
 /* Tracing */
 #define OCFS_DEBUG_CONTEXT      OCFS_DEBUG_CONTEXT_HEARTBEAT
 
-/*
- * ocfs_nm_heart_beat()
- *
- * @osb: ocfs super block for the volume
- * @flag: type of heart beat
- * @read_publish: if the publish sector needs to be re-read
- *
- * Updates the timestamp in the nodes publish sector.
- * NOTE: must be called while holding publish_lock!
- *
- * Returns 0 if success, < 0 if error.
- */ 
-int ocfs_nm_heart_beat (ocfs_super * osb, __u32 flag, int read_publish)
+#define OCFS2_HB_NODE_DOWN_PRI     (0x0000001)
+#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
+
+static void ocfs2_hb_node_down_cb(struct inode *group,
+				  struct inode *node,
+				  int node_num,
+				  void *data);
+static void ocfs2_hb_node_up_cb(struct inode *group,
+				struct inode *node,
+				int node_num,
+				void *data);
+
+static void __ocfs_node_map_dup(ocfs_super *osb,
+				ocfs_node_map *target,
+				ocfs_node_map *from);
+static inline void __ocfs_node_map_set_bit(ocfs_node_map *map,
+					   int bit);
+static inline void __ocfs_node_map_clear_bit(ocfs_node_map *map,
+					     int bit);
+static inline int __ocfs_node_map_is_empty(ocfs_node_map *map);
+static void __ocfs_node_map_dup(ocfs_super *osb,
+				ocfs_node_map *target,
+				ocfs_node_map *from);
+static void __ocfs_node_map_set(ocfs_node_map *target, ocfs_node_map *from);
+
+void ocfs2_init_node_maps(ocfs_super *osb)
 {
-	int status = 0;
-	ocfs_publish *publish = NULL;
-	int publish_idx = OCFS_VOLCFG_NEWCFG_SECTORS + osb->node_num;
-	struct buffer_head **pub_bh = &osb->autoconfig_bhs[publish_idx];
+	spin_lock_init(&osb->node_map_lock);
+	ocfs_node_map_init(osb, &osb->mounted_map);
+	ocfs_node_map_init(osb, &osb->recovery_map);
+	ocfs_node_map_init(osb, &osb->umount_map);
+}
 
-	LOG_ENTRY_ARGS ("(0x%p, %u, %s)\n", osb, flag,
-			read_publish ? "true" : "false");
+static void ocfs2_hb_node_down_cb(struct inode *group,
+				  struct inode *node,
+				  int node_num,
+				  void *data)
+{
+	ocfs_super *osb = data;
 
-	if (flag & HEARTBEAT_METHOD_DISK) {
-		if (pub_bh == NULL && !read_publish)
-			BUG();
+	if (osb->group_inode != group)
+		return;
 
-		if (read_publish) {
-			status = ocfs_read_block(osb,
-						 (osb->publish_blkno + osb->node_num),
-						 pub_bh, 0, NULL);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto finally;
-			}
-		}
+	OCFS_ASSERT(osb->node_num != node_num);
 
-		publish = (ocfs_publish *) (*pub_bh)->b_data;
-		if ((publish->dirty) && (!osb->publish_dirty)) { 
-			LOG_TRACE_STR(("NMThread reads the bit as dirty")); 
-			publish->dirty = 0; 
-		}
-		/* Write the current time in local node's publish sector */
-		publish->time = jiffies;
-		/* Dissallow 0 */
-		if (!publish->time)
-			publish->time = 1;
-		spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
-		publish->comm_seq_num = OcfsGlobalCtxt.comm_seq_num;
-		spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
+	printk("ocfs2: node down event for %d\n", node_num);
 
-		status = ocfs_write_block(osb, *pub_bh, NULL);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
+	if (ocfs_node_map_test_bit(osb, &osb->umount_map, node_num)) {
+		/* If a node is in the umount map, then we've been
+		 * expecting him to go down and we know ahead of time
+		 * that recovery is not necessary. */
+		ocfs_node_map_clear_bit(osb, &osb->umount_map, node_num);
+		return;
 	}
 
-	if (flag & HEARTBEAT_METHOD_IPC) {
-		/* Plug this in later... */
-	}
+	ocfs_recovery_thread(osb, node_num);
+}
 
-finally:
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_nm_heart_beat */
+static void ocfs2_hb_node_up_cb(struct inode *group,
+				struct inode *node,
+				int node_num,
+				void *data)
+{
+	ocfs_super *osb = data;
 
+	if (osb->group_inode != group)
+		return;
 
-/*
- * ocfs_update_publish_map()
- *
- * @osb: ocfs super block for the volume
- * @buffer: publish sectors read in the last round
- * @first_time: if true, the buffer needs to be initialized
- *
- * Reads the publish sectors and compares the timestamp of each node
- * to the one it read in the last round. As long as the timestamp keeps
- * changing, the node is marked alive. Conversely, if the timestamp does
- * not change over time, the node is marked dead. The function marks all
- * the live nodes in the publishmap.
- *
- */
-void ocfs_update_publish_map (ocfs_super * osb, struct buffer_head *bhs[], int first_time)
+	OCFS_ASSERT(osb->node_num != node_num);
+
+	printk("ocfs2: node up event for %d\n", node_num);
+	ocfs_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+/* Most functions here are just stubs for now... */
+int ocfs2_register_hb_callbacks(ocfs_super *osb)
 {
-	ocfs_publish *publish;
-	ocfs_vol_node_map *node_map;
-	int i;
-	__u16 num_nodes;
+	int status;
 
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u)\n", osb, bhs, first_time);
+	status = hb_register_callback(HB_NODE_DOWN_CB,
+				      ocfs2_hb_node_down_cb,
+				      osb,
+				      OCFS2_HB_NODE_DOWN_PRI);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	num_nodes = osb->max_nodes;
-	node_map = osb->vol_node_map;
+	status = hb_register_callback(HB_NODE_UP_CB,
+				      ocfs2_hb_node_up_cb,
+				      osb,
+				      OCFS2_HB_NODE_UP_PRI);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 
-	/* First time thru, update buffer with timestamps for all nodes */
-	if (first_time) {
-		/* Read the last comm_seq_num */
-		publish = (ocfs_publish *) bhs[osb->node_num]->b_data;
-		spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
-		OcfsGlobalCtxt.comm_seq_num = publish->comm_seq_num + 10;
-		spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
+bail:
+	return status;
+}
 
-		/* Refresh local buffers */
-		for (i = 0;  i < num_nodes; i++) {
-			publish = (ocfs_publish *) bhs[i]->b_data;
-			node_map[i].time = publish->time;
-			if (publish->mounted && i != osb->node_num) {
-				printk("ocfs2: Adding %s (node %d) to "
-				       "clustered device (%u,%u)\n",
-				       osb->node_cfg_info[i]->node_name, i,
-				       MAJOR(osb->sb->s_dev), 
-				       MINOR(osb->sb->s_dev));
-				node_map[i].miss_cnt = 0;
-				ocfs_publish_map_set(&osb->publ_map, i);
-			}
-		}
-		goto bail;	/* exit */
-	}
+void ocfs2_clear_hb_callbacks(ocfs_super *osb)
+{
+	int status;
 
-	for (i = 0; i < num_nodes; i++) {
-		publish = (ocfs_publish *) bhs[i]->b_data;
+	status = hb_unregister_callback(HB_NODE_DOWN_CB,
+					ocfs2_hb_node_down_cb, osb);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 
-		/* Loop if slot is unused */
-		if (publish->time == 0ULL)
-			continue;
+	status = hb_unregister_callback(HB_NODE_UP_CB,
+					ocfs2_hb_node_up_cb, osb);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 
-		/* Check if the node is hung or not by comparing the disk */
-		/* and memory timestamp values */
-		if (node_map[i].time == publish->time) {
-			if (ocfs_node_is_alive(&osb->publ_map, i)) {
-				char *err_msg = NULL;
-				if (atomic_read (&(node_map[i].dismount))) {
-					node_map[i].miss_cnt = MISS_COUNT_NODE_DEAD;
-					atomic_set (&(node_map[i].dismount), 0);
-					ocfs_publish_map_clear(&osb->publ_map, i);
-					err_msg = "Received dismount message. Removing %s "
-						  "(node %d) from clustered device (%u,%u).\n";
-				} else {
-					(node_map[i].miss_cnt)++;
+}
 
-					if (node_map[i].miss_cnt == MISS_COUNT_WARNING)
-						err_msg = "warning: %s (node %d) may be ejected from cluster "
-						  	  "on device (%u.%u)... %d misses so far\n";
-					else if (node_map[i].miss_cnt == MISS_COUNT_EMERGENCY)
-						err_msg = "warning: %s (node %d) WILL BE EJECTED from cluster "
-						  	  "on device (%u.%u)... %d misses so far\n";
-					else if (node_map[i].miss_cnt >= MISS_COUNT_NODE_DEAD)
-						err_msg = "Removing %s (node %d) from clustered device "
-						  	  "(%u,%u) after %d misses\n";
-				}
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+void ocfs_node_map_init(ocfs_super *osb,
+			ocfs_node_map *map)
+{
+	map->num_nodes = osb->max_nodes;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS_NODE_MAP_MAX_NODES) * 
+	       sizeof(unsigned long));
+}
 
-				if (err_msg)
-					LOG_ERROR_ARGS(err_msg, osb->node_cfg_info[i]->node_name, i,
-						       MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev),
-						       node_map[i].miss_cnt);
-				if (node_map[i].miss_cnt >= MISS_COUNT_NODE_DEAD) {
-					ocfs_recovery_map_set(osb, i);
-					ocfs_publish_map_clear(&osb->publ_map, i);
+static inline void __ocfs_node_map_set_bit(ocfs_node_map *map,
+					   int bit)
+{
+	set_bit(bit, map->map);
+}
 
-					/* Ok, we'd better recover him now...*/
-					ocfs_recovery_thread(osb, i);
-				}
-			}
-		} else {
-			if (!ocfs_node_is_alive(&osb->publ_map, i) &&
-			    (osb->node_num != i))
-				printk ("ocfs2: Adding %s (node %d) to clustered device (%u,%u)\n",
-					osb->node_cfg_info[i]->node_name, i,
-					MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
-			node_map[i].miss_cnt = 0;
-			node_map[i].time = publish->time;
-			ocfs_publish_map_set(&osb->publ_map, i);
+void ocfs_node_map_set_bit(ocfs_super *osb,
+			   ocfs_node_map *map,
+			   int bit)
+{
+	if (bit==-1)
+		return;
+	OCFS_ASSERT(bit < map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
 
-		}
+static inline void __ocfs_node_map_clear_bit(ocfs_node_map *map,
+					     int bit)
+{
+	clear_bit(bit, map->map);
+}
+
+void ocfs_node_map_clear_bit(ocfs_super *osb,
+			     ocfs_node_map *map,
+			     int bit)
+{
+	if (bit==-1)
+		return;
+	OCFS_ASSERT(bit < map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs_node_map_test_bit(ocfs_super *osb,
+			   ocfs_node_map *map,
+			   int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		LOG_ERROR_ARGS("bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
 	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
 
-bail:
-	LOG_EXIT ();
-	return;
-}				/* ocfs_update_publish_map */
+static inline int __ocfs_node_map_is_empty(ocfs_node_map *map)
+{
+	int bit;
+	bit = find_next_bit(map->map, map->num_nodes, 0);
+	if (bit < map->num_nodes)
+		return 0;
+	return 1;
+}
 
+int ocfs_node_map_is_empty(ocfs_super *osb,
+			   ocfs_node_map *map)
+{
+	int ret;
+	OCFS_ASSERT(map->num_nodes > 0);
+	spin_lock(&osb->node_map_lock);
+	ret = __ocfs_node_map_is_empty(map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
 
-/* half a second timeout */
-#define OCFS_HEARTBEAT_JIFFIES  (HZ >> 1)
+static void __ocfs_node_map_dup(ocfs_super *osb,
+				ocfs_node_map *target,
+				ocfs_node_map *from)
+{
+	OCFS_ASSERT(from->num_nodes > 0);
+	ocfs_node_map_init(osb, target);
+	__ocfs_node_map_set(target, from);
+}
 
-/*
- * ocfs_heartbeat_thread()
- * 
- * This function is executed as a kernel thread for each mounted ocfs volume.
- */
-int ocfs_heartbeat_thread (void *arg)
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs_node_map_is_only(ocfs_super *osb,
+			  ocfs_node_map *target,
+			  int bit)
 {
-	ocfs_super *osb;
-	char proc[16];
-	int status = 0;
-	__u8 *buffer = NULL;
-	ocfs_publish *publish;
-	unsigned long j;
-	__u16 num_nodes = 0;
-	ocfs_node_config_hdr *node_cfg_hdr = NULL;
-	__u64 cfg_seq_num;
-	struct buffer_head *bh = NULL;
-	siginfo_t info;
+	ocfs_node_map temp;
+	int ret;
 
-	LOG_ENTRY ();
+	spin_lock(&osb->node_map_lock);
+	__ocfs_node_map_dup(osb, &temp, target);
+	__ocfs_node_map_clear_bit(&temp, bit);
+	ret = __ocfs_node_map_is_empty(&temp);
+	spin_unlock(&osb->node_map_lock);
 
-	osb = arg;
+	return ret;
+}
 
-	sprintf (proc, "ocfs2nm-%d", osb->osb_id);
-	ocfs_daemonize (proc, strlen(proc), 1);
+static void __ocfs_node_map_set(ocfs_node_map *target,
+				ocfs_node_map *from)
+{
+	int num_longs, i; 
 
-	osb->dlm_task = current;
+	OCFS_ASSERT(target->num_nodes == from->num_nodes);
+	OCFS_ASSERT(target->num_nodes > 0);
 
-	osb->hbt = OCFS_HEARTBEAT_JIFFIES + jiffies;
-	/* The delay changes based on multiplier */
-	while (!(OcfsGlobalCtxt.flags & OCFS_FLAG_SHUTDOWN_VOL_THREAD) &&
-	       !(osb->osb_flags & OCFS_OSB_FLAGS_BEING_DISMOUNTED)) {
+	num_longs = BITS_TO_LONGS(target->num_nodes);
+	for (i=0; i<num_longs; i++)
+		target->map[i] = from->map[i];
+}
 
-		buffer = NULL;
+void ocfs_recovery_map_set(ocfs_super *osb,
+			   int num)
+{
+	spin_lock(&osb->node_map_lock);
+	__ocfs_node_map_clear_bit(&osb->mounted_map, num);
+	__ocfs_node_map_set_bit(&osb->recovery_map, num);
+	spin_unlock(&osb->node_map_lock);
+}
 
-		if (!time_after (jiffies, (unsigned long) (osb->hbt)))
-			goto again;
+void ocfs_recovery_map_clear(ocfs_super *osb,
+			     int num)
+{
+	ocfs_node_map_clear_bit(osb, &osb->recovery_map, num);
+}
 
-		/* lock publish to prevent overwrites from vote_req and vote_reset */
-		down (&(osb->publish_lock));
+int ocfs_node_map_iterate(ocfs_super *osb,
+			  ocfs_node_map *map,
+			  int idx)
+{
+	int i = idx;
 
-		/* Read disk for 4 autoconfig blocks + all nodes publish blocks */
-		status = ocfs_read_blocks(osb,
-					  osb->new_autoconfig_blkno,
-					  osb->total_autoconfig_blocks,
-					  osb->autoconfig_bhs, 0, NULL);
-		if (status < 0) {
-			up (&(osb->publish_lock));
-			LOG_ERROR_STATUS (status);
-			BUG();
+	idx = OCFS_INVALID_NODE_NUM;
+	spin_lock(&osb->node_map_lock);
+	if ((i != OCFS_INVALID_NODE_NUM) &&
+	    (i >= 0) &&
+	    (i < map->num_nodes)) {
+		while(i < map->num_nodes) {
+			if (test_bit(i, map->map)) {
+				idx = i;
+				break;
+			}
+			i++;
 		}
+	}
+	spin_unlock(&osb->node_map_lock);
+	return idx;
+}
 
-		bh = osb->autoconfig_bhs[OCFS_VOLCFG_NEWCFG_SECTORS + osb->node_num];
-		publish = (ocfs_publish *) bh->b_data;
-		if ((osb->check_mounted) && (publish->mounted == 0)) {
-			printk("ocfs2: Heartbeat timed out, volume has been "
-			       "recovered from another node!\n");
+#if 0
+/* unused (for now) node map functions. */
 
-			BUG();
-		}
-		bh = NULL;
+/* uses the heartbeat api to test whether a given global node num is
+ * heartbeating. Warning: this function can sleep in
+ * hb_fill_node_map() */
+int ocfs2_is_node_alive(ocfs_super *osb,
+			unsigned int node_num)
+{
+	int ret;
+	ocfs_node_map tmpmap;
 
-		ocfs_nm_heart_beat (osb, HEARTBEAT_METHOD_DISK, 0);
+	ocfs_node_map_init(osb, &tmpmap);
 
-		/* release publish lock */
-		up (&(osb->publish_lock));
+	ret = hb_fill_node_map(osb->group_inode, &tmpmap, sizeof(tmpmap.map));
+	if (ret < 0) {
+		LOG_ERROR_STATUS(ret);
+		goto bail;
+	}
 
-		/* If another node was added to the config read and update the cfg */
-		node_cfg_hdr =
-			(ocfs_node_config_hdr *) osb->autoconfig_bhs[1]->b_data;
-		num_nodes = node_cfg_hdr->num_nodes;
-		cfg_seq_num = node_cfg_hdr->cfg_seq_num;
+	ret = ocfs_node_map_test_bit(osb, &tmpmap, node_num);
 
-		if ((osb->cfg_seq_num != cfg_seq_num) ||
-		    (osb->num_cfg_nodes != num_nodes)) {
-			down (&(osb->cfg_lock));
-			status = ocfs_chk_update_config (osb);
-			up (&(osb->cfg_lock));
-			if (status < 0)
-				LOG_ERROR_STATUS (status);
-		}
+bail:
+	return ret;
+}
 
-		num_nodes = osb->max_nodes;
+static int ocfs_node_map_stringify(ocfs_node_map *map, char **str)
+{
+	int i, n;
+	char *s;
 
-		/* Refresh the publish map */
-		ocfs_update_publish_map (osb, &(osb->autoconfig_bhs[OCFS_VOLCFG_NEWCFG_SECTORS]), 0);
+	OCFS_ASSERT(map->num_nodes > 0);
 
-		/* send signal to mount thread to continue */
-		if (atomic_read (&osb->nm_init) < OCFS_HEARTBEAT_INIT) {
-			atomic_inc (&osb->nm_init);
-		} else if (atomic_read(&osb->nm_init) == OCFS_HEARTBEAT_INIT) {
-			wake_up (&osb->nm_init_event);
-			atomic_inc (&osb->nm_init);
+	*str = kmalloc( strlen("123 ") * map->num_nodes, GFP_KERNEL);
+	if (!(*str))
+		return -ENOMEM;
+
+	memset(*str, 0, strlen("123 ") * map->num_nodes);
+
+	s = *str;
+	for (i=0; i<map->num_nodes; i++) {
+		if (ocfs_node_map_test_bit(map, i)) {
+			n = sprintf(s, "%3d ", i);
+			if (n != strlen("123 ")) {
+				kfree(*str);
+				return -ENOMEM;
+			}
+			s += n;
 		}
+	}
+	return 0;
+}
 
-		osb->hbt = OCFS_HEARTBEAT_JIFFIES + jiffies;
+void ocfs_node_map_and(ocfs_node_map *target, ocfs_node_map *mask)
+{
+	int num_longs, i; 
 
-again:
-		status = 0;
+	OCFS_ASSERT(target->num_nodes == mask->num_nodes);
+	OCFS_ASSERT(target->num_nodes > 0);
 	
-		if ((OcfsGlobalCtxt.flags & OCFS_FLAG_SHUTDOWN_VOL_THREAD) ||
-		    (osb->osb_flags & OCFS_OSB_FLAGS_BEING_DISMOUNTED))
-			break;
-		j = jiffies;
-		if (time_after (j, (unsigned long) (osb->hbt))) {
-			osb->hbt = OCFS_HEARTBEAT_JIFFIES + j;
-		}
-		set_current_state (TASK_INTERRUPTIBLE);
-		schedule_timeout (osb->hbt - j);
+	num_longs = BITS_TO_LONGS(target->num_nodes);
+	for (i=0; i<num_longs; i++)
+		target->map[i] &= mask->map[i];
+}
 
-		/* ignore the actual signal */
-		if (signal_pending(current)) {
-			dequeue_signal_lock(current, &current->blocked, &info);
-		}
+int ocfs_node_map_is_equal(ocfs_node_map *map1, ocfs_node_map *map2)
+{
+	int num_longs, i; 
+
+	OCFS_ASSERT(map1->num_nodes == map2->num_nodes);
+	OCFS_ASSERT(map1->num_nodes > 0);
+	
+	num_longs = BITS_TO_LONGS(map1->num_nodes);
+	for (i=0; i<num_longs; i++) {
+		if (map1->map[i] != map2->map[i])
+			return 0;
 	}
+	return 1;
+}
 
-	/* Flush all scheduled tasks */
-	flush_scheduled_work();
 
-	complete (&(osb->dlm_complete));
+// clear all the bits in "target" which are set in "mask"
+static void __ocfs_node_map_clear_bits(ocfs_node_map *target,
+				       ocfs_node_map *mask)
+{
+	int bit, prev=0;
+	while (1) {
+		bit = find_next_bit (mask->map, mask->num_nodes, prev);
+		if (bit >= mask->num_nodes)
+			break;
+		ocfs_node_map_clear_bit(target, bit);
+		prev = bit+1;
+	}
+}
 
-	LOG_EXIT_INT (0);
-	return 0;
-}				/* ocfs_heartbeat_thread */
+// set all the bits in "target" which are set in "mask"
+void __ocfs_node_map_set_bits(ocfs_node_map *target,
+			      ocfs_node_map *mask)
+{
+	int bit, prev=0;
+	while (1) {
+		bit = find_next_bit (mask->map, mask->num_nodes, prev);
+		if (bit >= mask->num_nodes)
+			break;
+		ocfs_node_map_set_bit(target, bit);
+		prev = bit+1;
+	}
+}
+#endif
+

Modified: trunk/src/heartbeat.h
===================================================================
--- trunk/src/heartbeat.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/heartbeat.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,9 +26,40 @@
 #ifndef OCFS2_HEARTBEAT_H
 #define OCFS2_HEARTBEAT_H
 
-int ocfs_nm_heart_beat(ocfs_super *osb, __u32 flag, int read_publish);
-void ocfs_update_publish_map(ocfs_super *osb, struct buffer_head *bhs[],
-			     int first_time);
-int ocfs_heartbeat_thread(void *arg);
+void ocfs2_init_node_maps(ocfs_super *osb);
 
+int ocfs2_register_hb_callbacks(ocfs_super *osb);
+void ocfs2_clear_hb_callbacks(ocfs_super *osb);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs_node_map_init(ocfs_super *osb, ocfs_node_map *map);
+int ocfs_node_map_is_empty(ocfs_super *osb,
+			   ocfs_node_map *map);
+void ocfs_node_map_set_bit(ocfs_super *osb,
+			   ocfs_node_map *map,
+			   int bit);
+void ocfs_node_map_clear_bit(ocfs_super *osb,
+			     ocfs_node_map *map,
+			     int bit);
+int ocfs_node_map_test_bit(ocfs_super *osb,
+			   ocfs_node_map *map,
+			   int bit);
+int ocfs_node_map_iterate(ocfs_super *osb,
+			  ocfs_node_map *map,
+			  int idx);
+static inline int ocfs_node_map_first_set_bit(ocfs_super *osb,
+					      ocfs_node_map *map)
+{
+	return ocfs_node_map_iterate(osb, map, 0);
+}
+void ocfs_recovery_map_set(ocfs_super *osb,
+			   int num);
+void ocfs_recovery_map_clear(ocfs_super *osb,
+			     int num);
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs_node_map_is_only(ocfs_super *osb,
+			  ocfs_node_map *target,
+			  int bit);
+
 #endif /* OCFS2_HEARTBEAT_H */

Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/inode.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -39,11 +39,10 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "inode.h"
-#include "lockres.h"
 #include "namei.h"
 #include "suballoc.h"
 #include "super.h"
@@ -57,7 +56,6 @@
 
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_INODE
 
-extern struct semaphore recovery_list_sem;
 extern struct address_space_operations ocfs_aops;
 
 typedef struct _ocfs_find_inode_args
@@ -307,27 +305,22 @@
 
 	i->ip_flags = 0;
 	atomic_set(&i->ip_clean_buffer_seq, 0);
-	init_rwsem(&(i->ip_node_extend_sem));
 	i->ip_open_cnt = 0;
 	spin_lock_init(&i->ip_lock);
 	ocfs2_extent_map_init(inode);
-	INIT_LIST_HEAD(&i->ip_recovery_list);
 	INIT_LIST_HEAD(&i->ip_handle_list);
 	i->ip_handle = NULL;
 	i->ip_next_orphan = NULL;
 
 	init_rwsem(&i->ip_alloc_sem);
 	init_MUTEX(&(i->ip_io_sem));
-	atomic_set(&i->ip_needs_verification, 0);
-	INIT_LIST_HEAD(&i->ip_pending_locks);
-	INIT_LIST_HEAD(&i->ip_j_inode);
 
 	/* These should be set in read_inode2. */
 	i->ip_clusters = 0;
 	i->ip_blkno = 0ULL;
 	i->ip_mmu_private = 0ULL;
 
-	OCFS_SET_FLAG (i->ip_flags, OCFS_INODE_INITIALIZED);
+	i->ip_flags |= OCFS_INODE_INITIALIZED;
 	return 0;
 } /* ocfs_inode_init_private */
 
@@ -405,7 +398,10 @@
 	inode->i_nlink = fe->i_links_count;
 
 	if (le32_to_cpu(fe->i_flags) & OCFS2_LOCAL_ALLOC_FL) {
+		OCFS_I(inode)->ip_flags |= OCFS_INODE_BITMAP;
 		LOG_TRACE_ARGS("local alloc inode: i_ino=%lu\n", inode->i_ino);
+	} else if (le32_to_cpu(fe->i_flags) & OCFS2_BITMAP_FL) {
+		OCFS_I(inode)->ip_flags |= OCFS_INODE_BITMAP;
 	} else if (le32_to_cpu(fe->i_flags) & OCFS2_SUPER_BLOCK_FL) {
 		LOG_TRACE_ARGS("superblock inode: i_ino=%lu\n", inode->i_ino);
 		// we can't actually hit this as read_inode can't handle
@@ -439,7 +435,16 @@
 		    break;
 	}
 
-	status = 0;
+	status = ocfs2_inode_lock_res_init(&OCFS_I(inode)->ip_meta_lockres,
+					   OCFS_TYPE_META, inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	status = ocfs2_inode_lock_res_init(&OCFS_I(inode)->ip_data_lockres,
+					   OCFS_TYPE_DATA, inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 bail:
 	LOG_EXIT_STATUS (status);
 	return status;
@@ -502,14 +507,8 @@
 		BUG();
 
 	if (sysfile)
-		OCFS_SET_FLAG(OCFS_I(inode)->ip_flags, OCFS_INODE_SYSTEM_FILE);
+	       OCFS_I(inode)->ip_flags |= OCFS_INODE_SYSTEM_FILE;
 
-	ocfs_init_lockres (osb, inode);
-	status = ocfs_update_lockres(osb, bh, inode, 0);
-	if (status < 0) {
-		make_bad_inode(inode);
-		goto bail;
-	}
 	status = 0;
 
 bail:
@@ -613,31 +612,51 @@
 		goto bail;
 	}
 
+	spin_lock(&OCFS_I(inode)->ip_lock);
 	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SKIP_DELETE) {
+		spin_unlock(&OCFS_I(inode)->ip_lock);
 		LOG_TRACE_ARGS("Skipping delete of %lu because another node "
 			       "has done this for us.\n", inode->i_ino);
 		goto bail;
 	}
+	spin_unlock(&OCFS_I(inode)->ip_lock);
 
 	/* If we're coming from process_vote we can't go into our own
 	 * voting [hello, deadlock city!], so unforuntately we just
 	 * have to skip deleting this guy. That's OK though because
 	 * the node who's doing the actual deleting should handle it
 	 * anyway. */
-	if (osb->voting_ino == inode->i_ino) {
+	if (current == osb->vote_task) {
 		LOG_TRACE_ARGS("Skipping delete of %lu because we're currently"
 			       "in process_vote\n", inode->i_ino);
 		goto bail;
 	}
 
-	/* acquire_lock and friends will igrab / iput this guy, so we
+	/* ocfs2_meta_lock and friends might igrab / iput this guy, so we
 	 * take an extra ref. to avoid recursive calls to
 	 * delete_inode. */
 	atomic_inc(&inode->i_count);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, FLAG_FILE_DELETE, 
-				   &fe_bh, inode);
-	atomic_set(&inode->i_count, 0);
+	status = ocfs2_meta_lock(inode, NULL, &fe_bh, 1);
+	atomic_dec(&inode->i_count);
 	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	/* While we were waiting for the lock, another node might have
+	 * asked to delete the inode. Recheck our flags to catch this
+	 * race and just clear_inode instead.*/
+	spin_lock(&OCFS_I(inode)->ip_lock);
+	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SKIP_DELETE) {
+		spin_unlock(&OCFS_I(inode)->ip_lock);
+		LOG_TRACE_ARGS("Skipping delete of %lu because another node "
+			       "has done this for us.\n", inode->i_ino);
+		goto bail;
+	}
+	spin_unlock(&OCFS_I(inode)->ip_lock);
+
+	status = ocfs2_request_delete_vote(inode);
+	if (status < 0) {
 		/* EBUSY here is assumed to mean that other nodes are
 		 * still using the inode. We're done here though, so
 		 * avoid doing anything on disk and let them worry
@@ -685,14 +704,11 @@
 		goto bail;
 	}
 	ocfs_handle_add_inode(handle, orphan_dir_inode);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
-				   &orphan_dir_bh, orphan_dir_inode);
+	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0,
-			     orphan_dir_inode);
 
 	/* we do this while holding the orphan dir lock because we
 	 * don't want recovery being run from another node to vote for
@@ -711,14 +727,11 @@
 		goto bail;
 	}
 	ocfs_handle_add_inode(handle, inode_alloc_inode);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
-				   &inode_alloc_bh, inode_alloc_inode);
+	status = ocfs2_meta_lock(inode_alloc_inode, handle, &inode_alloc_bh, 1);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0,
-			     inode_alloc_inode);
 
 	handle = ocfs_start_trans(osb, handle, OCFS_DELETE_INODE_CREDITS);
 	if (handle == NULL) {
@@ -785,6 +798,7 @@
  */
 void ocfs_clear_inode (struct inode *inode)
 {
+	int status;
 	ocfs_super *osb;
 
 	LOG_SET_CONTEXT(CLEAR_INODE);
@@ -805,7 +819,7 @@
 		goto bail;
 	}
 
-	OCFS_CLEAR_FLAG (OCFS_I(inode)->ip_flags, OCFS_INODE_INITIALIZED);
+	OCFS_I(inode)->ip_flags &= ~OCFS_INODE_INITIALIZED;
 
 	if (OCFS_I(inode)->ip_blkno == -1)
 		BUG();
@@ -819,10 +833,12 @@
 
 	ocfs2_extent_map_drop(inode, 0);
 
-	down(&recovery_list_sem);
-	list_del(&OCFS_I(inode)->ip_recovery_list);
-	up(&recovery_list_sem);
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 
+	ocfs2_lock_res_free(&OCFS_I(inode)->ip_meta_lockres);
+	ocfs2_lock_res_free(&OCFS_I(inode)->ip_data_lockres);
 	/* clean out the inode private ... why?! */
 	memset(inode->u.generic_ip, 0, sizeof(ocfs_inode_private));
 
@@ -904,7 +920,6 @@
 	struct inode *inode = dentry->d_inode;
 	int status = 0;
 	ocfs_super *osb;
-	ocfs_lock_res *lockres;
 
 	LOG_SET_CONTEXT(REVALIDATE);
 
@@ -928,28 +943,13 @@
 	}
 	spin_unlock(&OCFS_I(inode)->ip_lock);
 
-	if (ocfs_node_map_is_only(osb, &osb->publ_map, osb->node_num)) {
-		LOG_TRACE_STR ("Only node alive.");
+	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
 		goto bail;
 	}
-
-	lockres = GET_INODE_LOCKRES(inode);
-	/* if I hold cache lock, no revalidate needed */
-	ocfs_acquire_lockres_read(inode);
-	if (ocfs_is_local_cache_lock(osb, inode)) {
-		ocfs_release_lockres_read(inode);
-		LOG_TRACE_STR("local cache lock\n");
-		goto bail;
-	}
-	ocfs_release_lockres_read(inode);
-
-	atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
-	status = ocfs_verify_update_inode(osb, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		status = -ENOENT;
-	}
-
+	ocfs2_meta_unlock(inode, 0);
 bail:
 	LOG_EXIT_STATUS(status);
 
@@ -998,10 +998,8 @@
 //	fe->i_generation = inode->i_generation;
 
 	status = ocfs_journal_dirty(handle, bh);
-	if (status < 0) {
+	if (status < 0)
 		LOG_ERROR_STATUS(status);
-		goto leave;
-	}
 
 	status = 0;
 leave:
@@ -1016,156 +1014,30 @@
  * Updates a struct inode from a disk inode.
  * does no i/o, only takes ip_lock. 
  */
-int ocfs_refresh_inode(struct inode *inode, 
-		       ocfs2_dinode *fe)
+void ocfs_refresh_inode(struct inode *inode, 
+			ocfs2_dinode *fe)
 {
-	int status = 0;
-	int drop_map = 0;
+	ocfs_inode_private *oip = OCFS_I(inode);
 	ocfs_super *osb = OCFS2_SB(inode->i_sb);
 
-	spin_lock(&OCFS_I(inode)->ip_lock);
+	spin_lock(&oip->ip_lock);
 
-	if (INODE_DELETED(inode)) {
-		LOG_TRACE_ARGS("Inode %llu was marked as deleted!", 
-			       OCFS_I(inode)->ip_blkno);
-		status = -ENOENT;
-		goto bail;
+	oip->ip_clusters = fe->i_clusters;
+	inode->i_size = fe->i_size;
+	if (S_ISREG(inode->i_mode)) {
+		oip->ip_mmu_private = inode->i_size;
 	}
+	inode->i_nlink = fe->i_links_count;
+	inode->i_blocks = (inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
+	inode->i_uid = fe->i_uid;
+	inode->i_gid = fe->i_gid;
+	inode->i_mode = fe->i_mode;
+	inode->i_blksize = (__u32) osb->s_clustersize;
+	OCFS_SET_INODE_TIME(inode, i_ctime, fe->i_ctime);
+	OCFS_SET_INODE_TIME(inode, i_atime, fe->i_atime);
+	OCFS_SET_INODE_TIME(inode, i_mtime, fe->i_mtime);
 
-	/* Add checks as needed */
-	if ((fe->i_dtime) || (!(fe->i_flags & OCFS2_VALID_FL))) {
-		if (fe->i_dtime)
-			LOG_ERROR_ARGS("Inode %lu has dtime = %llu\n", 
-				       inode->i_ino, fe->i_dtime);
-		else
-			LOG_TRACE_STR ("File Entry is invalid");
-
-		status = -ENOENT;
-		goto bail;
-	}
-
-	if (inode->i_generation != le32_to_cpu(fe->i_generation)) {
-		LOG_ERROR_ARGS("Inode %llu is stale! (%u, %u)\n",
-			       OCFS_I(inode)->ip_blkno, 
-			       inode->i_generation,
-			       le32_to_cpu(fe->i_generation));
-		SET_INODE_DELETED(inode);
-		status = -ENOENT;
-		goto bail;
-	}
-
-	if ((OCFS_I(inode)->ip_clusters != fe->i_clusters) ||
-	    (inode->i_size != fe->i_size) ||
-	    inode->i_uid != fe->i_uid ||
-	    inode->i_gid != fe->i_gid || 
-	    inode->i_mode != fe->i_mode ||
-	    inode->i_nlink != fe->i_links_count){
-
-		if (OCFS_I(inode)->ip_clusters > fe->i_clusters) {
-			LOG_TRACE_ARGS("destroying extent maps for %llu, "
-				       "ip_clusters = %u, i_clusters = %u\n", 
-				       OCFS_I(inode)->ip_blkno, 
-				       OCFS_I(inode)->ip_clusters, 
-				       fe->i_clusters);
-			drop_map = 1; /* Because we have the lock here */
-		}
-
-		LOG_TRACE_STR("Allocsize, filesize or seq no did not match");
-		OCFS_I(inode)->ip_clusters = fe->i_clusters;
-		inode->i_size = fe->i_size;
-		if (S_ISREG(inode->i_mode)) {
-			OCFS_I(inode)->ip_mmu_private = inode->i_size;
-		}
-		LOG_TRACE_ARGS("verifyupdate: setting nlink from %d to %d for %llu\n", 
-			       inode->i_nlink, fe->i_links_count,
-			       OCFS_I(inode)->ip_blkno);
-		inode->i_nlink = fe->i_links_count;
-		inode->i_blocks = (inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
-		inode->i_uid = fe->i_uid;
-		inode->i_gid = fe->i_gid;
-		inode->i_mode = fe->i_mode;
-		inode->i_blksize = (__u32) osb->s_clustersize;
-		OCFS_SET_INODE_TIME(inode, i_ctime, fe->i_ctime);
-		OCFS_SET_INODE_TIME(inode, i_atime, fe->i_atime);
-		OCFS_SET_INODE_TIME(inode, i_mtime, fe->i_mtime);
-
-		if (S_ISCHR(fe->i_mode) ||
-	       	    S_ISBLK(fe->i_mode) ||
-	      	    S_ISFIFO(fe->i_mode) ||
-	     	    S_ISSOCK(fe->i_mode)) {
-			inode->i_rdev = 0;
-			init_special_inode(inode, inode->i_mode,
-					   huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)));
-		}
-	}
-
-	atomic_set(&OCFS_I(inode)->ip_needs_verification, 0);
-bail:
-
-	spin_unlock(&OCFS_I(inode)->ip_lock);
-
-	if (drop_map)
-		ocfs2_extent_map_trunc(inode, fe->i_clusters);
-
-	return(status);
+	spin_unlock(&oip->ip_lock);
 }				/* ocfs_refresh_inode */
 
-/*
- * ocfs_verify_update_inode()
- */ 
-int ocfs_verify_update_inode (ocfs_super * osb, struct inode * inode)
-{
-	int status = 0;
-	struct buffer_head *fe_bh = NULL;
-	ocfs2_dinode *fe;
 
-	/* We are setting the oin Updated flag in the end. */
-	LOG_ENTRY ();
-
-	OCFS_ASSERT (inode);
-
-	if (OCFS_I(inode)->ip_blkno == 0) {
-		LOG_ERROR_ARGS("inode 0x%lu has zero blkno\n", inode->i_ino);
-		status = -EINVAL;
-		goto leave;
-	}
-
-	spin_lock(&OCFS_I(inode)->ip_lock);
-	if (INODE_DELETED(inode)) {
-		spin_unlock(&OCFS_I(inode)->ip_lock);
-		LOG_TRACE_ARGS("Inode %llu was marked as deleted!", 
-			       OCFS_I(inode)->ip_blkno);
-		status = -ENOENT;
-		goto leave;
-	}
-	spin_unlock(&OCFS_I(inode)->ip_lock);
-
-	status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &fe_bh, 
-				 OCFS_BH_CACHED, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto leave;
-	}
-
-	fe = (ocfs2_dinode *) fe_bh->b_data;
-
-	status = ocfs_refresh_inode(inode, fe);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto leave;
-	}
-
-	ocfs_acquire_lockres_write(inode);
-	status = ocfs_update_lockres (osb, fe_bh, inode, 0);
-	ocfs_release_lockres_write(inode);
-
-	status = 0;
-leave:
-
-	if (fe_bh)
-		brelse(fe_bh);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_verify_update_inode */
-

Modified: trunk/src/inode.h
===================================================================
--- trunk/src/inode.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/inode.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -41,9 +41,8 @@
 ssize_t ocfs_rw_direct(int rw, struct file *filp, char *buf,
 		       size_t size, loff_t *offp);
 void ocfs_sync_blockdev(struct super_block *sb);
-int ocfs_verify_update_inode(ocfs_super *osb, struct inode *inode);
-int ocfs_refresh_inode(struct inode *inode, 
-		       ocfs2_dinode *fe);
+void ocfs_refresh_inode(struct inode *inode, 
+			ocfs2_dinode *fe);
 int ocfs_mark_inode_dirty(ocfs_journal_handle *handle, 
 			  struct inode *inode, 
 			  struct buffer_head *bh);

Modified: trunk/src/journal.c
===================================================================
--- trunk/src/journal.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/journal.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -35,14 +35,14 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "extent_map.h"
+#include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
 #include "localalloc.h"
-#include "lockres.h"
 #include "namei.h"
-#include "nm.h"
+#include "slot_map.h"
 #include "super.h"
 #include "util.h"
 #include "vote.h"
@@ -55,124 +55,42 @@
 
 spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
 
-static int ocfs_reset_publish (ocfs_super * osb, __u64 node_num);
 static int ocfs_force_read_journal(struct inode *inode);
 static int ocfs_recover_node(struct _ocfs_super *osb, int node_num);
 static int __ocfs_recovery_thread(void *arg);
 static int ocfs_commit_cache (ocfs_super * osb);
 static int ocfs_wait_on_mount(ocfs_super *osb);
-static void ocfs_handle_move_locks(ocfs_journal *journal, 
-				   ocfs_journal_handle *handle);
-static void ocfs_journal_optimize_lock_list(ocfs_journal *journal);
+static int ocfs2_commit_thread_sleep(ocfs_super *osb);
+static void ocfs_handle_cleanup_locks(ocfs_journal *journal, 
+				      ocfs_journal_handle *handle,
+				      int set_id);
 static void ocfs_commit_unstarted_handle(ocfs_journal_handle *handle);
+static int ocfs_journal_toggle_dirty(ocfs_super *osb,
+				     int dirty);
+static int ocfs2_trylock_journal(ocfs_super *osb,
+				 int slot_num);
 
-static void ocfs_journal_optimize_lock_list(ocfs_journal *journal)
-{
-	ocfs_journal_lock *lock = NULL;
-	ocfs_journal_lock *first = NULL; /* 1st update_oin release for
-					  * an inode. */
-	struct inode *curr_inode = NULL;
-	struct list_head *p, *n;
-
-	/* 1st pass: optimize away all UPDATE_OIN messages into one.
-	 * 2 types of UPDATE_OIN messages as of now: with size change
-	 * (truncate / extend) or without size change. We will take
-	 * *all* of them (regardless of type) and cram it into one
-	 * message. */
-	curr_inode = NULL;
-	list_for_each_safe(p, n, &journal->checkpointing_locks) {
-		lock = list_entry(p, ocfs_journal_lock, lock_list);
-
-		if (lock->inode != curr_inode) {
-			/* Ok, new inode. */
-			first = NULL;
-
-			curr_inode = lock->inode;
-		}
-
-		/* if it's not an update oin then just continue */
-		if (!(lock->flags & FLAG_FILE_UPDATE_OIN))
-			continue;
-
-		if (!first) {
-			first = lock;
-			/* nothing else to do to the 1st update release. */
-			continue;
-		}
-
-		/* only the 1st update_oin lock stays -- the
-		 * rest are either num_ident into it or
-		 * dropped */
-		list_del(&lock->lock_list);
-		atomic_dec(&journal->num_chkpt_locks);
-
-		/* we only incement first->num_ident if the current
-		 * lock has a size change AND first has recorded at
-		 * least one size change (which can include
-		 * itself). Otherwise we'll have an off by one as a
-		 * first with no size change recorded will keep at
-		 * least 1 reference for itself. */
-		if ((lock->flags & FLAG_FILE_EXTEND) 
-		    || (lock->flags & FLAG_FILE_TRUNCATE)) {
-			if ((first->flags & FLAG_FILE_EXTEND) 
-			    || (first->flags & FLAG_FILE_TRUNCATE))
-				first->num_ident++;
-			else
-				first->drop_holders++;
-		} else {
-			/* Ok, no size change on this particular lock,
-			 * so we're discarding it without updating the
-			 * num_ident value -- that's fine but we need
-			 * to manually drop the lockres->lock_holders
-			 * value on it's behalf */
-			first->drop_holders++;
-		}
-
-		/* record a size change in first if there was one. */
-		if (lock->flags & FLAG_FILE_EXTEND)
-			first->flags = first->flags | FLAG_FILE_EXTEND;
-		if (lock->flags & FLAG_FILE_TRUNCATE)
-			first->flags = first->flags | FLAG_FILE_TRUNCATE;
-
-		iput(lock->inode);
-
-		kmem_cache_free(OcfsGlobalCtxt.lock_cache, lock);
-	}
-
-	return;
-}
-
 /* 
  * ocfs_commit_cache()
- *
- * This is in journal.c for lack of a better place.
- *
  */
 static int ocfs_commit_cache(ocfs_super *osb)
 {
-	int status = 0, tmpstat;
-	unsigned int flushed  = 0;
-	unsigned int cmt_locks;
+	int status = 0;
+	unsigned int flushed;
+	unsigned long old_id;
 	ocfs_journal * journal = NULL;
-	struct list_head *p, *n;
-	ocfs_journal_lock *lock = NULL;
-	struct inode *inode;
-	ocfs_inode_private *ip;
-	ocfs_lock_res *lockres;
 
 	LOG_ENTRY();
 
 	journal = osb->journal;
 
-	/* Step 1: flush all pending commits and checkpoint the journal. */
+	/* Flush all pending commits and checkpoint the journal. */
 	down_write(&journal->trans_barrier);
 
 	if (atomic_read(&journal->num_trans) == 0) {
+		up_write(&journal->trans_barrier);
 		LOG_TRACE_STR("No transactions for me to flush!");
-		/* now, we may have locks left to drop even though no
-		 * transactions are in the journal. */
-
-		goto drop_locks;
+		goto finally;
 	}
 
 	journal_lock_updates(journal->k_journal);
@@ -184,111 +102,18 @@
 		goto finally;
 	}
 
-	ocfs_inc_trans_id(journal);
+	old_id = ocfs_inc_trans_id(journal);
 
-drop_locks:
 	flushed = atomic_read(&journal->num_trans);
 	atomic_set(&journal->num_trans, 0);
-
-	/* Step 2: Drop any locks acquired during transactions which
-	 * have just been checkpointed.  */
-	spin_lock(&journal->cmt_lock);
-
-	cmt_locks = atomic_read(&journal->num_cmt_locks);
-
-	atomic_add(atomic_read(&journal->num_cmt_locks),
-		   &journal->num_chkpt_locks);
-	atomic_set(&journal->num_cmt_locks, 0);
-
-	/* move the locks off each inode onto the commit threads list. */
-	list_for_each_safe(p, n, &journal->committing_inodes) {
-		ip = list_entry(p, ocfs_inode_private, ip_j_inode);
-		inode = ip->ip_inode;
-
-		if (!list_empty(&OCFS_I(inode)->ip_pending_locks))
-			list_splice_init(&OCFS_I(inode)->ip_pending_locks,
-					(&journal->checkpointing_locks)->prev);
-
-		/* we can now remove the inode from the committing
-		 * list. */
-		list_del_init(&OCFS_I(inode)->ip_j_inode);
-	}
-	osb->needs_flush = 0;
-
-	spin_unlock(&journal->cmt_lock);
-
-	/* TODO: Can we assert this anymore and move these lock
-	 * releases back up?  
-	 *
-	 * Once we've got cmt_lock, we can let
-	 * transactions start again -- it should protect us against
-	 * people mucking with the committed list... */
 	up_write(&journal->trans_barrier);
 
 #ifdef VERBOSE_COMMIT_THREAD
-	if (flushed || cmt_locks)
-		printk("(%u) commit_thread: flushed %u transactions, "
-		       "releasing %u locks\n", current->pid, flushed, 
-		       cmt_locks);
+	printk("(%u) commit_thread: flushed transaction %lu (%u handles)\n",
+	       current->pid, journal->trans_id, flushed);
 #endif
 
-	ocfs_journal_optimize_lock_list(journal);
-#ifdef VERBOSE_COMMIT_THREAD
-	if (flushed || cmt_locks)
-		printk("(%u) commit_thread: after optimization, %u locks "
-		       "to release\n", current->pid, 
-		       atomic_read(&journal->num_chkpt_locks));
-#endif
-
-	p = n = NULL;
-	list_for_each_safe(p, n, &journal->checkpointing_locks) {
-		if (!atomic_read(&journal->num_chkpt_locks))
-			BUG();
-
-		lock = list_entry(p, ocfs_journal_lock, lock_list);
-
-		list_del(&(lock->lock_list));
-
-		if (!lock->inode)
-			BUG();
-
-#if 0
-		/* enable this for tons of output, which will likely
-		 * hang your box :) */
-		printk("commit_thread: release lock %u (inode %llu)\n", 
-		       atomic_read(&journal->num_chkpt_locks),
-		       OCFS_I(lock->inode)->ip_blkno);
-#endif
-		tmpstat = 0;
-		if (!INODE_DELETED(lock->inode))
-			tmpstat = ocfs_release_lock_full(osb, 
-							 lock->type,
-							 lock->flags,
-							 lock->inode,
-							 lock->num_ident);
-		else
-			LOG_ERROR_ARGS("commit_thread: Skipping release for "
-				       "inode %llu!\n", 
-				       OCFS_I(lock->inode)->ip_blkno);
-		if (tmpstat < 0)
-			LOG_ERROR_ARGS("commit_thread: release_lock status is"
-				       " %d releasing lock on inode %llu!\n",
-				       tmpstat, OCFS_I(lock->inode)->ip_blkno);
-
-		if (lock->drop_holders) {
-			lockres = GET_INODE_LOCKRES(lock->inode);
-			ocfs_acquire_lockres_write(lock->inode);
-			OCFS_ASSERT(lockres->lock_holders >= lock->drop_holders);
-			lockres->lock_holders -= lock->drop_holders;
-			ocfs_release_lockres_write(lock->inode);
-		}
-
-		iput(lock->inode);
-
-		atomic_dec(&journal->num_chkpt_locks);
-		kmem_cache_free(OcfsGlobalCtxt.lock_cache, lock);
-	}
-
+	ocfs2_kick_vote_thread(osb);
 finally:
 	LOG_EXIT_STATUS (status);
 	return status;
@@ -384,7 +209,7 @@
 {
 	ocfs_journal_handle * retval = NULL;
 
-	retval = ocfs_malloc(sizeof(*retval));
+	retval = kmalloc(sizeof(*retval), GFP_KERNEL);
 	if (!retval) {
 		LOG_ERROR_STR("Failed to allocate memory for journal handle!");
 		return(NULL);
@@ -534,10 +359,7 @@
 	/* You are allowed to add journal locks before the transaction
 	 * has started. */
 	osb = handle->osb;
-	ocfs_handle_move_locks(osb->journal, handle);
-	spin_lock(&osb->journal->cmt_lock);
-	osb->needs_flush = 1;
-	spin_unlock(&osb->journal->cmt_lock);
+	ocfs_handle_cleanup_locks(osb->journal, handle, 0);
 
 	kfree(handle);
 	LOG_EXIT();
@@ -587,10 +409,7 @@
 		BUG();
 	}
 
-	ocfs_handle_move_locks(osb->journal, handle);
-	spin_lock(&osb->journal->cmt_lock);
-	osb->needs_flush = 1;
-	spin_unlock(&osb->journal->cmt_lock);
+	ocfs_handle_cleanup_locks(osb->journal, handle, 1);
 
 	up_read(&journal->trans_barrier);
 
@@ -612,8 +431,9 @@
  * during the transaction, so make sure they were taken *before*
  * start_trans or we'll have ordering deadlocks. 
  *
- * This function would be alot simpler if we didn't have to worry
- * about abort. 
+ * WARNING2: Note that we do *not* drop trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
  */
 int ocfs_extend_trans(ocfs_journal_handle *handle, int nblocks)
 {
@@ -732,90 +552,65 @@
 	return(status);
 } /* ocfs_journal_dirty */
 
-void ocfs_handle_add_lock(ocfs_journal_handle *handle, __u32 type, 
-			  __u32 flags, struct inode *inode)
+/* We always assume you're adding a metadata lock at level 'ex' */
+int ocfs_handle_add_lock(ocfs_journal_handle *handle,
+			 struct inode *inode)
 {
+	int status;
 	ocfs_journal_lock *lock;
 
 	OCFS_ASSERT(inode);
-
-	LOG_ENTRY_ARGS("(inode=%llu, type=%u, flags=%u)\n",
-			OCFS_I(inode)->ip_blkno, type, flags);
-
 	lock = kmem_cache_alloc(OcfsGlobalCtxt.lock_cache, GFP_NOFS);
-	if (lock == NULL) {
-		LOG_ERROR_STR("Out of memory -- cannot add lock to release.");
+	if (!lock) {
+		status = -ENOMEM;
 		LOG_ERROR_STATUS(-ENOMEM);
-
-		BUG();
+		goto bail;
 	}
 
-	lock->type  = type;
-	lock->flags = flags;
-	lock->inode = inode;
-
-	/* stuff for commit thread optimization. */
-	lock->num_ident = 1;
-	/* this is for *additional* decrements of lock_holders, not
-	 * the one given by ocfs_release_lock... */
-	lock->drop_holders = 0;
-
 	if (!igrab(inode))
 		BUG();
+	lock->jl_inode = inode;
 
-	list_add_tail(&(lock->lock_list), &(handle->locks));
+	list_add_tail(&(lock->jl_lock_list), &(handle->locks));
 	handle->num_locks++;
-	spin_lock(&handle->journal->cmt_lock);
-	atomic_inc(&handle->journal->num_cmt_locks);
-	spin_unlock(&handle->journal->cmt_lock);
 
-	LOG_EXIT();
-	return;
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
 }
 
-/* move the locks off a journal handle and onto the
- * inode->ip_pending_locks. makes sure the inodes are on
- * journal->committing_inodes so that the commit thread can get them
- * after checkpoint.
- *
- * You want to do this before dropping trans_barrier to prevent the commit
- * thread from missing the locks. 
- * 
- * TODO: When we get rid of the last checkpointed transactions, we can
- * just put locks right on the inode in ocfs_handle_add_lock...
- */
-static void ocfs_handle_move_locks(ocfs_journal *journal, 
-				   ocfs_journal_handle *handle)
+static void ocfs_handle_cleanup_locks(ocfs_journal *journal, 
+				      ocfs_journal_handle *handle,
+				      int set_id)
 {
 	struct list_head *p, *n;
 	ocfs_journal_lock *lock;
 	struct inode *inode;
 
 	list_for_each_safe(p, n, &(handle->locks)) {
-		lock = list_entry(p, ocfs_journal_lock, lock_list);
-		list_del(&lock->lock_list);
+		lock = list_entry(p, ocfs_journal_lock, jl_lock_list);
+		list_del(&lock->jl_lock_list);
 		handle->num_locks--;
 
-		inode = lock->inode;
-
-		spin_lock(&journal->cmt_lock);
-		/* add the lock to the inode */
-		list_add_tail(&lock->lock_list, 
-			      &OCFS_I(inode)->ip_pending_locks);
-		/* and make sure the inode is on the journals list */
-		if (list_empty(&OCFS_I(inode)->ip_j_inode))
-			list_add_tail(&OCFS_I(inode)->ip_j_inode, 
-				      &journal->committing_inodes);
-		spin_unlock(&journal->cmt_lock);
+		inode = lock->jl_inode;
+		if (set_id)
+			ocfs_set_inode_lock_trans(journal, inode);
+		ocfs2_meta_unlock(inode, 1);
+		if (atomic_read(&inode->i_count) == 1)
+			LOG_ERROR_ARGS("Inode %llu, I'm doing a last iput "
+				       "for!", OCFS_I(inode)->ip_blkno);
+		iput(inode);
+		kmem_cache_free(OcfsGlobalCtxt.lock_cache, lock);
 	}
-	return;
 }
 
 #define OCFS_DEFAULT_COMMIT_INTERVAL 	(HZ * 5)
 /* 
  * Setup the journal using the journal system file
  */
-int ocfs_journal_init(ocfs_super *osb) 
+int ocfs_journal_init(ocfs_super *osb,
+		      int *dirty) 
 {
 	int status = -1;
 	struct inode *inode = NULL; /* the journal inode */
@@ -828,10 +623,9 @@
 	if (!osb)
 		BUG();
 
-	spin_lock_init(&(osb->journal->cmt_lock));
-
 	/* already have the inode for our journal */
-	inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, osb->node_num);
+	inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, 
+					   osb->slot_num);
 	if (inode == NULL) {
 		LOG_ERROR_STR("access error");
 		status = -EACCES;
@@ -847,8 +641,7 @@
 
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE,
-				    0, &bh, inode);
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STR("Could not get lock on journal!");
@@ -893,21 +686,16 @@
 
 	/* yay, pass the proper info back to our journal structure. */
 	osb->journal->osb = osb;
-	/* eventually this will be a value passed into us */
-	osb->journal->node_num = osb->node_num;
 	osb->journal->k_journal = k_journal;
 	osb->journal->k_inode = inode;
 	osb->journal->version = OCFS_JOURNAL_CURRENT_VERSION;
 	osb->journal->lockbh = bh;
 	atomic_set(&(osb->journal->num_trans), 0);
-	atomic_set(&(osb->journal->num_cmt_locks), 0);
-	atomic_set(&(osb->journal->num_chkpt_locks), 0);
 	init_rwsem(&(osb->journal->trans_barrier));
 	osb->journal->state = OCFS_JOURNAL_LOADED;
 	osb->journal->trans_id = (unsigned long) 1;
-	INIT_LIST_HEAD(&(osb->journal->committing_inodes));
-	INIT_LIST_HEAD(&(osb->journal->checkpointing_locks));
 
+	*dirty = (fe->id1.journal1.i_flags & OCFS2_JOURNAL_DIRTY_FL);
 	status = 0;
 done:
 	if (status < 0) {
@@ -924,8 +712,34 @@
 	return(status);
 } /* ocfs_journal_init */
 
+static int ocfs_journal_toggle_dirty(ocfs_super *osb,
+				     int dirty)
+{
+	int status;
+	ocfs_journal * journal = osb->journal;
+	struct buffer_head *bh = journal->lockbh;
+	ocfs2_dinode *fe;
+
+	LOG_ENTRY();
+
+	fe = (ocfs2_dinode *) bh->b_data;
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+
+	if (dirty)
+		fe->id1.journal1.i_flags |= OCFS2_JOURNAL_DIRTY_FL;
+	else
+		fe->id1.journal1.i_flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+
+	status = ocfs_write_block(osb, bh, journal->k_inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
 /*
-  if the journal has been ocfs_malloc'd it needs to be freed after this call.
+  if the journal has been kmalloc'd it needs to be freed after this call.
 */
 void ocfs_journal_shutdown(ocfs_super *osb) 
 {
@@ -968,8 +782,8 @@
 	if (osb->commit && osb->commit->c_task) {
 		/* Wait for the commit thread */
 		LOG_TRACE_STR ("Waiting for ocfs2commit to exit....");
-		atomic_set (&osb->flush_event_woken, 1);
-		wake_up (&osb->flush_event);
+		atomic_set (&osb->needs_checkpoint, 1);
+		wake_up (&osb->checkpoint_event);
 		wait_for_completion(&osb->commit->c_complete);
 		osb->commit->c_task = NULL;
 		kfree(osb->commit);
@@ -977,15 +791,17 @@
 
 	OCFS_ASSERT(atomic_read(&(osb->journal->num_trans)) == 0);
 
+	status = ocfs_journal_toggle_dirty(osb, 0);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
 	/* Shutdown the kernel journal system */
 	journal_destroy(journal->k_journal);
 
 	OCFS_I(inode)->ip_open_cnt--;
 
 	/* unlock our journal */
-	status = ocfs_release_lock (osb, OCFS_LKM_EXMODE, 0, inode);
-	if (status < 0)
-		LOG_ERROR_STATUS (status);
+	ocfs2_meta_unlock(inode, 1);
 
 	brelse (journal->lockbh);
 	journal->lockbh = NULL;
@@ -1004,6 +820,7 @@
 {
 	int status = 0;
 	int olderr = 0;
+	int child_pid;
 	ocfs_super *osb;
 
 	LOG_ENTRY();
@@ -1026,6 +843,30 @@
 		journal_clear_err(journal->k_journal);
 	}
 
+	status = ocfs_journal_toggle_dirty(osb, 1);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto done;
+	}
+
+	/* Launch the commit thread */
+	osb->commit = kmalloc(sizeof(ocfs_commit_task), GFP_KERNEL);
+	if (osb->commit == NULL) {
+		LOG_ERROR_STATUS(status = -ENOMEM);
+		goto done;
+	}
+	memset(osb->commit, 0, sizeof(ocfs_commit_task));
+	child_pid = kernel_thread (ocfs_commit_thread, osb,
+				   CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if (child_pid < 0) {
+		LOG_ERROR_ARGS ("unable to launch ocfs2commit thread, error=%d",
+				child_pid);
+		status = child_pid;
+		goto done;
+	} else {
+		init_completion (&osb->commit->c_complete);
+	}
+
 done:
 	LOG_EXIT_STATUS(status);
 	return(status);
@@ -1044,7 +885,16 @@
 		BUG();
 
 	status = journal_wipe(journal->k_journal, full);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
+	status = ocfs_journal_toggle_dirty(journal->osb, 0);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+bail:
 	LOG_EXIT_STATUS(status);
 	return(status);
 }
@@ -1116,116 +966,137 @@
 	return(status);
 }
 
-struct ocfs_recover_arg {
-	ocfs_super *osb;
-	int node_num;
-};
-
 static int __ocfs_recovery_thread(void *arg)
 {
-	struct ocfs_recover_arg *recover_arg = arg;
-	ocfs_super *osb = recover_arg->osb;
-	int node_num = recover_arg->node_num;
+	ocfs_super *osb = arg;
 	int status = 0;
+	int node_num;
 	char proc[16];
 
-	LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n", node_num,
-		       osb->node_num);
+	LOG_ENTRY();
 
 	sprintf (proc, "ocfs2rec-%d", osb->osb_id);
 	ocfs_daemonize (proc, strlen(proc), 0);
 
-#ifdef HAVE_NPTL
-	spin_lock_irq (&current->sighand->siglock);
-	sigfillset(&current->blocked);
-	recalc_sigpending();
-	spin_unlock_irq (&current->sighand->siglock);
-#else
-	spin_lock_irq(&current->sigmask_lock);
-	sigfillset(&current->blocked);
-	recalc_sigpending(current);
-	spin_unlock_irq(&current->sigmask_lock);
-#endif	
+	status = ocfs_wait_on_mount(osb);
+	if (status < 0) {
+		if (status == -EBUSY)
+			status = 0;
+		goto bail;
+	}
 
-	status = ocfs_recover_node(osb, node_num);
-	if (status < 0)
+restart:
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
 		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	LOG_EXIT_STATUS(status);
+	while(!ocfs_node_map_is_empty(osb, &osb->recovery_map)) {
+		node_num = ocfs_node_map_first_set_bit(osb,
+						       &osb->recovery_map);
+		if (node_num == OCFS_INVALID_NODE_NUM) {
+			LOG_TRACE_ARGS("Out of nodes to recover.\n");
+			break;
+		}
 
-	kfree(arg);
-	return status;
-}
+		ocfs_recovery_map_clear(osb, node_num);
+		/* TODO: Figure out how we're going to save all the
+		 * local alloc stuff for after recovery on all nodes
+		 * is complete? */
+		status = ocfs_recover_node(osb, node_num);
+		if (status < 0) {
+			printk("ocfs2: Error %d recovering node %d on device "
+				"(%u,%u)!\n", status, node_num,
+			       MAJOR(osb->sb->s_dev),MINOR(osb->sb->s_dev));
+			printk("ocfs2: Volume requires unmount.\n");
+			continue;
+		}
+	}
+	ocfs2_super_unlock(osb, 1);
 
-void ocfs_recovery_thread(ocfs_super *osb, int node_num) 
-{
-	struct ocfs_recover_arg *arg;
+bail:
+	down(&osb->recovery_lock);
+	if (!ocfs_node_map_is_empty(osb, &osb->recovery_map)) {
+		up(&osb->recovery_lock);
+		goto restart;
+	}
 
-	LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n", node_num,
-		       osb->node_num);
+	osb->recovery_launched = 0;
+	wake_up(&osb->recovery_event);
 
-	arg = ocfs_malloc(sizeof(struct ocfs_recover_arg));
-	if (arg == NULL) {
-		LOG_ERROR_STATUS(-ENOMEM);
-		goto done;
-	}
+	up(&osb->recovery_lock);
 
-	arg->osb = osb;
-	arg->node_num = node_num;
+	LOG_EXIT_STATUS(status);
+	return status;
+}
 
-	/* atomic_inc this here and let recover_vol dec it when
-	 * done. We do it this way to avoid races with umount. */
-	atomic_inc(&osb->num_recovery_threads);
+void ocfs_recovery_thread(ocfs_super *osb, int node_num)
+{
+	LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n",
+		       node_num, osb->node_num);
 
-	LOG_TRACE_STR("starting recovery thread...");
+	down(&osb->recovery_lock);
+	if (!osb->disable_recovery) {
+		/* People waiting on recovery will wait on
+		 * the recovery map to empty. */
+		ocfs_recovery_map_set(osb, node_num);
 
-	kernel_thread(__ocfs_recovery_thread, arg,
-		      CLONE_VM | CLONE_FS | CLONE_FILES);
+		LOG_TRACE_STR("starting recovery thread...");
 
-done:
+		if (!osb->recovery_launched) {
+			kernel_thread(__ocfs_recovery_thread, osb,
+				      CLONE_VM | CLONE_FS | CLONE_FILES);
+			osb->recovery_launched = 1;
+		}
+	}
+	up(&osb->recovery_lock);
+	wake_up(&osb->recovery_event);
+
 	LOG_EXIT();
 	return;
 }
 
 static int ocfs_recover_node(ocfs_super *osb, int node_num) 
 {
-	int status = -1;
-	int tmpstat;
+	int status = 0;
+//	int tmpstat;
+	int slot_num;
 	ocfs2_dinode *fe;
 	ocfs2_dinode *local_alloc = NULL;
 	struct inode *inode = NULL;
 	journal_t *k_journal = NULL;
 	struct buffer_head *bh = NULL;
 	ocfs_journal * journal = NULL;
-	int recovery_lock = 0, got_lock = 0, clean_orphans = 0;
+	int got_lock = 0, clean_orphans = 0;
+	ocfs2_slot_info *si = osb->slot_info;
 
-	LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n", node_num,
-		       osb->node_num);
+	LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n",
+		       node_num, osb->node_num);
 
-	if (!osb || (node_num >= osb->max_nodes)) {
-		LOG_ERROR_STATUS (status = -EINVAL);
-		goto done;
-	}
+	printk("ocfs2_recover_node: checking node %d\n", node_num);
 
-	status = ocfs_wait_on_mount(osb);
-	if (status < 0) {
-		if (status == -EBUSY)
-			status = 0;
-		goto done;
-	}
-	journal = osb->journal;
+	/* Should not ever be called to recover ourselves -- in that
+	 * case we should've called ocfs_journal_load instead. */
+	if (osb->node_num == node_num)
+		BUG();
 
-	/* Grab the local recovery resource to ensure no other thread
-	 * comes in from this node for recovery */
-	down(&(osb->recovery_lock));
-	recovery_lock = 1;
-	if (osb->disable_recovery) {
-		LOG_TRACE_STR("Shutting down so skipping reovery.");
+	ocfs2_update_slot_info(si);
+	slot_num = ocfs2_node_num_to_slot(si, node_num);
+	if (slot_num == OCFS_INVALID_NODE_NUM) {
+		printk("ocfs2_recover_node: no slot for this node, so no "
+		       "recovery required.\n");
 		goto done;
 	}
 
+	printk("ocfs2_recover_node: node %d was using slot %d\n", node_num,
+	       slot_num);
+
+	journal = osb->journal;
+
 	/* Ok, look up the inode for our journal */
-	inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, node_num);
+	inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					   slot_num);
 	if (inode == NULL) {
 		LOG_ERROR_STR("access error");
 		status = -EACCES;
@@ -1241,16 +1112,10 @@
 
 	SET_INODE_JOURNAL(inode);
 
-	/* Should not ever be called to recover ourselves -- in that
-	 * case we should've called ocfs_journal_load instead. */
-	if (osb->node_num == node_num)
-		BUG();
-
-	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, FLAG_FILE_RECOVERY, 
-				    &bh, inode);
-
+	status = ocfs2_meta_lock_flags(inode, NULL, &bh, 1,
+				       OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
-		LOG_TRACE_ARGS("status returned from acquire_lock=%d\n", 
+		LOG_TRACE_ARGS("status returned from ocfs2_meta_lock=%d\n", 
 			       status);
 		if (status != -EINTR)
 			LOG_ERROR_STR("Could not lock journal!");
@@ -1258,19 +1123,20 @@
 	}
 	got_lock = 1;
 
-	/* check if that nodes publish sector has been reset (mounted
-	 * is set false) if so, we can unlock and quit. otherwise we
-	 * should recover. */
-	if (!ocfs_publish_get_mount_state(osb, node_num)) {
+	fe = (ocfs2_dinode *) bh->b_data;
+
+	if (!(fe->id1.journal1.i_flags & OCFS2_JOURNAL_DIRTY_FL)) {
 		LOG_TRACE_ARGS("No recovery required for node %d\n", node_num);
-		status = 0;
+		printk("ocfs2_recover_node: No recovery required for node "
+		       "%d\n", node_num);
 		goto clear_node;
 	}
-	printk("ocfs2: Recovering node %d from device (%u,%u)\n", node_num, 
-	       MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	printk("ocfs2: Recovering node %d from slot %d on device (%u,%u)\n",
+	       node_num, slot_num, MAJOR(osb->sb->s_dev),
+	       MINOR(osb->sb->s_dev));
 	clean_orphans = 1;
 
-	fe = (ocfs2_dinode *) bh->b_data;
 	OCFS_I(inode)->ip_clusters = fe->i_clusters;
 
 	status = ocfs_force_read_journal(inode);
@@ -1305,74 +1171,155 @@
 	if (status < 0)
 		LOG_ERROR_STATUS(status);
 
+	/* mark the node clean. */
+	fe->id1.journal1.i_flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	status = ocfs_write_block(osb, bh, inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
 	if (ocfs_inc_icount(inode) < 0)
 		BUG();
 
 	/* shutdown the journal */
 	journal_destroy(k_journal);
 
+#warning "we can't complete local alloc recovery in this function!"
 	/* recover his local alloc file, AFTER recovering his journal... */
-	status = ocfs_begin_local_alloc_recovery(osb, node_num, &local_alloc);
+	status = ocfs_begin_local_alloc_recovery(osb, slot_num, &local_alloc);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto done;
 	}
 
-	/* clear the publish sector (mark it unmounted and clean) */
-	status = ocfs_reset_publish(osb, node_num);
-	if (status < 0)
-		LOG_ERROR_STATUS(status);
-
 	status = 0;
 
 clear_node:
-	ocfs_recovery_map_clear(osb, node_num);
-	ocfs_recover_oin_locks(osb, node_num);
+	ocfs2_clear_slot(si, slot_num);
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 done:
-	if (recovery_lock)
-		up(&(osb->recovery_lock));
-
 	/* drop the lock on this nodes journal */
-	if (got_lock) {
-		tmpstat = ocfs_release_lock(osb, OCFS_LKM_EXMODE, 
-					    FLAG_FILE_RECOVERY,
-					    inode);
-		if (tmpstat < 0)
-			LOG_ERROR_STATUS(tmpstat);
-	}
+	if (got_lock)
+		ocfs2_meta_unlock(inode, 1);
 
 	if (inode)
 		iput(inode);
 
 	if (bh)
 		brelse(bh);
-
+#if 0
 	if (local_alloc && !status) {
 		tmpstat = ocfs_complete_local_alloc_recovery(osb, local_alloc);
 		if (tmpstat < 0)
 			LOG_ERROR_STATUS(tmpstat);
 	}
-
+#endif
 	if (local_alloc)
 		kfree(local_alloc);
-
+#if 0
 	if (clean_orphans && !status) {
 		tmpstat = ocfs_recover_orphans(osb);
 		if (tmpstat < 0)
 			LOG_ERROR_STATUS(tmpstat);
 	}
+#endif
 
-	atomic_dec(&osb->num_recovery_threads);
-
 	LOG_EXIT_STATUS(status);
 	return(status);
 }
 
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(ocfs_super *osb,
+				 int slot_num)
+{
+	int status, flags;
+	struct inode *inode = NULL;
+
+	inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					   slot_num);
+	if (inode == NULL) {
+		LOG_ERROR_STR("access error");
+		status = -EACCES;
+		goto bail;
+	}
+	if (is_bad_inode (inode)) {
+		LOG_ERROR_STR("access error (bad inode)");
+		iput (inode);
+		inode = NULL;
+		status = -EACCES;
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	flags = OCFS2_META_LOCK_RECOVERY|OCFS2_META_LOCK_NOQUEUE;
+	status = ocfs2_meta_lock_flags(inode, NULL, NULL, 1, flags);
+	if (status < 0) {
+		if (status != -EAGAIN || status != -EINTR)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	ocfs2_meta_unlock(inode, 1);
+bail:
+	if (inode)
+		iput(inode);
+
+	return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(ocfs_super *osb)
+{
+	int status, i, node_num;
+	ocfs2_slot_info *si = osb->slot_info;
+
+	/* This is called with the super block cluster lock, so we
+	 * know that the slot map can't change underneath us. */
+
+	spin_lock(&si->si_lock);
+	for(i = 0; i < si->si_num_slots; i++) {
+		node_num = si->si_global_node_nums[i];
+		if (i == osb->slot_num)
+			continue;
+		if (node_num == OCFS_INVALID_NODE_NUM)
+			continue;
+		if (ocfs_node_map_test_bit(osb, &osb->recovery_map, node_num))
+			continue;
+		spin_unlock(&si->si_lock);
+
+		/* Ok, we have a slot occupied by another node which
+		 * is not in the recovery map. We trylock his journal
+		 * file here to test if he's alive. */
+		status = ocfs2_trylock_journal(osb, i);
+		if (!status) {
+			/* Since we're called from mount, we know that
+			 * the recovery thread can't race us on
+			 * setting / checking the recovery bits. */
+			ocfs_recovery_thread(osb, node_num);
+		} else if ((status < 0) && (status != -EAGAIN)) {
+			if (status != -EINTR)
+				LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		spin_lock(&si->si_lock);
+	}
+	spin_unlock(&si->si_lock);
+
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
 int ocfs_recover_orphans(ocfs_super *osb)
 {
 	int status = 0;
 	int have_disk_lock = 0;
-	int tmpstat;
 	struct inode *inode = NULL;
 	struct inode *iter;
 	struct inode *orphan_dir_inode = NULL;
@@ -1381,8 +1328,6 @@
 	struct ocfs2_dir_entry *de;
 	struct super_block *sb = osb->sb;
 
-	down(&osb->orphan_recovery_lock);
-
 	orphan_dir_inode = ocfs_get_system_file_inode(osb, 
 						      ORPHAN_DIR_SYSTEM_INODE, 
 						      -1);
@@ -1393,7 +1338,7 @@
 	}
 
 	down(&orphan_dir_inode->i_sem);
-	status = ocfs_acquire_lock_ro(osb, orphan_dir_inode);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
 	if (status < 0) {
 		up(&orphan_dir_inode->i_sem);
 		LOG_ERROR_STATUS(status);
@@ -1467,12 +1412,8 @@
 	}
 	up(&orphan_dir_inode->i_sem);
 
-	status = ocfs_release_lock_ro(osb, orphan_dir_inode);
+	ocfs2_meta_unlock(orphan_dir_inode, 0);
 	have_disk_lock = 0;
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
 
 	iput(orphan_dir_inode);
 	orphan_dir_inode = NULL;
@@ -1484,14 +1425,9 @@
 	}
 
 bail:
-	up(&osb->orphan_recovery_lock);
+	if (have_disk_lock)
+		ocfs2_meta_unlock(orphan_dir_inode, 0);
 
-	if (have_disk_lock) {
-		tmpstat = ocfs_release_lock_ro(osb, orphan_dir_inode);
-		if (tmpstat < 0)
-			LOG_ERROR_STATUS(tmpstat);
-	}
-
 	if (orphan_dir_inode)
 		iput(orphan_dir_inode);
 
@@ -1520,57 +1456,40 @@
 	goto retry;
 }
 
-/*
- * ocfs_reset_publish()
- *
- *
- * called by: old_ocfs_recover_node()
- *
- * NOTE: This function is unused. I keep it here because it may be
- * useful in the future. --Mark (Sept. 22, 2003)
- */
-static int ocfs_reset_publish (ocfs_super * osb, __u64 node_num)
+static int ocfs2_commit_thread_sleep(ocfs_super *osb)
 {
-	int status = 0;
-	ocfs_publish *publish = NULL;
-	struct buffer_head *publish_bh = NULL;
+	int status;
+	signed long timeout = OCFS_CHECKPOINT_INTERVAL;
+	DECLARE_WAITQUEUE(wait, current);
 
-	LOG_ENTRY_ARGS("(0x%p, %llu)\n", osb, node_num);
+	if (atomic_read(&osb->needs_checkpoint))
+		return 0;
 
-	/* take a lock on the publish sector */
-	down (&(osb->publish_lock));
+	status = 0;
+	add_wait_queue(&osb->checkpoint_event, &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
 
-	/* Read the publish sector */
-	status = ocfs_read_block(osb, (osb->publish_blkno + node_num),
-				 &publish_bh, 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-	
-	publish = (ocfs_publish *) publish_bh->b_data;
+		if (atomic_read(&osb->needs_checkpoint))
+			break;
 
-	publish->dirty = 0;
-	publish->mounted = 0;
-
-	/* Write the publish sector */
-	status = ocfs_write_block(osb, publish_bh, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
+		if (!signal_pending(current)) {
+			timeout = schedule_timeout(timeout);
+			if (!timeout) {
+				status = -ETIMEDOUT;
+				break;
+			}
+			continue;
+		}
+		status = -EINTR;
+		break;
 	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&osb->checkpoint_event, &wait);
 
-finally:
-	/* unlock it */
-	up (&(osb->publish_lock));
+	return status;
+}
 
-	if (publish_bh)
-		brelse(publish_bh);
-
-	LOG_EXIT_STATUS (status);
-	return (status);
-}				/* ocfs_reset_publish */
-
 int ocfs_commit_thread(void *arg)
 {
 	int status = 0, misses = 0;
@@ -1587,15 +1506,12 @@
 
 	misses = 0;
 	while (1) {
-		status = ocfs_wait (osb->flush_event,
-			    atomic_read (&osb->flush_event_woken), 
-				    OCFS_CHECKPOINT_INTERVAL);
-		
-		atomic_set (&osb->flush_event_woken, 0);
+		status = ocfs2_commit_thread_sleep(osb);
+		atomic_set (&osb->needs_checkpoint, 0);
 
 		switch (status) {
 			case -ETIMEDOUT:
-				LOG_TRACE_STR("FLUSH_EVENT: timed out");
+				LOG_TRACE_STR("timed out");
 				break;
 			case -EINTR:
 				LOG_ERROR_STR("Commit thread got a signal!");
@@ -1607,10 +1523,10 @@
 				}
 				break;
 			case 0:
-				LOG_TRACE_STR("FLUSH_EVENT: woken!!!");
+				LOG_TRACE_STR("woken\n");
 				break;
 			default:
-				LOG_TRACE_STR("FLUSH_EVENT: ??????");
+				LOG_ERROR_STR("invalid status!\n");
 				break;
 		}
 
@@ -1627,15 +1543,12 @@
 			/* we can trust num_trans here because we're
 			 * in shutdown and nobody other than ourselves
 			 * should be able to start more. */
-			if ((atomic_read(&journal->num_trans) == 0)
-			    && (atomic_read(&journal->num_cmt_locks) == 0))
+			if (atomic_read(&journal->num_trans) == 0)
 				break;
 #ifdef VERBOSE_COMMIT_THREAD
-			printk("(%u) commit_thread: %u transactions, %u locks"
-			       "pending on shutdown\n", 
-			       current->pid, 
-			       atomic_read(&journal->num_trans),
-			       atomic_read(&journal->num_cmt_locks));
+			printk("(%u) commit_thread: %u transactions pending "
+			       "on shutdown\n", 
+			       current->pid, atomic_read(&journal->num_trans));
 #endif
 			goto skip_sleep;
 		}
@@ -1644,4 +1557,3 @@
 	complete (&(commit->c_complete));
 	return 0;
 }
-

Modified: trunk/src/localalloc.c
===================================================================
--- trunk/src/localalloc.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/localalloc.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,7 +38,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "localalloc.h"
 #include "suballoc.h"
 #include "sysfile.h"
@@ -54,11 +54,11 @@
 
 static inline int ocfs_local_alloc_window_bits(ocfs_super *osb);
 
-static __u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc);
+static u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc);
 
 static int ocfs_local_alloc_find_clear_bits(ocfs_super *osb,
 				      ocfs2_dinode *alloc,
-				      __u32 numbits);
+				      u32 numbits);
 
 static void ocfs_clear_local_alloc(ocfs2_dinode *alloc);
 
@@ -137,7 +137,7 @@
 	int status = 0;
 	ocfs2_dinode *alloc = NULL;
 	struct buffer_head *alloc_bh = NULL;
-	__u32 num_used;
+	u32 num_used;
 	struct inode *inode = NULL;
 
 	LOG_ENTRY();
@@ -148,7 +148,7 @@
 
 	/* read the alloc off disk */
 	inode = ocfs_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, 
-					   osb->node_num);
+					   osb->slot_num);
 	if (!inode) {
 		LOG_ERROR_STATUS(status=-EINVAL);
 		goto bail;
@@ -237,7 +237,7 @@
 	local_alloc_inode = 
 		ocfs_get_system_file_inode(osb, 
 					   LOCAL_ALLOC_SYSTEM_INODE,
-					   osb->node_num);
+					   osb->slot_num);
 	if (!local_alloc_inode) {
 		status = -ENOENT;
 		LOG_ERROR_STATUS(status);
@@ -272,15 +272,12 @@
 	}
 
 	ocfs_handle_add_inode(handle, main_bm_inode);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
-				   0, &main_bm_bh, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-			     0, main_bm_inode);
 
 	/* WINDOW_MOVE_CREDITS is a bit heavy... */
 	handle = ocfs_start_trans(osb, handle, OCFS_WINDOW_MOVE_CREDITS);
@@ -352,7 +349,7 @@
  * caller to process with ocfs_complete_local_alloc_recovery
  */
 int ocfs_begin_local_alloc_recovery(ocfs_super *osb, 
-				    int node_num, 
+				    int slot_num, 
 				    ocfs2_dinode **alloc_copy)
 {
 	int status = 0;
@@ -360,13 +357,13 @@
 	struct inode *inode = NULL;
 	ocfs2_dinode *alloc;
 
-	LOG_ENTRY_ARGS("(node_num = %d)\n", node_num);
+	LOG_ENTRY_ARGS("(slot_num = %d)\n", slot_num);
 
 	*alloc_copy = NULL;
 
 	inode = ocfs_get_system_file_inode(osb, 
 					   LOCAL_ALLOC_SYSTEM_INODE, 
-					   node_num);
+					   slot_num);
 	if (!inode) {
 		LOG_ERROR_STATUS(status=-EINVAL);
 		goto bail;
@@ -442,15 +439,12 @@
 	}
 
 	ocfs_handle_add_inode(handle, main_bm_inode);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
-				   0, &main_bm_bh, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-			     0, main_bm_inode);
 
 	handle = ocfs_start_trans(osb, handle, OCFS_WINDOW_MOVE_CREDITS);
 	if (!handle) {
@@ -508,7 +502,7 @@
 	local_alloc_inode = 
 		ocfs_get_system_file_inode(osb, 
 					   LOCAL_ALLOC_SYSTEM_INODE,
-					   osb->node_num);
+					   osb->slot_num);
 	if (!local_alloc_inode) {
 		status = -ENOENT;
 		LOG_ERROR_STATUS(status);
@@ -627,18 +621,17 @@
 /*
  * ocfs_local_alloc_count_bits
  */
-static __u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc)
+static u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc)
 {
 	int i;
-	__u8 *buffer;
-	__u32 count = 0;
+	u8 *buffer;
+	u32 count = 0;
 
 	LOG_ENTRY();
 
 	buffer = LOCAL_ALLOC(alloc)->la_bitmap;
-	for (i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++) {
+	for (i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++)
 		count += hweight8(buffer[i]);
-	}
 
 	LOG_EXIT_ULONG ((unsigned long)count);
 	return(count);
@@ -649,7 +642,7 @@
  */
 static int ocfs_local_alloc_find_clear_bits(ocfs_super *osb,
 					    ocfs2_dinode *alloc,
-					    __u32 numbits)
+					    u32 numbits)
 {
 	int numfound, bitoff, left, startoff, lastzero;
 	void *bitmap = NULL;

Deleted: trunk/src/lockres.c
===================================================================
--- trunk/src/lockres.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/lockres.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,128 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * lockres.c
- *
- * lock resource handling
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-
-#include "dlm.h"
-#include "lockres.h"
-#include "util.h"
-#include "vote.h"
-
-#include "buffer_head_io.h"
-
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT      OCFS_DEBUG_CONTEXT_LOCKRES
-
-
-/*
- * ocfs_update_lockres()
- *
- * @osb: ocfs super block for the volume
- * @fe: corresponding file entry
- *
- * the lockres is refreshed from the disk.
- *
- * Returns 0 if success, < 0 if error.
- */
-int ocfs_update_lockres(ocfs_super *osb, struct buffer_head *bh,
-			struct inode *inode, int reread)
-{
-	int status = 0;
-	ocfs2_dinode *fe;
-	int flags;
-	int drop_bh = 0;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
-	LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, 0x%p)\n", osb,
-		       OCFS_I(inode)->ip_blkno, lockres, bh);
-
-	/* hey, you can't do that!  ;) */
-	if ((!bh) && !reread)
-		BUG();
-
-	if (!bh) 
-		drop_bh = 1;
-
-	/* Behavior for process_vote: if you don't pass a buffer, then
-	 * we'll only read if you're not he master. */
-	if ((bh == NULL) && (lockres->master_node_num == osb->node_num))
-		goto out;
-
-	if (reread) {
-		flags = lockres->master_node_num == osb->node_num ? 
-		    	OCFS_BH_CACHED : 0;
-		status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &bh, 
-					 flags, inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto out;
-		}
-	}
-
-	fe = (ocfs2_dinode *) bh->b_data;
-	if (!IS_VALID_FILE_ENTRY(fe))
-		BUG();
-
-	lockres->lock_type = DISK_LOCK(fe)->dl_level;
-	lockres->master_node_num = DISK_LOCK(fe)->dl_master;
-
-out:
-	if (bh && drop_bh)
-		brelse(bh);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_update_lockres */
-
-
-/*
- * ocfs_init_lockres()
- *
- */
-void ocfs_init_lockres (ocfs_super * osb, struct inode *inode)
-{
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p)\n", osb, lockres);
-
-	lockres->master_node_num = OCFS_INVALID_NODE_NUM;
-	lockres->lock_holders = 0;
-	lockres->readonly_state = 0;
-	lockres->uncommitted_holders = 0;
-	lockres->lock_type = OCFS_LKM_NLMODE;
-	init_rwsem(&lockres->lock);
-	ocfs_node_map_init(osb, &lockres->readonly_map);
-
-	LOG_EXIT ();
-	return;
-}				/* ocfs_init_lockres */

Deleted: trunk/src/lockres.h
===================================================================
--- trunk/src/lockres.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/lockres.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,118 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * lockres.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_LOCKRES_H
-#define OCFS2_LOCKRES_H
-
-#include "util.h"
-
-/* lockres.c */
-
-
-/*
- * ocfs_acquire_lockres_write_timeout()
- *
- * @lockres: lockres to acquire
- * @timeout: timeout in ms, 0 == no timeout
- */
-static inline int ocfs_acquire_lockres_write_timeout (struct inode *inode, __u32 timeout)
-{
-	unsigned long jif = jiffies + (timeout * HZ / 1000);
-	ocfs_lock_res * lockres = GET_INODE_LOCKRES(inode);
-
-	while(1) {
-		if (down_write_trylock(&lockres->lock))
-			return 0;
-
-		if (jif < jiffies)
-			return -ETIMEDOUT;
-
-		ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
-	}
-
-	return 0;
-}
-
-/*
- * ocfs_acquire_lockres_write()
- */
-static inline int ocfs_acquire_lockres_write (struct inode *inode)
-{
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	down_write(&lockres->lock);
-	return 0;
-}
-
-/*
- * ocfs_acquire_lockres_read_timeout()
- *
- * @lockres: lockres to acquire
- * @timeout: timeout in ms, 0 == no timeout
- */
-static inline int ocfs_acquire_lockres_read_timeout (struct inode *inode, __u32 timeout)
-{
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	unsigned long jif = jiffies + (timeout * HZ / 1000);
-	
-	while(1) {
-		if (down_read_trylock(&lockres->lock))
-			return 0;
-
-		if (jif < jiffies)
-			return -ETIMEDOUT;
-
-		ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
-	}
-
-	return 0;
-}
-
-/*
- * ocfs_acquire_lockres_read()
- */
-static inline int ocfs_acquire_lockres_read (struct inode *inode)
-{
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	down_read(&lockres->lock);
-	return 0;
-}
-
-static inline void ocfs_release_lockres_write(struct inode *inode)
-{
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	up_write(&lockres->lock);
-}
-static inline void ocfs_release_lockres_read(struct inode *inode)
-{
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	up_read(&lockres->lock);
-}
-
-void ocfs_init_lockres(ocfs_super *osb, struct inode *inode);
-int ocfs_update_lockres(ocfs_super *osb, struct buffer_head *bh, 
-			struct inode *inode, int reread);
-
-
-#endif /* OCFS2_LOCKRES_H */

Modified: trunk/src/namei.c
===================================================================
--- trunk/src/namei.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/namei.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -50,12 +50,11 @@
 #include "alloc.h"
 #include "dcache.h"
 #include "dir.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "sysfile.h"
 #include "inode.h"
-#include "lockres.h"
 #include "namei.h"
 #include "suballoc.h"
 #include "util.h"
@@ -101,10 +100,8 @@
 
 static int ocfs_double_lock(ocfs_super *osb,
 			    ocfs_journal_handle *handle,
-			    __u32 type1, __u32 flags1, 
 			    struct buffer_head **bh1,
 		     	    struct inode *inode1,
-			    __u32 type2, __u32 flags2, 
 			    struct buffer_head **bh2,
 		     	    struct inode *inode2);
 
@@ -292,11 +289,6 @@
 
 	/* get our super block */
 	osb = OCFS_SB(dir->i_sb);
-	if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
-		LOG_ERROR_STR ("Volume has been shutdown");
-		status = -EACCES;
-		goto leave;
-	}
 
 	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
 		printk("inode %llu has i_nlink of %u\n",
@@ -323,15 +315,12 @@
 		goto leave;
 	}
 
-	/* lock the parent directory */
-	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, 
-				    &parent_fe_bh, dir);
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto leave;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
 
 	dirfe = (ocfs2_dinode *) parent_fe_bh->b_data;
 	if (!dirfe->i_links_count) {
@@ -392,12 +381,13 @@
 		BUG();
 	}
 
-	file_off = fe->i_blkno << dir->i_sb->s_blocksize_bits;
 	ocfs_inode_set_new(osb, inode);
+	status = ocfs2_create_new_inode_locks(inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 
-	ocfs_init_lockres(osb, inode);
+	file_off = fe->i_blkno << dir->i_sb->s_blocksize_bits;
 
-	status = ocfs_update_lockres(osb, new_fe_bh, inode, 0);
 	if (S_ISDIR (mode)) {
 		status = ocfs_fill_new_dir(osb, handle, dir, inode, 
 					   new_fe_bh, data_ac);
@@ -530,7 +520,7 @@
 	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
 	fe->i_blkno = fe_blkno;
 	fe->i_suballoc_bit = suballoc_bit;
-	fe->i_suballoc_node = osb->node_num;
+	fe->i_suballoc_node = osb->slot_num;
 	fe->i_uid = current->fsuid;
 	if (dir->i_mode & S_ISGID) {
 		fe->i_gid = dir->i_gid;
@@ -553,8 +543,6 @@
 	fe->i_last_eb_blk = 0;
 	strcpy (fe->i_signature, OCFS2_INODE_SIGNATURE);
 	fe->i_flags |= OCFS2_VALID_FL;
-	DISK_LOCK(fe)->dl_master = osb->node_num;
-	DISK_LOCK(fe)->dl_level = OCFS_LKM_EXMODE;
 	fe->i_atime = fe->i_ctime = fe->i_mtime = OCFS_CURRENT_TIME;
 	fe->i_dtime = 0;
 
@@ -662,15 +650,12 @@
 		goto bail;
 	}
 
-	/* lock the parent directory */
-	err = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, 
-				    &parent_fe_bh, dir);
+	err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
 	if (err < 0) {
 		if (err != -EINTR)
 			LOG_ERROR_STATUS (err);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
 
 	err = ocfs_prepare_dir_for_insert(osb, dir, parent_fe_bh, 
 					     dentry->d_name.name, 
@@ -680,14 +665,12 @@
 		goto bail;
 	}
 
-	err = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, &fe_bh, inode);
+	err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
 	if (err < 0) {
 		if (err != -EINTR)
 			LOG_ERROR_STATUS (err);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_FILE_UPDATE_OIN, 
-			     inode);
 
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 	if (fe->i_links_count >= OCFS2_LINK_MAX) {
@@ -791,13 +774,11 @@
 		goto leave;
 	}
 
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0, 
-				   &parent_node_bh, dir);
+	status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
 
 	status = ocfs_find_files_on_disk(osb, dentry->d_name.name,
 					 dentry->d_name.len, &blkno,
@@ -811,16 +792,20 @@
 	if (blkno != OCFS_I(inode)->ip_blkno)
 		BUG();
 
-	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE,
-				    FLAG_RELEASE_DENTRY, &fe_bh, inode);
+	status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto leave;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_RELEASE_DENTRY, 
-			     inode);
 
+	status = ocfs2_request_unlink_vote(inode);
+	if (status < 0) {
+		/* This vote should succeed under all normal circumstances. */
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+
 	if (S_ISDIR (inode->i_mode)) {
 	       	if (!ocfs_empty_dir(inode)) {
 			status = -ENOTEMPTY;
@@ -942,16 +927,14 @@
  */
 static int ocfs_double_lock(ocfs_super *osb,
 			    ocfs_journal_handle *handle,
-			    __u32 type1, __u32 flags1, 
 			    struct buffer_head **bh1,
 			    struct inode *inode1,
-			    __u32 type2, __u32 flags2, 
 			    struct buffer_head **bh2,
 			    struct inode *inode2)
 {
-	int status = 0;
-	__u64 tmpid, id1, id2;
-	__u32 tmptype, tmpflags;
+	int status;
+	ocfs_inode_private *oip1 = OCFS_I(inode1);
+	ocfs_inode_private *oip2 = OCFS_I(inode2);
 	struct buffer_head **tmpbh;
 	struct inode *tmpinode;
 
@@ -961,31 +944,16 @@
 
 	OCFS_ASSERT(handle);
 
-	id1 = OCFS_I(inode1)->ip_blkno;
-	id2 = OCFS_I(inode2)->ip_blkno;
-
 	if (*bh1)
 		*bh1 = NULL;
 	if (*bh2)
 		*bh2 = NULL;
 
 	/* we always want to lock the one with the lower lockid first. */
-	if (id1 != id2) {
-		if (id1 < id2) {
+	if (oip1->ip_blkno != oip2->ip_blkno) {
+		if (oip1->ip_blkno < oip2->ip_blkno) {
 			/* switch id1 and id2 around */
 			LOG_TRACE_STR("switching them around...");
-			tmpid = id2;
-			id2 = id1;
-			id1 = tmpid;
-
-			tmptype = type2;
-			type2 = type1;
-			type1 = tmptype;
-
-			tmpflags = flags2;
-			flags2 = flags1;
-			flags1 = tmpflags;
-
 			tmpbh = bh2;
 			bh2 = bh1;
 			bh1 = tmpbh;
@@ -995,21 +963,18 @@
 			inode1 = tmpinode;
 		}
 		/* lock id2 */
-		status = ocfs_acquire_lock(osb, type2, flags2, bh2, inode2);
+		status = ocfs2_meta_lock(inode2, handle, bh2, 1);
 		if (status < 0) {
 			LOG_ERROR_STATUS (status);
 			goto bail;
 		}
-		ocfs_handle_add_lock(handle, type2, flags2, inode2);
 	}
 	/* lock id1 */
-	status = ocfs_acquire_lock(osb, type1, flags1, 
-				   bh1, inode1);
+	status = ocfs2_meta_lock(inode1, handle, bh1, 1);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, type1, flags1, inode1);
 bail:
 	LOG_EXIT_STATUS(status);
 	return(status);
@@ -1045,7 +1010,6 @@
 	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
 						    // this is the 1st dirent bh
 	nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
-	struct buffer_head *lockbh = NULL;
 
 	LOG_SET_CONTEXT(RENAME);
 
@@ -1077,11 +1041,7 @@
 
 	/* if old and new are the same, this'll just do one lock. */
 	status = ocfs_double_lock(osb, handle, 
-				  OCFS_LKM_EXMODE, 
-				  0,
 				  &old_dir_bh, old_dir,
-				  OCFS_LKM_EXMODE, 
-				  0,
 				  &new_dir_bh, new_dir);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
@@ -1105,20 +1065,17 @@
 		/* Directories actually require metadata updates to
 		 * the directory info so we can't get away with not
 		 * doing node locking on it. */
-		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
-					   FLAG_RELEASE_DENTRY|FLAG_FILE_RENAME, 
-					   &lockbh, old_inode);
-		if (lockbh) {
-			brelse(lockbh);
-			lockbh = NULL;
+		status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
 		}
+
+		status = ocfs2_request_rename_vote(old_inode);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto bail;
 		}
-		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-				     FLAG_RELEASE_DENTRY|FLAG_FILE_RENAME,
-				     old_inode);
 
 		status = -EIO;
 		old_inode_de_bh = ocfs_bread (old_inode, 0, &status, 0);
@@ -1136,7 +1093,7 @@
 	} else {
 		/* Ah, the simple case - we're a file so just send a
 		 * message. */
-		status = ocfs_notify_on_rename(osb, old_inode);
+		status = ocfs2_request_rename_vote(old_inode);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto bail;
@@ -1187,16 +1144,18 @@
 		if (newfe_blkno != OCFS_I(new_inode)->ip_blkno)
 			BUG();
 
-		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
-					   FLAG_RELEASE_DENTRY, &newfe_bh, 
-					   new_inode);
+		status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto bail;
 		}
-		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-				     FLAG_RELEASE_DENTRY, new_inode);
 
+		status = ocfs2_request_unlink_vote(new_inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
 		newfe = (ocfs2_dinode *) newfe_bh->b_data;
 
 		LOG_TRACE_ARGS("aha rename over existing... new_de=%p "
@@ -1426,7 +1385,7 @@
 		goto bail;
 	}
 
-	bhs = ocfs_malloc(sizeof(struct buffer_head *) * blocks);
+	bhs = kmalloc(sizeof(struct buffer_head *) * blocks, GFP_KERNEL);
 	if (!bhs) {
 		status = -ENOMEM;
 		LOG_ERROR_STATUS(status);
@@ -1560,14 +1519,12 @@
 	}
 
 	/* lock the parent directory */
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0, 
-				   &parent_fe_bh, dir);
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
 
 	dirfe = (ocfs2_dinode *) parent_fe_bh->b_data;
 	if (!dirfe->i_links_count) {
@@ -1622,8 +1579,7 @@
 	}
 
 	ocfs_inode_set_new(osb, inode);
-	ocfs_init_lockres(osb, inode);
-	status = ocfs_update_lockres(osb, new_fe_bh, inode, 0);
+	status = ocfs2_create_new_inode_locks(inode);
 	if (status < 0)
 		LOG_ERROR_STATUS(status);
 
@@ -2054,14 +2010,11 @@
 	}
 
 	ocfs_handle_add_inode(handle, orphan_dir_inode);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
-				   &orphan_dir_bh, orphan_dir_inode);
+	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, 
-			     orphan_dir_inode);
 
 	status = ocfs_prepare_dir_for_insert(osb, orphan_dir_inode, 
 					     orphan_dir_bh, name, namelen, 

Deleted: trunk/src/nm.c
===================================================================
--- trunk/src/nm.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/nm.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,1150 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * nm.c
- *
- * net and disk process vote, nm thread, etc.
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/bitops.h>
-#include <linux/net.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-#include "ocfs2.h"
-
-#include "alloc.h"
-#include "dlm.h"
-#include "extent_map.h"
-#include "file.h"
-#include "heartbeat.h"
-#include "inode.h"
-#include "lockres.h"
-#include "nm.h"
-#include "util.h"
-#include "vote.h"
-
-#include "ocfs_journal.h"
-#include "buffer_head_io.h"
-
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT      OCFS_DEBUG_CONTEXT_NM
-
-/* for lack of a better name... protects inode_private->ip_num_extends. */
-static spinlock_t oin_num_ext_lock = SPIN_LOCK_UNLOCKED;
-struct semaphore recovery_list_sem;
-
-static inline int need_write_lock(ocfs_super *osb, ocfs_lock_res *lockres, __u32 flags);
-static inline int get_process_vote_action(ocfs_super * osb, ocfs_lock_res *lockres, __u32 node_num, 
-					  __u32 flags, int *master_alive, int *write_lock, 
-					  int *change_master, struct inode *inode);
-
-static int ocfs_process_vote_pre_change_master(int vote_type, int flags, int *vote_response, struct inode *inode);
-static int ocfs_process_vote_change_master(ocfs_super *osb, int *vote_response, int *status, struct inode *inode, ocfs_lock_res *lockres, __s16 node_num, __u64 lock_id);
-static int ocfs_process_vote_post_change_master(ocfs_super *osb, int vote_type, int flags, int *vote_response, struct inode *inode, ocfs_lock_res *lockres, int *status, __s16 node_num, int *inc_seq);
-static int ocfs_lock_busy(ocfs_super *osb, struct inode *inode, ocfs_lock_res *lockres);
-
-static int _ocfs_drop_readonly_cache_lock(void *arg);
-
-typedef struct _ocfs_ro_cache_drop_ctxt
-{
-	ocfs_super *osb;
-	ocfs_lock_res *lockres;
-	struct inode *inode;
-	int yield;
-} ocfs_ro_cache_drop_ctxt;
-
-static void ocfs_mark_inode_for_extend(ocfs_super *osb, struct inode *inode,
-				       __u32 node_num);
-static void ocfs_clear_inode_for_extend(ocfs_super *osb, struct inode *inode,
-					__u32 node_num, u32 num_rel);
-
-static int ocfs_process_inode_delete(struct inode *inode);
-static void ocfs_commit_inode_delete(struct inode *inode);
-
-static const char *process_vote_strings[] = {
-	"INVALID_REQUEST",      // reply with a NO vote
-	"UPDATE_OIN_INODE",     // update both oin and inode
-	"DELETE_ACQUIRE",// delete or rename request
-	"CHANGE_MASTER",        // request to change master to requestor
-	"NOT_MASTER",           // I am not master, retry
-	"REMASTER_THIS",        // remaster lock to me
-	"REMASTER_REQUESTOR",   // remaster lock to requestor
-	"DROP_READONLY",        // RO cachelock needs to convert to RW
-	"READONLY",
-	"RELEASE_DENTRY",
-	"TRUNCATE_PAGES"
-};
-
-/*
- * ocfs_recv_thread()
- *
- */
-int ocfs_recv_thread (void *unused)
-{
-	int status = 0;
-	ocfs_recv_ctxt *recv_ctxt = NULL;
-
-	LOG_ENTRY ();
-
-#define LISTENER_PROCESS_NAME	"ocfs2lsnr"
-	ocfs_daemonize (LISTENER_PROCESS_NAME, strlen(LISTENER_PROCESS_NAME), 
-			1);
-
-	OcfsIpcCtxt.task = current;
-
-	while (1) {
-		recv_ctxt = ocfs_malloc (sizeof (ocfs_recv_ctxt));
-		if (recv_ctxt == NULL) {
-			LOG_ERROR_STATUS (-ENOMEM);
-			goto bail;
-		}
-
-		memset (recv_ctxt, 0, sizeof (ocfs_recv_ctxt));
-		recv_ctxt->msg_len = OCFS_MAX_DLM_PKT_SIZE;
-
-		status = ocfs_recv_udp_msg (recv_ctxt);
-		if (status < 0) {
-			kfree(recv_ctxt);
-			if (status != -EBADF) {
-				LOG_ERROR_STATUS (status);
-			} else {
-				/* Thread is being killed. */
-				goto finally;
-			}
-		}
-	}
-
-finally:
-	/* Flush all scheduled tasks */
-	flush_scheduled_work();
-
-	if (OcfsIpcCtxt.send_sock) {
-		sock_release (OcfsIpcCtxt.send_sock);
-		OcfsIpcCtxt.send_sock = NULL;
-	}
-
-	if (OcfsIpcCtxt.recv_sock) {
-		sock_release (OcfsIpcCtxt.recv_sock);
-		OcfsIpcCtxt.recv_sock = NULL;
-	}
-
-	OcfsIpcCtxt.task = NULL;
-
-	/* signal main thread of ipcdlm's exit */
-	complete (&(OcfsIpcCtxt.complete));
-
-bail:
-	LOG_EXIT ();
-	return 0;
-}				/* ocfs_recv_thread */
-
-// gets a best guess (based on dirty read of lockres)
-// of whether down_read or down_write should be used on lockres
-// NOTE: always RECHECK after getting the lock and follow what
-// get_process_vote_action says
-static inline int need_write_lock(ocfs_super *osb, ocfs_lock_res *lockres, __u32 flags)
-{
-	// always need write access to lockres if not master
-	if (lockres->master_node_num != osb->node_num)
-		return 1;
-	// usually need write access for these so just get it
-	if (flags & (FLAG_CHANGE_MASTER|FLAG_DROP_READONLY|FLAG_READONLY))
-		return 1;
-	// nothing else will need it, assuming it didnt just change under us
-	return 0;
-}
-
-static inline int get_process_vote_action(ocfs_super * osb, ocfs_lock_res *lockres, __u32 node_num, 
-					  __u32 flags, int *master_alive, int *write_lock, 
-					  int *change_master, struct inode *inode)
-{
-	int vote_type = INVALID_REQUEST;
-	int my_node_wins = 0;
-	int this_node_master = 0;
-	__u64 lockid = 0;
-	ocfs_vote_obj_lookup_data data;
-
-	LOG_ENTRY_ARGS("(node_num=%d, flags=%08x)\n", node_num, flags);
-
-	OCFS_ASSERT(inode);
-	OCFS_ASSERT(lockres);
-
-	lockid = OCFS_I(inode)->ip_blkno;
-
-	*change_master = 0;
-	*write_lock = 0;
-	this_node_master = (lockres->master_node_num == osb->node_num);
-	*master_alive = (lockres->master_node_num != OCFS_INVALID_NODE_NUM) &&
-			ocfs_node_is_alive(&osb->publ_map, lockres->master_node_num);
-
-	// if an outstanding vote request is found on this lockid
-	// and this node number is higher, this node wins
-	data.func = ocfs_lookup_obj_by_lockid;
-	data.u.s.lock_id = lockid;
-	data.ret = NULL;
-	if (ocfs_lookup_vote_request_obj(osb, &data) == 0)
-		my_node_wins = (node_num < osb->node_num);
-
-	/* NOTE: FLAG_CHANGE_MASTER may be combined with
-	 * other flags and result in a process_vote action
-	 * other than CHANGE_MASTER.  the change_master
-	 * value returned here is independent of this action */
-	if (this_node_master && flags & FLAG_CHANGE_MASTER) {
-		*write_lock = 1;
-		*change_master = 1;
-	}
-	
-	// if this node is not master, we will need to update the lockres
-	if (!this_node_master)
-		*write_lock = 1;
-
-	if (flags & (FLAG_RELEASE_DENTRY | FLAG_FILE_RENAME)) {
-		vote_type = RELEASE_DENTRY;
-		goto done;
-	}
-
-	if (flags & FLAG_DROP_READONLY) {
-		vote_type = DROP_READONLY;
-		*write_lock = 1;
-		goto done;
-	} else if (flags & FLAG_READONLY) {
-		if (this_node_master && lockres->lock_type == OCFS_LKM_EXMODE) {
-			vote_type = READONLY;
-			*write_lock = 1;
-		} else 
-			vote_type = INVALID_REQUEST;
-		goto done;
-	}
-
-	if (flags & FLAG_FILE_DELETE) {
-		if (flags & FLAG_RELEASE_LOCK)
-			vote_type = INVALID_REQUEST;
-		else if (flags & FLAG_ACQUIRE_LOCK)
-			vote_type = DELETE_ACQUIRE;
-		else
-			vote_type = INVALID_REQUEST;
-	} else if (flags & FLAG_FILE_UPDATE_OIN) {
-		if ((flags & FLAG_FILE_TRUNCATE) &&
-		    (flags & FLAG_ACQUIRE_LOCK))
-			vote_type = TRUNCATE_PAGES;
-		else
-			vote_type = UPDATE_OIN_INODE;
-	} else if (flags & FLAG_TRUNCATE_PAGES) {
-		vote_type = TRUNCATE_PAGES;
-	} else if (this_node_master) {
-		if (flags & FLAG_CHANGE_MASTER)
-			vote_type = CHANGE_MASTER;
-		else {
-			LOG_TRACE_STR("(INVALID_REQUEST) am master, but no more types");
-			vote_type = INVALID_REQUEST;
-		}
-	} else {
-		if (*master_alive)
-			vote_type = NOT_MASTER;
-		else if (my_node_wins)
-			vote_type = REMASTER_THIS;
-		else
-			vote_type = REMASTER_REQUESTOR;
-	}
-	
-done:
-	LOG_EXIT_STATUS(vote_type);
-	return vote_type;
-}
-
-/* this function requires that callers to it be serialized (isn't
- * really a problem as vote_sem does that for us. */
-static void ocfs_mark_inode_for_extend(ocfs_super *osb, struct inode *inode,
-				       __u32 node_num)
-{
-	spin_lock(&oin_num_ext_lock);
-
-	if (OCFS_I(inode)->ip_num_extends < 0)
-		BUG();
-
-	/* this isn't the 1st extend against the inode, so just inc
-	 * the counter. */
-	if (OCFS_I(inode)->ip_num_extends > 0) {
-		OCFS_I(inode)->ip_num_extends++;
-
-	printk("ocfs_mark_inode_for_extend: inode %llu, num = %d\n",
-	       OCFS_I(inode)->ip_blkno, OCFS_I(inode)->ip_num_extends);
-
-		spin_unlock(&oin_num_ext_lock);
-		return;
-	}
-
-	/* ok, we're going to have to take the extend sem. We can't do
-	 * this holding ip_node_extend_sem so we drop it and recheck after
-	 * we've got it. */
-	spin_unlock(&oin_num_ext_lock);
-
-	/* take the extend_sem on behalf of
-	 * this other node. It won't be
-	 * released until he does his last
-	 * release broadcast. This has the
-	 * effect of locking out
-	 * ocfs2_extent_map lookups
-	 * inode. */
-	down_write(&OCFS_I(inode)->ip_node_extend_sem);
-
-	atomic_inc(&inode->i_count);
-
-	/* Ok, we've still got it open. Put this guy on the recovery
-	 * list in case the extending node dies. */
-	down(&recovery_list_sem);
-	spin_lock(&oin_num_ext_lock);
-
-	if (OCFS_I(inode)->ip_num_extends < 0)
-		BUG();
-
-	OCFS_I(inode)->ip_num_extends++;
-	list_add_tail(&OCFS_I(inode)->ip_recovery_list, 
-		      &osb->lock_recovery_lists[node_num]);
-
-	LOG_TRACE_PROCESS_VOTE("inode %llu, num = %d\n",
-	       OCFS_I(inode)->ip_blkno, OCFS_I(inode)->ip_num_extends);
-	
-	spin_unlock(&oin_num_ext_lock);
-	up(&recovery_list_sem);
-
-	return;
-}
-
-static void ocfs_clear_inode_for_extend(ocfs_super *osb, struct inode *inode,
-					__u32 node_num, u32 num_rel)
-{
-	int dec = 0;
-
-	down(&recovery_list_sem);
-	spin_lock(&oin_num_ext_lock);
-
-	if ((OCFS_I(inode)->ip_num_extends - (s32) num_rel) < 0) {
-		/* We don't force to zero here in order to cover up a
-		 * bug, but rather because it's perfectly valid for us
-		 * to get a release with a count > what we've had if
-		 * we mount after the acquires have been sent. */
-
-		LOG_TRACE_PROCESS_VOTE("inode %llu, num_rel of "
-		       "%d would result in negative count (ip_num_extends "
-		       "= %d)\n", 
-		       OCFS_I(inode)->ip_blkno, num_rel,
-		       OCFS_I(inode)->ip_num_extends);
-		OCFS_I(inode)->ip_num_extends = 0;
-	} else {
-		OCFS_I(inode)->ip_num_extends -= num_rel;
-	}
-
-	LOG_TRACE_PROCESS_VOTE("inode %llu, num = %d\n",
-	       OCFS_I(inode)->ip_blkno, OCFS_I(inode)->ip_num_extends);
-
-	if (!OCFS_I(inode)->ip_num_extends) {
-		list_del(&OCFS_I(inode)->ip_recovery_list);
-		INIT_LIST_HEAD(&OCFS_I(inode)->ip_recovery_list);
-
-		up_write(&OCFS_I(inode)->ip_node_extend_sem);
-
-		dec = 1;
-	}
-
-	spin_unlock(&oin_num_ext_lock);
-	up(&recovery_list_sem);
-
-	/* we want iputs to happen outside of as many locks as possible. */
-	if (dec)
-		iput(inode);
-
-	return;
-}
-
-
-static int ocfs_process_inode_delete(struct inode *inode)
-{
-	int status;
-
-	LOG_TRACE_ARGS("DELETE vote on inode %lu, read "
-		       "lnk_cnt = %u\n", inode->i_ino, 
-		       inode->i_nlink);
-
-	/* force this as ours may be out of date. */
-	inode->i_nlink = 0;
-
-	spin_lock(&OCFS_I(inode)->ip_lock);
-	/* vote no if the file is still open. */
-	if (OCFS_I(inode)->ip_open_cnt > 0) {
-		LOG_TRACE_PROCESS_VOTE("open count = %u\n", 
-		       OCFS_I(inode)->ip_open_cnt);
-		spin_unlock(&OCFS_I(inode)->ip_lock);
-		status = 0;
-		goto done;
-	}
-	spin_unlock(&OCFS_I(inode)->ip_lock);
-
-	/* vote no if someone's extending it. */
-	spin_lock(&oin_num_ext_lock);
-	if (OCFS_I(inode)->ip_num_extends) {
-		spin_unlock(&oin_num_ext_lock);
-		LOG_TRACE_PROCESS_VOTE("extends pending\n");
-		status = 0;
-		goto done;
-	}
-	spin_unlock(&oin_num_ext_lock);
-
-	/* directories are a bit ugly... What if someone is sitting in
-	 * it? We want to make sure the inode is removed completely as
-	 * a result of the iput in process_vote. */
-	if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
-		LOG_TRACE_PROCESS_VOTE("i_count = %u\n", 
-		       atomic_read(&inode->i_count));
-		status = 0;
-		goto done;
-	}
-
-	status = 1;
-done:
-	return(status);
-}
-
-static void ocfs_commit_inode_delete(struct inode *inode)
-{
-	SET_INODE_DELETED(inode);
-	/* if we vote yes, then we set the SKIP_DELETE
-	 * flag on the inode so we don't try to delete
-	 * it in delete_inode ourselves. */
-	OCFS_SET_FLAG(OCFS_I(inode)->ip_flags, 
-		      OCFS_INODE_SKIP_DELETE);
-
-	d_prune_aliases (inode);
-	sync_mapping_buffers(inode->i_mapping);
-	ocfs_truncate_inode_pages(inode, 0);
-	ocfs2_extent_map_drop(inode, 0);
-}
-
-
-/*
- * ocfs_process_vote()
- *
- * @osb:
- * @publish:
- * @node_num: node asking for the vote
- *
- */
-int ocfs_process_vote (ocfs_super * osb, ocfs_dlm_msg *dlm_msg)
-{
-	int status = 0;
-	int tmpstat = 0;
-	ocfs_lock_res *lockres = NULL;
-	__u32 flags, num_ident;
-	__u16 num_nodes;
-	int vote_type = INVALID_REQUEST, vote_response = 0;
-	struct inode *inode = NULL;
-	int master_alive = 1, change_master = 0, write_lock = 0;
-	int inc_inode_seq = 0;
-	int change_master_succeeded = 0;
-	__s16 node_num = dlm_msg->src_node;
-	__u64 lock_id, seq_num;
-	ocfs_dlm_req_master *req_master = NULL;
-	int lockres_lock_held = NO_LOCK;
-
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p)\n", osb, dlm_msg);
-
-	down(&osb->vote_sem);
-
-	num_nodes = osb->max_nodes;
-	
-	req_master = (ocfs_dlm_req_master *)dlm_msg->msg_buf;
-	flags = req_master->flags;
-	lock_id = req_master->lock_id;
-	seq_num = req_master->lock_seq_num;
-	num_ident = req_master->num_ident;
-
-	if (!num_ident) {
-		printk("flags = 0x%x, lock_id = %llu, node_num = %u\n",
-		       flags, lock_id, node_num);
-		BUG();
-	}
-
-	LOG_TRACE_ARGS ("node=%u, id=%llu, seq=%llu\n", node_num,
-			lock_id, seq_num);
-
-	/* if we timeout on any of the locks, we want to send a retry
-	 * instead of letting the other guy's network timeout. */
-	vote_response = FLAG_VOTE_UPDATE_RETRY;
-
-	if (flags & FLAG_TRUNCATE_PAGES) {
-		inode = ocfs_ilookup(osb, lock_id);
-		if(!inode) {
-			vote_type = TRUNCATE_PAGES;
-			goto got_vote_type;
-		}
-	} else {
-		inode = ocfs_iget(osb, lock_id);
-	}
-
-	if (!inode) {
-		status = -ENOENT;
-		LOG_ERROR_ARGS("Could not find inode: lock_id = %llu, "
-			       "node=%u, seq=%llu, flags=0x%x\n",
-			       lock_id, node_num, seq_num, flags);
-		LOG_ERROR_STATUS(status);
-		goto vote;
-	}
-
-	/* ahh, so you find yourself asking "what the
-	 * heck is this?"
-	 * Please see the note in ocfs_delete_inode. */
-	osb->voting_ino = inode->i_ino;
-
-	lockres = GET_INODE_LOCKRES(inode);
-
-	// take a good guess...
-	// at worst, we will take 2 passes through
-	write_lock = need_write_lock(osb, lockres, flags);
-
-retake_lock:
-	OCFS_ASSERT(lockres_lock_held == NO_LOCK);
-	if (write_lock)
-		status = ocfs_acquire_lockres_write_timeout (inode, (OCFS_NM_HEARTBEAT_TIME/2));
-	else
-		status = ocfs_acquire_lockres_read_timeout (inode, (OCFS_NM_HEARTBEAT_TIME/2));
-
-	if (status < 0) {
-		LOG_TRACE_ARGS("Timedout locking lockres for id: %llu\n",
-			       OCFS_I(inode)->ip_blkno);
-		goto vote;
-	} else
-		lockres_lock_held = (write_lock ? WRITE_LOCK : READ_LOCK);
-
-	// find out everything now that a lock is held
-	vote_type = get_process_vote_action(osb, lockres, node_num, flags, 
-					    &master_alive, &write_lock, 
-					    &change_master, inode);
-
-	// bummer. we got the wrong lock. get the write lock and start over.
-	if (write_lock && lockres_lock_held == READ_LOCK) {
-		ocfs_release_lockres_read(inode);
-		lockres_lock_held = NO_LOCK;
-		goto retake_lock;
-	}
-
-	if (lockres->master_node_num != osb->node_num) {
-		/* since we pass a NULL bh, this'll only do a read if
-	 	* we're not the master. */
-		OCFS_ASSERT(lockres_lock_held == WRITE_LOCK);
-		status = ocfs_update_lockres (osb, NULL, inode, 1);
-
-		if (status < 0) {
-			if (status != -ETIMEDOUT)
-				LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-	}
-
-got_vote_type:
-	
-	LOG_TRACE_PROCESS_VOTE("type: %s, lockid: %llu, action: (%u) %s, num_ident: %u, "
-	       "alive: %d, write: %d, change: %d, held: %d\n", 
-	       flags & FLAG_RELEASE_LOCK ? "RELEASE" : 
-	       (flags & FLAG_ACQUIRE_LOCK ? "ACQUIRE" : "MODIFY"), lock_id,
- 	       vote_type, process_vote_strings[vote_type], num_ident,
-	       master_alive, write_lock, change_master, lockres_lock_held);
-	
-	if (vote_type == INVALID_REQUEST)
-		printk("Invalid request! flags = 0x%x master=%d, readonly=%s\n", 
-		       flags, lockres->master_node_num, 
-		       test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no");
-
-	/* get_process_vote_action will only allow CHANGE_MASTER on a CACHE lock 
-	 * held by this node.  the CHANGE_MASTER path needs to check the readonly 
-	 * map to see if any nodes need to be updated. */
-
-	vote_response = FLAG_VOTE_NODE;
-
-	if (ocfs_process_vote_pre_change_master(vote_type, flags, &vote_response, inode))
-		goto vote;
-
-	if (change_master) {
-		tmpstat = ocfs_process_vote_change_master(osb, &vote_response, &status, 
-							  inode, lockres, node_num, lock_id);
-		if (tmpstat < 0)
-			goto leave;
-		else if (tmpstat == 1)
-			goto vote;
-		change_master_succeeded = 1;
-		inc_inode_seq = 1;
-	}
-
-	tmpstat = ocfs_process_vote_post_change_master(osb, vote_type, flags, &vote_response, inode, 
-					     lockres, &status, node_num, &inc_inode_seq);
-
-	/* if we made it this far, and change_master, then it had better be voting yes */
-	if (change_master && vote_response != FLAG_VOTE_NODE)
-		BUG();
-
-	if (inode && (flags & (FLAG_FILE_EXTEND|FLAG_FILE_TRUNCATE)) && 
-	    ((flags & FLAG_ACQUIRE_LOCK && vote_response==FLAG_VOTE_NODE) ||
-	    (flags & FLAG_RELEASE_LOCK))) {
-		LOG_TRACE_ARGS("responding YES to %s %s request, inode=%p, node=%u\n", flags & FLAG_FILE_EXTEND ?
-			      "extend" : "truncate", flags & FLAG_RELEASE_LOCK ? 
-			      "release" : "acquire", inode, node_num);
-
-		if (flags & FLAG_ACQUIRE_LOCK)
-			ocfs_mark_inode_for_extend(osb, inode, node_num);
-		else if (flags & FLAG_RELEASE_LOCK)
-			ocfs_clear_inode_for_extend(osb, inode, node_num, 
-						    num_ident);
-		else {
-			printk("uhoh, bad vote flags! 0x%x\n", flags);
-			BUG();
-		}
-	}
-
-vote:
-	status = ocfs_send_vote_reply(osb, dlm_msg, vote_response);
-
-	LOG_TRACE_PROCESS_VOTE("vote: lockid=%llu, node=%d, seqnum=%llu, response=%d\n",
-			       lock_id, node_num, seq_num, vote_response);
-		       
-	if (status < 0)
-		LOG_ERROR_STATUS (status);
-	else {
-		ocfs_compute_dlm_stats (0, vote_response,
-					&(OcfsGlobalCtxt.net_reply_stats));
-		ocfs_compute_dlm_stats (0, vote_response,
-				       	&(osb->net_reply_stats));
-	}
-
-leave:
-	if (lockres_lock_held == READ_LOCK)
-		ocfs_release_lockres_read (inode);
-	else if (lockres_lock_held == WRITE_LOCK)
-		ocfs_release_lockres_write (inode);
-	lockres_lock_held = NO_LOCK;
-
-	if (!inode)
-		goto no_inode_leave;
-
-	if (inc_inode_seq) {
-		ocfs_inc_inode_seq(osb, inode);
-		sync_mapping_buffers(inode->i_mapping);
-	}
-	iput(inode);
-
-no_inode_leave:
-	osb->voting_ino = 0;
-
-	up(&osb->vote_sem);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_process_vote */
-
-
-/* some lock requests need to be processed before a possible
- * change master. Beware however that the change_master might
- * very well send a no vote, so you can't do things here that
- * cannot be rolled back. */
-
-/* Returns: 1 if process_vote should vote immediately, 0 otherwise */
-
-static int ocfs_process_vote_pre_change_master(int vote_type, int flags, int *vote_response, struct inode *inode)
-{
-	if (vote_type == DELETE_ACQUIRE) {
-		LOG_TRACE_STR("DELETE_ACQUIRE (part one)");
-		if (!ocfs_process_inode_delete(inode)) {
-			*vote_response = FLAG_VOTE_OIN_ALREADY_INUSE;
-			return 1;
-		}
-		*vote_response = FLAG_VOTE_NODE;
-		return 0;
-	} 
-	if (vote_type == TRUNCATE_PAGES) {
-		LOG_TRACE_STR("TRUNCATE_PAGES");
-		*vote_response = FLAG_VOTE_NODE;
-		if (inode) {
-			if (ocfs_sync_inode(inode) < 0) {
-				LOG_ERROR_ARGS("sync inode failed for inode %lu!\n", inode->i_ino);
-				BUG();
-			}
-			ocfs_truncate_inode_pages(inode, 0);
-			spin_lock(&OCFS_I(inode)->ip_lock);
-
-			/* truncate may send this */
-			if (flags & FLAG_FILE_UPDATE_OIN)
-				atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
-			spin_unlock(&OCFS_I(inode)->ip_lock);
-
-			/* Do we need this? */
-			ocfs2_extent_map_drop(inode, 0);
-		}
-		return 0;
-	}
-		
-	*vote_response = 0;
-	if (vote_type == INVALID_REQUEST) {
-		/* we catch INVALID_REQUEST up here now as we
-		 * don't want to do a change_master on a
-		 * messed up vote... */
-		LOG_TRACE_STR("INVALID_REQUEST");
-		return 1;
-	}
-	return 0;
-}
-
-
-
-
-static int ocfs_lock_busy(ocfs_super *osb, struct inode *inode, ocfs_lock_res *lockres)
-{
-	/* requestor will need to retry if anyone is using the lockres */
-	if (lockres->lock_holders > 0) {
-		LOG_TRACE_PROCESS_VOTE("Lock id (%llu) has %u holders\n",
-		       OCFS_I(inode)->ip_blkno, lockres->lock_holders);
-		// kick the commit thread
-		atomic_set(&osb->flush_event_woken, 1);
-		wake_up(&osb->flush_event);
-
-		return 1;
-	}
-	return 0;
-}
-
-
-/* Returns: <0 if an I/O error occurred, 
- *           1 if process_vote should vote immediately, 
- *           0 if change master succeeded */
-
-static int ocfs_process_vote_change_master(ocfs_super *osb, int *vote_response, int *status, struct inode *inode, 
-				ocfs_lock_res *lockres, __s16 node_num, __u64 lock_id)
-{
-	struct buffer_head *fe_bh = NULL;
-
-	/* lockres is held with down_write throughout this call */ 
-	
-	LOG_TRACE_STR("CHANGE_MASTER");
-	LOG_TRACE_PROCESS_VOTE("doing CHANGE_MASTER for this request\n");
-
-	if (ocfs_lock_busy(osb, inode, lockres)) {
-		*vote_response = FLAG_VOTE_UPDATE_RETRY;
-		*status = 0;
-		return 1;
-	}
-
-	/* this is currently a readonly EX lock.
-	 * need to communicate to all the nodes in the 
-	 * map that lock will be changing to RW before we
-	 * continue.  RETRY this request while we spawn 
-	 * off a thread to collect up the communication */
-	if (!ocfs_node_map_is_empty(&lockres->readonly_map)) {
-		// assumption: node asking for vote has already dropped readonly
-		ocfs_node_map_clear_bit(&lockres->readonly_map, node_num);
-		// should not be in there, but...
-		ocfs_node_map_clear_bit(&lockres->readonly_map, osb->node_num);
-		if (!ocfs_node_map_is_empty(&lockres->readonly_map)) {
-			OCFS_ASSERT(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) &&
-				    lockres->master_node_num == osb->node_num);
-			OCFS_ASSERT(inode);
-			*status = ocfs_drop_readonly_cache_lock(osb, inode, 1);
-			if (*status < 0)
-				LOG_ERROR_STATUS(*status);
-			LOG_TRACE_PROCESS_VOTE("node map not empty on RO drop request\n");
-			*vote_response = FLAG_VOTE_UPDATE_RETRY;
-			// did not change master, send response
-			return 1;
-		}
-		// noone left in map, so continue
-		clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-	}
-
-	sync_mapping_buffers(inode->i_mapping);
-
-#warning do we need a truncate_inode_pages here?
-	if (lockres->master_node_num != osb->node_num) {
-		printk("are we giving away a lock we don't own!?! "
-		       "inode %llu\n", OCFS_I(inode)->ip_blkno);
-		BUG();
-	}
-
-	*status = ocfs_read_block(osb, lock_id, &fe_bh, OCFS_BH_CACHED, inode);
-	if (*status < 0) {
-		LOG_ERROR_STATUS ((*status));
-		return *status;
-	}
-	lockres->master_node_num = node_num;
-	lockres->lock_type = OCFS_LKM_NLMODE;
-	ocfs_update_disk_lock(osb, fe_bh, inode);
-	brelse(fe_bh);
-	*vote_response = FLAG_VOTE_NODE;
-	*status = 0;
-
-	// master successfully changed
-	return 0;
-}
-
-
-
-/* Returns:  1 if process_vote should vote immediately, 
- *           0 on success */
-
-/* we can't have any of these cases failing if the change master already succeeded */
-static int ocfs_process_vote_post_change_master(ocfs_super *osb, int vote_type, int flags, int *vote_response, struct inode *inode, ocfs_lock_res *lockres, int *status, __s16 node_num, int *inc_seq)
-{
-	switch (vote_type) {
-		case TRUNCATE_PAGES:
-		case CHANGE_MASTER:
-			/* we dealt with this all above. */
-			break;
-
-		case UPDATE_OIN_INODE:
-			LOG_TRACE_STR("UPDATE_OIN_INODE");
-			atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
-			*vote_response = FLAG_VOTE_OIN_UPDATED;
-			break;
-
-		case RELEASE_DENTRY:
-			OCFS_ASSERT(inode);
-
-			/* we always vote yes on this one. */
-			*vote_response = FLAG_VOTE_NODE;
-
-			/* do nothing in the release case... hmm,
-			 * perhaps we should just do a verify_update
-			 * or something in case the guy aborted... */
-			if (flags & FLAG_RELEASE_LOCK)
-				break;
-
-			d_prune_aliases (inode);
-
-			/* for rename, we don't drop link counts */
-			if (!(flags & FLAG_FILE_RENAME)) {
-				if (S_ISDIR(inode->i_mode))
-					inode->i_nlink = 0;
-				else
-					inode->i_nlink--;
-			}
-
-			LOG_TRACE_ARGS("pruned dentries for inode %lu, nlink = %u\n", 
-				       inode->i_ino, inode->i_nlink);
-			break;
-
-		case DELETE_ACQUIRE:
-			LOG_TRACE_STR("DELETE_ACQUIRE (part two)");
-			/* If we got this far, then we assume we've
-			 * done the 1st part of the DELETE_ACQUIRE
-			 * case and we just have to commit it. */
-			if (*vote_response != FLAG_VOTE_NODE)
-				BUG();
-
-			ocfs_commit_inode_delete(inode);
-			break;
-
-		case READONLY:
-			LOG_TRACE_STR("READONLY");
-			// WRITELOCK
-			OCFS_ASSERT(!(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state)) ||
-				    lockres->master_node_num == osb->node_num);
-
-			if (ocfs_lock_busy(osb, inode, lockres)) {
-				*vote_response = FLAG_VOTE_UPDATE_RETRY;
-				*status = 0;
-				return 1;
-			}
-
-			// if the requestor just wants to do readonly, we 
-			// drop our buffers, so switch to readonly and done
-			sync_mapping_buffers(inode->i_mapping);
-
-			ocfs_node_map_set_bit(&lockres->readonly_map, node_num);
-			set_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-			*vote_response = FLAG_VOTE_NODE;
-			*status = 0;
-			break;
-
-		case DROP_READONLY:
-			/* TODO: may need locking in here to lock out 
-			 * the actual IO that a readdir may have in 
-			 * progress, if it's possible to have a corrupt 
-			 * readdir.  for now, skip it.
-			 * NOTE: can't just take io_sem because lock order
-			 * needs to be io_sem->lockres... would have to 
-			 * drop lockres, take io_sem, take lockres, then 
-			 * recheck all the conditions to see if still 
-			 * appropriate, then do the work and drop both.
-			 * seems like a lot of work.  almost as many lines
-			 * of code as there are lines of comments right here.
-			 */
-
-			/* this path should always succeed on the vote *
-			 * even in the error case.  do nothing for error. */	
-			
-			// WRITELOCK
-			if (lockres->master_node_num != node_num ||
-			    lockres->lock_type != OCFS_LKM_EXMODE ||
-			    !ocfs_node_map_is_empty(&lockres->readonly_map))
-				LOG_ERROR_ARGS("(drop-ro) master=%d node_num=%d locktype=%d readonly=%s\n",
-				       lockres->master_node_num, node_num, lockres->lock_type, 
-				       test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no");
-			else
-				clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-
-			*status = 0;
-			*vote_response = FLAG_VOTE_NODE;
-			*inc_seq = 1;
-			break;
-			
-		case NOT_MASTER:
-			LOG_TRACE_STR("NOT_MASTER");
-			*vote_response = FLAG_VOTE_UPDATE_RETRY;
-			if (flags & (FLAG_FILE_EXTEND|FLAG_FILE_TRUNCATE) &&
-			    lockres->master_node_num == node_num) {
-				LOG_TRACE_STR("owner is requesting extend/truncate");
-				*vote_response = FLAG_VOTE_NODE;
-			}
-			break;
-
-		case REMASTER_THIS:
-			LOG_TRACE_STR("REMASTER_THIS");
-			*vote_response = FLAG_VOTE_UPDATE_RETRY;
-			break;
-
-		case REMASTER_REQUESTOR:
-			LOG_TRACE_STR("REMASTER_REQUESTOR");
-			*vote_response = FLAG_VOTE_NODE;
-			break;
-
-		case INVALID_REQUEST:
-		default:
-			LOG_TRACE_STR("INVALID_REQUEST");
-			*vote_response = 0;
-			break;
-	}
-	return 0;
-}
-
-
-/* inode is definitely non NULL */
-void ocfs_inc_inode_seq(ocfs_super *osb, struct inode *inode)
-{
-	atomic_t *seq = GET_INODE_CLEAN_SEQ(inode);
-
-	LOG_TRACE_ARGS("incrementing inode seq... current is %d\n", 
-		       atomic_read(seq));
-	
-	/* wrap to ONE after 13 bits, will need a spinlock */
-	spin_lock (&osb->clean_buffer_lock);
-	if ((atomic_read(&osb->clean_buffer_seq)+1) % STATE_BIT_MAX == 0)
-		atomic_set(&osb->clean_buffer_seq, 1);
-	else
-		atomic_inc(&osb->clean_buffer_seq);
-	spin_unlock (&osb->clean_buffer_lock);
-
-	/* doesn't matter if this another process */
-	/* has already incremented the global seq */
-	atomic_set(seq, atomic_read(&osb->clean_buffer_seq));
-	
-	LOG_TRACE_ARGS("done incrementing inode seq... new is %d\n", 
-		       atomic_read(seq));
-}
-
-
-void ocfs_recover_oin_locks(ocfs_super *osb, __u32 node_num)
-{
-	struct list_head *iter, *temp;
-	struct inode *inode;
-	ocfs_inode_private *i;
-
-	LOG_ENTRY_ARGS("(node_num = %u)\n", node_num);
-
-start:
-	down(&recovery_list_sem);
-	list_for_each_safe (iter, temp, &osb->lock_recovery_lists[node_num]) {
-		i = list_entry (iter, ocfs_inode_private, ip_recovery_list);
-
-		inode = i->ip_inode;
-		spin_lock(&oin_num_ext_lock);
-		
-		if (OCFS_I(inode)->ip_num_extends) {
-			OCFS_I(inode)->ip_num_extends = 0;
-			list_del(&OCFS_I(inode)->ip_recovery_list);
-			INIT_LIST_HEAD(&OCFS_I(inode)->ip_recovery_list);
-			up_write(&OCFS_I(inode)->ip_node_extend_sem);
-
-			spin_unlock(&oin_num_ext_lock);
-			up (&recovery_list_sem);
-			iput(inode);
-			goto start;
-		} else
-			LOG_ERROR_STR("oin is in recovery list, but has zero extend counter value!");
-
-		spin_unlock(&oin_num_ext_lock);
-	}
-
-	up (&recovery_list_sem);
-
-	LOG_EXIT();
-}
-
-static int _ocfs_drop_readonly_cache_lock_thread(void *arg);
-
-/* inode is definitely non NULL */
-int ocfs_drop_readonly_cache_lock(ocfs_super *osb, struct inode *inode, int yield)
-{
-	ocfs_ro_cache_drop_ctxt *arg;
-	int status = 0;
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-					
-	if (test_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state)) {
-		// if coming from process_vote, go about our merry way
-		if (yield)
-			return 0;
-		// if coming from acquire_lock, we are holding the
-		// lockres and the rodrop thread needs it.
-		// return -EAGAIN to drop and try again.
-		return -EAGAIN;
-	}
-
-	arg = kmalloc(sizeof(ocfs_ro_cache_drop_ctxt), GFP_KERNEL);
-	if (arg == NULL) 
-		return -ENOMEM;
-
-	atomic_inc(&inode->i_count);
-	arg->osb = osb;
-	arg->lockres = lockres;
-	arg->inode = inode;
-	arg->yield = yield;
-
-	if (yield)
-		kernel_thread(_ocfs_drop_readonly_cache_lock_thread, arg,
-		      CLONE_VM | CLONE_FS | CLONE_FILES);
-	else 
-		status = _ocfs_drop_readonly_cache_lock(arg);
-		
-	return status;
-}
-
-static int _ocfs_drop_readonly_cache_lock(void *arg)
-{
-	ocfs_ro_cache_drop_ctxt *ctxt = arg;
-	ocfs_super *osb = ctxt->osb;
-	ocfs_lock_res *lockres = ctxt->lockres;
-	struct inode *inode = ctxt->inode;
-	int status = 0;
-	int yield = ctxt->yield;
-
-	/* this will wait until process_vote gets to the release */
-	if (yield)
-		ocfs_acquire_lockres_write(inode);
-	/* check these under the lock */	
-	if (!(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state)) ||
-	    lockres->master_node_num != osb->node_num ||
-	    lockres->lock_type != OCFS_LKM_EXMODE) {
-		LOG_ERROR_ARGS("inode %llu: bad RO lockres!  this=%d, readonly=%s, master=%d, locktype=%u\n", OCFS_I(inode)->ip_blkno,
-			       osb->node_num, 
-			       test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no", 
-			       lockres->master_node_num, lockres->lock_type);
-		status = -EINVAL;
-		goto leave;
-	}
-
-	if (test_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state)) {
-		status = 0;
-		goto leave;
-	}
-
-	set_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state);
-	/* remove this node */
-	ocfs_node_map_clear_bit(&lockres->readonly_map, osb->node_num);
-	
-	status = 0;
-	while (!ocfs_node_map_is_empty(&lockres->readonly_map)) {
-		/* remove all dead nodes */
-		ocfs_node_map_and(&lockres->readonly_map, &osb->publ_map);
-		status = new_lock_function(osb, OCFS_LKM_EXMODE, FLAG_DROP_READONLY, 
-					   NULL, inode);
-		if (status == -EAGAIN) {
-			status = 0;
-			if (yield) {
-				/* from nm thread, give some time to waiters */
-				ocfs_release_lockres_write(inode);
-				ocfs_sleep(50);
-				ocfs_acquire_lockres_write(inode);
-			}
-			continue;
-		}
-		if (status < 0)
-			LOG_ERROR_STATUS (status);
-		break;
-	}
-			
-	if (ocfs_node_map_is_empty(&lockres->readonly_map) &&	
-	    test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) &&
-	    lockres->master_node_num == osb->node_num)
-		clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-
-	clear_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state);
-
-leave:
-	if (yield)
-		ocfs_release_lockres_write(inode); // ocfs_process_vote ocfs_acquire_lock
-
-	if (inode)
-		iput(inode);
-	kfree(arg);
-
-	return status;
-}
-
-static int _ocfs_drop_readonly_cache_lock_thread(void *arg)
-{
-	int status = 0;
-	siginfo_t info;
-
-#define OCFS_DROP_RO_THREAD_NAME   "ocfs2dropro"
-
-	ocfs_daemonize (OCFS_DROP_RO_THREAD_NAME, 
-			strlen(OCFS_DROP_RO_THREAD_NAME),
-			0);
-	status = _ocfs_drop_readonly_cache_lock(arg);
-
-	/* ignore the actual signal */
-	if (signal_pending(current)) {
-		dequeue_signal_lock(current, &current->blocked, &info);
-	}
-
-	/* Flush all scheduled tasks */
-	flush_scheduled_work();
-
-#warning   need a way to wait on all of these threads on dismount
-/*
- * The way to do this is to create a wait queue on the osb.  When one of
- * these guys start, you bump a counter.  When it ends, it decrements
- * the counter and wake_up()s the wait queue.  The counter probably can
- * be protected by a spinlock on the OSB.  The dismount handling just
- * waits on that wait queue until readonly_threads == 0.
- */
-	return status;
-}

Deleted: trunk/src/nm.h
===================================================================
--- trunk/src/nm.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/nm.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,36 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * nm.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_NM_H
-#define OCFS2_NM_H
-
-int ocfs_drop_readonly_cache_lock(ocfs_super *osb, struct inode *inode,
-				  int yield);
-void ocfs_inc_inode_seq(ocfs_super *osb, struct inode *inode);
-int ocfs_process_vote (ocfs_super * osb, ocfs_dlm_msg *dlm_msg);
-int ocfs_recv_thread(void *unused);
-void ocfs_recover_oin_locks(ocfs_super *osb, __u32 node_num);
-
-#endif /* OCFS2_NM_H */

Modified: trunk/src/ocfs.h
===================================================================
--- trunk/src/ocfs.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -42,37 +42,12 @@
 # include <linux/tqueue.h>
 #endif
 
-enum
-{
-	OCFS_VOTE_REQUEST = 1,
-	OCFS_VOTE_REPLY,
-	OCFS_INFO_DISMOUNT
-};
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
 
-enum {
-	DISK_VOTE,
-	COMM_VOTE
-};
-
-enum {
-	INVALID_REQUEST,      // reply with a NO vote
-	UPDATE_OIN_INODE,     // update both oin and inode
-	DELETE_ACQUIRE,// delete or rename acquire request
-	DELETE_RELEASE,// delete or rename release request
-	CHANGE_MASTER,        // request to change master to requestor
-	NOT_MASTER,           // I am not master, retry
-	REMASTER_THIS,        // remaster lock to me
-	REMASTER_REQUESTOR,   // remaster lock to requestor
-	DROP_READONLY,	      // RO cachelock needs to convert to RW
-	READONLY,	      // a RW or RO cachelock, requesting RO
-	RELEASE_DENTRY,
-	TRUNCATE_PAGES		// truncate page caches of a file
-};
-
-#define  OCFS_MAX_DLM_PKT_SIZE			256
-#define  OCFS_DLM_MAX_MSG_SIZE			256
-#define  OCFS_DLM_MSG_MAGIC			0x79677083
-
 /* convenience macro */
 
 #define OCFS_ASSERT(x)             do { if (!(x)) BUG(); } while (0)
@@ -88,134 +63,14 @@
 #define OCFS_CURRENT_TIME               ocfs_get_seconds(CURRENT_TIME)
 #define OCFS_SET_INODE_TIME(i, x, y)    (ocfs_get_seconds(i->x) = (y))
 
-
-#define  MISS_COUNT_WARNING        20
-#define  MISS_COUNT_EMERGENCY      40
-#define  MISS_COUNT_NODE_DEAD      60
-
-/*
-** The following flag values reflect the operation to be performed
-**   by ocfs_create_modify_file
-*/
-// FILEFLAG MASK
-#define  FLAG_ACQUIRE_LOCK        0x00000001
-#define  FLAG_RELEASE_LOCK        0x00000002
-#define  FLAG_FILE_EXTEND         0x00000004
-#define  FLAG_FILE_DELETE         0x00000008
-#define  FLAG_FILE_RENAME         0x00000010
-#define  FLAG_FILE_RECOVERY       0x00000020
-#define  FLAG_FILE_UPDATE_OIN     0x00000040
-#define  FLAG_RELEASE_DENTRY      0x00000080
-#define  FLAG_CHANGE_MASTER       0x00000100
-#define  FLAG_DIR                 0x00000200
-#define  FLAG_REMASTER            0x00000400
-#define  FLAG_FAST_PATH_LOCK      0x00000800
-#define  FLAG_TRUNCATE_PAGES      0x00001000
-#define  FLAG_FILE_TRUNCATE       0x00002000
-#define  FLAG_DROP_READONLY       0x00004000
-#define  FLAG_READONLY            0x00008000
-#define  FLAG_FILE_UNUSED01       0x00010000
-#define  FLAG_FILE_UNUSED02       0x00020000
-#define  FLAG_FILE_UNUSED03       0x00040000
-#define  FLAG_FILE_UNUSED04       0x00080000
-#define  FLAG_FILE_UNUSED05       0x00100000
-#define  FLAG_FILE_UNUSED06       0x00200000
-#define  FLAG_FILE_UNUSED07       0x00400000
-#define  FLAG_FILE_UNUSED08       0x00800000
-#define  FLAG_FILE_UNUSED09       0x01000000
-#define  FLAG_FILE_UNUSED10       0x02000000
-#define  FLAG_FILE_UNUSED11       0x04000000
-#define  FLAG_FILE_UNUSED12       0x08000000
-#define  FLAG_FILE_UNUSED13       0x10000000
-#define  FLAG_FILE_UNUSED14       0x20000000
-#define  FLAG_FILE_UNUSED15       0x40000000
-#define  FLAG_FILE_UNUSED16       0x80000000
-
 #define  OCFS_MAX_OSB_ID             65536
 
-
-#define  HEARTBEAT_METHOD_DISK       (1)
-#define  HEARTBEAT_METHOD_IPC        (2)
-
-
-enum
-{
-	LEFT_NO_OVERLAP,
-	LEFT_ADJACENT,
-	LEFT_OVERLAP,
-	FULLY_CONTAINED,
-	FULLY_CONTAINING,
-	RIGHT_OVERLAP,
-	RIGHT_ADJACENT,
-	RIGHT_NO_OVERLAP
-};
-
-
-/*
-** Extents Defines
-*/
-
-typedef enum _ocfs_ext_flag {
-	LOCAL_EXT = 1,
-	NONLOCAL_EXT = 2
-} ocfs_ext_flag;
-
-/* The following are standard DLM lock types, of which we currently
- * only use a couple. */
-#define OCFS_LKM_NLMODE      (0)               /* null lock */
-#define OCFS_LKM_CRMODE      (1)               /* concurrent read */
-#define OCFS_LKM_CWMODE      (2)               /* concurrent write */
-#define OCFS_LKM_PRMODE      (3)               /* protected read */
-#define OCFS_LKM_PWMODE      (4)               /* protected write */
-#define OCFS_LKM_EXMODE      (5)               /* exclusive */
-
 #define  OCFS_INVALID_NODE_NUM         -1
 
-/* lockres->lock_state bits */
-enum {
-	LOCK_STATE_READONLY,
-	LOCK_STATE_READONLY_DROPPING,
-	LOCK_STATE_BLOCK_EXCLUSIVE,
-	LOCK_STATE_BLOCK_READONLY
-};
-
-enum {
-	NO_LOCK=0,
-	READ_LOCK,
-	WRITE_LOCK
-};
-
-
-
-/* osb->osb_flags flags */
-#define  OCFS_OSB_FLAGS_BEING_DISMOUNTED  (0x00000004)
-#define  OCFS_OSB_FLAGS_SHUTDOWN          (0x00000008)
-#define  OCFS_OSB_FLAGS_INITIALIZED       (0x00000020)
-
 /* OcfsGlobalCtxt.flags flags */
 #define  OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED (0x00000001)
 #define  OCFS_FLAG_MEM_LISTS_INITIALIZED          (0x00000002)
-#define  OCFS_FLAG_SHUTDOWN_VOL_THREAD            (0x00000004)
 
-/*
-** Information on Publish sector of each node
-*/
-#define  DISK_HBEAT_COMM_ON           20	/* in the order of 5 secs */
-#define  DISK_HBEAT_NO_COMM           4		/* in the order of 1 sec */
-#define  DISK_HBEAT_INVALID           0		/* in the order of 100ms */
-
-
-/*
-** Information on Vote sector of each node
-*/
-// VOTEFLAG MASK
-#define  FLAG_VOTE_NODE               0x1
-#define  FLAG_VOTE_OIN_UPDATED        0x2
-#define  FLAG_VOTE_OIN_ALREADY_INUSE  0x4
-#define  FLAG_VOTE_UPDATE_RETRY       0x8
-#define  FLAG_VOTE_FILE_DEL           0x10
-
-
 #define SHUTDOWN_SIGS   (sigmask(SIGKILL) | sigmask(SIGHUP) | \
 			 sigmask(SIGINT) | sigmask(SIGQUIT))
 
@@ -223,21 +78,12 @@
 
 #define OCFS_LINUX_MAX_FILE_SIZE   9223372036854775807LL
 
-#define OCFS_VOLCFG_LOCK_ITERATE	(HZ/10)	/* in jiffies */
-#define OCFS_VOLCFG_LOCK_TIME		1000    /* in ms */
-#define OCFS_VOLCFG_HDR_SECTORS		2	/* in sectors */
-#define OCFS_VOLCFG_NEWCFG_SECTORS	4	/* in sectors */
-
-#define OCFS_NM_HEARTBEAT_TIME		500	/* in ms */
-#define OCFS_HEARTBEAT_INIT             10      /* number of NM iterations to stabilize the publish map */
-	
 #ifndef O_DIRECT
 #warning this depends on the architecture!
 #define O_DIRECT        040000
 #endif
 
 
-/* sm - ocfs 1.0 fails to set fe->sig for dirs */
 #define  IS_VALID_FILE_ENTRY(ptr)     \
 	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 #define  IS_VALID_EXTENT_BLOCK(ptr)  \
@@ -246,27 +92,9 @@
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
 
 /*
-** Macros
-*/
-#define  OCFS_SET_FLAG(flag, value)    ((flag) |= (value))
-#define  OCFS_CLEAR_FLAG(flag, value)  ((flag) &= ~(value))
-
-
-/*
 ** Structures...
 */
 
-
-static inline void *ocfs_malloc(size_t size)
-{
-	void *p = kmalloc(size, GFP_KERNEL);
-	if (p)
-		memset(p, 0, size);
-	return p;
-}
-
-
-
 #define  OCFS_NAME              "OCFS2"
 
 /* ioctl commands */
@@ -276,28 +104,10 @@
 #define OCFS_SB(sb)	    ((ocfs_super *)OCFS_GENERIC_SB_MEMBER(sb))
 #define OCFS2_SB(sb)	    ((ocfs_super *)OCFS_GENERIC_SB_MEMBER(sb))
 
-#define OCFS_IPC_DEFAULT_PORT   7001
-
-		
-#define OCFS_IPC_DLM_VERSION    0x0201
-
-
-/* =========================================================== */
-
 /* This totally sucks that we have to include these here
  * FIXME: Make them seperately includable. */
 #include "ocfs2_fs.h"
-#include "ocfs2_disk_dlm.h"
 
-typedef struct _BARF_BARF_BARF
-{
-	char node_name[MAX_NODE_NAME_LENGTH];
-	ocfs_guid guid;
-	ocfs_ipc_config_info ipc_config;
-}
-BARF_BARF_BARF;
-
-
 typedef struct _ocfs_super ocfs_super;
 typedef struct _ocfs_lock_res ocfs_lock_res;
 
@@ -310,17 +120,6 @@
 	unsigned long map[BITS_TO_LONGS(OCFS_NODE_MAP_MAX_NODES)];
 } ocfs_node_map;
 
-struct _ocfs_lock_res
-{
-	__s16 master_node_num;	/* Master Node */
-	__u32 lock_holders;
-	__u32 uncommitted_holders;
-	__u8 lock_type;
-	struct rw_semaphore lock;
-	unsigned long readonly_state;
-	ocfs_node_map readonly_map;
-};
-
 struct _ocfs_journal_handle;
 
 /* I hate our includes */
@@ -329,6 +128,65 @@
 	struct rb_root	em_extents;
 };
 
+enum ocfs2_ast_action {
+	OCFS2_AST_INVALID = 0,
+	OCFS2_AST_ATTACH,
+	OCFS2_AST_CONVERT,
+	OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+	OCFS2_UNLOCK_INVALID = 0,
+	OCFS2_UNLOCK_CANCEL_CONVERT,
+	OCFS2_UNLOCK_DROP_LOCK,
+};
+
+enum ocfs2_lock_type {
+	OCFS_TYPE_META = 0,
+	OCFS_TYPE_DATA,
+	OCFS_TYPE_SUPER,
+	OCFS_NUM_LOCK_TYPES
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					       * downconvert*/
+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING    (0x00000020)
+
+struct ocfs2_lock_res_ops;
+
+typedef struct _ocfs2_lock_res {
+	void                    *l_priv;
+	struct ocfs2_lock_res_ops *l_ops;
+	spinlock_t               l_lock;
+
+	struct list_head         l_blocked_list;
+
+	enum ocfs2_lock_type     l_type;
+	int                      l_flags;
+	char                    *l_name;
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	dlm_lockstatus           l_lksb;
+	u32                      l_local_seq;
+
+	/* used from AST/BAST funcs. */
+	enum ocfs2_ast_action    l_action;
+	enum ocfs2_unlock_action l_unlock_action;
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t l_event;
+} ocfs2_lock_res;
+
 /* OCFS2 Inode Private Data */
 typedef struct _ocfs_inode_private
 {
@@ -338,19 +196,19 @@
 
 	u64             ip_blkno;
 
+	ocfs2_lock_res  ip_meta_lockres;
+	ocfs2_lock_res  ip_data_lockres;
+
 	/* protects allocation changes on this inode. */
 	struct rw_semaphore  ip_alloc_sem;
 
 	/* These fields are protected by ip_lock */
 	spinlock_t	  ip_lock;
-	__u32             ip_open_cnt;
+	u32             ip_open_cnt;
 	u32		ip_clusters;
 	u64		ip_mmu_private;
-	__u32             ip_open_flags;
 	struct ocfs2_extent_map ip_map;
 
-	atomic_t          ip_needs_verification;
-
 	struct semaphore  ip_io_sem;
 
 	/* Used by the journalling code to attach an inode to a
@@ -360,40 +218,21 @@
 	struct list_head            ip_handle_list;
 	struct _ocfs_journal_handle *ip_handle;
 
-	/* ip_node_extend_sem locks out extends on behalf of other nodes. */
-	struct rw_semaphore  ip_node_extend_sem;
-
-	struct list_head  ip_recovery_list;/* protected by recovery_list_sem */
-	__s32             ip_num_extends; /* protected by oin_num_ext_lock */
-
 	atomic_t          ip_clean_buffer_seq;
-	__u32             ip_flags; /* see below */
+	u32             ip_flags; /* see below */
 
 	/* protected by recovery_lock. */
 	struct inode      *ip_next_orphan;
 
-	ocfs_lock_res     ip_lockres;
-	__u32 		  ip_dir_start_lookup;
+	u32 		  ip_dir_start_lookup;
 
-	/* ip_pending_locks and ip_j_inode are protected by the
-	 * journals cmt_lock.  
-	 * ip_pending_locks: disk locks for this inode which have to be
-	 * released once  their transaction checkpoints
- 	 * ip_j_inode: list_head for journal->committing_inodes. */
-	struct list_head  ip_pending_locks;
-	struct list_head  ip_j_inode;
-
-	/* protected by trans_inc_lock, which transaction were we
-	 * created on? Zero if none. */
+	/* next two are protected by trans_inc_lock */
+	/* which transaction were we created on? Zero if none. */
 	unsigned long               ip_created_trans;
+	/* last transaction we were a part of. */
+	unsigned long               ip_last_trans;
 } ocfs_inode_private;
 
-/* Eventually, the 'flags' and 'open_flags' fields need to be
- * merged. */
-/* open flags */
-#define  OCFS_OIN_OPEN_FOR_DIRECTIO              (0x00000001)
-#define  OCFS_IN_FIRST_OPEN                      (0x00000002)
-
 /* 'flags' flags. */
 /* has this inode been deleted, either from this node or from another node. */
 #define OCFS_INODE_DELETED          0x00000001
@@ -405,6 +244,9 @@
 #define OCFS_INODE_SYSTEM_FILE      0x00000008
 /* are we going to let another node deal with deletion of this inode? */
 #define OCFS_INODE_SKIP_DELETE      0x00000010
+#define OCFS_INODE_IN_REFRESH       0x00000020
+#define OCFS_INODE_BITMAP           0x00000040
+#define OCFS_INODE_OPEN_DIRECT      0x00000080
 
 #define OCFS_I(i)        ((ocfs_inode_private *)(i->u.generic_ip))
 
@@ -430,14 +272,6 @@
 }
 ocfs_vol_state;
 
-typedef struct _ocfs_vol_node_map
-{
-	__u64 time;
-	__u32 miss_cnt;
-	atomic_t dismount;
-}
-ocfs_vol_node_map;
-
 typedef struct _ocfs_commit_task
 {
 	struct completion c_complete;
@@ -469,6 +303,7 @@
 } ocfs_alloc_stats;
 
 struct _ocfs_journal;
+struct _ocfs2_slot_info;
 
 /*
  * ocfs_super
@@ -477,34 +312,27 @@
  */
 struct _ocfs_super
 {
-	struct semaphore osb_res; /* resource to protect the ocfs_super */
 	struct list_head osb_next;	/* list of ocfs_super(s) */
 	__u32 osb_id;		/* id used by the proc interface */
-	struct completion dlm_complete;
-	struct task_struct *dlm_task;
 	ocfs_commit_task *commit;
-	__u32 osb_flags;
-	ocfs_node_map publ_map;
 	struct super_block *sb;
 	struct inode *root_inode;
 	struct inode *sys_root_inode;
 	struct inode *system_inodes[NUM_SYSTEM_INODES];
-	
+
+	struct _ocfs2_slot_info *slot_info;
+
+	spinlock_t node_map_lock;
+	ocfs_node_map mounted_map;
+	ocfs_node_map recovery_map;
+	ocfs_node_map umount_map;
+
 	/* new */
 	u32 num_clusters;
 	u64 root_blkno;
 	u64 system_dir_blkno;
 	u64 bitmap_blkno;
 	u32 bitmap_cpg;
-	u64 publish_blkno;
-	u32 publish_blocks;
-	u64 vote_blkno;
-	u32 vote_blocks;
-	u64 autoconfig_blkno;
-	u32 autoconfig_blocks;
-	u64 new_autoconfig_blkno;
-	u32 new_autoconfig_blocks;
-	u32 total_autoconfig_blocks;
 	u8 *uuid;
 	u8 *vol_label;
 	u64 first_cluster_group_blkno;
@@ -517,73 +345,69 @@
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 
-	ocfs_vol_node_map *vol_node_map;
-	struct semaphore cfg_lock;
-	BARF_BARF_BARF **node_cfg_info;
-	__u64 cfg_seq_num;
-	int cfg_initialized;
 	u16 max_nodes;
-	u16 num_cfg_nodes;
 	u16 num_nodes;
 	s16 node_num;
+	s16 slot_num;
 	int reclaim_id;		/* reclaim the original node number*/
-	__u32 hbt;
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
-	int needs_flush;
 	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
 
 	atomic_t vol_state;
-	struct semaphore orphan_recovery_lock;
 	struct semaphore recovery_lock;
-	spinlock_t recovery_map_lock;
-	ocfs_node_map recovery_map;
+	int recovery_launched;
 	int disable_recovery;
-	atomic_t num_recovery_threads;
-	struct timer_list lock_timer;
-	atomic_t lock_stop;
-	wait_queue_head_t lock_event;
-	atomic_t lock_event_woken;
-	struct semaphore comm_lock;	/* protects ocfs_comm_process_vote_reply */
-	atomic_t nm_init;
-	wait_queue_head_t nm_init_event;
-	__u32 prealloc_lock;
-	struct buffer_head **autoconfig_bhs;
-	struct semaphore publish_lock;  /* protects r/w to publish sector */
-	atomic_t node_req_vote;         /* set when node's vote req pending */
-	int publish_dirty;
-	wait_queue_head_t flush_event;
-	atomic_t flush_event_woken;
+	wait_queue_head_t checkpoint_event;
+	atomic_t needs_checkpoint;
 	struct _ocfs_journal *journal;
 	atomic_t clean_buffer_seq;
 	spinlock_t clean_buffer_lock;
-	struct list_head *lock_recovery_lists;
-	__u64 *last_publ_seq_num;
+
 	int have_local_alloc;
 	struct buffer_head *local_alloc_bh;
-	__u8 check_mounted; /* tell nm to check mounted flag, protected by publish_lock*/
+
+	/* Next two fields are for local node slot recovery during
+	 * mount. */
+	int dirty;
+	ocfs2_dinode *local_alloc_copy;
+
 	ocfs_dlm_stats net_reqst_stats;	/* stats of netdlm vote requests */
 	ocfs_dlm_stats net_reply_stats;	/* stats of netdlm vote reponses */
 	ocfs_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
-	struct semaphore vote_sem; /* protects calls to ocfs_process_vote */
-	struct list_head vote_obj_queue;
-	spinlock_t vote_obj_queue_lock;
-	unsigned long voting_ino; /* only safe from the process_vote pid */
-	wait_queue_head_t open_event;
-};
 
-typedef struct _ocfs_comm_info
-{
-	u16 ip_version;			/* IP version in NBO */
-	u16 ip_port;			/* IP port in NBO */
-	union {
-		__u32 ip_addr4;		/* IPv4 address in NBO */
-		__u32 ip_addr6[4];	/* IPv6 address in NBO */
-	} addr_u;
-} ocfs_comm_info;
+	char *group_name;
+	struct inode *group_inode;
+	dlm_ctxt *dlm;
+	ocfs2_lock_res super_lockres;
 
+	wait_queue_head_t recovery_event;
+
+	spinlock_t vote_task_lock;
+	struct task_struct *vote_task;
+	wait_queue_head_t vote_event;
+	atomic_t wake_vote_task;
+	int vote_exit;
+
+	struct list_head blocked_lock_list;
+	unsigned long blocked_lock_count;
+
+	struct list_head vote_list;
+	int vote_count;
+
+	struct completion vote_event_complete;
+	struct completion vote_event_init;
+
+	u32 net_key;
+	char *net_vote_buf;
+	char *net_response_buf;
+	spinlock_t net_response_lock;
+	unsigned int net_response_ids;
+	struct list_head net_response_list;
+};
+
 typedef struct _ocfs_global_ctxt
 {
 	struct semaphore global_res;
@@ -591,11 +415,8 @@
 	kmem_cache_t *inode_cache;
 	kmem_cache_t *lock_cache;
 	__u32 flags;
-	__s16 pref_node_num;		/* preferred... osb has the real one */
-	ocfs_guid guid;			/* uniquely identifies a node */
 	char *node_name;		/* human readable node identification */
 	char *cluster_name;		/* unused */
-	ocfs_comm_info comm_info;	/* ip address, etc for listener */
 	int comm_info_read;		/* ipc info loaded from config file */
 	spinlock_t comm_seq_lock;	/* protects comm_seq_num */
 	__u64 comm_seq_num;		/* local node seq num used in ipcdlm */
@@ -605,160 +426,11 @@
 }
 ocfs_global_ctxt;
 
-typedef struct _ocfs_ipc_ctxt
-{
-	__u32 dlm_msg_size;
-	__u16 version;
-	int init;
-	struct socket *send_sock;
-	struct socket *recv_sock;
-	struct completion complete;
-	struct task_struct *task;
-}
-ocfs_ipc_ctxt;
-
-
-extern ocfs_ipc_ctxt OcfsIpcCtxt;
-
-typedef struct _ocfs_ipc_dlm_config
-{
-	__u16 version;
-	__u32 msg_size;
-	__u32 num_recv_threads;
-}
-ocfs_ipc_dlm_config;
-
 /*
 ** Globals ...
 */
 extern ocfs_global_ctxt OcfsGlobalCtxt;
 
-
-/*
- * DLM network stuff
- */
-typedef struct _ocfs_dlm_msg_hdr
-{
-	__u64 lock_id;
-	__u64 lock_seq_num;
-	__u32 flags;
-	__u8 odmh_pad[4];
-	__u32 num_ident; /* number of identical messages, always >= 1 */
-} ocfs_dlm_msg_hdr;
-
-typedef ocfs_dlm_msg_hdr ocfs_dlm_req_master;
-
-typedef struct _ocfs_dlm_reply_master
-{
-	ocfs_dlm_msg_hdr h;
-	__u32 status;
-}
-ocfs_dlm_reply_master;
-
-typedef struct _ocfs_dlm_msg
-{
-	__u32 magic;
-	__u32 msg_len;
-	__u8 vol_id[MAX_VOL_ID_LENGTH];
-	__s16 src_node;
-	__s16 dst_node;
-	__u32 msg_type;
-	__u32 check_sum;
-	__u8 msg_buf[0];
-} ocfs_dlm_msg;
-
-typedef struct _ocfs_vote_obj
-{
-	struct list_head list;
-	wait_queue_head_t voted_event;
-	atomic_t voted_event_woken;
-	atomic_t refcount;
-	spinlock_t lock;
-	__u32 vote_state;
-	__u32 req_lock_type;
-	int vote_status;
-	ocfs_node_map req_vote_map;
-	ocfs_node_map got_vote_map;
-	//ocfs_node_map tmp_openmap;
-	__u64 seq_num;
-	pid_t pid;
-	ocfs_dlm_msg m;
-} ocfs_vote_obj;
-
-enum {
-	VOTE_OBJ_STATE_UNSENT,
-	VOTE_OBJ_STATE_SENT,
-	VOTE_OBJ_STATE_PARTIAL_REPLY,
-	VOTE_OBJ_STATE_FULL_REPLY,
-	VOTE_OBJ_STATE_DESTROYING
-};
-
-	
-
-typedef struct _ocfs_vote_obj_lookup_data ocfs_vote_obj_lookup_data;
-
-struct _ocfs_vote_obj_lookup_data
-{
-	union {
-		struct {
-			__u64 seq_num;
-			__u64 lock_id;
-		} s;
-		struct {
-			char *page;
-			int *len;
-			int max;
-		} proc;
-	} u;
-	int (*func) (ocfs_vote_obj *obj, struct _ocfs_vote_obj_lookup_data *data);
-	ocfs_vote_obj **ret;
-};
-
-
-
-
-typedef struct _ocfs_recv_ctxt
-{
-	__s32 msg_len;
-	__u8 msg[OCFS_MAX_DLM_PKT_SIZE];
-	int status;
-	struct work_struct ipc_wq;
-}
-ocfs_recv_ctxt;
-
-typedef struct _ocfs_cfg_task
-{
-	struct work_struct cfg_wq;
-	ocfs_super *osb;
-	__u64 lock_off;
-	__u8 *buffer;
-	struct buffer_head *bh;
-}
-ocfs_cfg_task;
-
-typedef enum _ocfs_volcfg_op
-{
-	OCFS_VOLCFG_ADD,
-	OCFS_VOLCFG_UPD
-}
-ocfs_volcfg_op;
-
-typedef struct _ocfs_vote_request_ctxt
-{
-	__s16 node_num;
-	int status;
-	ocfs_dlm_msg *dlm_msg;
-} ocfs_vote_request_ctxt;
-
-typedef struct _ocfs_vote_reply_ctxt
-{
-	int reply_method;
-	int *status;
-	ocfs_node_map *got_vote_map;
-	__u32 flags;
-	ocfs_dlm_reply_master *reply;
-} ocfs_vote_reply_ctxt;
-
 struct ocfs_ioc
 {
 	char name[255];		/* "OCFS" */
@@ -767,114 +439,11 @@
 	char nodename[255];	/* node name */
 };
 
-/* timeout structure taken from Ben's aio.c */
-typedef struct _ocfs_timeout {
-	struct timer_list	timer;
-	int			timed_out;
-	wait_queue_head_t	wait;
-} ocfs_timeout;
-
 #define NAMEI_RA_CHUNKS  2
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
 #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
-#define __ocfs_wait(wq, condition, timeo, ret)			\
-do {								\
-	ocfs_timeout __to;					\
-								\
-	DECLARE_WAITQUEUE(__wait, current);			\
-	DECLARE_WAITQUEUE(__to_wait, current);			\
-								\
-	ocfs_init_timeout(&__to);				\
-								\
-	if (timeo) {						\
-		ocfs_set_timeout(&__to, timeo);			\
-		if (__to.timed_out) {				\
-			ocfs_clear_timeout(&__to);		\
-		}						\
-	}							\
-								\
-	add_wait_queue(&wq, &__wait);				\
-	add_wait_queue(&__to.wait, &__to_wait);			\
-	do {							\
-		ret = 0;					\
-		set_current_state(TASK_INTERRUPTIBLE);		\
-		if (condition)					\
-			break;					\
-		ret = -ETIMEDOUT;				\
-		if (__to.timed_out)				\
-			break;					\
-		schedule();					\
-		if (signal_pending(current)) {			\
-			ret = -EINTR;				\
-			break;					\
-		}						\
-	} while (1);						\
-								\
-	set_current_state(TASK_RUNNING);			\
-	remove_wait_queue(&wq, &__wait);			\
-	remove_wait_queue(&__to.wait, &__to_wait);		\
-								\
-	if (timeo)						\
-		ocfs_clear_timeout(&__to);			\
-								\
-} while(0)
-
-#define __ocfs_wait_uninterruptible(wq, condition, timeo, ret)	\
-do {								\
-	ocfs_timeout __to;					\
-								\
-	DECLARE_WAITQUEUE(__wait, current);			\
-	DECLARE_WAITQUEUE(__to_wait, current);			\
-								\
-	ocfs_init_timeout(&__to);				\
-								\
-	if (timeo) {						\
-		ocfs_set_timeout(&__to, timeo);			\
-		if (__to.timed_out) {				\
-			ocfs_clear_timeout(&__to);		\
-		}						\
-	}							\
-								\
-	add_wait_queue(&wq, &__wait);				\
-	add_wait_queue(&__to.wait, &__to_wait);			\
-	do {							\
-		ret = 0;					\
-		set_current_state(TASK_UNINTERRUPTIBLE);	\
-		if (condition)					\
-			break;					\
-		ret = -ETIMEDOUT;				\
-		if (__to.timed_out)				\
-			break;					\
-		schedule();					\
-	} while (1);						\
-								\
-	set_current_state(TASK_RUNNING);			\
-	remove_wait_queue(&wq, &__wait);			\
-	remove_wait_queue(&__to.wait, &__to_wait);		\
-								\
-	if (timeo)						\
-		ocfs_clear_timeout(&__to);			\
-								\
-} while(0)
-
-#define ocfs_wait(wq, condition, timeout)			\
-({								\
-	int __ret = 0;						\
-	if (!(condition))					\
-		__ocfs_wait(wq, condition, timeout, __ret);	\
-	__ret;							\
-})
-
-#define ocfs_wait_uninterruptible(wq, condition, timeout)		      \
-({									      \
-	int __ret = 0;							      \
-	if (!(condition))						      \
-		__ocfs_wait_uninterruptible(wq, condition, timeout, __ret);   \
-	__ret;								      \
-})
-
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
@@ -900,15 +469,6 @@
 	return -EINVAL;
 }
 
-static inline int ocfs_is_local_cache_lock(ocfs_super *osb, struct inode *inode)
-{
-	ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-	if (lockres->lock_type == OCFS_LKM_EXMODE &&
-	    lockres->master_node_num == osb->node_num)
-		return 1;
-	return 0;
-}
-
 typedef struct _ocfs_journal_handle ocfs_journal_handle;
 
 #endif /* !OCFS_H */

Modified: trunk/src/ocfs1_fs_compat.h
===================================================================
--- trunk/src/ocfs1_fs_compat.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs1_fs_compat.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -33,6 +33,7 @@
 #define MAX_VOL_ID_LENGTH_V1               16
 #define MAX_VOL_LABEL_LEN_V1               64
 #define MAX_CLUSTER_NAME_LEN_V1            64
+#define MAX_NODE_NAME_LENGTH	32
 
 #define OCFS1_MAJOR_VERSION              (2)
 #define OCFS1_MINOR_VERSION              (0)

Modified: trunk/src/ocfs2.h
===================================================================
--- trunk/src/ocfs2.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs2.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,6 +26,8 @@
 #ifndef OCFS2_H
 #define OCFS2_H
 
+#define OCFS2_MAX_NODE_NAME_LENGTH	65
+
 static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
 					   u32 clusters)
 {

Deleted: trunk/src/ocfs2_disk_dlm.h
===================================================================
--- trunk/src/ocfs2_disk_dlm.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs2_disk_dlm.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,130 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_disk_dlm.h
- *
- * On-disk structures involved in disk publish/vote for OCFS2.
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2,  as published by the Free Software Foundation.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef _OCFS2_DISK_DLM_H
-#define _OCFS2_DISK_DLM_H
-
-/*
- * On-disk IPC configuration for an OCFS2 node.
- */
-typedef struct _ocfs_ipc_config_info
-{
-/*00*/	__u16 ip_version;		/* IP version in NBO */
-	__u16 ip_port;			/* IP port in NBO */
-	__u32 ip_reserved1;
-	__u64 ip_reserved2;
-/*10*/	union {
-		__u32 ip_addr4;		/* IPv4 address in NBO */
-		__u32 ip_addr6[4];	/* IPv6 address in NBO */
-	} addr_u;
-/*20*/
-} ocfs_ipc_config_info;
-
-/*
- * On-disk structure representing a Global Unique ID for an OCFS2 node.
- *
- * The GUID has two parts.  The host_id is a generally-randomly-unique
- * hex-as-ascii string of 20 characters (10 bytes).  The mad_id field
- * is, unsurprisingly, the MAC address of the network card that the
- * IPC mechanism will be using (the address in
- * ocfs_ipc_config_info.addr_u).  This should (ha-ha) provide a unique
- * identifier for a node in the OCFS2 cluster.  It has the added
- * benefit of detecting when a node has changed network cards
- * (host_id is the same, mac_id has changed) or when an identical
- * mac address is on a different mode (the converse).
- */
-typedef union _ocfs_guid
-{
-/*00*/	struct
-	{
-		char host_id[OCFS2_GUID_HOSTID_LEN];
-		char mac_id[OCFS2_GUID_MACID_LEN];
-	} id;
-	__u8 guid[OCFS2_GUID_LEN];
-/*20*/
-} ocfs_guid;
-
-/*
- * On-disk configuration information for an OCFS2 node.  A node
- * populates its own info for other nodes to read and use.
- */
-typedef struct _ocfs_node_config_info
-{
-/*00*/	ocfs2_disk_lock disk_lock;		/* Lock on the info */
-/*30*/	ocfs_guid guid;				/* GUID */
-/*50*/	ocfs_ipc_config_info ipc_config;	/* IPC info */
-/*70*/	__u8 node_name[MAX_NODE_NAME_LENGTH+1]; /* Name */
-/*91*/	__u8 name_pad[7];			/* Pad to align (UGH) */
-/*98*/
-} ocfs_node_config_info;
-
-/*
- * On-disk ... for OCFS2.  FIXME this description.
- */
-typedef struct _ocfs_node_config_hdr
-{
-/*00*/	ocfs2_disk_lock disk_lock;
-/*30*/	__u8 signature[OCFS2_NODE_CONFIG_SIGN_LEN];
-	__u32 version;
-	__u16 num_nodes;
-	__u16 reserved1;
-/*40*/	__u32 last_node;
-	__u32 onch_pad;
-	__u64 cfg_seq_num;
-/*50*/	
-} ocfs_node_config_hdr;
-
-/*
- * On-disk lock / state change request for OCFS2.
- */
-typedef struct _ocfs_publish
-{
-/*00*/	__u64 time;		/* Time of publish */
-	__s32 vote_UNUSED;
-	__u32 dirty;		/* Is the node in a clean state */
-/*10*/	__u32 vote_type_UNUSED;	/* Type required */
-	__u32 mounted;		/* Does the publisher have it mounted */
-/*18*/	__u32 vote_map_UNUSED[8];	/* Who needs to vote */
-/*38*/	__u64 reserved1;
-/*50*/	__u64 publ_seq_num_UNUSED;	/* Sequence for vote */
-	__u64 lock_id_UNUSED;		/* Lock vote is requested for */
-	/* last seq num used in comm voting */
-/*60*/	__u64 comm_seq_num;
-	__u32 num_ident;
-/*72*/
-} ocfs_publish;
-
-typedef struct _ocfs_vote
-{
-/*00*/	__u8 type_UNUSED;	/* Vote type */
-	__u8 node_UNUSED;	/* Node voting */
-	__u8 reserved1[30];	/* used to be vote[32] */
-/*20*/	__u64 vote_seq_num_UNUSED;	/* Vote sequence */
-	__u64 lock_id_UNUSED;	/* Lock being voted on */
-/*30*/	__u8 open_handle_UNUSED;/* Does the voter have it open */
-	__u8 ov_pad[7];
-/*38*/	
-} ocfs_vote;
-
-#endif  /* _OCFS2_DISK_DLM_H */

Modified: trunk/src/ocfs2_fs.h
===================================================================
--- trunk/src/ocfs2_fs.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs2_fs.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -96,6 +96,11 @@
 #define OCFS2_DLM_FL		(0x00000200)	/* DLM area */
 #define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
 
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
+
 /* Limit of space in ocfs2_dir_entry */
 #define OCFS2_MAX_FILENAME_LENGTH       255
 
@@ -115,8 +120,9 @@
 enum {
 	BAD_BLOCK_SYSTEM_INODE = 0,
 	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+	SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
 	DLM_SYSTEM_INODE,
-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE DLM_SYSTEM_INODE
 	GLOBAL_BITMAP_SYSTEM_INODE,
 	ORPHAN_DIR_SYSTEM_INODE,
 #define OCFS2_LAST_GLOBAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
@@ -134,6 +140,7 @@
 	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	"global_inode_alloc",
 
 	/* These are used by the running filesystem */
+	[SLOT_MAP_SYSTEM_INODE]			"slot_map",
 	[DLM_SYSTEM_INODE]			"dlm",
 	[GLOBAL_BITMAP_SYSTEM_INODE]		"global_bitmap",
 	[ORPHAN_DIR_SYSTEM_INODE]		"orphan_dir",
@@ -191,7 +198,6 @@
  * Convenience casts
  */
 #define OCFS2_RAW_SB(dinode)	(&((dinode)->id2.i_super))
-#define DISK_LOCK(dinode)	(&((dinode)->i_disk_lock))
 #define LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
 
 /* TODO: change these?  */
@@ -200,8 +206,6 @@
 #define OCFS2_NODE_CONFIG_VER		2
 #define OCFS2_NODE_MIN_SUPPORTED_VER	2
 
-#define MAX_NODE_NAME_LENGTH	32
-
 #define OCFS2_GUID_HOSTID_LEN	20
 #define OCFS2_GUID_MACID_LEN	12
 #define OCFS2_GUID_LEN		(OCFS2_GUID_HOSTID_LEN + OCFS2_GUID_MACID_LEN)
@@ -280,17 +284,6 @@
 } ocfs2_extent_block;
 
 /*
- * On disk lock structure for OCFS2
- */
-typedef struct _ocfs2_disk_lock
-{
-/*00*/	__s16 dl_master;	/* Node number of current master */
-	__u8 dl_level;		/* Lock level */
-	__u8 dl_reserved1;
-/*04*/
-} ocfs2_disk_lock;
-
-/*
  * On disk superblock for OCFS2
  * Note that it is contained inside an ocfs2_dinode, so all offsets
  * are relative to the start of ocfs2_dinode.id2.
@@ -349,7 +342,7 @@
 					   belongs to */
 	__u16 i_suballoc_bit;		/* Bit offset in suballocater
 					   block group */
-/*10*/	ocfs2_disk_lock i_disk_lock;	/* Lock structure */
+	__u32 i_reserved0;
 /*14*/	__u32 i_clusters;		/* Cluster count */
 /*18*/	__u32 i_uid;			/* Owner UID */
 	__u32 i_gid;			/* Owning GID */
@@ -365,8 +358,8 @@
 	__u64 i_last_eb_blk;		/* Pointer to last extent
 					   block */
 /*60*/	__u32 i_fs_generation;		/* Generation per fs-instance */
-	__u32 i_reserved0;		/* Generation per fs-instance */
-/*68*/	__u64 i_reserved1[10];
+	__u32 i_reserved1;		/* Generation per fs-instance */
+/*68*/	__u64 i_reserved2[10];
 /*B8*/	union {
 		__u64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -379,6 +372,11 @@
 			__u32 i_total;	/* Total bits (clusters)
 					   available */
 		} bitmap1;
+		struct {		/* Info for journal system
+					   inodes */
+			__u32 i_flags;	/* Mounted, version, etc.    */
+			__u32 i_j_pad;
+		} journal1;
 	} id1;				/* Inode type dependant 1 */
 /*C0*/	union {
 		ocfs2_super_block i_super;

Modified: trunk/src/ocfs_journal.h
===================================================================
--- trunk/src/ocfs_journal.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs_journal.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -30,7 +30,7 @@
 #include <linux/jbd.h>
 
 #define OCFS_JOURNAL_CURRENT_VERSION	1
-#define OCFS_CHECKPOINT_INTERVAL        8000
+#define OCFS_CHECKPOINT_INTERVAL        (8 * HZ)
 
 enum ocfs_journal_state {
 	OCFS_JOURNAL_FREE = 0,
@@ -59,7 +59,6 @@
 					       * which we usually run
 					       * from (recovery,
 					       * etc)                     */
-	__u32                     node_num;   /* Whose journal are we?    */
 	struct buffer_head        *lockbh;    /* Journal disk lock, used 
 						 to access file entry	  */
 	atomic_t                  num_trans;  /* Number of transactions 
@@ -67,58 +66,51 @@
 	unsigned long             trans_id;
 	/* locking order: trans_lock -> cmt_lock */
 	spinlock_t                cmt_lock;   /* protects the committed list */
-	atomic_t                  num_cmt_locks; /* number of delayed
-						  * locks */
-	atomic_t                  num_chkpt_locks;
 	struct rw_semaphore       trans_barrier;
-
-	struct list_head          committing_inodes;   /* list of all
-						        * inodes that
-						        * have committed
-						        * and are
-						        * awaiting a
-						        * checkpoint. Protected
-						        * by cmt_lock. */
-	struct list_head          checkpointing_locks; /* locks
-							* pending release
-							* after a checkpoint
-							* -- this variable
-							* is unlocked as
-							* commit_thread is
-							* the only guy who
-							* looks at it! */
 };
 
 extern spinlock_t trans_inc_lock;
 
 /* wrap trans_id so we never have it equal to zero. */
-static inline void ocfs_inc_trans_id(ocfs_journal *j)
+static inline unsigned long ocfs_inc_trans_id(ocfs_journal *j)
 {
+	unsigned long old_id;
 	spin_lock(&trans_inc_lock);
-	j->trans_id++;
+	old_id = j->trans_id++;
 	if (!j->trans_id)
 		j->trans_id = 1;
 	spin_unlock(&trans_inc_lock);
+	return old_id;
 }
 
-static inline int ocfs_trans_checkpointed(ocfs_journal *j, 
-					  unsigned long trans_id)
+static inline void ocfs_set_inode_lock_trans(ocfs_journal *journal,
+					     struct inode *inode)
 {
+	spin_lock(&trans_inc_lock);
+	OCFS_I(inode)->ip_last_trans = journal->trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * inode. Returns true if all the inodes changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock at put it on another transaction. */
+static inline int ocfs_inode_fully_checkpointed(struct inode *inode)
+{
 	int ret;
+	ocfs_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+
 	spin_lock(&trans_inc_lock);
-	ret = time_after(trans_id, j->trans_id);
+	ret = time_after(journal->trans_id, OCFS_I(inode)->ip_last_trans);
 	spin_unlock(&trans_inc_lock);
 	return ret;
 }
 
-/* convenience function to check if an inode has been checkpointed
- * yet. Replaces ocfs_journal_new_file_search. Will do you a favor and
- * set created_trans = 0 when you've been checkpointed. 
- * returns '1' if the inode hasn't been checkpointed yet.
- *
- */
-static inline int ocfs_inode_is_new(ocfs_super *osb, 
-				    struct inode *inode)
+/* convenience function to check if an inode is still new (has never
+ * hit disk) Will do you a favor and set created_trans = 0 when you've
+ * been checkpointed.  returns '1' if the inode is still new. */
+static inline int ocfs_inode_is_new(struct inode *inode)
 {
 	int ret;
 
@@ -126,10 +118,10 @@
 	 * mkfs. This helps us early during mount, before we have the
 	 * journal open and trans_id could be junk. */
 	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SYSTEM_FILE)
-		return(0);
+		return 0;
 	spin_lock(&trans_inc_lock);
-	ret = !(time_after(osb->journal->trans_id, 
-			 OCFS_I(inode)->ip_created_trans));
+	ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->trans_id, 
+			   OCFS_I(inode)->ip_created_trans));
 	if (!ret)
 		OCFS_I(inode)->ip_created_trans = 0;
 	spin_unlock(&trans_inc_lock);
@@ -146,20 +138,8 @@
 
 typedef struct _ocfs_journal_lock ocfs_journal_lock;
 struct _ocfs_journal_lock {
-	/* release_lock arguments. */
-	__u32 type;
-	__u32 flags;
-	struct inode *inode;
-	unsigned int num_ident;
-
-	/* used by commit_cache */
-	unsigned int drop_holders;
-	/* lock_list: we are either on 
-	 *  - handle->locks: if still running
-	 *  - inode->ip_pending_locks: if waiting for checkpoint
-	 *  - journal->checkpointing_locks: awaiting release after checkpoint
-	 */
-	struct list_head lock_list;
+	struct inode     *jl_inode;
+	struct list_head  jl_lock_list;
 };
 
 struct _ocfs_journal_handle {
@@ -198,7 +178,7 @@
 
 /*
  *  Journal Control:
- *  Initialize, Load, Shutdown, Wipe, Create a journal.
+ *  Initialize, Load, Shutdown, Wipe a journal.
  *  
  *  ocfs_journal_init     - Initialize journal structures in the OSB.
  *  ocfs_journal_load     - Load the given journal off disk. Replay it if
@@ -208,12 +188,21 @@
  *  ocfs_journal_wipe     - Wipe transactions from a journal. Optionally 
  *                          zero out each block.
  *  ocfs_recovery_thread  - Perform recovery on a node. osb is our own osb.
+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ *                          event on.
+ *  ocfs_start_checkpoint - Kick the commit thread to do a checkpoint.
  */
-int    ocfs_journal_init(struct _ocfs_super *osb);
+int    ocfs_journal_init(struct _ocfs_super *osb, int *dirty);
 void   ocfs_journal_shutdown(struct _ocfs_super *osb);
 int    ocfs_journal_wipe(ocfs_journal *journal, int full);
 int    ocfs_journal_load(ocfs_journal *journal);
 void   ocfs_recovery_thread(struct _ocfs_super *osb, int node_num);
+int    ocfs2_mark_dead_nodes(ocfs_super *osb);
+static inline void ocfs_start_checkpoint(struct _ocfs_super *osb)
+{
+	atomic_set(&osb->needs_checkpoint, 1);
+	wake_up(&osb->checkpoint_event);
+}
 
 /*
  *  Transaction Handling:
@@ -292,10 +281,8 @@
  */
 int                  ocfs_journal_dirty(ocfs_journal_handle *handle, 
 					struct buffer_head *bh);
-void                 ocfs_handle_add_lock(ocfs_journal_handle *handle, 
-					  __u32 type,
-					  __u32 flags, 
-					  struct inode *inode);
+int                  ocfs_handle_add_lock(ocfs_journal_handle *handle,
+			                  struct inode *inode);
 /*
  * Use this to protect from other processes reading buffer state while
  * it's in flight.

Modified: trunk/src/ocfs_log.h
===================================================================
--- trunk/src/ocfs_log.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs_log.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -100,29 +100,29 @@
 #define OCFS_DEBUG_CONTEXT_ALLOC       0x00000001	/* alloc.c    */
 #define OCFS_DEBUG_CONTEXT_DIR         0x00000002	/* dir.c      */
 #define OCFS_DEBUG_CONTEXT_EXTMAP      0x00000004	/* extmap.c   */
-#define OCFS_DEBUG_CONTEXT_HEARTBEAT   0x00000008	/* hearbeat.c */
+#define OCFS_DEBUG_CONTEXT_SLOTMAP     0x00000008	/* slotmap.c  */
 #define OCFS_DEBUG_CONTEXT_IOCTL       0x00000010	/* ioctl.c    */
-#define OCFS_DEBUG_CONTEXT_NM          0x00000020	/* nm.c       */
+#define OCFS_DEBUG_CONTEXT_VOTE        0x00000020	/* vote.c     */
 #define OCFS_DEBUG_CONTEXT_PROC        0x00000040	/* proc.c     */
 #define OCFS_DEBUG_CONTEXT_SYMLINK     0x00000080	/* symlink.c  */
 #define OCFS_DEBUG_CONTEXT_BITMAP      0x00000100	/* bitmap.c   */
 #define OCFS_DEBUG_CONTEXT_FILE        0x00000200	/* file.c     */
 #define OCFS_DEBUG_CONTEXT_INODE       0x00000400	/* inode.c    */
 #define OCFS_DEBUG_CONTEXT_JOURNAL     0x00000800	/* journal.c  */
-#define OCFS_DEBUG_CONTEXT_CHAINALLOC  0x00001000	/*            */
-#define OCFS_DEBUG_CONTEXT_LOCALALLOC  0x00002000	/*            */
+#define OCFS_DEBUG_CONTEXT_CHAINALLOC  0x00001000	/* chainalloc */
+#define OCFS_DEBUG_CONTEXT_LOCALALLOC  0x00002000	/* localalloc */
 #define OCFS_DEBUG_CONTEXT_SYSFILE     0x00004000	/* sysfile.c  */
 #define OCFS_DEBUG_CONTEXT_VOLCFG      0x00008000	/* volcfg.c   */
 #define OCFS_DEBUG_CONTEXT_DCACHE      0x00010000	/* dcache.c   */
-#define OCFS_DEBUG_CONTEXT_DLM         0x00020000	/* dlm.c      */
+#define OCFS_DEBUG_CONTEXT_DLMGLUE     0x00020000	/* dlmglue.c  */
 #define OCFS_DEBUG_CONTEXT_HASH        0x00040000	/* hash.c     */
 #define OCFS_DEBUG_CONTEXT_IO          0x00080000	/* io.c       */
 #define OCFS_DEBUG_CONTEXT_NAMEI       0x00100000	/* namei.c    */
 #define OCFS_DEBUG_CONTEXT_OSB         0x00200000	/* osb.c      */
 #define OCFS_DEBUG_CONTEXT_SUPER       0x00400000	/* super.c    */
 #define OCFS_DEBUG_CONTEXT_UTIL        0x00800000	/* util.c     */
-#define OCFS_DEBUG_CONTEXT_VOTE        0x01000000	/* vote.c     */
-#define OCFS_DEBUG_CONTEXT_LOCKRES     0x02000000	/* lockres.c  */
+#define OCFS_DEBUG_CONTEXT_UNUSED3     0x01000000	/*            */
+#define OCFS_DEBUG_CONTEXT_UNUSED4    0x02000000	/*            */
 
 
 #ifdef OCFS_DBG_TIMING

Modified: trunk/src/proc.c
===================================================================
--- trunk/src/proc.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/proc.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,8 +38,8 @@
 #include "ocfs2.h"
 
 #include "proc.h"
-#include "vote.h"
 #include "alloc.h"
+#include "heartbeat.h"
 #include "ocfs_journal.h"
 
 
@@ -48,18 +48,16 @@
 
 #define OCFS2_PROC_BASENAME	"fs/ocfs2"
 
-static int ocfs_proc_globalctxt(char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_dlm_stats(char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_version (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_nodenum (char *page, char **start, off_t off, int count, int *eof, void *data);
+static int ocfs_proc_slotnum (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_nodename (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_mountpoint (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_statistics (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_device (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_nodes (char *page, char **start, off_t off, int count, int *eof, void *data);
-static int ocfs_proc_net_vote_obj (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_alloc_stat(char *page, char **start, off_t off, int count, int *eof, void *data);
-static int ocfs_proc_guid (char *page, char **start, off_t off, int count, int *eof, void *data);
 static int ocfs_proc_label (char *page, char **start, off_t off, int count, int *eof, void *data);
 
 typedef struct _ocfs_proc_list
@@ -72,20 +70,18 @@
 ocfs_proc_list top_dir[] = {
 	{ "version", NULL, ocfs_proc_version },
 	{ "nodename", NULL, ocfs_proc_nodename },
-	{ "globalctxt", NULL, ocfs_proc_globalctxt },
 	{ "lockstat", NULL, ocfs_proc_dlm_stats },
 	{ NULL } };
 
 ocfs_proc_list sub_dir[] = {
 	{ "nodenum", NULL, ocfs_proc_nodenum },
 	{ "mountpoint", NULL, ocfs_proc_mountpoint },
+	{ "slotnum", NULL, ocfs_proc_slotnum },
 	{ "statistics", NULL, ocfs_proc_statistics },
 	{ "lockstat", NULL, ocfs_proc_dlm_stats },
 	{ "device", NULL, ocfs_proc_device },
 	{ "nodes", NULL, ocfs_proc_nodes },
-	{ "sent-votes", NULL, ocfs_proc_net_vote_obj },
 	{ "allocstat", NULL, ocfs_proc_alloc_stat },
-	{ "guid", NULL, ocfs_proc_guid },
 	{ "label", NULL, ocfs_proc_label },
 	{ NULL } };
 
@@ -159,35 +155,7 @@
 	return len;
 }				/* ocfs_proc_calc_metrics */
 
-
 /*
- * ocfs_proc_globalctxt()
- *
- */
-static int ocfs_proc_globalctxt(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
-{
-	int len = 0;
-	int ret;
-
-	LOG_ENTRY ();
-
-	len += sprintf(page + len, "ip addr/port : 0x%08u/%u\n",
-		       ntohl(OcfsGlobalCtxt.comm_info.addr_u.ip_addr4),
-		       ntohs(OcfsGlobalCtxt.comm_info.ip_port));
-	len += sprintf(page + len, "guid         : ");
-	strncat(page + len, OcfsGlobalCtxt.guid.guid, OCFS2_GUID_LEN);
-	len += OCFS2_GUID_LEN;
-	strncat(page + len, "\n", 1);
-	len++;
-
-	ret = ocfs_proc_calc_metrics(page, start, off, count, eof, len);
-
-	LOG_EXIT_INT (ret);
-	return ret;
-}				/* ocfs_proc_version */
-
-/*
  * ocfs_proc_dlm_stats()
  *
  */
@@ -331,6 +299,29 @@
 }				/* ocfs_proc_nodenum */
 
 /*
+ * ocfs_proc_slotnum()
+ *
+ */
+static int ocfs_proc_slotnum (char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	int len;
+	int ret;
+	ocfs_super *osb;
+
+	LOG_ENTRY ();
+
+	osb = data;
+	sprintf (page, "%d\n", osb->slot_num);
+	len = strlen (page);
+
+	ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
+
+	LOG_EXIT_INT (ret);
+	return ret;
+}				/* ocfs_proc_slotnum */
+
+/*
  * ocfs_proc_nodename()
  *
  */
@@ -431,50 +422,25 @@
 				 int count, int *eof, void *data)
 {
 	int len;
-	char *pubmap = NULL;
 	ocfs_super *osb;
-	int ret = 0, i;
-	char *ptr;
+	int ret = 0;
 
 	LOG_ENTRY ();
 
 	osb = data;
 
-	pubmap = ocfs_malloc (100);
-	if (!pubmap) {
-		LOG_ERROR_STATUS (-ENOMEM);
-		goto bail;
-	}
-
-	ptr = pubmap;
-	for (i = 0; i < osb->max_nodes; i++) {
-		if (ocfs_node_map_test_bit(&osb->publ_map, i))
-			ptr += sprintf (ptr, "%d ", i);
-	}
-	if (pubmap != ptr)
-		*(ptr - 1) = '\0';
-
 #define PROC_STATS                             \
-  "Publish map              : %s\n"		\
   "Number of nodes          : %u\n"		\
   "Cluster size             : %d\n"		\
   "Volume size              : %llu\n"		\
-  "Open Transactions:       : %u\n"		\
-  "Delayed Locks            : %u\n"		\
-  "Checkpointing Locks      : %u\n"
+  "Open Transactions:       : %u\n"		
 
-	len = sprintf (page, PROC_STATS, pubmap,
-		       osb->num_nodes, osb->s_clustersize, 
+	len = sprintf (page, PROC_STATS, osb->num_nodes, osb->s_clustersize, 
 		       ocfs2_clusters_to_bytes(osb->sb, osb->num_clusters),
-		       atomic_read(&osb->journal->num_trans),
-		       atomic_read(&osb->journal->num_cmt_locks),
-		       atomic_read(&osb->journal->num_chkpt_locks));
+		       atomic_read(&osb->journal->num_trans));
 
 	ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
 
-bail:
-	if (pubmap)
-		kfree(pubmap);
 	LOG_EXIT_INT (ret);
 	return ret;
 }				/* ocfs_proc_statistics */
@@ -512,7 +478,6 @@
 	int i;
 	int ret;
 	ocfs_super *osb;
-	BARF_BARF_BARF *node;
 	char mount;
 
 	LOG_ENTRY ();
@@ -520,23 +485,10 @@
 	osb = data;
 
 	if (osb) {
-		down (&(osb->cfg_lock));
 		for (i = 0; i < osb->max_nodes; i++) {
-			node = osb->node_cfg_info[i];
-			if (!node)
-				continue;
-			mount = ocfs_node_map_test_bit(&osb->publ_map, i) ? 'M' : ' ';
-			len += sprintf (page + len,
-				       	"%2d %c %-32s 0x%08u %-6u ",
-				       	i, mount, node->node_name,
-		 			ntohl(node->ipc_config.addr_u.ip_addr4),
-				       	ntohs(node->ipc_config.ip_port));
-			strncat(page + len, node->guid.guid,
-		       		OCFS2_GUID_LEN);
-			len += OCFS2_GUID_LEN;
-			len += sprintf (page + len, "\n");
+			mount = ocfs_node_map_test_bit(osb, &osb->mounted_map, i) ? 'M' : ' ';
+			len += sprintf(page + len, "%2d %c\n", i, mount);
 		}
-		up (&(osb->cfg_lock));
 	}
 
 	ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
@@ -546,63 +498,6 @@
 }				/* ocfs_proc_nodes */
 
 /*
- * ocfs_proc_net_votes()
- *
- */
-static int ocfs_proc_net_vote_obj (char *page, char **start, off_t off,
-			    int count, int *eof, void *data)
-{
-	int len = 0, ret;
-	ocfs_super *osb;
-	ocfs_vote_obj_lookup_data d;  // 24 bytes
-
-	LOG_ENTRY ();
-
-	osb = data;
-
-	d.func = ocfs_lookup_obj_for_proc;
-	d.ret = NULL;
-	d.u.proc.page = page;
-	d.u.proc.len = &len;
-	d.u.proc.max = 4096;
-	ret = ocfs_lookup_vote_request_obj (osb, &d);
-	ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
-
-	LOG_EXIT_INT (ret);
-	return ret;
-}				/* ocfs_proc_net_vote_obj */
-
-/*
- * ocfs_proc_guid()
- *
- */
-static int ocfs_proc_guid (char *page, char **start, off_t off,
-			   int count, int *eof, void *data)
-{
-	int len;
-	int ret;
-	ocfs_super *osb;
-	char *p;
-	int i;
-
-	LOG_ENTRY ();
-
-	osb = (ocfs_super *) data;
-
-	for (i = 0, p = page; i < MAX_VOL_ID_LENGTH; i++, p += 2)
-		sprintf(p, "%02X", osb->uuid[i]);
-	*p = '\n'; ++p; *p = '\0';
-
-	len = strlen (page);
-
-	ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
-
-	LOG_EXIT_INT (ret);
-	return ret;
-}				/* ocfs_proc_guid */
-
-
-/*
  * ocfs_proc_label()
  *
  */

Added: trunk/src/slot_map.c
===================================================================
--- trunk/src/slot_map.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/slot_map.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,288 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "slot_map.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT  OCFS_DEBUG_CONTEXT_SLOTMAP
+
+static s16 __ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+				    s16 global);
+static void __ocfs2_fill_slot(ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num);
+
+/* Use the slot information we've collected to create a map of mounted
+ * nodes. Should be holding an EX on super block. assumes slot info is
+ * up to date. Note that we call this *after* we find a slot, so our
+ * own node should be set in the map too... */
+void ocfs2_populate_mounted_map(ocfs_super *osb)
+{
+	int i;
+	ocfs2_slot_info *si = osb->slot_info;
+
+	spin_lock(&si->si_lock);
+
+	for (i = 0; i < si->si_size; i++)
+		if (si->si_global_node_nums[i] != OCFS_INVALID_NODE_NUM)
+			ocfs_node_map_set_bit(osb, &osb->mounted_map,
+					      si->si_global_node_nums[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the slot information on disk into our slot_info struct. */
+void ocfs2_update_slot_info(ocfs2_slot_info *si)
+{
+	int i;
+	s16 *disk_info;
+
+	/* we don't read the slot block here as ocfs2_super_lock
+	 * should've made sure we have the most recent copy. */
+	spin_lock(&si->si_lock);
+	disk_info = (s16 *) si->si_bh->b_data;
+
+	for (i = 0; i < si->si_size; i++)
+		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+int ocfs2_update_disk_slots(ocfs_super *osb,
+			    ocfs2_slot_info *si)
+{
+	int status, i;
+	s16 *disk_info = (s16 *) si->si_bh->b_data;
+
+	spin_lock(&si->si_lock);
+	for (i = 0; i < si->si_size; i++)
+		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+	spin_unlock(&si->si_lock);
+
+	status = ocfs_write_block(osb, si->si_bh, si->si_inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+	return status;
+}
+
+/* try to find global node in the slot info. Returns
+ * OCFS_INVALID_NODE_NUM if nothing is found. */
+static s16 __ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+				    s16 global)
+{
+	int i;
+	s16 ret = OCFS_INVALID_NODE_NUM;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (global == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+s16 ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+			   s16 global)
+{
+	s16 ret;
+
+	spin_lock(&si->si_lock);
+	ret = __ocfs2_node_num_to_slot(si, global);
+	spin_unlock(&si->si_lock);
+	return ret;
+}
+
+static void __ocfs2_fill_slot(ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num)
+{
+	OCFS_ASSERT(slot_num != OCFS_INVALID_NODE_NUM);
+	OCFS_ASSERT(slot_num < si->si_num_slots);
+	OCFS_ASSERT((node_num == OCFS_INVALID_NODE_NUM) || 
+		    (node_num < OCFS2_MAX_NODES));
+
+	si->si_global_node_nums[slot_num] = node_num;
+}
+
+void ocfs2_clear_slot(ocfs2_slot_info *si,
+		      s16 slot_num)
+{
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, slot_num, OCFS_INVALID_NODE_NUM);
+	spin_unlock(&si->si_lock);
+}
+
+int ocfs2_init_slot_info(ocfs_super *osb)
+{
+	int status, i;
+	u64 blkno;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+	ocfs2_slot_info *si;
+
+	si = kmalloc(sizeof(ocfs2_slot_info), GFP_KERNEL);
+	if (!si) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(si, 0, sizeof(ocfs2_slot_info));
+	spin_lock_init(&si->si_lock);
+	si->si_num_slots = osb->max_nodes;
+	si->si_size = OCFS2_MAX_NODES;
+
+	for(i = 0; i < si->si_num_slots; i++)
+		si->si_global_node_nums[i] = OCFS_INVALID_NODE_NUM;
+
+	inode = ocfs_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, -1);
+	if (!inode) {
+		LOG_ERROR_STATUS(status = -EINVAL);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_read_block(osb, blkno, &bh, 0, inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	si->si_inode = inode;
+	si->si_bh = bh;
+	osb->slot_info = si;
+bail:
+	if (status < 0 && si)
+		ocfs2_free_slot_info(si);
+
+	return status;
+}
+
+void ocfs2_free_slot_info(ocfs2_slot_info *si)
+{
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh)
+		brelse(si->si_bh);
+	kfree(si);
+}
+
+int ocfs2_find_slot(ocfs_super *osb)
+{
+	int status;
+	s16 slot;
+	ocfs2_slot_info *si;
+
+	LOG_ENTRY();
+
+	si = osb->slot_info;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	/* search for ourselves first and take the slot if it already
+	 * exists. Perhaps we need to mark this in a variable for our
+	 * own journal recovery? Possibly not, though we certainly
+	 * need to warn to the user */
+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+	if (slot == OCFS_INVALID_NODE_NUM) {
+		/* if no slot yet, then just take 1st available
+		 * one. */
+		slot = __ocfs2_node_num_to_slot(si, OCFS_INVALID_NODE_NUM);
+		if (slot == OCFS_INVALID_NODE_NUM) {
+			spin_unlock(&si->si_lock);
+			printk("ocfs2: no free slots available!\n");
+			status = -EINVAL;
+			goto bail;
+		}
+	} else
+		printk("ocfs2: slot %d is already allocated to this node!\n",
+		       slot);
+
+	__ocfs2_fill_slot(si, slot, osb->node_num);
+	osb->slot_num = slot;
+	spin_unlock(&si->si_lock);
+
+	printk("ocfs2: taking node slot %d\n", osb->slot_num);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+void ocfs2_put_slot(ocfs_super *osb)
+{
+	int status;
+	ocfs2_slot_info *si = osb->slot_info;
+
+	if (!si)
+		return;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, osb->slot_num, OCFS_INVALID_NODE_NUM);
+	osb->slot_num = OCFS_INVALID_NODE_NUM;
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+bail:
+	osb->slot_info = NULL;
+	kfree(si);
+}
+

Added: trunk/src/slot_map.h
===================================================================
--- trunk/src/slot_map.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/slot_map.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,57 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+typedef struct _ocfs2_slot_info {
+	spinlock_t si_lock;
+
+       	struct inode *si_inode;
+	struct buffer_head *si_bh;
+	unsigned int si_num_slots;
+	unsigned int si_size;
+	s16 si_global_node_nums[OCFS2_MAX_NODES];
+} ocfs2_slot_info;
+
+int ocfs2_init_slot_info(ocfs_super *osb);
+void ocfs2_free_slot_info(ocfs2_slot_info *si);
+
+int ocfs2_find_slot(ocfs_super *osb);
+void ocfs2_put_slot(ocfs_super *osb);
+
+void ocfs2_update_slot_info(ocfs2_slot_info *si);
+int ocfs2_update_disk_slots(ocfs_super *osb,
+			    ocfs2_slot_info *si);
+
+s16 ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+			   s16 global);
+void ocfs2_clear_slot(ocfs2_slot_info *si,
+		      s16 slot_num);
+
+void ocfs2_populate_mounted_map(ocfs_super *osb);
+
+#endif

Modified: trunk/src/suballoc.c
===================================================================
--- trunk/src/suballoc.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/suballoc.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -36,7 +36,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
 #include "localalloc.h"
 #include "util.h"
 #include "suballoc.h"
@@ -425,15 +425,12 @@
 	OCFS_ASSERT(!(handle->flags & OCFS_HANDLE_STARTED));
 
 	ocfs_handle_add_inode(handle, alloc_inode);
-	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
-				   0, &bh, alloc_inode);
+	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto bail;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-			     0, alloc_inode);
 
 	fe = (ocfs2_dinode *) bh->b_data;
 	OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
@@ -494,7 +491,7 @@
 #ifndef OCFS_USE_ALL_METADATA_SUBALLOCATORS
 	alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
 #else
-	alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, osb->node_num);
+	alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, osb->slot_num);
 #endif
 	if (!alloc_inode) {
 		status = -ENOMEM;
@@ -543,7 +540,7 @@
 	(*ac)->ac_handle = handle;
 	(*ac)->ac_which = OCFS_AC_USE_INODE;
 
-	alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, osb->node_num);
+	alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, osb->slot_num);
 	if (!alloc_inode) {
 		status = -ENOMEM;
 		LOG_ERROR_STATUS(status);

Modified: trunk/src/super.c
===================================================================
--- trunk/src/super.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/super.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -42,6 +42,10 @@
 #include <linux/socket.h>
 #include <linux/inet.h>
 
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
+
 #include "ocfs_log.h"
 #include "ocfs.h"
 #include "ocfs2.h"
@@ -50,18 +54,18 @@
 #include "ocfs1_fs_compat.h"
 
 #include "alloc.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
 #include "localalloc.h"
-#include "nm.h"
 #include "proc.h"
+#include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
 #include "util.h"
 #include "ver.h"
-#include "volcfg.h"
 #include "vote.h"
 
 #include "ocfs_journal.h"
@@ -75,19 +79,10 @@
 ocfs_global_ctxt OcfsGlobalCtxt;
 spinlock_t osb_id_lock;
 __u32 osb_id;             /* Keeps track of next available OSB Id */
-spinlock_t mount_cnt_lock;
-__u32 mount_cnt;          /* Number of volumes currently mounted */
 
-char *node_name = NULL;
-__s32 node_number = OCFS_INVALID_NODE_NUM;
 __u32 debug_context = 0;
 __u32 debug_level = 0;
 __u32 debug_exclude = 0;
-char *ip_address = NULL;
-__u32 ip_port_v2 = 0;
-char *guid = NULL;
-__u32 cs = 0;
-char *ocfs_hostname;
 
 #ifdef EVIL_TRACE
 __u64 debug_mask = 0;
@@ -128,45 +123,26 @@
 //MODULE_DESCRIPTION("Oracle Clustered FileSystem");
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-module_param (node_name, charp, 0);
-module_param (node_number, int, 0);
 module_param (debug_context, uint, 0);
 module_param (debug_level, uint, 0);
 module_param (debug_exclude, uint, 0);
-module_param (ip_address, charp, 0);
-module_param (ip_port_v2, uint, 0);
-module_param (guid, charp, 0);
-module_param (cs, uint, 0);
 #else /* 2.6.x kernel */
-MODULE_PARM (node_name, "s");
-MODULE_PARM_DESC(node_name, "Name of this machine in the cluster");
-MODULE_PARM (node_number, "i");
-MODULE_PARM_DESC(node_number, "Slot number for this machine within volume");
 MODULE_PARM (debug_context, "i");
 MODULE_PARM_DESC(debug_context, "Debug context");
 MODULE_PARM (debug_level, "i");
 MODULE_PARM_DESC(debug_level, "Debug level");
 MODULE_PARM (debug_exclude, "i");
 MODULE_PARM_DESC(debug_exclude, "Process ID to exclude from tracing");
-MODULE_PARM (ip_address, "s");
-MODULE_PARM_DESC(ip_address, "IP address for the network dlm on this node");
-MODULE_PARM (ip_port_v2, "i");
-MODULE_PARM_DESC(ip_port_v2, "Port number for the network dlm on this node");
-MODULE_PARM (guid, "s");
-MODULE_PARM_DESC(guid, "GUID for this machine");
-MODULE_PARM (cs, "i");
-MODULE_PARM_DESC(cs, "Checksum");
 #endif /* Linux 2.4 stuff */
 
-extern struct semaphore recovery_list_sem;
 
-static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id);
+static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id, char **group_name);
 static int __init ocfs_driver_entry (void);
 static void __exit ocfs_driver_exit (void);
 static void ocfs_put_super (struct super_block *sb);
-static int ocfs_mount_volume (struct super_block *sb, int reclaim_id, struct inode *root);
-static int ocfs_dismount_volume (struct super_block *sb);
-static int ocfs_read_params(void);
+static int ocfs_mount_volume (struct super_block *sb, int reclaim_id,
+			      char **group_name, struct inode *root);
+static void ocfs_dismount_volume(struct super_block *sb);
 static int ocfs_initialize_mem_lists (void);
 static void ocfs_free_mem_lists (void);
 static void ocfs_delete_osb (ocfs_super * osb);
@@ -182,10 +158,8 @@
 static int ocfs_init_global_system_inodes(ocfs_super *osb);
 static int ocfs_init_local_system_inodes(ocfs_super *osb);
 static int ocfs_release_system_inodes(ocfs_super *osb);
-static int ocfs_publish_set_unmounted(ocfs_super *osb, int node_num);
-static int ocfs_publish_set_mounted(ocfs_super *osb, int node_num);
-static int ocfs_publish_toggle_mounted(ocfs_super *osb, int node_num,
-				       int value);
+static int ocfs2_fill_local_node_info(ocfs_super *osb, char **group_name);
+static int ocfs2_complete_mount_recovery(ocfs_super *osb);
 static int ocfs_check_volume(ocfs_super * osb);
 static int ocfs_verify_volume(ocfs2_dinode *di, struct buffer_head *bh,
 			      __u32 sectsize);
@@ -269,7 +243,7 @@
 
 	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
 	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
-		new = ocfs_get_system_file_inode(osb, i, osb->node_num);
+		new = ocfs_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs_release_system_inodes(osb);
 			LOG_ERROR_STATUS(status = -EINVAL);
@@ -298,7 +272,7 @@
 	LOG_ENTRY();
 
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; i < NUM_SYSTEM_INODES ; i++) {
-		new = ocfs_get_system_file_inode(osb, i, osb->node_num);
+		new = ocfs_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs_release_system_inodes(osb);
 			LOG_ERROR_STATUS(status = -EINVAL);
@@ -349,11 +323,12 @@
 	__u32 uid = current->fsuid;
 	__u32 gid = current->fsgid;
 	int reclaim_id;
+	char *group_name = NULL;
 	ocfs_super *osb = NULL;
 
 	LOG_ENTRY_ARGS ("%p, %p, %i", sb, data, silent);
 
-	if (ocfs_parse_options (data, &uid, &gid, &reclaim_id) != 0) {
+	if (ocfs_parse_options (data, &uid, &gid, &reclaim_id, &group_name) != 0) {
 		status = -EINVAL;
 		LOG_ERROR_STR ("ocfs_read_super: bad mount option");
 		goto read_super_error;
@@ -366,7 +341,7 @@
 	/* this is needed to support O_LARGE_FILE */
 	sb->s_maxbytes = OCFS_LINUX_MAX_FILE_SIZE;
 
-	status = ocfs_mount_volume (sb, reclaim_id, NULL);
+	status = ocfs_mount_volume (sb, reclaim_id, &group_name, NULL);
 	if (status < 0)
 		goto read_super_error;
 
@@ -394,11 +369,23 @@
 
 	sb->s_root = root;
 
-	printk ("ocfs2: Mounting device (%u,%u) on %s (node %d)\n",
+	printk ("ocfs2: Mounting device (%u,%u) on %s (node %d, slot %d)\n",
 		MAJOR(sb->s_dev), MINOR(sb->s_dev),
-		osb->node_cfg_info[osb->node_num]->node_name, osb->node_num);
+		OcfsGlobalCtxt.node_name, osb->node_num, osb->slot_num);
 
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
+
+	if (osb->dirty) {
+		/* This must happen *after* setting the volume to
+		 * MOUNTED as we may sleep on any recovery threads. */
+		status = ocfs2_complete_mount_recovery(osb);
+		if (status < 0)
+			LOG_EXIT_STATUS(status);
+	}
+
+	if (group_name)
+		kfree(group_name);
+
 	LOG_EXIT_STATUS(status);
 	return status;		
 
@@ -411,6 +398,9 @@
 	if (inode)
 		iput (inode);
 
+	if (group_name)
+		kfree(group_name);
+
 	LOG_EXIT_STATUS(status);
 	return status;
 }
@@ -447,11 +437,12 @@
  *
  * e.g., gid=9999,uid=9999,[no]cache,reclaimid
  */
-static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id)
+static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id, char **group_name)
 {
 	char *c;
 	char *value;
 	int ret = 1;
+	int size;
 
 	LOG_ENTRY ();
 	
@@ -492,6 +483,24 @@
 			}
 		} else if (!strcmp (c, "reclaimid")) {
 			*reclaim_id = 1;
+		} else if (!strcmp(c, "group")) {
+			if (!value || !*value) {
+				LOG_ERROR_STR
+					("group option requires an argument");
+				goto bail;
+			}
+			LOG_TRACE_ARGS("group name passed = %s\n", value);
+
+			size = strlen(value) + 1;
+			*group_name = kmalloc(size, GFP_KERNEL);
+			if (!(*group_name)) {
+				LOG_ERROR_STATUS(-ENOMEM);
+				goto bail;
+			}
+			memset(*group_name, 0, size);
+			printk("ocfs2: group name passed = %s, size = %d\n",
+			       value, size);
+			strcpy(*group_name, value);
 		} else {
 			LOG_ERROR_ARGS ("Invalid mount option: %s", c);
 			goto bail;
@@ -519,16 +528,7 @@
 
 	if (init_ocfs2_extent_maps())
 		return -ENOMEM;
-	
-	ocfs_hostname = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
-	if (ocfs_hostname == NULL) {
-		status = -EINVAL;
-		goto leave;
-	}
 
-	strcpy(ocfs_hostname, system_utsname.nodename);
-	printk("ocfs2: hostname is %s\n", ocfs_hostname);
-	
 	ocfs_table_header = register_sysctl_table(ocfs_root_table, 0);
 	if (!ocfs_table_header) {
 		LOG_ERROR_STATUS(status = -ENOMEM);
@@ -536,18 +536,28 @@
 	}
 
 	memset (&OcfsGlobalCtxt, 0, sizeof (ocfs_global_ctxt));
-	memset (&OcfsIpcCtxt, 0, sizeof (ocfs_ipc_ctxt));
 
 	INIT_LIST_HEAD (&(OcfsGlobalCtxt.osb_next));
 	INIT_LIST_HEAD (&(OcfsGlobalCtxt.osb_next));
 
-	/* Read remaining insmod params */
-	if ((status = ocfs_read_params ()) < 0)
-	    goto leave;
+	/* Ok, just use utsname for now. Eventually we need to
+	 * get this from the node config subsystem. */
+	OcfsGlobalCtxt.node_name = kmalloc(OCFS2_MAX_NODE_NAME_LENGTH,
+					   GFP_KERNEL);
+	if (!OcfsGlobalCtxt.node_name) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+	memset(OcfsGlobalCtxt.node_name, 0, OCFS2_MAX_NODE_NAME_LENGTH);
+	strncpy(OcfsGlobalCtxt.node_name, system_utsname.nodename,
+		OCFS2_MAX_NODE_NAME_LENGTH - 1);
 
+	printk("ocfs2: node name is %s\n", OcfsGlobalCtxt.node_name);
+
 	/* Initialize the global data resource */
 	init_MUTEX (&(OcfsGlobalCtxt.global_res));
-	OCFS_SET_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED);
+	OcfsGlobalCtxt.flags |= OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED;
 
 	/* Initialize the memory slabs for oin and file entry */
 	status = ocfs_initialize_mem_lists ();
@@ -561,17 +571,11 @@
 	osb_id = 0;
 	spin_unlock (&osb_id_lock);
 
-	spin_lock_init (&mount_cnt_lock);
-	spin_lock (&mount_cnt_lock);
-	mount_cnt = 0;
-	spin_unlock (&mount_cnt_lock);
-
 	spin_lock_init (&OcfsGlobalCtxt.comm_seq_lock);
 	spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
 	OcfsGlobalCtxt.comm_seq_num = 0;
 	spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
 	
-	init_MUTEX (&recovery_list_sem);
 	/* Initialize the proc interface */
 	ocfs_proc_init ();
 
@@ -583,8 +587,7 @@
 
 		/* Delete the global context resource */
 		if (OcfsGlobalCtxt.flags & OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED)
-			OCFS_CLEAR_FLAG (OcfsGlobalCtxt.flags,
-				       OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED);
+			OcfsGlobalCtxt.flags &= ~OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED;
 
 		if (ocfs_table_header)
 			unregister_sysctl_table(ocfs_table_header);
@@ -601,79 +604,6 @@
 }				/* ocfs_driver_entry */
 
 /*
- * ocfs_read_params()
- *
- * Read insmod params
- */
-static int ocfs_read_params(void)
-{
-	int status = 0;
-	__u32 check_sum = 0;
-	int i;
-
-	/* Read remaining insmod params */
-	if (node_number != OCFS_INVALID_NODE_NUM) {
-		// this will be validated later
-		OcfsGlobalCtxt.pref_node_num = node_number;
-		LOG_TRACE_ARGS("Preferred node number: %d\n", node_number);
-	}
-
-	if (ip_port_v2 == 0)
-		OcfsGlobalCtxt.comm_info.ip_port =
-			htons(OCFS_IPC_DEFAULT_PORT);
-	else if (ip_port_v2 & 0xFFFF0000) {
-		status = -EINVAL;
-		LOG_ERROR_STR("'ip_port_v2' is too large'");
-	}
-	else
-		OcfsGlobalCtxt.comm_info.ip_port =
-			htons((u16)ip_port_v2);
-	LOG_TRACE_ARGS("IP port: %d\n",
-		       ntohs(OcfsGlobalCtxt.comm_info.ip_port));
-
-	if (node_name && strlen(node_name) < MAX_NODE_NAME_LENGTH) {
-		OcfsGlobalCtxt.node_name = node_name;
-		LOG_TRACE_ARGS ("Node name: %s\n", OcfsGlobalCtxt.node_name);
-	} else {
-		status = -EINVAL;
-		LOG_ERROR_STR ("'node_name' not set or too long");
-	}
-
-#define MAX_IPv4_ADDR_STR_LEN	15 /* 4x '255' + 3x '.' */
-	if (ip_address && strlen (ip_address) <= MAX_IPv4_ADDR_STR_LEN) {
-		OcfsGlobalCtxt.comm_info.addr_u.ip_addr4 =
-			in_aton(ip_address);
-		LOG_TRACE_ARGS ("IP address: %s\n", ip_address);
-	} else {
-		status = -EINVAL;
-		LOG_ERROR_STR ("'ip_address' not set or too long");
-	}
-
-	if (guid && strlen (guid) == OCFS2_GUID_LEN) {
-		memcpy(&OcfsGlobalCtxt.guid.guid, guid, OCFS2_GUID_LEN);
-		LOG_TRACE_ARGS ("Node guid: %s\n", guid);
-	} else {
-		status = -EINVAL;
-		LOG_ERROR_STR ("'guid' not set correctly");
-	}
-
-	if (status == 0) {
-		for (i = 0; i < OCFS2_GUID_LEN; ++i)
-			check_sum += (__u32) guid[i];
-		if (cs != check_sum) {
-			status = -EINVAL;
-			LOG_ERROR_STR ("load module using load_ocfs2");
-		}
-	}
-
-	/* hardcoding... not used yet */
-	OcfsGlobalCtxt.comm_info.ip_version = htons(4);
-
-	return status;
-}				/* ocfs_read_params */
-
-
-/*
  * ocfs_driver_exit()
  *
  * Called on rmmod
@@ -688,7 +618,6 @@
 
 	/* Signal DLM thread to exit */
 	down (&(OcfsGlobalCtxt.global_res));
-	OCFS_SET_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_SHUTDOWN_VOL_THREAD);
 
 	if (OcfsGlobalCtxt.flags & OCFS_FLAG_MEM_LISTS_INITIALIZED)
 		ocfs_free_mem_lists ();
@@ -702,6 +631,9 @@
 
 	exit_ocfs2_extent_maps();
 
+	if (OcfsGlobalCtxt.node_name)
+		kfree(OcfsGlobalCtxt.node_name);
+
 	printk("Unloaded OCFS Driver module\n");
 	LOG_EXIT ();
 	return;
@@ -812,7 +744,7 @@
 		sizeof(ocfs_journal_lock), 0, SLAB_NO_REAP | SLAB_HWCACHE_ALIGN,
 		NULL, NULL);
 	
-	OCFS_SET_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_MEM_LISTS_INITIALIZED);
+	OcfsGlobalCtxt.flags |= OCFS_FLAG_MEM_LISTS_INITIALIZED;
 
 	return 0;
 }				/* ocfs_initialize_mem_lists */
@@ -825,7 +757,7 @@
 {
 	kmem_cache_destroy (OcfsGlobalCtxt.inode_cache);
 	kmem_cache_destroy (OcfsGlobalCtxt.lock_cache);
-	OCFS_CLEAR_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_MEM_LISTS_INITIALIZED);
+	OcfsGlobalCtxt.flags &= ~OCFS_FLAG_MEM_LISTS_INITIALIZED;
 }				/* ocfs_free_mem_lists */
 
 static int ocfs2_sb_probe(struct super_block *sb,
@@ -926,18 +858,58 @@
 	return 0;
 }
 
+static int ocfs2_fill_local_node_info(ocfs_super *osb, char **group_name)
+{
+	int status, i;
+	struct inode *group = NULL;
+	char *p;
 
+	if (group_name) {
+		osb->group_name = *group_name;
+		*group_name = NULL;
+	} else {
+		osb->group_name = kmalloc(NM_MAX_NAME_LEN + 1, GFP_KERNEL);
+		if (!osb->group_name) {
+			status = -ENOMEM;
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		memset(osb->group_name, 0, NM_MAX_NAME_LEN + 1);
+		for (i = 0, p = osb->uuid; i < MAX_VOL_ID_LENGTH; i++, p += 2)
+			sprintf(p, "%02X", osb->uuid[i]);
+	}
+
+	group = nm_get_group_by_name(osb->group_name);
+	if (!group) {
+		printk("ocfs2: could not join group \"%s\"\n",
+		       osb->group_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	osb->group_inode = group;
+	osb->node_num = nm_this_node(group);
+
+	printk("ocfs2: I am node %d, a member of group %s\n", osb->node_num,
+	       osb->group_name);
+
+	status = 0;
+bail:
+
+	return status;
+}
+
 /*
  * ocfs_mount_volume()
  *
  */
-static int ocfs_mount_volume (struct super_block *sb, int reclaim_id, struct inode *root)
+static int ocfs_mount_volume (struct super_block *sb, int reclaim_id,
+			      char **group_name, struct inode *root)
 {
-	int status;
-	ocfs_super *osb;
-	int child_pid;
+	int status, sector_size;
+	int unlock_super = 0;
+	ocfs_super *osb = NULL;
 	struct buffer_head *bh = NULL;
-	int sector_size;
 
 	LOG_ENTRY ();
 
@@ -948,7 +920,7 @@
 		goto leave;
 	}
 
-	if ((osb = ocfs_malloc (sizeof (ocfs_super))) == NULL) {
+	if ((osb = kmalloc (sizeof(ocfs_super), GFP_KERNEL)) == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
 		goto leave;
 	}
@@ -972,97 +944,79 @@
 		goto leave;
 	}
 
-	down(&(osb->osb_res));
+	status = ocfs2_fill_local_node_info(osb, group_name);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
+	}
 
-	/* Launch the NM thread for the mounted volume */
-	osb->dlm_task = NULL;
-	child_pid = kernel_thread (ocfs_heartbeat_thread, osb,
-				   CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
-	if (child_pid < 0) {
-		LOG_ERROR_ARGS ("unable to launch ocfs2nm thread, error=%d",
-				child_pid);
-		up (&(osb->osb_res));
-		status = child_pid;
+	status = ocfs2_register_hb_callbacks(osb);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
 		goto leave;
-	} else {
-		init_completion (&osb->dlm_complete);
 	}
 
-	up (&(osb->osb_res));
+	status = ocfs2_dlm_init(osb);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
+	}
 
-	/* Add proc entry for this volume */
-	ocfs_proc_add_volume (osb);
+	/* requires vote_thread to be running. */
+	status = ocfs2_register_net_handlers(osb);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
+	}
 
-	/* GlobalMountCount */
-	spin_lock (&mount_cnt_lock);
-	mount_cnt++;
-	if (mount_cnt == 1) {
-		OcfsIpcCtxt.dlm_msg_size = OCFS_DLM_MAX_MSG_SIZE;
-		OcfsIpcCtxt.version = OCFS_IPC_DLM_VERSION;
-		/* start the listener thread */
-		status = ocfs_init_udp_sock(&OcfsIpcCtxt.send_sock,
-					    &OcfsIpcCtxt.recv_sock);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-		OcfsIpcCtxt.task = NULL;
-		child_pid = kernel_thread (ocfs_recv_thread, NULL,
-					    CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
-		if (child_pid >= 0) {
-			init_completion (&(OcfsIpcCtxt.complete));
-		} else {
-			status = child_pid;
-			LOG_ERROR_ARGS ("unable to launch ocfs2lsnr thread, error=%d", child_pid);
-			goto leave;
-		}
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
 	}
-	spin_unlock (&mount_cnt_lock);
+	unlock_super = 1;
 
-	/* wait for nm thread to be init */
-	ocfs_wait (osb->nm_init_event, (atomic_read (&osb->nm_init) >= OCFS_HEARTBEAT_INIT ), 0);
+	/* This will load up the node map and add ourselves to it. */
+	status = ocfs2_find_slot(osb);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
+	}
 
-	down(&(osb->osb_res));
-	down (&(osb->publish_lock));
-	ocfs_nm_heart_beat (osb, HEARTBEAT_METHOD_DISK, 1);
-	up (&(osb->publish_lock));
+	ocfs2_populate_mounted_map(osb);
 
-	ocfs_node_map_set_bit(&osb->publ_map, osb->node_num);
-	up (&(osb->osb_res));
+	/* load all node-local system inodes */
+	status = ocfs_init_local_system_inodes(osb);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
 
+	/* Add proc entry for this volume */
+	ocfs_proc_add_volume (osb);
+
 	/* Read the publish sector for this node and cleanup dirent being */
 	/* modified when we crashed. */
 	LOG_TRACE_STR ("ocfs_check_volume...");
-	down(&(osb->osb_res));
 	status = ocfs_check_volume (osb);
 	if (status < 0) {
-		up(&(osb->osb_res));
 		LOG_ERROR_STATUS (status);
 		goto leave;
 	}
 
-	/* Launch the commit thread */
-	osb->commit = ocfs_malloc(sizeof(ocfs_commit_task));
-	if (osb->commit == NULL) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		up (&(osb->osb_res));
-		goto leave;
-	}
-	memset(osb->commit, 0, sizeof(ocfs_commit_task));
-	child_pid = kernel_thread (ocfs_commit_thread, osb,
-				   CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
-	if (child_pid < 0) {
-		LOG_ERROR_ARGS ("unable to launch ocfs2commit thread, error=%d",
-				child_pid);
-		up (&(osb->osb_res));
-		status = child_pid;
-		goto leave;
-	} else {
-		init_completion (&osb->commit->c_complete);
-	}
-	up (&(osb->osb_res));
+	/* This should be sent *after* we recovered our journal as it
+	 * will cause other nodes to unmark us as needing
+	 * recovery. However, we need to send it *before* dropping the
+	 * super block lock as otherwise their recovery threads might
+	 * try to clean us up while we're live! */
+	status = ocfs2_request_mount_vote(osb);
+	if (status < 0)
+		LOG_ERROR_STATUS (status);
 
 leave:
+	if (unlock_super)
+		ocfs2_super_unlock(osb, 1);
+
 	if (bh != NULL)
 		brelse(bh);
 	LOG_EXIT_STATUS (status);
@@ -1074,232 +1028,67 @@
  * ocfs_dismount_volume()
  *
  */
-static int ocfs_dismount_volume (struct super_block *sb)
+static void ocfs_dismount_volume (struct super_block *sb)
 {
-	int status;
-	int AcquiredOSB = 0;
+	int tmp;
 	ocfs_super *osb = NULL;
-	int i;
 
 	LOG_ENTRY_ARGS ("(0x%p)\n", sb);
 
-	if (sb == NULL) {
-		LOG_ERROR_STATUS (status = -EINVAL);
-		goto leave;
-	}
-
+	OCFS_ASSERT(sb);
 	osb = OCFS_SB(sb);
+	OCFS_ASSERT(osb);
 
-	if (osb == NULL) {
-		LOG_ERROR_STATUS (status = -EINVAL);
-		goto leave;
-	}
+	ocfs_shutdown_local_alloc(osb);
 
 	/* disable any new recovery threads and wait for any currently
 	 * running ones to exit. Do this before setting the vol_state. */
 	down(&osb->recovery_lock);
 	osb->disable_recovery = 1;
-	up(&osb->recovery_lock);
-	while (atomic_read(&osb->num_recovery_threads)) {
+	while (osb->recovery_launched) {
+		up(&osb->recovery_lock);
 		LOG_TRACE_STR("Waiting on a recovery thread to complete.");
 		schedule();
+		down(&osb->recovery_lock);
 	}
+	up(&osb->recovery_lock);
 
-	down(&(osb->osb_res));
-	AcquiredOSB = 1;
-
-	ocfs_shutdown_local_alloc(osb);
 	ocfs_journal_shutdown(osb);
 
-	/* unset the mounted flag -- we're done with the journal and
-	 * the local alloc bitmap */
-	status = ocfs_publish_set_unmounted(osb, osb->node_num);
-	if (status < 0)
-		LOG_ERROR_STR("Could not set mounted flag!");
-
 	ocfs_sync_blockdev(sb);
 
 	/* Remove the proc element for this volume */
 	ocfs_proc_remove_volume (osb);
 
-	/* Dismount */
-	OCFS_SET_FLAG (osb->osb_flags, OCFS_OSB_FLAGS_BEING_DISMOUNTED);
-
-	/* Wait for this volume's NM thread to exit */
-	if (osb->dlm_task) {
-		LOG_TRACE_STR ("Waiting for ocfs2nm to exit....");
-		send_sig (SIGINT, osb->dlm_task, 0);
-		wait_for_completion (&(osb->dlm_complete));
-		osb->dlm_task = NULL;
+	tmp = ocfs2_super_lock(osb, 1);
+	if (tmp < 0) {
+		LOG_ERROR_STATUS(tmp);
+		return;
 	}
 
-	/* send dismount msg to all */
-	status = ocfs_send_dismount_msg (osb);
-	if (status < 0)
-		LOG_ERROR_STATUS (status);
+	tmp = ocfs2_request_umount_vote(osb);
+	if (tmp < 0)
+		LOG_ERROR_STATUS(tmp);
 
-	/* decrement mount count */
-	spin_lock (&mount_cnt_lock);
-	mount_cnt--;
-	if (mount_cnt == 0) {
-		/* Shutdown ocfslsnr */
-		if (OcfsIpcCtxt.task) {
-			LOG_TRACE_STR ("Waiting for ocfs2lsnr to exit....");
-			send_sig (SIGINT, OcfsIpcCtxt.task, 0);
-			wait_for_completion (&(OcfsIpcCtxt.complete));
-			OcfsIpcCtxt.task = NULL;
-		}
-	}
-	spin_unlock (&mount_cnt_lock);
+	ocfs2_put_slot(osb);
 
+	ocfs2_dlm_shutdown(osb);
+
+	ocfs2_clear_hb_callbacks(osb);
+
 	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
-	if (AcquiredOSB) {
-		up (&(osb->osb_res));
-		AcquiredOSB = 0;
-	}
 
 	printk ("ocfs2: Unmounting device (%u,%u) on %s (node %d)\n",
 		MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev),
-		osb->node_cfg_info[osb->node_num]->node_name, osb->node_num);
+		OcfsGlobalCtxt.node_name, osb->node_num);
 
-	/* Free all nodecfgs */
-	for (i = 0; i < osb->max_nodes; ++i) {
-		BARF_BARF_BARF *p;
-
-		p = osb->node_cfg_info[i];
-		if (p)
-    			kfree(p);
-	}
-
 	ocfs_release_system_inodes(osb);
 
 	ocfs_delete_osb (osb);
 	kfree(osb);
 	sb->s_dev = 0;
-
-leave:
-	if (AcquiredOSB) {
-		up (&(osb->osb_res));
-		AcquiredOSB = 0;
-	}
-
-	LOG_EXIT_STATUS (status);
-	return status;
 }				/* ocfs_dismount_volume */
 
-
-/* true if mounted, false otherwise */
-int ocfs_publish_get_mount_state(ocfs_super *osb, int node_num) 
-{
-	int status;
-	ocfs_publish *publish;
-	struct buffer_head *publish_bh = NULL;
-	int retval = 0;
-	int flags = 0;
-
-	LOG_ENTRY();
-
-	/* read it in */
-	/* we may be called during mount in which case our publish
-	 * sector might be dirty. */
-	if (node_num == osb->node_num)
-		flags = OCFS_BH_CACHED;
-	status = ocfs_read_block(osb, (osb->publish_blkno + node_num), 
-				 &publish_bh, flags, NULL);
-	if (status < 0) {
-		brelse(publish_bh);
-		LOG_ERROR_STR("Could not read publish sector, mounted value"
-			      " may be incorrect!");
-		LOG_ERROR_STATUS (status);
-		goto done;
-	}
-	publish = (ocfs_publish *) publish_bh->b_data;
-
-	retval = publish->mounted;
-
-	brelse(publish_bh);
-done:
-	LOG_EXIT_STATUS(retval);
-	return(retval);
-}
-
-static int ocfs_publish_toggle_mounted(ocfs_super *osb, int node_num, int value) 
-{
-	int status;
-	ocfs_publish *publish;
-	struct buffer_head * publish_bh = NULL;
-
-	LOG_ENTRY_ARGS("(node_num=%d, value=%d)\n", node_num, value);
-
-	/* read it in */
-	status = ocfs_read_block(osb, (osb->publish_blkno + node_num),
-				 &publish_bh, 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto done;
-	}
-	publish = (ocfs_publish *) publish_bh->b_data;
-
-	/* change it */
-	publish->mounted = value;
-
-	/* write it back out */
-	status = ocfs_write_block(osb, publish_bh, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto done;
-	}
-
-done:
-	if (publish_bh)
-		brelse(publish_bh);
-
-	LOG_EXIT_STATUS(status);
-
-	return(status);
-}
-
-/* set the 'mounted' bit in the publish sector */
-static int ocfs_publish_set_mounted(ocfs_super *osb, int node_num) 
-{
-	int retval;
-
-	LOG_ENTRY_ARGS("(node_num=%d)\n", node_num);
-
-	down (&(osb->publish_lock));
-
-	retval = ocfs_publish_toggle_mounted(osb, node_num, 1);
-	if (osb->node_num == node_num)
-		osb->check_mounted = 1;
-
-	up (&(osb->publish_lock));
-
-	LOG_EXIT_STATUS(retval);
-
-	return(retval);
-}
-
-/* unset the 'mounted' bit in the publish sector */
-static int ocfs_publish_set_unmounted(ocfs_super *osb, int node_num) 
-{
-	int retval;
-
-	LOG_ENTRY_ARGS("(node_num=%d)\n", node_num);
-
-	down (&(osb->publish_lock));
-
-	if (osb->node_num == node_num)
-		osb->check_mounted = 0;
-
-	retval = ocfs_publish_toggle_mounted(osb, node_num, 0);
-
-	up (&(osb->publish_lock));
-
-	LOG_EXIT_STATUS(retval);
-
-	return(retval);
-}
-
 /*
  * ocfs_initialize_osb()
  *
@@ -1307,14 +1096,10 @@
 static int ocfs_initialize_osb(ocfs_super *osb, struct buffer_head *bh)
 {
 	int status = 0;
-	ocfs_publish *publish = NULL;
-	u64 p_blkno;
-	struct buffer_head *publish_bh = NULL;  /* our own publish sector */
-	struct buffer_head **publish_bhs = NULL; /* all the publish sectors */
-	struct buffer_head *bitmap_bh = NULL;
 	int i;
 	ocfs2_dinode *di = NULL;
 	struct inode *inode = NULL;
+	struct buffer_head *bitmap_bh = NULL;
 
 	LOG_ENTRY ();
 
@@ -1323,16 +1108,15 @@
 	if (!osb->vol_label) {
 		LOG_ERROR_STR("unable to alloc vol label");
 		status = -ENOMEM;
-		goto done_nojournal;
+		goto bail;
 	}
 	osb->uuid = kmalloc(MAX_VOL_ID_LENGTH, GFP_KERNEL);
 	if (!osb->uuid) {
 		LOG_ERROR_STR("unable to alloc uuid");
 		status = -ENOMEM;
-		goto done_nojournal;
+		goto bail;
 	}
 
-
 	/* this needs to be done before most other initializations */
 	di = (ocfs2_dinode *) bh->b_data;
 	osb->max_nodes = le32_to_cpu(di->id2.i_super.s_max_nodes);
@@ -1340,7 +1124,7 @@
 		LOG_ERROR_ARGS("Invalid number of nodes (%u)\n",
 			       osb->max_nodes);
 		status = -EINVAL;
-		goto done_nojournal;
+		goto bail;
 	}
 	printk("max_nodes for this device: %u\n", osb->max_nodes);
 
@@ -1354,16 +1138,23 @@
 	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
 		LOG_ERROR_ARGS("couldn't mount because of unsupported "
 			       "optional features (%x).\n", i);
-		goto done_nojournal;
+		goto bail;
 	}
 	if (!(osb->sb->s_flags & MS_RDONLY) &&
 	    (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
 		LOG_ERROR_ARGS("couldn't mount RDWR because of "
 			       "unsupported optional features (%x).\n",
 			      i);
-		goto done_nojournal;
+		goto bail;
 	}
 
+	init_waitqueue_head(&osb->recovery_event);
+	atomic_set(&osb->wake_vote_task, 0);
+	spin_lock_init(&osb->vote_task_lock);
+	init_waitqueue_head(&osb->vote_event);
+	INIT_LIST_HEAD(&osb->blocked_lock_list);
+	osb->blocked_lock_count = 0;
+	INIT_LIST_HEAD(&osb->vote_list);
 	spin_lock_init(&osb->s_next_gen_lock);
 	get_random_bytes(&osb->s_next_generation, sizeof(u32));
 
@@ -1380,93 +1171,33 @@
 	if (!osb->journal) {
 		LOG_ERROR_STR("unable to alloc journal");
 		status = -ENOMEM;
-		goto done_nojournal;
+		goto bail;
 	}
 	memset(osb->journal, 0, sizeof(ocfs_journal));
 
-	publish_bhs = kmalloc(sizeof(struct buffer_head *) * osb->max_nodes, GFP_KERNEL);
-	if (publish_bhs == NULL) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto finally;
-	}
-	memset(publish_bhs, 0, sizeof(struct buffer_head *) * osb->max_nodes);
+	ocfs2_init_node_maps(osb);
 
-	osb->vol_node_map = kmalloc(sizeof(ocfs_vol_node_map) * osb->max_nodes, GFP_KERNEL);
-	if (!osb->vol_node_map) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto bail;
-	}
-	memset(osb->vol_node_map, 0, sizeof(ocfs_vol_node_map) * osb->max_nodes);
-
-	osb->lock_recovery_lists = kmalloc(sizeof(struct list_head) * osb->max_nodes, GFP_KERNEL);
-	if (!osb->lock_recovery_lists) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto bail;
-	}
-	memset(osb->lock_recovery_lists, 0, sizeof(struct list_head) * osb->max_nodes);
-
-	osb->last_publ_seq_num = kmalloc(sizeof(__u64) * osb->max_nodes, GFP_KERNEL);
-	if (!osb->last_publ_seq_num) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto bail;
-	}
-	memset(osb->last_publ_seq_num, 0, sizeof(__u64) * osb->max_nodes);
-	
-	osb->node_cfg_info = kmalloc(sizeof(BARF_BARF_BARF *) * osb->max_nodes, GFP_KERNEL);
-	if (!osb->node_cfg_info) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto bail;
-	}
-	memset(osb->node_cfg_info, 0, sizeof(BARF_BARF_BARF *) * osb->max_nodes);
-
-	ocfs_node_map_init(osb, &osb->publ_map);
-
-		
-	OCFS_CLEAR_FLAG (osb->osb_flags, OCFS_OSB_FLAGS_SHUTDOWN);
-
 	INIT_LIST_HEAD (&(osb->osb_next));
 
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
-	init_MUTEX (&(osb->osb_res));
 	init_MUTEX (&(osb->recovery_lock));
-	init_MUTEX (&(osb->orphan_recovery_lock));
-	init_MUTEX (&(osb->comm_lock));
-	init_MUTEX (&(osb->cfg_lock));
-	init_MUTEX (&(osb->vote_sem));
 
-	spin_lock_init(&osb->recovery_map_lock);
-	ocfs_node_map_init(osb, &osb->recovery_map);
-
-	osb->needs_flush = 0;
 	osb->disable_recovery = 0;
+	osb->recovery_launched = 0;
 
-	init_MUTEX (&(osb->publish_lock));
-	atomic_set (&osb->node_req_vote, 0);
-
-	atomic_set (&osb->num_recovery_threads, 0);
-
-	init_waitqueue_head (&osb->nm_init_event);
-	atomic_set (&osb->nm_init, 0);
-
-	osb->publish_dirty = 0;
-	init_waitqueue_head (&osb->flush_event);
-	atomic_set (&osb->flush_event_woken, 0);
+	init_waitqueue_head (&osb->checkpoint_event);
+	atomic_set (&osb->needs_checkpoint, 0);
 	atomic_set (&osb->clean_buffer_seq, 1);
 	spin_lock_init (&osb->clean_buffer_lock);
-	spin_lock_init (&osb->vote_obj_queue_lock);
 
-	INIT_LIST_HEAD (&(osb->vote_obj_queue));
-	for (i=0; i<osb->max_nodes; i++) {
-		INIT_LIST_HEAD(&(osb->lock_recovery_lists[i]));
-	}
 	osb->node_num = OCFS_INVALID_NODE_NUM;
+	osb->slot_num = OCFS_INVALID_NODE_NUM;
 
 	osb->have_local_alloc = 0;
 	osb->local_alloc_bh = NULL;
 
-	init_waitqueue_head (&osb->open_event);
 	/* not using any of these sb fields yet */
 #if 0
 di->i_ctime = cpu_to_le64(format_time); // use this as s_wtime (write time)
@@ -1533,63 +1264,16 @@
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &p_blkno,
-					     NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	// i_size must be at least
-	// (2 + osb->max_nodes + 4) + osb->max_nodes + osb->max_nodes
-	if (inode->i_size >> osb->sb->s_blocksize_bits < 
-	    (OCFS_VOLCFG_HDR_SECTORS + osb->max_nodes) + // autoconfig
-	    OCFS_VOLCFG_NEWCFG_SECTORS + // new autoconfig
-	    osb->max_nodes + // publish
-	    osb->max_nodes ) { // vote
+	if (inode->i_size >> osb->sb->s_blocksize_bits < OCFS2_MAX_NODES) {
 		LOG_ERROR_ARGS("dlm area size incorrect: "
 			       "found=%llu, need=%u\n", 
 			       inode->i_size,
-			       (OCFS_VOLCFG_HDR_SECTORS + 
-				OCFS_VOLCFG_NEWCFG_SECTORS +
-			       (osb->max_nodes*3)) << 
-			       osb->sb->s_blocksize_bits);
+			       OCFS2_MAX_NODES << osb->sb->s_blocksize_bits);
 		status = -EINVAL;
 		goto bail;
 
 	}
-	osb->autoconfig_blkno = p_blkno;
-	osb->autoconfig_blocks = OCFS_VOLCFG_HDR_SECTORS + osb->max_nodes;
-	
-	osb->new_autoconfig_blkno = osb->autoconfig_blkno + osb->autoconfig_blocks;
-	osb->new_autoconfig_blocks = OCFS_VOLCFG_NEWCFG_SECTORS;
-	osb->total_autoconfig_blocks = OCFS_VOLCFG_NEWCFG_SECTORS + osb->max_nodes;
-	
-	osb->publish_blkno = osb->new_autoconfig_blkno + osb->new_autoconfig_blocks;
-	osb->publish_blocks = osb->max_nodes;
-	
-	osb->vote_blkno = osb->publish_blkno + osb->publish_blocks;
-	osb->vote_blocks = osb->max_nodes;
-
-	printk("autoconfig: blkno=%llu, blocks=%u newblkno=%llu newblocks=%u\n", 
-	       osb->autoconfig_blkno, osb->autoconfig_blocks, 
-	       osb->new_autoconfig_blkno, osb->new_autoconfig_blocks);
-	printk("publish: blkno=%llu, blocks=%u\n", osb->publish_blkno, 
-	       osb->publish_blocks);
-	printk("vote: blkno=%llu, blocks=%u\n", osb->vote_blkno, osb->vote_blocks);
-
-	osb->autoconfig_bhs = ocfs_malloc (osb->total_autoconfig_blocks
-				    * sizeof(struct buffer_head *));
-	if (!osb->autoconfig_bhs) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto bail;
-	}
-	memset(osb->autoconfig_bhs, 0, 
-	       osb->total_autoconfig_blocks * sizeof(struct buffer_head *));
-
 	iput(inode);
-
-
 	
 	/* 
 	 * global bitmap 
@@ -1599,7 +1283,7 @@
 		LOG_ERROR_STATUS(status = -EINVAL);
 		goto bail;
 	}
-
+	
 	osb->bitmap_blkno = OCFS_I(inode)->ip_blkno;
 
 	status = ocfs_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, inode);
@@ -1608,6 +1292,7 @@
 		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
+
 	di = (ocfs2_dinode *) bitmap_bh->b_data;
 	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
 	osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
@@ -1615,75 +1300,18 @@
 	printk("cluster bitmap inode: %llu, clusters per group: %u\n",
 	       osb->bitmap_blkno, osb->bitmap_cpg);
 
-	osb->prealloc_lock = 0;
-
-
-	status = ocfs_get_config (osb);
+	status = ocfs2_init_slot_info(osb);
 	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	/* Read the Publish Sector of local Node */
-	status = ocfs_read_block(osb, (osb->publish_blkno + osb->node_num),
-				 &publish_bh, 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-	publish = (ocfs_publish *) publish_bh->b_data;
-
-	/* 
-	 * FIXME: This really ought to be something exported by the
-	 * identical code in heartbeat.c
-	 */
-	publish->time = jiffies;
-	/* Disallow 0 */
-	if (!publish->time)
-    		publish->time = 1;
-
-	publish = NULL;
-
-	status = ocfs_write_block(osb, publish_bh, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-	brelse(publish_bh);
-
-	/*  Read disk for all Publish Sectors  */
-	status = ocfs_read_blocks(osb, osb->publish_blkno, osb->max_nodes,
-				  publish_bhs, 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	ocfs_update_publish_map(osb, publish_bhs, 1);
-
-	for(i = 0; i < osb->max_nodes; i++)
-		osb->last_publ_seq_num[i] = (__u64) (-1);
-
-
-	/* load all node-local system inodes */
-	status = ocfs_init_local_system_inodes(osb);
-	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
 
-	/* We might need to add a variable in Global List of osb to */
-	/* delay any creation, if any other node is already creating a file */
-
 	/*  Link this osb onto the global linked list of all osb structures. */
 	/*  The Global Link List is mainted for the whole driver . */
 	down (&(OcfsGlobalCtxt.global_res));
 	list_add_tail (&(osb->osb_next), &(OcfsGlobalCtxt.osb_next));
 	up (&(OcfsGlobalCtxt.global_res));
 
-	/*  Mark the fact that this osb structure is initialized. */
-	OCFS_SET_FLAG (osb->osb_flags, OCFS_OSB_FLAGS_INITIALIZED);
-
 	spin_lock (&osb_id_lock);
 	osb->osb_id = osb_id;
 	if (osb_id < OCFS_MAX_OSB_ID)
@@ -1696,35 +1324,7 @@
 	}
 	spin_unlock (&osb_id_lock);
 
-
-	/* skip the frees which happen on error only */
-	goto finally;
-
 bail:
-	if (osb->autoconfig_bhs)
-		kfree(osb->autoconfig_bhs);
-	if (osb->vol_node_map)
-		kfree(osb->vol_node_map);
-	if (osb->lock_recovery_lists)
-		kfree(osb->lock_recovery_lists);
-	if (osb->last_publ_seq_num)
-		kfree(osb->last_publ_seq_num);
-	if (osb->node_cfg_info)
-		kfree(osb->node_cfg_info);
-finally:
-	if (publish) {
-		if (publish_bh) {
-			brelse(publish_bh);
-		}
-	}
-	if (publish_bhs[0]) {
-		int i;
-		for(i = 0; i < osb->max_nodes; i++)
-			if (publish_bhs[i])
-				brelse(publish_bhs[i]);
-	}
-
-done_nojournal:
 	LOG_EXIT_STATUS (status);
 	return status;
 }				/* ocfs_initialize_osb */
@@ -1754,7 +1354,6 @@
 			   OCFS2_MAJOR_REV_LEVEL ||
 			   le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
 			   OCFS2_MINOR_REV_LEVEL) {
-#warning dont know what is appropriate on minor rev difference
 			LOG_ERROR_ARGS("found superblock with bad version: "
 				       "found %u.%u, should be %u.%u\n",
 				       le16_to_cpu(di->id2.i_super.s_major_rev_level),
@@ -1789,6 +1388,35 @@
 	return status;
 }				/* ocfs_verify_volume */
 
+/* This part of local node recovery needs to happen after we've
+ * discovered all other nodes that need recovery and we've recovered
+ * them. */
+static int ocfs2_complete_mount_recovery(ocfs_super *osb)
+{
+	int status = 0;
+	ocfs2_dinode *local_alloc = osb->local_alloc_copy;
+
+	osb->local_alloc_copy = NULL;
+
+	if (osb->dirty) {
+		status = ocfs_complete_local_alloc_recovery(osb, local_alloc);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto finally;
+		}
+
+		status = ocfs_recover_orphans(osb);
+		if (status < 0)
+			LOG_ERROR_STATUS(status);
+	}
+	osb->dirty = 0;
+
+finally:
+	if (local_alloc)
+		kfree(local_alloc);
+	return status;
+}
+
 /*
  * ocfs_check_volume()
  *
@@ -1796,32 +1424,15 @@
 static int ocfs_check_volume (ocfs_super * osb)
 {
 	int status = 0;
-	ocfs_publish *publish = NULL;
-	int node_num = osb->node_num;
-	struct buffer_head * publish_bh = NULL;
-	int mounted;
+	int dirty;
 	ocfs2_dinode *local_alloc = NULL; /* only used if we
 					   * recover
 					   * ourselves. */
 
 	LOG_ENTRY ();
 
-	/* Read the node's publish sector */
-	status = ocfs_read_block(osb, (osb->publish_blkno + osb->node_num),
-				 &publish_bh, 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-	
-	publish = (ocfs_publish *) publish_bh->b_data;
-	/* we copy this out of the publish sector and then unlock
-	 * the bh as other functions will need to modify it. */
-	mounted = publish->mounted;
-	publish = NULL;
-
 	/* Init our journal object. */
-	status = ocfs_journal_init(osb);
+	status = ocfs_journal_init(osb, &dirty);
 	if (status < 0) {
 		LOG_ERROR_STR("Could not initialize journal!");
 		goto finally;
@@ -1830,7 +1441,7 @@
 	/* If the journal was unmounted cleanly then we don't want to
 	 * recover anything. Otherwise, journal_load will do that
 	 * dirty work for us :) */
-	if (!mounted) {		
+	if (!dirty) {		
 		status = ocfs_journal_wipe(osb->journal, 0);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
@@ -1844,10 +1455,10 @@
 	/* will play back anything left in the journal. */
 	ocfs_journal_load(osb->journal);
 
-	if (mounted) {
+	if (dirty) {
 		/* recover my local alloc if we didn't unmount cleanly. */
 		status = ocfs_begin_local_alloc_recovery(osb,
-							 node_num,
+							 osb->slot_num,
 							 &local_alloc);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
@@ -1857,36 +1468,33 @@
 		 * ourselves as mounted. */
 	}
 
-	/* 'mounted' flag in publish sector should not be set until
-	 * after we successfully load the journal. */
-	status = ocfs_publish_set_mounted(osb, osb->node_num);
-	if (status < 0)
-		LOG_ERROR_STR("Could not set mounted flag!");
 	LOG_TRACE_STR("Journal loaded.");
 
 	status = ocfs_load_local_alloc(osb);
-	if (status < 0)
+	if (status < 0) {
 		LOG_ERROR_STATUS(status);
+		goto finally;
+	}
 
-	if (mounted) {
-		status = ocfs_complete_local_alloc_recovery(osb, local_alloc);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto finally;
-		}
-
-		status = ocfs_recover_orphans(osb);
-		if (status < 0)
-			LOG_ERROR_STATUS(status);
+	if (dirty) {
+		/* Recovery will be completed after we've mounted the
+		 * rest of the volume. */
+		osb->dirty = 1;
+		osb->local_alloc_copy = local_alloc;
+		local_alloc = NULL;
 	}
 
+	/* go through each journal, trylock it and if you get the
+	 * lock, and it's marked as dirty, set the bit in the recover
+	 * map and launch a recovery thread for it. */
+	status = ocfs2_mark_dead_nodes(osb);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
 finally:
 	if (local_alloc)
 		kfree(local_alloc);
 
-	if (publish_bh)
-		brelse(publish_bh);
-
 	LOG_EXIT_STATUS (status);
 	return status;
 }				/* ocfs_check_volume */
@@ -1902,7 +1510,6 @@
  */
 static void ocfs_delete_osb (ocfs_super * osb)
 {
-	int i;
 	LOG_ENTRY ();
 
 	/* This function assumes that the caller has the main osb resource */
@@ -1914,31 +1521,22 @@
 		list_del (&(osb->osb_next));
 	up (&(OcfsGlobalCtxt.global_res));
 
-	for (i=0; i<osb->max_nodes; i++)
-		ocfs_recover_oin_locks(osb, i);
+	if (osb->slot_info)
+		ocfs2_free_slot_info(osb->slot_info);
 
-	for(i = 0; i < osb->total_autoconfig_blocks; i++)
-		if (osb->autoconfig_bhs[i])
-			brelse(osb->autoconfig_bhs[i]);
+	if (osb->group_inode)
+		iput(osb->group_inode);
 
-	if (osb->autoconfig_bhs)
-		kfree(osb->autoconfig_bhs);
-	if (osb->vol_node_map)
-		kfree(osb->vol_node_map);
-	if (osb->lock_recovery_lists)
-		kfree(osb->lock_recovery_lists);
-	if (osb->last_publ_seq_num)
-		kfree(osb->last_publ_seq_num);
-	if (osb->node_cfg_info)
-		kfree(osb->node_cfg_info);
-
 	/* FIXME
 	 * This belongs in journal shutdown, but because we have to
 	 * allocate osb->journal at the start of ocfs_initalize_osb(),
 	 * we free it here.
 	 */
 	kfree(osb->journal);
-
+	if (osb->group_name)
+		kfree(osb->group_name);
+	if (osb->local_alloc_copy)
+		kfree(osb->local_alloc_copy);
 	memset (osb, 0, sizeof (ocfs_super));
 
 	LOG_EXIT ();

Modified: trunk/src/sysfile.c
===================================================================
--- trunk/src/sysfile.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/sysfile.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -46,10 +46,10 @@
 /* Tracing */
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_SYSFILE
 
-static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 node);
+static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 slot);
 
 static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 node);
+static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 slot);
 
 static inline int is_global_system_inode(int type)
 {
@@ -57,19 +57,19 @@
 		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE);
 }
 
-static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 node)
+static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 slot)
 {
-	return (node == osb->node_num || is_global_system_inode(type));
+	return (slot == osb->slot_num || is_global_system_inode(type));
 }
 
 struct inode *ocfs_get_system_file_inode(ocfs_super *osb, int type,
-					 __u32 node)
+					 __u32 slot)
 {
 	struct inode *inode = NULL;
 	struct inode **arr = NULL;
 
 	/* avoid the lookup if cached in local system file array */
-	if (is_in_system_inode_array(osb, type, node))
+	if (is_in_system_inode_array(osb, type, slot))
 		arr = &(osb->system_inodes[type]);
 
 	if (arr && ((inode = *arr) != NULL)) {
@@ -82,7 +82,7 @@
 	}
 	
 	/* this gets one ref thru iget */
-	inode = _ocfs_get_system_file_inode(osb, type, node);
+	inode = _ocfs_get_system_file_inode(osb, type, slot);
 
 	/* add one more if putting into array for first time */
 	if (arr && inode) {
@@ -93,7 +93,7 @@
 	return inode;
 }
 
-static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 node)
+static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 slot)
 {
 	char namebuf[40];
 	struct inode *inode = NULL;
@@ -104,7 +104,7 @@
 
 	ocfs2_sprintf_system_inode_name(namebuf,
 					sizeof(namebuf),
-					type, node);
+					type, slot);
 	
 	status = ocfs_find_files_on_disk(osb, namebuf, strlen(namebuf),
 					 &blkno, osb->sys_root_inode, 

Modified: trunk/src/sysfile.h
===================================================================
--- trunk/src/sysfile.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/sysfile.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,6 +26,6 @@
 #ifndef OCFS2_SYSFILE_H
 #define OCFS2_SYSFILE_H
 
-struct inode * ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 node);
+struct inode * ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 slot);
 
 #endif /* OCFS2_SYSFILE_H */

Modified: trunk/src/util.c
===================================================================
--- trunk/src/util.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/util.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,8 +38,6 @@
 /* Tracing */
 #define OCFS_DEBUG_CONTEXT  OCFS_DEBUG_CONTEXT_UTIL
 
-static void ocfs_timeout_func(unsigned long data);
-
 /* block all but 'mask' sigs, optionally saving off our previous
  * signal state. */
 void ocfs_block_sigs(sigset_t *oldsigs, unsigned long mask)
@@ -129,31 +127,6 @@
 }				/* ocfs_daemonize */
 #endif
 
-/*
- * ocfs_sleep()
- *
- * The interval time is in milliseconds
- *
- * This function needs to be removed.
- * Instead call schedule_timeout() directly and handle signals.
- */
-int ocfs_sleep (__u32 ms)
-{
-	__u32 numJiffies;
-
-	LOG_ENTRY ();
-
-	/* 10ms = 1 jiffy, minimum resolution is one jiffy */
-	numJiffies = ms * HZ / 1000;
-	numJiffies = (numJiffies < 1) ? 1 : numJiffies;
-
-	set_current_state (TASK_INTERRUPTIBLE);
-	numJiffies = schedule_timeout (numJiffies);
-
-	LOG_EXIT ();
-	return 0;
-}				/* ocfs_sleep */
-
 /* prefetch has been declared to allow to build in debug mode */
 #ifdef DEBUG
 #ifndef ARCH_HAS_PREFETCH
@@ -163,46 +136,6 @@
 #endif
 #endif
 
-
-static void ocfs_timeout_func(unsigned long data)
-{
-	ocfs_timeout *to = (ocfs_timeout *)data; 
-
-	to->timed_out = 1;
-	wake_up(&to->wait);
-}
-
-void ocfs_init_timeout(ocfs_timeout *to)
-{
-	init_timer(&to->timer);
-	to->timer.data = (unsigned long)to;
-	to->timer.function = ocfs_timeout_func;
-	to->timed_out = 0;
-	init_waitqueue_head(&to->wait);
-}
-
-void ocfs_set_timeout(ocfs_timeout *to, __u32 timeout)
-{
-	__u32 how_long;
-
-	if (!timeout) {
-		to->timed_out = 1;
-		return ;
-	}
-
-	how_long = (timeout * HZ / 1000);
-	if (how_long < 1)
-		how_long = 1;
-
-	to->timer.expires = jiffies + how_long;
-	add_timer(&to->timer);
-}
-
-void ocfs_clear_timeout(ocfs_timeout *to)
-{
-	del_timer_sync(&to->timer);
-}
-
 void ocfs_truncate_inode_pages(struct inode *inode, loff_t off)
 {
 	LOG_TRACE_ARGS("truncating pages for inode %llu (%p) from offset %llu\n", 
@@ -275,46 +208,5 @@
 
 }
 
-int __ocfs_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms)
-{
-	int ret;
-	ocfs_timeout timeout;
-	DECLARE_WAITQUEUE(wait, current);
-	DECLARE_WAITQUEUE(to_wait, current);
 
-	ocfs_init_timeout(&timeout);
 
-	if (ms) {
-		ocfs_set_timeout(&timeout, ms);
-		if (timeout.timed_out) {
-			ocfs_clear_timeout(&timeout);
-		}
-	}
-	add_wait_queue(wq, &wait);
-	add_wait_queue(&timeout.wait, &to_wait);
-	do { 
-		ret = 0;
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (atomic_read(var)==val)
-			break;
-		ret = -ETIMEDOUT;
-		if (timeout.timed_out)
-			break;
-		schedule();
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-	} while (1);
-	
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(wq, &wait);
-	remove_wait_queue(&timeout.wait, &to_wait);
-
-	if (ms)
-		ocfs_clear_timeout(&timeout);
-
-	return ret;
-}
-
-

Modified: trunk/src/util.h
===================================================================
--- trunk/src/util.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/util.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,26 +26,11 @@
 #ifndef OCFS2_UTIL_H
 #define OCFS2_UTIL_H
 
-void ocfs_clear_timeout(ocfs_timeout *to);
 void ocfs_daemonize(char *name, int len, int shutdown_sigs);
-void ocfs_init_timeout(ocfs_timeout *to);
-void ocfs_set_timeout(ocfs_timeout *to, __u32 timeout);
 void ocfs_show_stack(unsigned long *esp);
 void ocfs_show_trace(unsigned long *stack);
-int ocfs_sleep(__u32 ms);
 void ocfs_truncate_inode_pages(struct inode *inode, loff_t off);
-int __ocfs_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms);
 void ocfs_block_sigs(sigset_t *oldsigs, unsigned long mask);
 void ocfs_unblock_sigs(sigset_t newsig);
 
-/* exits when var == val, or on timeout */
-static inline int ocfs_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int timeout)
-{
-	int ret = 0;
-	if (atomic_read(var) != val)
-		ret = __ocfs_wait_atomic_eq(wq, var, val, timeout);
-	return ret;
-}
-
-
 #endif /* OCFS2_UTIL_H */

Deleted: trunk/src/volcfg.c
===================================================================
--- trunk/src/volcfg.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/volcfg.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,970 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * volcfg.c
- *
- * Auto configuration, namely, node number.
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-
-#include "util.h"
-#include "volcfg.h"
-
-#include "buffer_head_io.h"
-
-/* Tracing */
-#define  OCFS_DEBUG_CONTEXT  OCFS_DEBUG_CONTEXT_VOLCFG
-
-static void ocfs_worker (void *arg);
-static void ocfs_assert_lock_owned (unsigned long arg);
-static int ocfs_add_to_disk_config (ocfs_super * osb, __s16 pref_node_num, ocfs_node_config_info * new_disk_node);
-static int ocfs_write_volcfg_header (ocfs_super * osb, ocfs_volcfg_op op);
-static int ocfs_update_disk_config (ocfs_super * osb, __u32 node_num, ocfs_node_config_info * disk);
-static int ocfs_release_disk_lock (ocfs_super * osb, __u64 lock_off);
-static int ocfs_add_node_to_config (ocfs_super * osb);
-static int ocfs_has_node_config_changed (ocfs_super * osb);
-static int ocfs_refresh_node_config (ocfs_super * osb);
-static void ocfs_show_all_node_cfgs (ocfs_super * osb);
-static int ocfs_disknode_to_node (BARF_BARF_BARF ** node, ocfs_node_config_info * disk);
-static void ocfs_volcfg_gblctxt_to_node(BARF_BARF_BARF *node);
-static void ocfs_volcfg_gblctxt_to_disknode(ocfs_node_config_info *disk);
-
-/*
- * ocfs_worker()
- *
- * This function reiterates the lock on the disk from this node once
- * it has obtained it.
- */
-static void ocfs_worker (void *arg)
-{
-	__u32 length;
-	int status;
-	ocfs_super *osb;
-	__u64 offset;
-	ocfs_cfg_task *cfg_task;
-	struct buffer_head *bh;
-
-	LOG_ENTRY ();
-
-	cfg_task = arg;
-
-	/* Obtain the volume for which we need to reiterate the lock */
-	osb = cfg_task->osb;
-	bh = cfg_task->bh;
-	length = osb->sb->s_blocksize;
-	offset = cfg_task->lock_off;
-
-	/* Write the sector back */
-	status = ocfs_write_block(osb, bh, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		/* deliberate no exit jump here */
-	}
-
-	if (atomic_read (&osb->lock_stop)) {
-		LOG_TRACE_ARGS ("Last Lock written : %lu\n", jiffies);
-		atomic_set (&osb->lock_event_woken, 1);
-		brelse(bh);
-		wake_up (&osb->lock_event);
-	} else {
-		LOG_TRACE_ARGS ("Lock written : %lu\n", jiffies);
-		mod_timer (&osb->lock_timer, jiffies + OCFS_VOLCFG_LOCK_ITERATE);
-	}
-
-	LOG_EXIT ();
-	return;
-}				/* ocfs_worker */
-
-/*
- * ocfs_assert_lock_owned()
- *
- * Routine called by a timer to reiterate the disk lock.
- */
-static void ocfs_assert_lock_owned (unsigned long arg)
-{
-	ocfs_cfg_task *cfg_task;
-
-	LOG_ENTRY ();
-
-	cfg_task = (ocfs_cfg_task *) arg;
-
-	/* initialize the task and submit it */
-	INIT_WORK(&cfg_task->cfg_wq, ocfs_worker, cfg_task);
-	schedule_work(&cfg_task->cfg_wq);
-
-	LOG_EXIT ();
-	return ;
-}				/* ocfs_assert_lock_owned */
-
-/*
- * ocfs_add_to_disk_config()
- *
- */
-static int ocfs_add_to_disk_config (ocfs_super * osb, __s16 pref_node_num, ocfs_node_config_info * new_disk_node)
-{
-	int status = 0;
-	int i;
-	ocfs_node_config_info *disk_node = NULL;
-	__s16 node_num;
-	struct buffer_head **cfg_bhs = NULL;
-
-	LOG_ENTRY ();
-
-	cfg_bhs = kmalloc(sizeof(struct buffer_head *) * osb->max_nodes, GFP_KERNEL);
-	if (cfg_bhs == NULL) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto finally;
-	}
-	memset(cfg_bhs, 0, osb->max_nodes * sizeof(struct buffer_head *));
-
-	/* Read the nodecfg info for all nodes from disk */
-	status = ocfs_read_blocks(osb,
-				  (osb->autoconfig_blkno + OCFS_VOLCFG_HDR_SECTORS),
-				  (osb->autoconfig_blocks - OCFS_VOLCFG_HDR_SECTORS),
-				  cfg_bhs, 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	/* Check if preferred node num is available */
-	node_num = OCFS_INVALID_NODE_NUM;
-	if (pref_node_num < osb->max_nodes) {
-		disk_node = (ocfs_node_config_info *) cfg_bhs[pref_node_num]->b_data;
-		if (disk_node->node_name[0] == '\0')
-			node_num = pref_node_num;
-	}
-
-	/* if not, find the first available empty slot */
-	if (node_num == OCFS_INVALID_NODE_NUM) {
-		for (node_num = 0; node_num < osb->max_nodes; node_num++) {
-			disk_node = (ocfs_node_config_info *) cfg_bhs[node_num]->b_data;
-			if (disk_node->node_name[0] == '\0')
-				break;
-		}
-	}
-
-	/* If no free slots, error out */
-	if (node_num >= osb->max_nodes) {
-		LOG_ERROR_STR ("Unable to allocate node number as no slots " \
-			       "are available");
-		status = -ENOSPC;
-		goto finally;
-	}
-
-	/* Copy the new nodecfg into the memory buffer */
-	memcpy (cfg_bhs[node_num]->b_data, new_disk_node, osb->sb->s_blocksize);
-
-	/* Write the new node details on disk */
-	status = ocfs_write_block(osb, cfg_bhs[node_num], NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	/* Update the nodecfg hdr on disk */
-	status = ocfs_write_volcfg_header (osb, OCFS_VOLCFG_ADD);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-finally:
-	for (i = 0; i < osb->max_nodes; i++)
-		if (cfg_bhs[i])
-			brelse(cfg_bhs[i]);
-	if (cfg_bhs)
-		kfree(cfg_bhs);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_add_to_disk_config */
-
-/*
- * ocfs_write_volcfg_header()
- *
- */
-static int ocfs_write_volcfg_header (ocfs_super * osb, ocfs_volcfg_op op)
-{
-	int status = 0;
-	ocfs_node_config_hdr *hdr, *hdr_copy;
-	struct buffer_head *node_cfg_bhs[2];
-	
-	LOG_ENTRY ();
-
-	node_cfg_bhs[0] = node_cfg_bhs[1] = NULL;
-	/* Read the nodecfg header */
-	status = ocfs_read_block(osb, osb->autoconfig_blkno, &node_cfg_bhs[0], 
-				 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-
-	status = ocfs_read_block(osb, (osb->new_autoconfig_blkno + 1),
-				 &node_cfg_bhs[1], 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-
-	hdr = (ocfs_node_config_hdr *) node_cfg_bhs[0]->b_data;
-	hdr_copy = (ocfs_node_config_hdr *) node_cfg_bhs[1]->b_data;
-
-	if (op == OCFS_VOLCFG_ADD)
-		hdr->num_nodes++;
-
-	/* Increment the seq# to trigger other nodes to re-read node cfg */
-	hdr->cfg_seq_num++;
-
-	memcpy(hdr_copy, hdr, osb->sb->s_blocksize);
-	/* Write the nodecfg header */
-	/* Write the nodecfg hdr into the second sector of newcfg also. */
-	/* We do so so that we can read the nodecfg hdr easily when we */
-	/* read the publish sector, for e.g. in ocfs_nm_thread() */
-	status = ocfs_write_blocks(osb, node_cfg_bhs, 2, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-	brelse(node_cfg_bhs[0]);
-	brelse(node_cfg_bhs[1]);
-	node_cfg_bhs[0] = node_cfg_bhs[1] = NULL;
-
-bail:
-	if (node_cfg_bhs[0])
-		brelse(node_cfg_bhs[0]);
-	if (node_cfg_bhs[1])
-		brelse(node_cfg_bhs[1]);
-	
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_write_volcfg_header */
-
-/*
- * ocfs_config_with_disk_lock()
- *
- * This function tries to obtain the lock on the disk for the volume
- * specified. The logic for obtaining a disk lock is as follows :
- *
- * Read the volcfg lock sector. If it is not locked, lock it by stamping
- * ones node number. Read the same sector after OCFS_VOLCFG_LOCK_TIME.
- * If the contents have not been modified, the lock is ours. Retain the
- * lock by reiterating the lock write operation every OCFS_VOLCFG_ITERATE_TIME.
- *
- * If the volcfg lock sector is owned by someone else, wait for
- * OCFS_VOLCFG_LOCK_TIME and read the lock sector again. If the lock sector
- * is owned by the same node as before attempt to break the lock as the
- * node may have died. If however, the lock sector is now owned by someone
- * else, wait for OCFS_VOLCFG_LOCK_TIME before repeating the entire exercise
- * again.
- *
- * Returns 0 if success, < 0 if error.
- */
-static int ocfs_config_with_disk_lock (ocfs_super * osb, __u64 lock_off, __u8 * cfg_buf, __s16 node_num, ocfs_volcfg_op op)
-{
-	int status = 0;
-	char *lock_buf;
-	int tried_acq = 0;
-	int break_lock = 0;
-	ocfs2_disk_lock *disk_lock;
-	ocfs_cfg_task *cfg_task;
-	__s16 lock_node_num = OCFS_INVALID_NODE_NUM;
-	struct buffer_head *bh = NULL;
-	int i;
-
-	LOG_ENTRY ();
-
-	cfg_task = ocfs_malloc (sizeof (ocfs_cfg_task));
-	if (cfg_task == NULL)
-	{
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto finito;
-	}
-
-	/* initialize cfg_task with info reqd to reiterate the volcfg lock */
-	cfg_task->osb = osb;
-	cfg_task->lock_off = lock_off;
-
-	/* Initialize the kernel timer */
-	init_timer(&osb->lock_timer);
-	osb->lock_timer.function = ocfs_assert_lock_owned;
-	osb->lock_timer.expires = 0;
-	osb->lock_timer.data = (unsigned long) cfg_task;
-
-	init_waitqueue_head (&osb->lock_event);
-	atomic_set (&osb->lock_event_woken, 0);
-	atomic_set (&osb->lock_stop, 0);
-
-	status = ocfs_read_block(osb, lock_off >> osb->sb->s_blocksize_bits, 
-				 &bh, 0, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finito;
-	}
-	cfg_task->bh = bh;
-
-	for (i = 0; i < 50; i++) {
-		/* Read the volcfg lock sector */
-		status = ocfs_read_block(osb, 
-					 lock_off >> osb->sb->s_blocksize_bits,
-					 &bh, 0, NULL);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finito;
-		}
-
-		disk_lock = (ocfs2_disk_lock *) bh->b_data;
-		lock_node_num = disk_lock->dl_master;
-
-		if (disk_lock->dl_level == 0 || break_lock) {
-			if (disk_lock->dl_level != 0)
-				LOG_TRACE_STR ("Try to break node config lock");
-			else
-				LOG_TRACE_STR ("Lock node config");
-
-			/* Attempt to lock volcfg */
-			memcpy(disk_lock, cfg_buf, osb->sb->s_blocksize);
-
-			disk_lock->dl_master = osb->node_num;
-			disk_lock->dl_level = 1;
-			memcpy(cfg_buf, disk_lock, osb->sb->s_blocksize);
-		
-			/* Write into volcfg lock sector... */
-			status = ocfs_write_block(osb, bh, NULL);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto finito;
-			}
-			tried_acq = 1;
-		}
-
-		ocfs_sleep (OCFS_VOLCFG_LOCK_TIME);
-
-		/* Read the volcfg lock sector again... */
-		status = ocfs_read_block(osb, 
-					 lock_off >> osb->sb->s_blocksize_bits,
-					 &bh, 0, NULL);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finito;
-		}
-
-		lock_buf = bh->b_data;
-
-		/* If we tried to acquire and we still own it we take it... */
-		if ((tried_acq) && (memcmp (lock_buf, cfg_buf, osb->sb->s_blocksize) == 0)) {
-			memcpy (lock_buf, cfg_buf, osb->sb->s_blocksize);
-
-			/* Set timer to reiterate lock every few jiffies */
-			LOG_TRACE_ARGS ("Start Timer: %lu\n", jiffies);
-			osb->lock_timer.expires = jiffies +
-						  OCFS_VOLCFG_LOCK_ITERATE;
-			/* we get_bh here because we brelse later in
-			 * this function, and so does the timer routine. */
-			get_bh(bh);
-			add_timer(&osb->lock_timer);
-
-			/* Write the config info into the disk */
-			disk_lock = (ocfs2_disk_lock *)cfg_buf;
-			disk_lock->dl_master = OCFS_INVALID_NODE_NUM;
-			disk_lock->dl_level = 0;
-
-			if (op == OCFS_VOLCFG_ADD)
-				status = ocfs_add_to_disk_config (osb, node_num,
-					(ocfs_node_config_info *) cfg_buf);
-			else if (op == OCFS_VOLCFG_UPD)
-				status = ocfs_update_disk_config(osb, node_num,
-				       (ocfs_node_config_info *) cfg_buf);
-			else
-				status = -EINVAL;
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto finito;
-			}
-			break;
-		} else {
-			disk_lock = (ocfs2_disk_lock *)lock_buf;
-			if (disk_lock->dl_master == lock_node_num)
-				break_lock = 1;
-			else {
-				LOG_TRACE_ARGS ("Node config locked by node: %d\n",
-					disk_lock->dl_master);
-				ocfs_sleep (OCFS_VOLCFG_LOCK_TIME);
-			}
-		}
-
-	}
-	if (i >= 50)
-		status = -EINVAL;
-
-finito:
-	ocfs_release_disk_lock (osb, lock_off);
-
-	if (cfg_task)
-		kfree(cfg_task);
-	if (bh)
-		brelse(bh);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_config_with_disk_lock */
-
-/*
- * ocfs_release_disk_lock()
- *
- * This function Cancels the timer to reiterate we own the disk lock and
- * then frees it by writing the sector for the disk lock.
- *
- * Returns 0 if success, < 0 if error.
- */
-static int ocfs_release_disk_lock (ocfs_super * osb, __u64 lock_off)
-{
-	int status = 0;
-	struct buffer_head *bh;
-	struct super_block *sb;
-	__u64 blocknum;
-
-	LOG_ENTRY ();
-
-	sb = osb->sb;
-
-	blocknum = lock_off >> sb->s_blocksize_bits;
-	bh = sb_getblk(sb, blocknum);
-	if (bh == NULL) {
-		LOG_ERROR_STATUS (status = -EIO);
-		goto finally;
-	}
-
-	/* reset lock... */
-	memset (bh->b_data, 0, osb->sb->s_blocksize);
-
-	/* Release the lock */
-	status = ocfs_write_block(osb, bh, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	/* Cancel the timer so that we don't reiterate the lock anymore */
-	LOG_TRACE_STR ("Waiting for osb->lock_event");
-	atomic_set (&osb->lock_stop, 1);
-	ocfs_wait (osb->lock_event, atomic_read (&osb->lock_event_woken), 0);
-	atomic_set (&osb->lock_event_woken, 0);
-	del_timer_sync(&osb->lock_timer);
-
-	/* reset lock... */
-	memset (bh->b_data, 0, osb->sb->s_blocksize);
-
-	/* Release the lock */
-	status = ocfs_write_block(osb, bh, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-finally:
-	if (bh)
-		brelse(bh);
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_release_disk_lock */
-
-/*
- * ocfs_add_node_to_config()
- *
- */
-static int ocfs_add_node_to_config (ocfs_super * osb)
-{
-	int status;
-	ocfs_node_config_info *disk;
-	void *buffer;
-	__u64 offset;
-
-	LOG_ENTRY ();
-
-	buffer = ocfs_malloc (osb->sb->s_blocksize);
-	if (buffer == NULL) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto bail;
-	}
-	memset (buffer, 0, osb->sb->s_blocksize);
-
-	disk = buffer;
-
-	/* populate the disknodecfg info from global context */
-	ocfs_volcfg_gblctxt_to_disknode (disk);
-
-	/* Write this nodes config onto disk */
-	offset = (osb->new_autoconfig_blkno << osb->sb->s_blocksize_bits);
-	status = ocfs_config_with_disk_lock (osb, offset, (__u8 *) disk,
-					     OcfsGlobalCtxt.pref_node_num,
-					     OCFS_VOLCFG_ADD);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-
-	status = ocfs_chk_update_config (osb);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-
-bail:
-	if (buffer)
-		kfree(buffer);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_add_node_to_config */
-
-/*
- * ocfs_disknode_to_node()
- *
- */
-static int ocfs_disknode_to_node (BARF_BARF_BARF ** node, ocfs_node_config_info * disk)
-{
-	int status = 0;
-
-	LOG_ENTRY ();
-
-	if (*node == NULL) {
-		if ((*node = (BARF_BARF_BARF *)
-		     ocfs_malloc (sizeof (BARF_BARF_BARF))) == NULL) {
-			LOG_ERROR_STATUS (status = -ENOMEM);
-			goto bail;
-		}
-		memset (*node, 0, sizeof (BARF_BARF_BARF));
-	}
-
-	strncpy ((*node)->node_name, disk->node_name, MAX_NODE_NAME_LENGTH);
-
-	memcpy((*node)->guid.guid, disk->guid.guid, OCFS2_GUID_LEN);
-
-	(*node)->ipc_config.ip_port = disk->ipc_config.ip_port;
-	(*node)->ipc_config.addr_u.ip_addr4 =
-		disk->ipc_config.addr_u.ip_addr4;
-	(*node)->ipc_config.ip_version = disk->ipc_config.ip_version;
-
-bail:
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_disknode_to_node */
-
-/*
- * ocfs_update_disk_config()
- *
- */
-static int ocfs_update_disk_config (ocfs_super * osb, __u32 node_num, ocfs_node_config_info * disk)
-{
-	int status = 0;
-	__u32 blocknum;
-	struct buffer_head *bh = NULL;
-	struct super_block *sb = NULL;
-
-	LOG_ENTRY ();
-
-	sb = osb->sb;
-	/* Write the node details */
-	blocknum = osb->autoconfig_blkno + OCFS_VOLCFG_HDR_SECTORS + node_num;
-
-	bh = sb_getblk(sb, blocknum);
-	if (bh == NULL) {
-		status = -EIO;
-		LOG_ERROR_STATUS(status);
-		goto finally;
-	}
-
-	memcpy(bh->b_data, disk, osb->sb->s_blocksize);
-
-	status = ocfs_write_block(osb, bh, NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	status = ocfs_write_volcfg_header (osb, OCFS_VOLCFG_UPD);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-finally:
-	if (bh)
-		brelse(bh);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_update_disk_config */
-
-/*
- * ocfs_volcfg_gblctxt_to_disknode()
- *
- */
-static void ocfs_volcfg_gblctxt_to_disknode(ocfs_node_config_info *disk)
-{
-	ocfs_ipc_config_info *ipc;
-	ocfs_comm_info *g_ipc;
-
-	LOG_ENTRY ();
-
-	ipc = &(disk->ipc_config);
-	g_ipc = &(OcfsGlobalCtxt.comm_info);
-
-	if (OcfsGlobalCtxt.node_name)
-		strncpy (disk->node_name, OcfsGlobalCtxt.node_name,
-			 MAX_NODE_NAME_LENGTH);
-
-	memcpy(disk->guid.guid, OcfsGlobalCtxt.guid.guid,
-	       OCFS2_GUID_LEN);
-
-	ipc->ip_port = g_ipc->ip_port;
-	ipc->ip_version = g_ipc->ip_version;
-	ipc->addr_u.ip_addr4 = g_ipc->addr_u.ip_addr4;
-
-	LOG_EXIT ();
-	return ;
-}				/* ocfs_volcfg_gblctxt_to_disknode */
-
-/*
- * ocfs_volcfg_gblctxt_to_node()
- *
- */
-static void ocfs_volcfg_gblctxt_to_node(BARF_BARF_BARF *node)
-{
-	ocfs_ipc_config_info *ipc;
-	ocfs_comm_info *g_ipc;
-
-	LOG_ENTRY ();
-
-	ipc = &(node->ipc_config);
-	g_ipc = &(OcfsGlobalCtxt.comm_info);
-
-	if (OcfsGlobalCtxt.node_name)
-		strncpy (node->node_name, OcfsGlobalCtxt.node_name,
-			 MAX_NODE_NAME_LENGTH);
-
-	memcpy(node->guid.guid, OcfsGlobalCtxt.guid.guid,
-	       OCFS2_GUID_LEN);
-
-	ipc->ip_port = g_ipc->ip_port;
-	ipc->ip_version = g_ipc->ip_version;
-	ipc->addr_u.ip_addr4 = g_ipc->addr_u.ip_addr4;
-
-	LOG_EXIT ();
-	return ;
-}				/* ocfs_volcfg_gblctxt_to_node */
-
-/*
- * ocfs_chk_update_config()
- *
- */
-int ocfs_chk_update_config (ocfs_super * osb)
-{
-	int status = 0;
-	ocfs_node_config_hdr *hdr = NULL;
-	ocfs_node_config_info *disk = NULL;
-	__s32 i;
-	struct buffer_head **cfg_bhs = NULL;
-
-	LOG_ENTRY ();
-
-	/* Read in the config on the disk */
-	cfg_bhs = ocfs_malloc(osb->autoconfig_blocks *
-			      sizeof(*cfg_bhs));
-	if (cfg_bhs == NULL) {
-		status = -ENOMEM;
-		LOG_ERROR_STATUS(status);
-		goto finally;
-	}
-	memset(cfg_bhs, 0, osb->autoconfig_blocks * sizeof(*cfg_bhs));
-
-	status = ocfs_read_blocks(osb, osb->autoconfig_blkno,
-				  (osb->autoconfig_blocks), cfg_bhs, 0, 
-				  NULL);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	/* 1st block in buffer is the NodeCfgHdr */
-	hdr = (ocfs_node_config_hdr *) cfg_bhs[0]->b_data;
-
-	if (strncmp (hdr->signature, OCFS2_NODE_CONFIG_HDR_SIGN,
-		     OCFS2_NODE_CONFIG_SIGN_LEN)) {
-		LOG_ERROR_STR ("Invalid node config signature");
-		status = -EINVAL;
-		goto finally;
-	}
-
-	if  (hdr->version < OCFS2_NODE_MIN_SUPPORTED_VER ||
-	     hdr->version > OCFS2_NODE_CONFIG_VER) {
-		LOG_ERROR_ARGS ("Node config version mismatch, (%d) < minimum" \
-				" (%d) or > current (%d)", hdr->version,
-				OCFS2_NODE_MIN_SUPPORTED_VER, OCFS2_NODE_CONFIG_VER);
-		status = -EINVAL;
-		goto finally;
-	}
-
-	/* Exit if nodecfg on disk has remained unchanged... */
-	if ((osb->cfg_initialized) && (osb->cfg_seq_num == hdr->cfg_seq_num) &&
-	    (osb->num_cfg_nodes == hdr->num_nodes))
-		goto finally;
-
-	/* ... else refresh nodecfg in memory */
-
-	/* Read the nodecfg for all possible nodes as there may be holes */
-	/* i.e., node numbers need not be dolled out in sequence */
-	for (i = 0; i < osb->max_nodes; i++) {
-		int which;
-		which = i + OCFS_VOLCFG_HDR_SECTORS;
-		disk = (ocfs_node_config_info *) cfg_bhs[which]->b_data;
-
-		if (disk->node_name[0] == '\0')
-			continue;
-
-		status = ocfs_disknode_to_node (&osb->node_cfg_info[i], disk);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
-
-		/* If nodenum is set, goto next node */
-		if (osb->node_num != OCFS_INVALID_NODE_NUM)
-			continue;
-
-		/*
-		 * If node num is not set, set it if guid matches.
-		 * If guid does not match and the hostid also does not
-		 * match, goto next slot.
-		 * However if the guid does not natch but the hostid
-		 * matches, it means that the user re-ran ocfs_uid_gen
-		 * with the -r option to reclaim its node number. In
-		 * this case, allow the reclaim only if the user mounts
-		 * the volume with the reclaimid option. Else, error.
-		 */
-		if (!memcmp(&OcfsGlobalCtxt.guid.guid, disk->guid.guid,
-			    OCFS2_GUID_LEN)) {
-			osb->node_num = i;
-			continue;
-		}
-
-		/* If the hostid does not match, goto next... */
-		if (memcmp(&OcfsGlobalCtxt.guid.id.host_id,
-			   disk->guid.id.host_id,
-			   OCFS2_GUID_HOSTID_LEN))
-			continue;
-
-		/* ...else allow node to reclaim the number if reclaimid set */
-		if (osb->reclaim_id) {
-			osb->node_num = i;
-			/* Write this node's cfg with the new guid on disk */
-			status = ocfs_refresh_node_config (osb);
-			if (status < 0) {
-				LOG_ERROR_STATUS(status);
-				goto finally;
-			}
-		}
-		else {
-			LOG_ERROR_STR("Re-mount volume with the reclaimid " \
-				      "option to reclaim the node number");
-			status = -EBUSY;
-			goto finally;
-		}
-	}
-
-	osb->cfg_initialized = 1;
-	osb->cfg_seq_num = hdr->cfg_seq_num;
-	osb->num_cfg_nodes = hdr->num_nodes;
-	LOG_TRACE_ARGS ("Num of configured nodes (%u)\n", osb->num_cfg_nodes);
-	IF_TRACE(ocfs_show_all_node_cfgs (osb));
-
-finally:
-	if (cfg_bhs) {
-		for (i = 0; i < osb->autoconfig_blocks; i++)
-			if (cfg_bhs[i])
-				brelse(cfg_bhs[i]);
-		kfree(cfg_bhs);
-	}
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_chk_update_config */
-
-/*
- * ocfs_get_config()
- *
- */
-int ocfs_get_config (ocfs_super * osb)
-{
-	int status = 0;
-
-	LOG_ENTRY ();
-
-	/* Update our config info for this volume from the disk */
-	status = ocfs_chk_update_config (osb);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-
-	if (osb->node_num == OCFS_INVALID_NODE_NUM) {
-		if (osb->reclaim_id) {
-			LOG_ERROR_STR ("unable to reclaim id");
-			status = -EINVAL;
-			goto bail;
-		}
-		status = ocfs_add_node_to_config (osb);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto bail;
-		}
-	} else {
-		if (ocfs_has_node_config_changed (osb)) {
-			status = ocfs_refresh_node_config (osb);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto bail;
-			}
-		}
-	}
-
-	LOG_TRACE_ARGS ("Node Num: %d\n", osb->node_num);
-
-bail:
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_get_config */
-
-/*
- * ocfs_has_node_config_changed()
- *
- */
-static int ocfs_has_node_config_changed (ocfs_super * osb)
-{
-	BARF_BARF_BARF *node;
-	ocfs_ipc_config_info *ipc;
-	ocfs_comm_info *g_ipc;
-	int chg = 0;
-
-	LOG_ENTRY ();
-
-	node = osb->node_cfg_info[osb->node_num];
-	ipc = &(node->ipc_config);
-	g_ipc = &(OcfsGlobalCtxt.comm_info);
-
-	if (OcfsGlobalCtxt.node_name &&
-	    strncmp (node->node_name, OcfsGlobalCtxt.node_name,
-		     MAX_NODE_NAME_LENGTH))
-		chg = 1;
-
-	if (!chg && (ipc->ip_version != g_ipc->ip_version))
-		chg = 1;
-
-	if (!chg && (ipc->ip_port != g_ipc->ip_port))
-		chg = 1;
-
-	if (!chg && (ipc->addr_u.ip_addr4 != g_ipc->addr_u.ip_addr4))
-		chg = 1;
-
-	LOG_EXIT_INT (chg);
-	return chg;
-}				/* ocfs_has_node_config_changed */
-
-/*
- * ocfs_refresh_node_config()
- *
- */
-static int ocfs_refresh_node_config (ocfs_super * osb)
-{
-	BARF_BARF_BARF *node;
-	ocfs_node_config_info *disk;
-	__u64 offset;
-	__u8 *buffer;
-	int status;
-
-	LOG_ENTRY ();
-
-	buffer = ocfs_malloc (osb->sb->s_blocksize);
-	if (buffer == NULL) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto bail;
-	}
-
-	memset (buffer, 0, osb->sb->s_blocksize);
-	disk = (ocfs_node_config_info *) buffer;
-
-	/* populate the nodecfg info in disk from global context */
-	ocfs_volcfg_gblctxt_to_disknode (disk);
-
-	/* populate the nodecfg info in mem from global context */
-	node = osb->node_cfg_info[osb->node_num];
-	ocfs_volcfg_gblctxt_to_node (node);
-
-	/* Update the nodecfg on disk with the new info */
-	offset = (osb->new_autoconfig_blkno << osb->sb->s_blocksize_bits);
-	status = ocfs_config_with_disk_lock (osb, offset, (__u8 *) disk,
-					     osb->node_num, OCFS_VOLCFG_UPD);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-
-bail:
-	if (buffer)
-		kfree(buffer);
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_refresh_node_config */
-
-/*
- * ocfs_show_all_node_cfgs()
- *
- */
-static void ocfs_show_all_node_cfgs (ocfs_super * osb)
-{
-	BARF_BARF_BARF *node;
-	__u32 i;
-
-	for (i = 0; i < osb->max_nodes; i++) {
-		node = osb->node_cfg_info[i];
-
-		if (!node || node->node_name[0] == '\0')
-			continue;
-
-		LOG_TRACE_ARGS ("Node (%u) is (%s)\n", i,
-				node->node_name);
-		LOG_TRACE_ARGS ("ip=0x%08u, port=%d\n",
-				ntohl(node->ipc_config.addr_u.ip_addr4),
-				ntohs(node->ipc_config.ip_port));
-	}
-
-	return;
-}				/* ocfs_show_all_node_cfgs */

Deleted: trunk/src/volcfg.h
===================================================================
--- trunk/src/volcfg.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/volcfg.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,32 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * volcfg.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VOLCFG_H
-#define OCFS2_VOLCFG_H
-
-int ocfs_chk_update_config(ocfs_super *osb);
-int ocfs_get_config(ocfs_super *osb);
-
-#endif /* OCFS2_VOLCFG_H */

Modified: trunk/src/vote.c
===================================================================
--- trunk/src/vote.c	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/vote.c	2004-12-06 21:45:32 UTC (rev 1693)
@@ -3,9 +3,9 @@
  *
  * vote.c
  *
- * netdlm listener, receive, verify and send messages
+ * description here
  *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -25,1125 +25,831 @@
 
 #include "ocfs_compat.h"
 
-#include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <linux/inet.h>
-#include <linux/net.h>
-#include <linux/in.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
 
-#include <asm/uaccess.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
 
 #include "ocfs_log.h"
 #include "ocfs.h"
+#include "ocfs2.h"
 
-#include "dlm.h"
-#include "nm.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
 #include "util.h"
 #include "vote.h"
 
-/* Tracing */
-#define  OCFS_DEBUG_CONTEXT  OCFS_DEBUG_CONTEXT_VOTE
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
 
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_VOTE
 
+#define OCFS2_MESSAGE_TYPE_VOTE     (0x1)
+#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
+typedef struct _ocfs2_msg_hdr
+{
+	u32 h_response_id; /* used to lookup message handle on sending
+			    * node. */
+	u32 h_request;
+	u64 h_blkno;
+	u32 h_generation;
+	u32 h_node_num;    /* node sending this particular message. */
+} ocfs2_msg_hdr;
 
-ocfs_ipc_ctxt OcfsIpcCtxt;
+typedef struct _ocfs2_vote_msg
+{
+	ocfs2_msg_hdr v_hdr;
+	/* may put stuff in here... */
+} ocfs2_vote_msg;
 
-static const char vote_state_str[] = { 'U', 'S', 'P', 'F', 'D' };
+typedef struct _ocfs2_response_msg
+{
+	ocfs2_msg_hdr r_hdr;
+	s32 r_response; /* this maps to '0' or a -value in errno.h */
+} ocfs2_response_msg;
 
-static ocfs_vote_obj * ocfs_alloc_vote_obj (ocfs_super *osb, int bytes, __u32 reqlock, ocfs_node_map * votemap);
-static void ocfs_dlm_recv_msg (void *val);
-static int ocfs_check_ipc_msg (__u8 * msg, __u32 msg_len);
-static int ocfs_comm_process_vote_reply (ocfs_super * osb, ocfs_dlm_msg * dlm_msg);
-static int ocfs_comm_process_msg (__u8 * msg);
-static void ocfs_init_dlm_msg (ocfs_super * osb, ocfs_dlm_msg * dlm_msg, __u32 msg_len, __u32 type);
+typedef struct _ocfs2_vote_work {
+	struct list_head   w_list;
+	ocfs2_vote_msg w_msg;
+} ocfs2_vote_work;
 
-static int ocfs_send_bcast (ocfs_super * osb, ocfs_node_map *votemap, ocfs_dlm_msg * dlm_msg);
-static int ocfs_node_map_stringify(ocfs_node_map *map, char **str);
+enum ocfs2_vote_request {
+	OCFS2_VOTE_REQ_INVALID = 0,
+	OCFS2_VOTE_REQ_DELETE,
+	OCFS2_VOTE_REQ_UNLINK,
+	OCFS2_VOTE_REQ_RENAME,
+	OCFS2_VOTE_REQ_MOUNT,
+	OCFS2_VOTE_REQ_UMOUNT
+};
 
-static void ocfs_put_vote_obj(ocfs_vote_obj *obj)
+typedef struct _ocfs2_net_wait_ctxt {
+	struct list_head   n_list;
+	u32                n_response_id;
+	wait_queue_head_t  n_event;
+	ocfs_node_map      n_node_map;
+	int                n_response; /* an agreggate response. 0 if
+					* all nodes are go, < 0 on any
+					* negative response from any
+					* node or network error. */
+} ocfs2_net_wait_ctxt;
+
+static void ocfs2_vote_thread_do_work(ocfs_super *osb);
+static void ocfs2_process_vote(ocfs_super *osb,
+			       ocfs2_vote_msg *msg);
+static int ocfs2_do_request_vote(ocfs_super *osb,
+				 u64 blkno,
+				 unsigned int generation,
+				 enum ocfs2_vote_request type);
+
+static void ocfs2_process_mount_request(ocfs_super *osb,
+					unsigned int node_num)
 {
-	if (atomic_dec_and_test(&obj->refcount))
-		kfree(obj);
+	printk("MOUNT vote from node %u\n", node_num);
+	/* The other node only sends us this message when he has an EX
+	 * on the superblock, so our recovery threads (if having been
+	 * launched) are waiting on it.*/
+	ocfs_recovery_map_clear(osb, node_num);
+	ocfs_node_map_set_bit(osb, &osb->mounted_map, node_num);
 }
 
-static void ocfs_get_vote_obj(ocfs_vote_obj *obj)
+static void ocfs2_process_umount_request(ocfs_super *osb,
+					 unsigned int node_num)
 {
-	atomic_inc(&obj->refcount);
+	printk("UMOUNT vote from node %u\n", node_num);
+	ocfs_node_map_clear_bit(osb, &osb->mounted_map, node_num);
+	ocfs_node_map_set_bit(osb, &osb->umount_map, node_num);
 }
 
-
-/*
- * ocfs_recv_udp_msg()
- *
- */
-int ocfs_recv_udp_msg (ocfs_recv_ctxt * recv_ctxt)
+static int ocfs2_process_delete_request(struct inode *inode)
 {
-	int status = -ENETUNREACH, error;
-	mm_segment_t oldfs;
-	struct sockaddr_in sin;
-	struct iovec iov = { 
-		.iov_len = recv_ctxt->msg_len, 
-		.iov_base = recv_ctxt->msg 
-	};
-	struct msghdr msg = { 
-		.msg_control = NULL, 
-		.msg_controllen = 0, 
-		.msg_iovlen = 1, 
-		.msg_iov = &iov, 
-		.msg_name = (struct sockaddr *) &sin, 
-		.msg_namelen = sizeof (sin),
-       		.msg_flags = 0 
-	};
+	int response = -EBUSY;
 
-	LOG_ENTRY ();
+	LOG_TRACE_ARGS("DELETE vote on inode %lu, read "
+		       "lnk_cnt = %u\n", inode->i_ino, 
+		       inode->i_nlink);
 
-	/* Initialize the workitem with our worker routine and Q it. */
-	INIT_WORK (&recv_ctxt->ipc_wq, ocfs_dlm_recv_msg, recv_ctxt);
+	/* force this as ours may be out of date. */
+	inode->i_nlink = 0;
 
-	memset (&sin, 0, sizeof (sin));
-	oldfs = get_fs ();
-	set_fs (get_ds ());
-	error = sock_recvmsg (OcfsIpcCtxt.recv_sock, &msg, recv_ctxt->msg_len, msg.msg_flags);
-	set_fs (oldfs);
+	spin_lock(&OCFS_I(inode)->ip_lock);
+	/* vote no if the file is still open. */
+	if (OCFS_I(inode)->ip_open_cnt > 0) {
+		LOG_TRACE_PROCESS_VOTE("open count = %u\n", 
+		       OCFS_I(inode)->ip_open_cnt);
+		spin_unlock(&OCFS_I(inode)->ip_lock);
+		goto done;
+	}
+	spin_unlock(&OCFS_I(inode)->ip_lock);
 
-	if (error < 0) {
-		if (error == -ERESTARTSYS) {
-			status = -EBADF;
-			LOG_TRACE_STR ("Shutting down ocfs2lsnr");
-		} else {
-			status = -ENETUNREACH;
-			LOG_ERROR_ARGS ("unable to recvmsg, error=%d", error);
-			LOG_ERROR_STATUS (status);
-		}
-		goto bail;
-	} else if (msg.msg_namelen) {
-		recv_ctxt->msg_len = iov.iov_len;
-		status = 0;
+	/* directories are a bit ugly... What if someone is sitting in
+	 * it? We want to make sure the inode is removed completely as
+	 * a result of the iput in process_vote. */
+	if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
+		LOG_TRACE_PROCESS_VOTE("i_count = %u\n", 
+		       atomic_read(&inode->i_count));
+		goto done;
 	}
 
-	LOG_TRACE_ARGS ("Received packet from: %d.%d.%d.%d\n",
-			NIPQUAD (sin.sin_addr.s_addr));
+	/* If we get here, then we're voting 'yes', so commit the
+	 * delete on our side. */
+	response = 0;
 
-	if (status == 0)
-		schedule_work(&recv_ctxt->ipc_wq);
+	spin_lock(&OCFS_I(inode)->ip_lock);
+	SET_INODE_DELETED(inode);
+	/* We set the SKIP_DELETE flag on the inode so we don't try to
+	 * delete it in delete_inode ourselves. */
+	OCFS_I(inode)->ip_flags |=  OCFS_INODE_SKIP_DELETE;
+	spin_unlock(&OCFS_I(inode)->ip_lock);
 
-bail:
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_recv_udp_msg */
+	d_prune_aliases (inode);
 
+	/* TODO: How much of this is really necessary? */
+	sync_mapping_buffers(inode->i_mapping);
+	ocfs_truncate_inode_pages(inode, 0);
+	ocfs2_extent_map_trunc(inode, 0);
 
-static inline int ocfs2_comm_ip_version_to_family(u16 ip_version)
-{
-	switch (ntohs(ip_version)) {
-		case 4:
-			return PF_INET;
-		case 6:
-			return PF_INET6;
-		default:
-			BUG();
-	}
-
-	return 4;
+done:
+	return response;
 }
 
-/*
- * ocfs_send_bcast()
- *
- */
-static int ocfs_send_bcast (ocfs_super * osb, ocfs_node_map *votemap, ocfs_dlm_msg * dlm_msg)
+static void ocfs2_process_dentry_request(struct inode *inode,
+					 int rename)
 {
-	int status = 0, error;
-	__s16 num;
-	BARF_BARF_BARF *node;
-	struct sockaddr_in sin;
-	mm_segment_t oldfs;
+	d_prune_aliases (inode);
 
-	LOG_ENTRY ();
-
-	oldfs = get_fs ();
-	for (num=0; num<osb->max_nodes; num++) {
-		if (num == osb->node_num)
-			continue;
-
-		if (!ocfs_node_map_test_bit(votemap, num))
-			continue;
-
-		node = osb->node_cfg_info[num];
-		if (!node)
-			continue;
-
-		LOG_TRACE_ARGS("Sending msg to node=%u, name=%s\n",
-       			       num, node->node_name);
-		memset (&sin, 0, sizeof (sin));
-		sin.sin_family = ocfs2_comm_ip_version_to_family(node->ipc_config.ip_version);
-		sin.sin_addr.s_addr = node->ipc_config.addr_u.ip_addr4;
-		sin.sin_port = node->ipc_config.ip_port;
-		
-		LOG_TRACE_ARGS("about to send to 0x%08u:%u\n",
-			       ntohl(node->ipc_config.addr_u.ip_addr4),
-			       ntohs(node->ipc_config.ip_port));
-		
-		status = -ENETUNREACH;
-		if (OcfsIpcCtxt.send_sock) {
-			struct iovec iov = {
-				.iov_base = dlm_msg,
-				.iov_len = dlm_msg->msg_len
-			};
-			struct msghdr msg = {
-				.msg_iov = &iov,
-				.msg_iovlen = 1,
-				.msg_control = NULL,
-				.msg_controllen = 0,
-				.msg_name = (struct sockaddr *) &sin,
-				.msg_namelen = sizeof (sin),
-				.msg_flags = 0
-			};
-			
-			status = 0;	
-			set_fs (get_ds ());
-			error = sock_sendmsg (OcfsIpcCtxt.send_sock, &msg, dlm_msg->msg_len);
-			set_fs (oldfs);
-		
-			if (error < 0) {
-				LOG_ERROR_ARGS ("unable to sendmsg, error=%d", error);
-				status = -ENETUNREACH;
-			} 
-		}
-		if (status < 0)
-			LOG_ERROR_STATUS (status);
+	/* for rename, we don't drop link counts */
+	if (!rename) {
+		if (S_ISDIR(inode->i_mode))
+			inode->i_nlink = 0;
+		else
+			inode->i_nlink--;
 	}
+}
 
-	status = 0;
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_send_bcast */
-
-/*
- * ocfs_init_udp_sock()
- *
- */
-int ocfs_init_udp_sock(struct socket **send_sock,
-		       struct socket **recv_sock)
+static void ocfs2_process_vote(ocfs_super *osb,
+			       ocfs2_vote_msg *msg)
 {
-	struct sockaddr_in sin;
-	int status;
-	ocfs_comm_info *comm;
+	int net_status, vote_response;
+	int rename = 0;
+	unsigned int node_num, generation;
+	u64 blkno;
+	enum ocfs2_vote_request request;
+	struct inode *inode = NULL;
+	struct inode *remote_node;
+	ocfs2_msg_hdr *hdr = &msg->v_hdr;
+	ocfs2_response_msg response;
 
-	LOG_ENTRY ();
+	/* decode the network mumbo jumbo into local variables. */
+	request = ntohl(hdr->h_request);
+	blkno = be64_to_cpu(hdr->h_blkno);
+	generation = ntohl(hdr->h_generation);
+	node_num = ntohl(hdr->h_node_num);
 
-	comm = &(OcfsGlobalCtxt.comm_info);
+	printk("ocfs2: processing vote: request = %u, blkno = %llu, "
+	       "generation = %u, node_num = %u\n", request, blkno, generation,
+	       node_num);
 
-	/* Create Send Socket */
-	status = sock_create(ocfs2_comm_ip_version_to_family(comm->ip_version),
-			     SOCK_DGRAM, IPPROTO_UDP,
-			     send_sock);
-	if (status < 0) {
-		LOG_ERROR_ARGS ("unable to create socket, error=%d", status);
-		goto bail;
+	vote_response = 0;
+	switch (request) {
+	case OCFS2_VOTE_REQ_UMOUNT:
+		ocfs2_process_umount_request(osb, node_num);
+		goto respond;
+	case OCFS2_VOTE_REQ_MOUNT:
+		ocfs2_process_mount_request(osb, node_num);
+		goto respond;
+	default:
+		/* avoids a gcc warning */
+		break;
 	}
 
-	/* Bind Send Socket */
-	memset(&sin, 0, sizeof (sin));
-	sin.sin_family = ocfs2_comm_ip_version_to_family(comm->ip_version);
-	sin.sin_addr.s_addr = htonl (INADDR_ANY);
-	sin.sin_port = htons(0);
+	/* We cannot process the remaining message types before we're
+	 * fully mounted. It's perfectly safe however to send a 'yes'
+	 * response as we can't possibly have any of the state they're
+	 * asking us to modify yet. */
+	if (atomic_read(&osb->vol_state) == VOLUME_INIT)
+		goto respond;
 
-	status = (*send_sock)->ops->bind(*send_sock,
-					 (struct sockaddr *)&sin,
-					 sizeof(sin));
-	if (status < 0) {
-		LOG_ERROR_ARGS ("unable to bind socket, error=%d", status);
-		goto bail;
-	}
+	vote_response = -EINVAL;
+	/* If we get here, then the request is against an inode. */
+	inode = ocfs_ilookup(osb, blkno);
+	if (!inode)
+		goto respond;
 
-	/* Create Receive Socket */
-	status = sock_create(ocfs2_comm_ip_version_to_family(comm->ip_version),
-			     SOCK_DGRAM, IPPROTO_UDP,
-			     recv_sock);
-	if (status < 0) {
-		LOG_ERROR_ARGS ("unable to create socket, error=%d", status);
-		goto bail;
+	OCFS_ASSERT(inode->i_generation == generation);
+
+	switch (request) {
+	case OCFS2_VOTE_REQ_DELETE:
+		vote_response = ocfs2_process_delete_request(inode);
+		break;
+	case OCFS2_VOTE_REQ_RENAME:
+		rename = 1;
+		/* fall through */
+	case OCFS2_VOTE_REQ_UNLINK:
+		ocfs2_process_dentry_request(inode, rename);
+		break;
+	default:
+		printk("ocfs2_process_vote: node %u, invalid request: %u\n",
+		       node_num, request);
 	}
 
+respond:
+	/* Response struture is small so we just put it on the stack
+	 * and stuff it inline. */
+	memset(&response, 0, sizeof(ocfs2_response_msg));
+	response.r_hdr.h_response_id = hdr->h_response_id;
+	response.r_hdr.h_blkno = hdr->h_blkno;
+	response.r_hdr.h_generation = hdr->h_generation;
+	response.r_hdr.h_node_num = htonl(osb->node_num);
+	response.r_response = htonl(vote_response);
 
-	/* Bind Receive Socket */
-	memset(&sin, 0, sizeof(sin));
-	sin.sin_family = ocfs2_comm_ip_version_to_family(comm->ip_version);
-	sin.sin_addr.s_addr = htonl(INADDR_ANY);
-	sin.sin_port = comm->ip_port;
-
-	status = (*recv_sock)->ops->bind(*recv_sock,
-					 (struct sockaddr *)&sin,
-					 sizeof(sin));
-	if (status < 0) {
-		LOG_ERROR_ARGS ("unable to bind socket, error=%d", status);
-		goto bail;
+	remote_node = nm_get_node_by_num(node_num);
+	if (!remote_node) {
+		LOG_ERROR_ARGS("Couldn't get inode for node %u!\n", node_num);
+	} else {
+		net_status = net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
+					      osb->net_key,
+					      &response,
+					      sizeof(ocfs2_response_msg),
+					      remote_node,
+					      NULL);
+		if (net_status < 0)
+			LOG_ERROR_ARGS("message to node %u fails with error "
+				       "%d!\n", node_num, net_status);
+		iput(remote_node);
 	}
 
-bail:
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_init_udp_sock */
+	if (inode)
+		iput(inode);
+}
 
-
-/*
- * ocfs_send_vote_reply()
- *
- */
-int ocfs_send_vote_reply (ocfs_super * osb, ocfs_dlm_msg * dlm_msg, __u32 vote_status)
+static void ocfs2_vote_thread_do_work(ocfs_super *osb)
 {
-	ocfs_dlm_req_master *req_master;
-	ocfs_dlm_reply_master *reply_master;
-	ocfs_dlm_msg *send_dlm_msg;
-	ocfs_vote_obj *obj;
-	int status = 0;
-	__u8 *buf;
-	__u32 msg_len, obj_len;
-	ocfs_node_map vote_map;
+	unsigned long processed;
+	ocfs2_lock_res *lockres;
+	ocfs2_vote_work *work;
 
-	LOG_ENTRY ();
+	spin_lock(&osb->vote_task_lock);
+	processed = osb->blocked_lock_count;
+	while (processed) {
+		OCFS_ASSERT(!list_empty(&osb->blocked_lock_list));
 
-	ocfs_node_map_init(osb, &vote_map);
+		lockres = list_entry(osb->blocked_lock_list.next,
+				     ocfs2_lock_res, l_blocked_list);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock(&osb->vote_task_lock);
 
-	req_master = (ocfs_dlm_req_master *) dlm_msg->msg_buf;
+		OCFS_ASSERT(processed);
+		processed--;
 
-	msg_len = sizeof (ocfs_dlm_msg) + sizeof (ocfs_dlm_reply_master);
-	obj_len = sizeof (ocfs_vote_obj) + sizeof (ocfs_dlm_reply_master);
+		ocfs2_process_blocked_lock(osb, lockres);
 
-	obj = ocfs_alloc_vote_obj (osb, obj_len, 0, NULL);
-	if (obj == NULL) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto finally;
+		spin_lock(&osb->vote_task_lock);
 	}
-	buf = (__u8 *)&(obj->m);
-	send_dlm_msg = (ocfs_dlm_msg *)buf;
-	reply_master = (ocfs_dlm_reply_master *) send_dlm_msg->msg_buf;
-	
-	ocfs_init_dlm_msg (osb, send_dlm_msg, msg_len, OCFS_VOTE_REPLY);
 
-	reply_master->h.lock_id = req_master->lock_id;
-	reply_master->status = vote_status;
-	reply_master->h.lock_seq_num = req_master->lock_seq_num;
-	reply_master->h.flags = req_master->flags;
+	while (osb->vote_count) {
+		OCFS_ASSERT(!list_empty(&osb->vote_list));
+		work = list_entry(osb->vote_list.next,
+				  ocfs2_vote_work, w_list);
+		list_del(&work->w_list);
+		osb->vote_count--;
+		spin_unlock(&osb->vote_task_lock);
 
-	ocfs_node_map_set_bit(&vote_map, dlm_msg->src_node);
-	ocfs_node_map_set(&obj->req_vote_map, &vote_map);
+		ocfs2_process_vote(osb, &work->w_msg);
+		kfree(work);
 
-	spin_lock(&osb->vote_obj_queue_lock);
-	list_add_tail(&obj->list, &osb->vote_obj_queue);
-	spin_unlock(&osb->vote_obj_queue_lock);
+		spin_lock(&osb->vote_task_lock);
+	}
+	spin_unlock(&osb->vote_task_lock);
+}
 
-	ocfs_send_bcast (osb, &vote_map, send_dlm_msg);
-	spin_lock (&obj->lock);
-	obj->vote_state = VOTE_OBJ_STATE_SENT;
-	spin_unlock (&obj->lock);
+static inline int ocfs2_vote_thread_has_work(ocfs_super *osb)
+{
+	if (list_empty(&osb->blocked_lock_list) &&
+	    list_empty(&osb->vote_list))
+		return 0;
 
-	// silly ;-)
-	spin_lock (&obj->lock);
-	obj->vote_state = VOTE_OBJ_STATE_DESTROYING;
-	spin_unlock (&obj->lock);
+	return 1;
+}
 
-	spin_lock(&osb->vote_obj_queue_lock);
-	list_del(&obj->list);
-	spin_unlock(&osb->vote_obj_queue_lock);
+int ocfs2_vote_thread(void *arg)
+{
+	int status = 0;
+	ocfs_super *osb = arg;
+	char proc[16];
 
-finally:
-	ocfs_put_vote_obj (obj);
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_send_vote_reply */
+	sprintf (proc, "ocfs2vote-%d", osb->osb_id);
+	ocfs_daemonize (proc, strlen(proc), 0);
 
+	spin_lock(&osb->vote_task_lock);
+	osb->vote_task = current;
+	init_completion (&osb->vote_event_complete);
 
-/*
- * ocfs_check_ipc_msg()
- *
- */
-int ocfs_check_ipc_msg (__u8 * msg, __u32 msg_len)
-{
-	int ret = 0;
-	ocfs_dlm_msg *dlm_msg;
+	complete(&osb->vote_event_init);
 
-	LOG_ENTRY ();
+	while (1) {
+		if (osb->vote_exit) {
+			if (!ocfs2_vote_thread_has_work(osb))
+				break;
+			/* don't want to sleep if we're supposed to quit. */
+			atomic_set(&osb->wake_vote_task, 1);
+		}
+		spin_unlock(&osb->vote_task_lock);
 
-	dlm_msg = (ocfs_dlm_msg *) msg;
+		wait_event_interruptible(osb->vote_event,
+					 atomic_read(&osb->wake_vote_task));
 
-	if (dlm_msg == NULL) {
-		LOG_TRACE_STR("Null netdlm message");
-		goto bail;
+		atomic_set(&osb->wake_vote_task, 0);
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+		printk("(%u) vote_thread: awoken\n", current->pid);
+#endif
+		ocfs2_vote_thread_do_work(osb);
+		spin_lock(&osb->vote_task_lock);
 	}
 
-	if (msg_len < sizeof(ocfs_dlm_msg)) {
-		LOG_TRACE_STR("Netdlm message too short");
-		goto bail;
-	}
+	osb->vote_task = NULL;
+	spin_unlock(&osb->vote_task_lock);
 
-	/* Compute and Compare the checksum */
-	if (dlm_msg->magic != OCFS_DLM_MSG_MAGIC) {
-		LOG_TRACE_ARGS ("Magic number mismatch in netdlm message: "
-				"0x%08x != 0x%08x\n",
-				dlm_msg->magic, OCFS_DLM_MSG_MAGIC);
-		goto bail;
-	}
+	complete(&osb->vote_event_complete);
 
+	return status;
+}
 
-	ret = 1;
-
-bail:
-	LOG_EXIT_INT (ret);
-	return ret;
-}				/* ocfs_check_ipc_msg */
-
-
-int ocfs_lookup_obj_for_proc (ocfs_vote_obj *obj, ocfs_vote_obj_lookup_data *data)
+static ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(ocfs_super *osb,
+						    unsigned int response_id)
 {
-	int status = -ENOENT;
-	ocfs_dlm_msg *dlm_msg = NULL;
-	ocfs_dlm_msg_hdr *request = NULL;
-	ocfs_dlm_reply_master *reply = NULL;
-	int *len = data->u.proc.len;
-	int max = data->u.proc.max - *len;
-	char *p = data->u.proc.page + *len;
-	int ret = 0;
-	char *reqstr=NULL, *gotstr=NULL;
+	ocfs2_net_wait_ctxt *w;
 
-	/* just run thru everything to populate /proc */
-	/* return -ENOENT to keep going */
-	dlm_msg = &(obj->m);
-
-	switch (dlm_msg->msg_type) {
-		case OCFS_VOTE_REQUEST:
-			request = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
-
-			if (ocfs_node_map_stringify(&obj->req_vote_map, &reqstr) < 0)
-				break;
-			if (ocfs_node_map_stringify(&obj->got_vote_map, &gotstr) < 0)
-				break;
-			ret = snprintf(p, max, "REQST: %d %c %3d %s %21llu %21llu %08x | %s\n",
-				obj->pid,
-				vote_state_str[obj->vote_state],
-				obj->vote_status,
-				reqstr,
-				request->lock_id,
-				request->lock_seq_num, 
-				request->flags, gotstr);
-			break;
-		case OCFS_VOTE_REPLY:
-			reply = (ocfs_dlm_reply_master *) dlm_msg->msg_buf;
-			if (ocfs_node_map_stringify(&obj->req_vote_map, &reqstr) < 0)
-				break;
-			ret = snprintf(p, max, "REPLY: %d %c %3d %s %21llu %21llu %08x | %3d\n",
-				obj->pid,
-				vote_state_str[obj->vote_state],
-				obj->vote_status,
-				reqstr,
-				reply->h.lock_id,
-				reply->h.lock_seq_num,
-				reply->h.flags, 
-				reply->status);
-
-			break;
-		case OCFS_INFO_DISMOUNT:
-			ret = snprintf(p, max, "UNMNT: %d\n", obj->pid);
-			break;
-		default:
-			ret = snprintf(p, max, "BAD!!: %d\n", obj->pid);
-			break;
+	w = kmalloc(sizeof(*w), GFP_KERNEL);
+	if (!w) {
+		LOG_ERROR_STATUS(-ENOMEM);
+		goto bail;
 	}
-	(*len) += ret;
-	p[max-1] = '\0';
+	memset(w, 0, sizeof(*w));
 
-	if (reqstr)
-		kfree(reqstr);
-	if (gotstr)
-		kfree(gotstr);
-	return status;
+	INIT_LIST_HEAD(&w->n_list);
+	init_waitqueue_head(&w->n_event);
+	ocfs_node_map_init(osb, &w->n_node_map);
+	w->n_response_id = response_id;
+bail:
+	return w;
 }
 
-
-int ocfs_lookup_obj_by_lockid (ocfs_vote_obj *obj, ocfs_vote_obj_lookup_data *data)
+static unsigned int ocfs2_new_response_id(ocfs_super *osb)
 {
-	int status = 0;
-	ocfs_dlm_msg *dlm_msg = NULL;
-	ocfs_dlm_msg_hdr *req = NULL;
+	unsigned int ret;
 
-	dlm_msg = &(obj->m);
-	req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
-	if (dlm_msg->msg_type != OCFS_VOTE_REQUEST ||
-	    obj->vote_state == VOTE_OBJ_STATE_DESTROYING ||
-	    req->lock_id != data->u.s.lock_id) {
-		status = -ENOENT;
-	}
-	return status;
+	spin_lock(&osb->net_response_lock);
+	ret = ++osb->net_response_ids;
+	spin_unlock(&osb->net_response_lock);
+
+	return ret;
 }
 
-static int ocfs_lookup_obj_by_seq (ocfs_vote_obj *obj, ocfs_vote_obj_lookup_data *data)
+static void ocfs2_dequeue_net_wait_ctxt(ocfs_super *osb,
+					ocfs2_net_wait_ctxt *w)
 {
-	int status = -ENOENT;
-	ocfs_dlm_msg *dlm_msg = NULL;
-	ocfs_dlm_msg_hdr *req = NULL;
+	spin_lock(&osb->net_response_lock);
+	list_del(&w->n_list);
+	spin_unlock(&osb->net_response_lock);
+}
 
-	if (obj->seq_num == data->u.s.seq_num) {
-		status = 0;
-		dlm_msg = &(obj->m);
-		req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
-		// error if there is a non-request with a matching seqnum, or
-		// a vote object that is in too early or too late a state, or
-		// a vote object with the right seqnum but wrong lockid
-		if (dlm_msg->msg_type != OCFS_VOTE_REQUEST ||
-		    obj->vote_state == VOTE_OBJ_STATE_DESTROYING ||
-		    obj->vote_state == VOTE_OBJ_STATE_UNSENT ||
-		    req->lock_id != data->u.s.lock_id) {
-			LOG_ERROR_ARGS("bad message: vote_state=%d type=%d "
-				       "lockid=%llu expected=%llu\n",
-				      obj->vote_state, dlm_msg->msg_type,
-				      req->lock_id, data->u.s.lock_id);
-			status = -EINVAL;
-		}
-	}
-	return status;
+static void ocfs2_queue_net_wait_ctxt(ocfs_super *osb,
+				      ocfs2_net_wait_ctxt *w)
+{
+	spin_lock(&osb->net_response_lock);
+	list_add_tail(&w->n_list,
+		      &osb->net_response_list);
+	spin_unlock(&osb->net_response_lock);
 }
 
-/*
- * returns an ocfs_vote_obj with a ref on it or NULL
- */
-int ocfs_lookup_vote_request_obj (ocfs_super *osb, ocfs_vote_obj_lookup_data *data)
+#define OCFS2_RESPONSE_WAIT_JIFFIES (30 * HZ)
+static int ocfs2_wait_on_vote_responses(ocfs_super *osb,
+					ocfs2_net_wait_ctxt *w)
 {
-	int status = -ENOENT;
-	struct list_head *iter;
-	ocfs_vote_obj *obj = NULL;
+	int status = 0;
+	signed long timeout = OCFS2_RESPONSE_WAIT_JIFFIES;
+	DECLARE_WAITQUEUE(wait, current);
 
-	spin_lock(&osb->vote_obj_queue_lock);
+	add_wait_queue(&w->n_event, &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
 
-	list_for_each (iter, &osb->vote_obj_queue) {
-		obj = list_entry (iter, ocfs_vote_obj, list);
-		ocfs_get_vote_obj (obj);
-		spin_lock(&obj->lock);
-		status = data->func(obj, data);
-		spin_unlock(&obj->lock);
-		if (status < 0) {
-			ocfs_put_vote_obj (obj);
-			obj = NULL;
-		}
-		if (status != -ENOENT)
+		if (ocfs_node_map_is_empty(osb, &w->n_node_map))
 			break;
-		obj = NULL;
+
+		if (!signal_pending(current)) {
+			timeout = schedule_timeout(timeout);
+			if (!timeout) {
+				status = -ETIMEDOUT;
+				break;
+			}
+			continue;
+		}
+		status = -ERESTARTSYS;
+		break;
 	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&w->n_event, &wait);
 
-	spin_unlock(&osb->vote_obj_queue_lock);
-
-	// return the obj, or drop the ref
-	if (data->ret)
-		*(data->ret) = obj;
-	else if (obj)
-		ocfs_put_vote_obj (obj);
 	return status;
 }
 
-
-/*
- * ocfs_comm_process_vote_reply()
- *
- */
-int ocfs_comm_process_vote_reply (ocfs_super * osb, ocfs_dlm_msg * dlm_msg)
+static int ocfs2_broadcast_vote(ocfs_super *osb,
+				ocfs2_vote_msg *request,
+				unsigned int response_id)
 {
-	int status = 0;
-	ocfs_dlm_reply_master *reply;
-	ocfs_dlm_msg_hdr *reply_msg;
-	ocfs_vote_reply_ctxt ctxt;
-	ocfs_vote_obj *obj = NULL;
-	ocfs_vote_obj_lookup_data data;
+	int status, i, remote_err;
+	ocfs2_net_wait_ctxt *w = NULL;
+	struct inode *remote_node;
 
-	LOG_ENTRY ();
-
-	down (&(osb->comm_lock));
-
-	reply = (ocfs_dlm_reply_master *) dlm_msg->msg_buf;
-	reply_msg = &(reply->h);
-
-	/* find the original request object for this reply */
-	data.u.s.seq_num = reply_msg->lock_seq_num;
-	data.u.s.lock_id = reply_msg->lock_id;
-	data.func = ocfs_lookup_obj_by_seq;
-	data.ret = &obj;
-	status = ocfs_lookup_vote_request_obj (osb, &data);
-	if (status < 0 || obj==NULL) {
-		LOG_ERROR_STATUS (status);
+	w = ocfs2_new_net_wait_ctxt(osb, response_id);
+	if (!w) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
 
-	spin_lock(&obj->lock);
-	if (obj->vote_state != VOTE_OBJ_STATE_SENT &&
-	    obj->vote_state != VOTE_OBJ_STATE_PARTIAL_REPLY) {
-		LOG_ERROR_ARGS("bad vote reply state=%d, node=%u, lockid=%llu, seq=%llu, vote=%d\n",
-			       obj->vote_state, dlm_msg->src_node,
-			       reply_msg->lock_id,
-			       reply_msg->lock_seq_num, reply->status);
-		status = -EINVAL;
-		goto unlock;
-	}
+	/* we're pretty much ready to go at this point, and this fills
+	 * in n_response which we need anyway... */
+	ocfs2_queue_net_wait_ctxt(osb, w);
 
-	LOG_TRACE_ARGS("node=%u, lockid=%llu, seq=%llu, vote=%d\n",
-		       dlm_msg->src_node, reply_msg->lock_id,
-		       reply_msg->lock_seq_num, reply->status);
-	
-	ctxt.got_vote_map = &(obj->got_vote_map);
-	ctxt.status = &(obj->vote_status);
-	ctxt.flags = reply_msg->flags;
-	ctxt.reply = reply;
+	i = ocfs_node_map_iterate(osb, &osb->mounted_map, 0);
+	while (i != OCFS_INVALID_NODE_NUM) {
+		if (i != osb->node_num) {
+			ocfs_node_map_set_bit(osb, &w->n_node_map, i);
 
-	ocfs_process_one_vote_reply(osb, &ctxt, dlm_msg->src_node);
+			remote_node = nm_get_node_by_num(i);
+			if (!remote_node) {
+				status = -EINVAL;
+				goto bail;
+			}
 
-	if (ocfs_node_map_is_equal(&obj->got_vote_map, &obj->req_vote_map))
-		obj->vote_state = VOTE_OBJ_STATE_FULL_REPLY;
-	else 
-		obj->vote_state = VOTE_OBJ_STATE_PARTIAL_REPLY;
-	
-unlock:
-	// wake if complete or error
-	if (obj->vote_status < 0 || status < 0 ||
-	    obj->vote_state == VOTE_OBJ_STATE_FULL_REPLY) {
-		atomic_set (&obj->voted_event_woken, 1);
-		wake_up (&obj->voted_event);
+			remote_err = 0;
+			status = net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
+						  osb->net_key,
+						  request,
+						  sizeof(*request),
+						  remote_node,
+						  &remote_err);
+			iput(remote_node);
+			if (status == -ETIMEDOUT) {
+				printk("ocfs2: remote node %d timed out!\n",
+				       i);
+				status = -EAGAIN;
+				goto bail;
+			}
+			if (remote_err < 0) {
+				status = remote_err;
+				printk("ocfs2: remote error %d on node %d!\n",
+				       remote_err, i);
+				goto bail;
+			}
+			if (status < 0) {
+				LOG_ERROR_STATUS(status);
+				goto bail;
+			}
+		}
+		i = ocfs_node_map_iterate(osb, &osb->mounted_map, i);
 	}
-	spin_unlock(&obj->lock);
-	ocfs_put_vote_obj (obj);
 
-bail:
-	up (&(osb->comm_lock));
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_comm_process_vote_reply */
+	status = ocfs2_wait_on_vote_responses(osb, w);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-/*
- * ocfs_dlm_recv_msg()
- *
- */
-void ocfs_dlm_recv_msg (void *val)
-{
-	ocfs_recv_ctxt *recv_ctxt;
-	__u8 *dlm_packet;
-
-	LOG_ENTRY ();
-
-	recv_ctxt = (ocfs_recv_ctxt *) val;
-	dlm_packet = (__u8 *) recv_ctxt->msg;
-
-	if (recv_ctxt->status >= 0) {
-		if (ocfs_check_ipc_msg (dlm_packet, recv_ctxt->msg_len))
-			ocfs_comm_process_msg (dlm_packet);
+	ocfs2_dequeue_net_wait_ctxt(osb, w);
+	status = w->n_response;
+bail:
+	if (w) {
+		ocfs2_dequeue_net_wait_ctxt(osb, w);
+		kfree(w);
 	}
 
-	kfree(recv_ctxt);
+	return status;
+}
 
-	LOG_EXIT ();
-	return;
-}				/* ocfs_dlm_recv_msg */
-
-/*
- * ocfs_comm_process_msg()
- *
- */
-int ocfs_comm_process_msg (__u8 * msg)
+static int ocfs2_do_request_vote(ocfs_super *osb,
+				 u64 blkno,
+				 unsigned int generation,
+				 enum ocfs2_vote_request type)
 {
-	int status = 0;
-	ocfs_super *osb = NULL;
-	ocfs_dlm_msg *dlm_msg;
-	ocfs_dlm_req_master *req_master;
-	struct list_head *iter_osb, *temp_iter;
-	__s16 src_node;
+	int status;
+	unsigned int response_id;
+	ocfs2_vote_msg *request = NULL;
+	ocfs2_msg_hdr *hdr;
 
-	LOG_ENTRY ();
+	OCFS_ASSERT(type == OCFS2_VOTE_REQ_DELETE ||
+		    type == OCFS2_VOTE_REQ_UNLINK ||
+		    type == OCFS2_VOTE_REQ_RENAME);
 
-	dlm_msg = (ocfs_dlm_msg *) msg;
-
-	down (&(OcfsGlobalCtxt.global_res));
-	list_for_each_safe (iter_osb, temp_iter, &(OcfsGlobalCtxt.osb_next)) {
-		osb = list_entry (iter_osb, ocfs_super, osb_next);
-		if (!memcmp (osb->uuid, dlm_msg->vol_id,
-			     MAX_VOL_ID_LENGTH))
-			break;
-		osb = NULL;
-	}
-	up (&(OcfsGlobalCtxt.global_res));
-	
-	if (osb == NULL) {
-		LOG_ERROR_STR("Ignoring netdlm message with invalid volume id");
+	request = kmalloc(sizeof(*request), GFP_KERNEL);
+	if (!request) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
+	memset(request, 0, sizeof(*request));
+	hdr = &request->v_hdr;
 
-	if (dlm_msg->src_node >= osb->max_nodes) {
-		LOG_ERROR_ARGS ("Invalid source node in netdlm message: %d\n",
-				dlm_msg->src_node);
-		goto bail;
-	}
+	response_id = ocfs2_new_response_id(osb);
 
-	if (!ocfs_node_map_test_bit(&osb->publ_map, dlm_msg->src_node)) {
-		LOG_TRACE_STR("Ignoring netdlm message from dead node");
+	hdr->h_response_id = htonl(response_id);
+	hdr->h_request = htonl(type);
+	hdr->h_blkno = cpu_to_be64(blkno);
+	hdr->h_generation = htonl(generation);
+	hdr->h_node_num = htonl((unsigned int) osb->node_num);
+
+	status = ocfs2_broadcast_vote(osb, request, response_id);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS(status);
 		goto bail;
 	}
 
-	switch (dlm_msg->msg_type) {
-	case OCFS_VOTE_REQUEST:
-		status = ocfs_process_vote (osb, dlm_msg);
-		break;
-
-	case OCFS_VOTE_REPLY:
-		ocfs_comm_process_vote_reply (osb, dlm_msg);
-		break;
-
-	case OCFS_INFO_DISMOUNT:
-		src_node = dlm_msg->src_node;
-		req_master = (ocfs_dlm_req_master *) dlm_msg->msg_buf;
-		printk ("ocfs2: Received dismount message for device (%u,%u) "
-			"from %s (node %d)\n", MAJOR(osb->sb->s_dev),
-			MINOR(osb->sb->s_dev), osb->node_cfg_info[src_node]->node_name,
-			src_node);
-		atomic_set (&(osb->vol_node_map[src_node].dismount), 1);
-		break;
-
-	default:
-		break;
-	}
-
 bail:
-	LOG_EXIT_STATUS (status);
+	if (request)
+		kfree(request);
+
 	return status;
-}				/* ocfs_comm_process_msg */
+}
 
-
-
-/*
- * ocfs_send_dismount_msg()
- *
- */
-int ocfs_send_dismount_msg (ocfs_super * osb)
+static int ocfs2_request_vote(struct inode *inode,
+			      enum ocfs2_vote_request type)
 {
-	int status = 0;
-	ocfs_dlm_msg *dlm_msg = NULL;
-	ocfs_dlm_msg_hdr *req;
-	ocfs_vote_obj *obj;
-	__u32 msg_len, obj_len;
-	ocfs_node_map map;
+	int status;
+	ocfs_super *osb = OCFS2_SB(inode->i_sb);
 
-	LOG_ENTRY_ARGS ("(osb=0x%p)\n", osb);
+	if (ocfs_inode_is_new(inode))
+		return 0;
 
-	ocfs_node_map_dup(osb, &map, &osb->publ_map);
-	ocfs_node_map_clear_bit(&map, osb->node_num);
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (signal_pending(current))
+			return -EINTR;
 
-	msg_len = sizeof (ocfs_dlm_msg) + sizeof (ocfs_dlm_req_master);
-	obj_len = sizeof (ocfs_vote_obj) + sizeof (ocfs_dlm_req_master);
+		status = ocfs2_super_lock(osb, 0);
+		if (status < 0) {
+			if (status != -EINTR)
+				LOG_ERROR_STATUS(status);
+			break;
+		}
 
-	obj = ocfs_alloc_vote_obj (osb, obj_len, 0, NULL);
-	if (obj == NULL) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto finally;
-	}
-	dlm_msg = &(obj->m);
-	req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
-	ocfs_init_dlm_msg (osb, dlm_msg, msg_len, OCFS_INFO_DISMOUNT);
-	req->lock_id = 0;
-	req->flags = 0;
-	req->lock_seq_num = 0;
+		status = 0;
+		if (!ocfs_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num))
+			status = ocfs2_do_request_vote(osb, 
+						       OCFS_I(inode)->ip_blkno,
+						       inode->i_generation,
+						       type);
 
-	spin_lock(&osb->vote_obj_queue_lock);
-	list_add_tail(&obj->list, &osb->vote_obj_queue);
-	spin_unlock(&osb->vote_obj_queue_lock);
-
-	ocfs_send_bcast (osb, &map, dlm_msg);
-	spin_lock (&obj->lock);
-	obj->vote_state = VOTE_OBJ_STATE_SENT;
-	spin_unlock (&obj->lock);
-
-	// silly ;-)
-	spin_lock (&obj->lock);
-	obj->vote_state = VOTE_OBJ_STATE_DESTROYING;
-	spin_unlock (&obj->lock);
-
-	spin_lock(&osb->vote_obj_queue_lock);
-	list_del(&obj->list);
-	spin_unlock(&osb->vote_obj_queue_lock);
-
-finally:
-	ocfs_put_vote_obj (obj);
-	LOG_EXIT_STATUS (status);
+		ocfs2_super_unlock(osb, 0);
+	}
 	return status;
-}				/* ocfs_send_dismount_msg */
-	
-/*
- * ocfs_init_dlm_msg()
- *
- */
-static void ocfs_init_dlm_msg (ocfs_super * osb, ocfs_dlm_msg * dlm_msg, __u32 msg_len, __u32 type)
+}
+
+int ocfs2_request_delete_vote(struct inode *inode)
 {
-	LOG_ENTRY ();
+	return ocfs2_request_vote(inode, OCFS2_VOTE_REQ_DELETE);
+}
 
-	dlm_msg->magic = OCFS_DLM_MSG_MAGIC;
-	dlm_msg->msg_len = msg_len;
-	dlm_msg->src_node = osb->node_num;
-	dlm_msg->msg_type = type;
-	memcpy (dlm_msg->vol_id, osb->uuid, MAX_VOL_ID_LENGTH);
-
-	LOG_EXIT ();
-	return;
-}				/* ocfs_init_dlm_msg */
-
-
-static ocfs_vote_obj * ocfs_alloc_vote_obj (ocfs_super *osb, int bytes, __u32 reqlock, ocfs_node_map *votemap)
+int ocfs2_request_unlink_vote(struct inode *inode)
 {
-	ocfs_vote_obj *obj = NULL;
+	return ocfs2_request_vote(inode, OCFS2_VOTE_REQ_UNLINK);
+}
 
-	obj = ocfs_malloc (bytes);
-	if (obj == NULL)
-		return NULL;
-
-	memset(obj, 0, bytes);
-	obj->vote_state = VOTE_OBJ_STATE_UNSENT;
-	spin_lock_init (&obj->lock);
-	atomic_set(&obj->refcount, 1);
-	atomic_set(&obj->voted_event_woken, 0);
-	init_waitqueue_head (&obj->voted_event);
-	INIT_LIST_HEAD (&obj->list);
-
-	if (votemap)
-		ocfs_node_map_dup(osb, &obj->req_vote_map, votemap);
-	else
-		ocfs_node_map_init(osb, &obj->req_vote_map);
-	ocfs_node_map_init(osb, &obj->got_vote_map);
-
-	obj->seq_num = 0ULL;
-	obj->req_lock_type = reqlock;
-	obj->vote_status = 0;
-	obj->pid = current->pid;
-	
-	return obj;
+int ocfs2_request_rename_vote(struct inode *inode)
+{
+	return ocfs2_request_vote(inode, OCFS2_VOTE_REQ_RENAME);
 }
 
-#define OCFS_DLM_NET_TIMEOUT   (30000)   // 30 seconds
-
-/*
- * ocfs_send_dlm_request_msg()
- * inode is definitely non NULL
- */
-int ocfs_send_dlm_request_msg (ocfs_super * osb, __u64 lock_id, __u32 lock_type, __u32 flags, ocfs_node_map *votemap, struct inode *inode, __u32 num_ident, int *vote_status)
+int ocfs2_request_mount_vote(ocfs_super *osb)
 {
-	int status = 0;
-	ocfs_dlm_msg *dlm_msg = NULL;
-	ocfs_dlm_msg_hdr *req;
-	ocfs_vote_obj *obj;
-	__u32 msg_len, obj_len;
+	int status;
 
-	LOG_ENTRY_ARGS ("(osb=0x%p, id=%llu, ty=%u, fl=%u)\n",
-			osb, lock_id, lock_type, flags);
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (signal_pending(current))
+			return -EINTR;
 
-	msg_len = sizeof (ocfs_dlm_msg) + sizeof (ocfs_dlm_req_master);
-	obj_len = sizeof (ocfs_vote_obj) + sizeof (ocfs_dlm_req_master);
+		if (ocfs_node_map_is_only(osb, &osb->mounted_map,
+					  osb->node_num))
+			return 0;
 
-	obj = ocfs_alloc_vote_obj (osb, obj_len, lock_type, votemap);
-	if (obj == NULL) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto finally;
+		status = ocfs2_do_request_vote(osb, 0ULL, 0,
+					       OCFS2_VOTE_REQ_MOUNT);
 	}
-	dlm_msg = &(obj->m);
-	req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
-	ocfs_init_dlm_msg (osb, dlm_msg, msg_len, OCFS_VOTE_REQUEST);
-
-	spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
-	req->lock_seq_num = ++OcfsGlobalCtxt.comm_seq_num;
-	obj->seq_num = req->lock_seq_num;
-	spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
-
-	req->lock_id = lock_id;
-	req->flags = flags;
-	req->num_ident = num_ident;
-
-#ifdef VERBOSE_LOCKING_TRACE
-	printk("ocfs_send_dlm_request_msg: inode=%p, lockid = %llu\n",
-	       inode, lock_id);
-#endif
-
-	spin_lock(&osb->vote_obj_queue_lock);
-	list_add_tail(&obj->list, &osb->vote_obj_queue);
-	spin_unlock(&osb->vote_obj_queue_lock);
-
-	ocfs_send_bcast (osb, votemap, dlm_msg);
-	spin_lock (&obj->lock);
-	obj->vote_state = VOTE_OBJ_STATE_SENT;
-	spin_unlock (&obj->lock);
-	status = ocfs_wait_uninterruptible(obj->voted_event,
-			    atomic_read (&obj->voted_event_woken), 
-			    OCFS_DLM_NET_TIMEOUT);
-
-	spin_lock (&obj->lock);
-	if (obj->vote_status >= 0 && obj->vote_state == VOTE_OBJ_STATE_FULL_REPLY) {
-		LOG_TRACE_ARGS ("OK vote, lockid=%llu\n", lock_id);
-	} else if ((obj->vote_status != -EAGAIN && obj->vote_status != -EBUSY) || obj->vote_state != VOTE_OBJ_STATE_FULL_REPLY) {
-#warning "should we even be erroring here at all!"
-		LOG_ERROR_ARGS("inode %llu, vote_status=%d, vote_state=%d, "
-			       "lockid=%llu, flags = 0x%x, asked type = %u "
-			       "master = %d, state = 0x%lx, type = %u\n",
-			       OCFS_I(inode)->ip_blkno, obj->vote_status, 
-			       obj->vote_state, lock_id, flags, lock_type, 
-			       GET_INODE_LOCKRES(inode)->master_node_num, 
-			       GET_INODE_LOCKRES(inode)->readonly_state, 
-			       GET_INODE_LOCKRES(inode)->lock_type);
-	}
-	*vote_status = obj->vote_status;
-	obj->vote_state = VOTE_OBJ_STATE_DESTROYING;
-	ocfs_node_map_clear_bits(votemap, &obj->got_vote_map);
-	spin_unlock (&obj->lock);
-
-	spin_lock(&osb->vote_obj_queue_lock);
-	list_del(&obj->list);
-	spin_unlock(&osb->vote_obj_queue_lock);
-	
-
-	ocfs_compute_dlm_stats (status, *vote_status,
-			       	&(OcfsGlobalCtxt.net_reqst_stats));
-
-	ocfs_compute_dlm_stats (status, *vote_status,
-			       	&(osb->net_reqst_stats));
-finally:
-	ocfs_put_vote_obj (obj);
-	LOG_EXIT_STATUS (status);
 	return status;
-}				/* ocfs_send_dlm_request_msg */
+}
 
-
-void ocfs_process_one_vote_reply(ocfs_super *osb, ocfs_vote_reply_ctxt *ctxt, __u32 node_num)
+int ocfs2_request_umount_vote(ocfs_super *osb)
 {
 	int status;
-	int reply_status;
 
-	reply_status = ctxt->reply->status;
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (signal_pending(current))
+			return -EINTR;
 
-	status = 0;
+		if (ocfs_node_map_is_only(osb, &osb->mounted_map,
+					  osb->node_num))
+			return 0;
 
-	switch (reply_status) {
-		case FLAG_VOTE_NODE:
-			ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
-			break;
-		case FLAG_VOTE_OIN_ALREADY_INUSE:
-			ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
-			status = -EINVAL;
-			if (ctxt->flags & FLAG_FILE_DELETE)
-				status = -EBUSY;
-			break;
-		case FLAG_VOTE_OIN_UPDATED:
-			status = 0;
-			ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
-			break;
-		case FLAG_VOTE_UPDATE_RETRY:
-			ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
-			status = -EAGAIN;
-			break;
-		case FLAG_VOTE_FILE_DEL:
-#warning "don't we need to set the node map bit here?"
-			status = -ENOENT;
-			break;
+		status = ocfs2_do_request_vote(osb, 0ULL, 0,
+					       OCFS2_VOTE_REQ_UMOUNT);
 	}
-	*(ctxt->status) = status;
+	return status;
 }
 
-/* special case -1 for now
- * TODO: should *really* make sure the calling func never passes -1!!  */
-void ocfs_node_map_init(ocfs_super *osb, ocfs_node_map *map)
+/* TODO: This should eventually be a hash table! */
+static ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(ocfs_super *osb,
+							u32 response_id)
 {
-	map->num_nodes = osb->max_nodes;
-	memset(map->map, 0, BITS_TO_LONGS(OCFS_NODE_MAP_MAX_NODES) * 
-	       sizeof(unsigned long));
-}
+	struct list_head *p;
+	ocfs2_net_wait_ctxt *w = NULL;
 
-void ocfs_node_map_set_bit(ocfs_node_map *map, int bit)
-{
-	if (bit==-1)
-		return;
-	OCFS_ASSERT(bit < map->num_nodes);
-	set_bit(bit, map->map);
-}
-
-void ocfs_node_map_clear_bit(ocfs_node_map *map, int bit)
-{
-	if (bit==-1)
-		return;
-	OCFS_ASSERT(bit < map->num_nodes);
-	clear_bit(bit, map->map);
-}
-
-// clear all the bits in "target" which are set in "mask"
-void ocfs_node_map_clear_bits(ocfs_node_map *target, ocfs_node_map *mask)
-{
-	int bit, prev=0;
-	while (1) {
-		bit = find_next_bit (mask->map, mask->num_nodes, prev);
-		if (bit >= mask->num_nodes)
+	list_for_each(p, &osb->net_response_list) {
+		w = list_entry(p, ocfs2_net_wait_ctxt, n_list);
+		if (response_id == w->n_response_id)
 			break;
-		ocfs_node_map_clear_bit(target, bit);
-		prev = bit+1;
+		w = NULL;
 	}
-}
 
-// set all the bits in "target" which are set in "mask"
-void ocfs_node_map_set_bits(ocfs_node_map *target, ocfs_node_map *mask)
-{
-	int bit, prev=0;
-	while (1) {
-		bit = find_next_bit (mask->map, mask->num_nodes, prev);
-		if (bit >= mask->num_nodes)
-			break;
-		ocfs_node_map_set_bit(target, bit);
-		prev = bit+1;
-	}
+	return w;
 }
 
-int ocfs_node_map_test_bit(ocfs_node_map *map, int bit)
+static int ocfs2_handle_response_message(net_msg *msg,
+					 u32 len,
+					 void *data)
 {
-	if (bit >= map->num_nodes) {
-		LOG_ERROR_ARGS("bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
-		BUG();
-	}
-	return test_bit(bit, map->map);
-}
+	unsigned int response_id, node_num;
+	int response_status;
+	ocfs_super *osb = data;
+	ocfs2_response_msg *resp;
+	ocfs2_net_wait_ctxt * w;
 
-static int ocfs_node_map_stringify(ocfs_node_map *map, char **str)
-{
-	int i, n;
-	char *s;
+	resp = (ocfs2_response_msg *) msg->buf;
 
-	OCFS_ASSERT(map->num_nodes > 0);
+	response_id = ntohl(resp->r_hdr.h_response_id);
+	node_num = ntohl(resp->r_hdr.h_node_num);
+	response_status = ntohl(resp->r_response);
 
-	*str = kmalloc( strlen("123 ") * map->num_nodes, GFP_KERNEL);
-	if (!(*str))
-		return -ENOMEM;
+	printk("recieved response message:\n");
+	printk("h_response_id = %u\n", ntohl(response_id));
+	printk("h_request = %u\n", ntohl(resp->r_hdr.h_request));
+	printk("h_blkno = %llu\n", be64_to_cpu(resp->r_hdr.h_blkno));
+	printk("h_generation = %u\n", ntohl(resp->r_hdr.h_generation));
+	printk("h_node_num = %u\n", node_num);
+	printk("r_response = %d\n", response_status);
 
-	memset(*str, 0, strlen("123 ") * map->num_nodes);
+	spin_lock(&osb->net_response_lock);
+	w = __ocfs2_find_net_wait_ctxt(osb, response_id);
+	if (!w) {
+		printk("request not found!\n");
+		goto bail;
+	}
 
-	s = *str;	
-	for (i=0; i<map->num_nodes; i++) {
-		if (ocfs_node_map_test_bit(map, i)) {
-			n = sprintf(s, "%3d ", i);
-			if (n != strlen("123 ")) {
-				kfree(*str);
-				return -ENOMEM;
-			}
-			s += n;
-		}
+	if (response_status && (!w->n_response)) {
+		/* we only really need one negative response so don't
+		 * set it twice. */
+		w->n_response = response_status;
 	}
+
+	ocfs_node_map_clear_bit(osb, &w->n_node_map, node_num);
+	if (ocfs_node_map_is_empty(osb, &w->n_node_map))
+		wake_up(&w->n_event);
+bail:
+	spin_unlock(&osb->net_response_lock);
+
 	return 0;
 }
 
-int ocfs_node_map_is_empty(ocfs_node_map *map)
+static int ocfs2_handle_vote_message(net_msg *msg,
+					u32 len,
+					void *data)
 {
-	int bit;
-	OCFS_ASSERT(map->num_nodes > 0);
-	bit = find_next_bit(map->map, map->num_nodes, 0);
-	if (bit < map->num_nodes)
-		return 0;
-	return 1;
-}
+	int status;
+	ocfs_super *osb = data;
+	ocfs2_vote_work *work;
 
-int ocfs_node_map_is_equal(ocfs_node_map *map1, ocfs_node_map *map2)
-{
-	int num_longs, i; 
-
-	OCFS_ASSERT(map1->num_nodes == map2->num_nodes);
-	OCFS_ASSERT(map1->num_nodes > 0);
-	
-	num_longs = BITS_TO_LONGS(map1->num_nodes);
-	for (i=0; i<num_longs; i++) {
-		if (map1->map[i] != map2->map[i])
-			return 0;
+	work = kmalloc(sizeof(ocfs2_vote_work), GFP_KERNEL);
+	if (!work) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
-	return 1;
-}
 
+	INIT_LIST_HEAD(&work->w_list);
+	memcpy(&work->w_msg, msg->buf, sizeof(ocfs2_vote_msg));
 
-void ocfs_node_map_and(ocfs_node_map *target, ocfs_node_map *mask)
-{
-	int num_longs, i; 
+	printk("scheduling vote request:\n");
+	printk("h_response_id = %u\n", work->w_msg.v_hdr.h_response_id);
+	printk("h_request = %u\n", work->w_msg.v_hdr.h_request);
+	printk("h_blkno = %llu\n", work->w_msg.v_hdr.h_blkno);
+	printk("h_generation = %u\n", work->w_msg.v_hdr.h_generation);
+	printk("h_node_num = %u\n", work->w_msg.v_hdr.h_node_num);
 
-	OCFS_ASSERT(target->num_nodes == mask->num_nodes);
-	OCFS_ASSERT(target->num_nodes > 0);
-	
-	num_longs = BITS_TO_LONGS(target->num_nodes);
-	for (i=0; i<num_longs; i++)
-		target->map[i] &= mask->map[i];
-}
+	spin_lock(&osb->vote_task_lock);
+	list_add_tail(&work->w_list, &osb->vote_list);
+	osb->vote_count++;
+	spin_unlock(&osb->vote_task_lock);
 
-void ocfs_node_map_set(ocfs_node_map *target, ocfs_node_map *from)
-{
-	int num_longs, i; 
+	ocfs2_kick_vote_thread(osb);
 
-	OCFS_ASSERT(target->num_nodes == from->num_nodes);
-	OCFS_ASSERT(target->num_nodes > 0);
-
-	num_longs = BITS_TO_LONGS(target->num_nodes);
-	for (i=0; i<num_longs; i++)
-		target->map[i] = from->map[i];
+	status = 0;
+bail:
+	return status;
 }
 
-
-void ocfs_node_map_dup(ocfs_super *osb, ocfs_node_map *target, ocfs_node_map *from)
+int ocfs2_register_net_handlers(ocfs_super *osb)
 {
-	OCFS_ASSERT(from->num_nodes > 0);
-	ocfs_node_map_init(osb, target);
-	ocfs_node_map_set(target, from);
-}
+	int status;
+	int i = MAX_VOL_ID_LENGTH - sizeof(osb->net_key);
 
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs_node_map_is_only(ocfs_super *osb, ocfs_node_map *target, int bit)
-{
-	ocfs_node_map temp;
-	int ret;
+	memcpy(&osb->net_key, &osb->uuid[i], sizeof(osb->net_key));
+	osb->net_response_buf = osb->net_vote_buf = NULL;
+	osb->net_response_ids = 0;
+	spin_lock_init(&osb->net_response_lock);
+	INIT_LIST_HEAD(&osb->net_response_list);
 
-	ocfs_node_map_dup(osb, &temp, target);
-	ocfs_node_map_clear_bit(&temp, bit);
-	ret = ocfs_node_map_is_empty(&temp);
-	return ret;
-}
+	osb->net_response_buf = kmalloc(sizeof(ocfs2_response_msg),
+					GFP_KERNEL);
+	if (!osb->net_response_buf) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-/*
-**  All structures have a type, and a size associated with it.
-**  The type serves to identify the structure. The size is used for
-**  consistency checking ...
-*/
-void ocfs_publish_map_set(ocfs_node_map *pubmap, int num)
-{
-	ocfs_node_map_set_bit(pubmap, num);
-}
+	osb->net_vote_buf = kmalloc(sizeof(ocfs2_vote_msg),
+				    GFP_KERNEL);
+	if (!osb->net_vote_buf) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-void ocfs_publish_map_clear(ocfs_node_map *pubmap, int num)
-{
-	ocfs_node_map_clear_bit(pubmap, num);
-}
+	status = net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
+				      osb->net_key,
+				      0,
+				      sizeof(ocfs2_response_msg),
+				      ocfs2_handle_response_message,
+				      osb,
+				      osb->net_response_buf);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-/* update the recovery map here */
-void ocfs_recovery_map_set(ocfs_super *osb, int num)
-{
-	spin_lock(&osb->recovery_map_lock);
-	ocfs_node_map_set_bit(&osb->recovery_map, num);
-	spin_unlock(&osb->recovery_map_lock);
+	status = net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
+				      osb->net_key,
+				      0,
+				      sizeof(ocfs2_vote_msg),
+				      ocfs2_handle_vote_message,
+				      osb,
+				      osb->net_vote_buf);
+	if (status < 0) {
+		/* TODO: net_unregister here! */
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+bail:
+	if (status < 0) {
+		if (osb->net_response_buf)
+			kfree(osb->net_response_buf);
+		if (osb->net_vote_buf)
+			kfree(osb->net_vote_buf);
+		osb->net_response_buf = osb->net_vote_buf = NULL;
+		/* 0 indicates we never registered anything */
+		osb->net_key = 0;
+	}
+	return status;
 }
 
-void ocfs_recovery_map_clear(ocfs_super *osb, int num)
+void ocfs2_unregister_net_handlers(ocfs_super *osb)
 {
-	spin_lock(&osb->recovery_map_lock);
-	ocfs_node_map_clear_bit(&osb->recovery_map, num);
-	spin_unlock(&osb->recovery_map_lock);
-}
+	if (!osb->net_key)
+		return;
 
-int ocfs_node_is_recovering(ocfs_super *osb, int num)
-{	
-	if (num == -1)
-		return 0;
-	return ocfs_node_map_test_bit(&osb->recovery_map, num);
-}
+	/* TODO: net_unregister here! */
+	/* TODO: net_unregister here! */
 
-int ocfs_node_is_alive(ocfs_node_map *pubmap, int index)
-{
-	if (index == -1)
-		return 0;
-	return ocfs_node_map_test_bit(pubmap, index);
-}	
+	if (!list_empty(&osb->net_response_list))
+		printk("ocfs2: net response list not empty!\n");
 
+	kfree(osb->net_response_buf);
+	kfree(osb->net_vote_buf);
+}

Modified: trunk/src/vote.h
===================================================================
--- trunk/src/vote.h	2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/vote.h	2004-12-06 21:45:32 UTC (rev 1693)
@@ -3,7 +3,7 @@
  *
  * vote.h
  *
- * Function prototypes
+ * description here
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
@@ -23,53 +23,23 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef OCFS2_VOTE_H
-#define OCFS2_VOTE_H
 
-int ocfs_init_udp_sock(struct socket **send_sock,
-		       struct socket **recv_sock);
-int ocfs_lookup_obj_for_proc(ocfs_vote_obj *obj,
-			     ocfs_vote_obj_lookup_data *data);
-int ocfs_lookup_obj_by_lockid(ocfs_vote_obj *obj,
-			      ocfs_vote_obj_lookup_data *data);
-int ocfs_lookup_vote_request_obj(ocfs_super *osb,
-				 ocfs_vote_obj_lookup_data *data);
-void ocfs_process_one_vote_reply(ocfs_super *osb,
-				 ocfs_vote_reply_ctxt *ctxt,
-				 __u32 node_num);
-int ocfs_recv_udp_msg(ocfs_recv_ctxt *recv_ctxt);
-int ocfs_send_dismount_msg(ocfs_super *osb);
-int ocfs_send_dlm_request_msg (ocfs_super * osb, __u64 lock_id, 
-			       __u32 lock_type, __u32 flags, 
-			       ocfs_node_map *votemap, 
-			       struct inode *inode, __u32 num_ident, 
-			       int *vote_status);
-int ocfs_send_vote_reply(ocfs_super *osb, ocfs_dlm_msg *dlm_msg,
-			 __u32 vote_status);
-int ocfs_lookup_vote_request_obj (ocfs_super *osb, 
-				  ocfs_vote_obj_lookup_data *data);
+#ifndef VOTE_H
+#define VOTE_H
 
-void ocfs_node_map_init(ocfs_super *osb, ocfs_node_map *map);
-void ocfs_node_map_set_bit(ocfs_node_map *map, int bit);
-void ocfs_node_map_clear_bit(ocfs_node_map *map, int bit);
-// clear all the bits in "target" which are set in "mask"
-void ocfs_node_map_clear_bits(ocfs_node_map *target, ocfs_node_map *mask);
-// set all the bits in "target" which are set in "mask"
-void ocfs_node_map_set_bits(ocfs_node_map *target, ocfs_node_map *mask);
-int ocfs_node_map_test_bit(ocfs_node_map *map, int bit);
-int ocfs_node_map_is_empty(ocfs_node_map *map);
-int ocfs_node_map_is_equal(ocfs_node_map *map1, ocfs_node_map *map2);
-void ocfs_node_map_and(ocfs_node_map *target, ocfs_node_map *mask);
-void ocfs_node_map_set(ocfs_node_map *target, ocfs_node_map *from);
-void ocfs_node_map_dup(ocfs_super *osb, ocfs_node_map *target, ocfs_node_map *from);
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs_node_map_is_only(ocfs_super *osb, ocfs_node_map *target, int bit);
+int ocfs2_vote_thread(void *arg);
+static inline void ocfs2_kick_vote_thread(ocfs_super *osb)
+{
+	atomic_set(&osb->wake_vote_task, 1);
+	wake_up(&osb->vote_event);
+}
 
-int ocfs_node_is_recovering(ocfs_super *osb, int num);
-int ocfs_node_is_alive(ocfs_node_map *pubmap, int index);
-void ocfs_publish_map_set(ocfs_node_map *pubmap, int num);
-void ocfs_publish_map_clear(ocfs_node_map *pubmap, int num);
-void ocfs_recovery_map_set(ocfs_super *osb, int num);
-void ocfs_recovery_map_clear(ocfs_super *osb, int num);
+int ocfs2_request_delete_vote(struct inode *inode);
+int ocfs2_request_unlink_vote(struct inode *inode);
+int ocfs2_request_rename_vote(struct inode *inode);
+int ocfs2_request_mount_vote(ocfs_super *osb);
+int ocfs2_request_umount_vote(ocfs_super *osb);
+int ocfs2_register_net_handlers(ocfs_super *osb);
+void ocfs2_unregister_net_handlers(ocfs_super *osb);
 
-#endif /* OCFS2_VOTE_H */
+#endif