[Ocfs2-commits] manish commits r1692 - in branches/dlm-glue: .
cluster src
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Fri Dec 3 20:54:02 CST 2004
Author: manish
Date: 2004-12-03 20:54:01 -0600 (Fri, 03 Dec 2004)
New Revision: 1692
Added:
branches/dlm-glue/cluster/
branches/dlm-glue/cluster/Makefile
branches/dlm-glue/cluster/compat_libfs.c
branches/dlm-glue/cluster/compat_libfs.h
branches/dlm-glue/cluster/dlm_compat.h
branches/dlm-glue/cluster/dlmcommon.h
branches/dlm-glue/cluster/dlmmaster.c
branches/dlm-glue/cluster/dlmmod.c
branches/dlm-glue/cluster/dlmmod.h
branches/dlm-glue/cluster/dlmrecovery.c
branches/dlm-glue/cluster/dlmthread.c
branches/dlm-glue/cluster/heartbeat.c
branches/dlm-glue/cluster/heartbeat.h
branches/dlm-glue/cluster/nodemanager.c
branches/dlm-glue/cluster/nodemanager.h
branches/dlm-glue/cluster/tcp.c
branches/dlm-glue/cluster/tcp.h
branches/dlm-glue/cluster/test.c
branches/dlm-glue/cluster/util.c
branches/dlm-glue/cluster/util.h
branches/dlm-glue/cluster/warning_hack.h
Modified:
branches/dlm-glue/Config.make.in
branches/dlm-glue/Makefile
branches/dlm-glue/configure.in
branches/dlm-glue/src/Makefile
branches/dlm-glue/src/dlmglue.c
branches/dlm-glue/src/heartbeat.c
branches/dlm-glue/src/ocfs.h
branches/dlm-glue/src/super.c
branches/dlm-glue/src/vote.c
Log:
Landed cluster support
Modified: branches/dlm-glue/Config.make.in
===================================================================
--- branches/dlm-glue/Config.make.in 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/Config.make.in 2004-12-04 02:54:01 UTC (rev 1692)
@@ -54,8 +54,6 @@
GCCINC = @GCCINC@
endif
-CLUSTERINC = @CLUSTERINC@
-
HAVE_NPTL = @HAVE_NPTL@
COMPAT_SAFE_WRITE = @COMPAT_SAFE_WRITE@
Modified: branches/dlm-glue/Makefile
===================================================================
--- branches/dlm-glue/Makefile 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/Makefile 2004-12-04 02:54:01 UTC (rev 1692)
@@ -2,7 +2,7 @@
include $(TOPDIR)/Preamble.make
-SUBDIRS = src docs patches vendor
+SUBDIRS = cluster src docs patches vendor
DIST_FILES = \
COPYING \
Added: branches/dlm-glue/cluster/Makefile
===================================================================
--- branches/dlm-glue/cluster/Makefile 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/Makefile 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,226 @@
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+TOPDIR = ..
+
+include $(TOPDIR)/Preamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6.x kernel build system
+
+# Global parameter so we know where our stuff is
+CLUSTER_SRC_DIR := $(M)
+
+include $(CLUSTER_SRC_DIR)/../Config.make
+endif
+
+#-*******************************************************
+# Now do stuff which is global for 2.4.x and 2.6.x builds
+
+#ifdef OCFS_DEBUG
+OPTS += -g
+#endif
+
+#ifdef OCFS_DEBUG
+GLOBAL_DEFINES += -DDEBUG
+#endif
+
+ifdef OCFS_TRACE
+GLOBAL_DEFINES += -DTRACE
+endif
+
+ifdef HAVE_NPTL
+GLOBAL_DEFINES += -DHAVE_NPTL
+endif
+
+CFILES = \
+ compat_libfs.c \
+ dlmmaster.c \
+ dlmmod.c \
+ dlmrecovery.c \
+ dlmthread.c \
+ heartbeat.c \
+ nodemanager.c \
+ tcp.c \
+ util.c \
+ test.c
+
+HFILES = \
+ compat_libfs.h \
+ dlm_compat.h \
+ dlmcommon.h \
+ dlmmod.h \
+ heartbeat.h \
+ nodemanager.h \
+ tcp.h \
+ util.h \
+ warning_hack.h
+
+CLEAN_RULES = clean-cluster
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+# End of stuff which is global for 2.4.x and 2.6.x kernels
+#-********************************************************
+
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+# Preliminary 2.6.x kernel support. See if we are building for the 2.6.x
+# kernel
+ifndef KERNEL_26
+# Building for a 2.4.x kernel
+
+WARNINGS = -Wall -Wstrict-prototypes
+
+ifneq ($(OCFS_PROCESSOR),x86_64)
+WARNINGS += -Wmissing-prototypes -Wmissing-declarations
+endif
+
+ifeq ($(KVER),vmware)
+ KERNELINC = /usr/src/linux-2.4/include
+endif
+
+ifeq ($(KVER),suse)
+ GLOBAL_DEFINES += -DSUSE
+endif
+ifeq ($(KVER),hugemem)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=1
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),smp)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=1
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),ent)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=1
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),up)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=1
+endif
+
+ifeq ($(OCFS_PROCESSOR),ppc64)
+ MACH_CFLAGS += -m64 -fsigned-char -fno-builtin -msoft-float -mminimal-toc
+ LDADD += -m elf64ppc
+endif
+ifeq ($(OCFS_PROCESSOR),x86_64)
+ MACH_CFLAGS += -m64 -mcmodel=kernel
+endif
+
+BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__
+DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
+
+INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC)
+
+CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
+ -fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
+LDADD = -nostdlib
+
+OPTIMIZE = -O2
+
+CFLAGS += $(OPTIMIZE)
+
+MODULES = ocfs2_dlm.o ocfs2_heartbeat.o ocfs2_nodemanager.o ocfs2_tcp.o
+TEST_MODULES = ocfs2_cluster_test.o
+
+INSTALL_MODULES = $(MODULES)
+
+# Make dependancies work
+$(CFILES): $(HFILES)
+$(OBJS): $(HFILES)
+
+build-cluster: $(MODULES)
+
+ocfs2_cluster_test.o: test.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_dlm.o: dlmmod.o dlmthread.o dlmrecovery.o util.o compat_libfs.o dlmmaster.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_nodemanager.o: nodemanager.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_heartbeat.o: heartbeat.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_tcp.o: tcp.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+clean-cluster:
+ rm -f *.o *.p *.s
+
+else # ifndef KERNEL_26
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it. They are:
+# 1. We are being included by the local Makefile to do a 2.6 kernel build.
+# In this method we will call the kernel make system to build our module.
+# This will cause the kernel make system to call back into our makefile
+# (2nd way).
+
+INSTALL_MODULE = ocfs2.ko
+
+#ALL_RULES = stamp-md5 build-ocfs
+ALL_RULES = build-cluster
+
+build-ocfs:
+ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) modules
+
+clean-ocfs:
+ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) clean
+
+endif # OCFS_KERNEL_2_6
+
+INSTALL_RULES = install-cluster
+
+install-cluster: $(INSTALL_MODULES)
+ $(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+ @for file in $(INSTALL_MODULES); do \
+ $(INSTALL_DATA) $$file $(DESTDIR)$(MODULEDIR)/ocfs2/$$file \
+ done
+
+include $(TOPDIR)/Postamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6 kernel build system. So we will include the
+# 2.6.x Makefile and skip everything else.
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it. They are:
+# 1. We are being included by the local Makefile to do a 2.6 kernel build.
+# In this method we will call the kernel make system to build our module.
+# This will cause the kernel make system to call back into our makefile
+# (2nd way).
+#
+# 2. We are being included by the kernel make system. So in this method we
+# just setup the variables that the make system wants and then the kernel
+# make system will take care of the build.
+
+# 2nd method. The kernel make system is including us. We need to setup the
+# various parameters for the kernel make system and then it will take care of
+# building us.
+
+STAMP_DIR = $(OCFS_SRC_DIR)
+include $(OCFS_SRC_DIR)/../Versioning.make
+
+EXTRA_CFLAGS += $(GLOBAL_DEFINES)
+
+CFLAGS_$(VERSION_OBJ) += $(VERDEFS)
+
+# Kernel Module file to produce
+obj-m += ocfs2.o
+
+# list of object files that are used to create our module
+ocfs2-objs := $(OBJS)
+
+endif # ifneq ($(KERNELRELEASE),)
Added: branches/dlm-glue/cluster/compat_libfs.c
===================================================================
--- branches/dlm-glue/cluster/compat_libfs.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/compat_libfs.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,705 @@
+/* -----------------------------------------------------------------*/
+
+
+/*
+ * compat_libfs.c
+ * Library for filesystems writers.
+ * PLUS... transaction file stuff stolen from nfsd
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+
+#include "compat_libfs.h"
+
+#define kstatfs statfs
+#define __user
+
+
+int simple_statfs(struct super_block *sb, struct statfs *buf);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+#else
+struct dentry *simple_lookup(struct inode *dir,struct dentry *dentry);
+#endif
+
+int simple_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int dcache_dir_open(struct inode *inode, struct file *file);
+int dcache_dir_close(struct inode *inode, struct file *file);
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin);
+ssize_t generic_read_dir(struct file *filp, char *buf, size_t siz, loff_t *ppos);
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry);
+int simple_empty(struct dentry *dentry);
+int simple_unlink(struct inode *dir, struct dentry *dentry);
+int simple_rmdir(struct inode *dir, struct dentry *dentry);
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry);
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+
+
+
+#if 0
+int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+ return 0;
+}
+#endif
+
+int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_namelen = NAME_MAX;
+ return 0;
+}
+
+/*
+ * Lookup the data. This is trivial - if the dentry didn't already
+ * exist, we know it is negative.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+#else
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+#endif
+
+
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name)
+{
+ struct list_head *iter;
+ struct dentry *child = NULL;
+
+ spin_lock(&dcache_lock);
+ list_for_each(iter, &dentry->d_subdirs) {
+ child = list_entry(iter, struct dentry, d_child);
+ if (child->d_name.len == name->len &&
+ memcmp(child->d_name.name, name->name, name->len)==0)
+ break;
+ child = NULL;
+ }
+ if (child)
+ dget_locked(child);
+ spin_unlock(&dcache_lock);
+ return child;
+}
+
+
+
+int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+ return 0;
+}
+
+int dcache_dir_open(struct inode *inode, struct file *file)
+{
+ static struct qstr cursor_name = {.len = 1, .name = "."};
+
+ file->private_data = d_alloc(file->f_dentry, &cursor_name);
+
+ return file->private_data ? 0 : -ENOMEM;
+}
+
+int dcache_dir_close(struct inode *inode, struct file *file)
+{
+ dput(file->private_data);
+ return 0;
+}
+
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
+{
+ down(&file->f_dentry->d_inode->i_sem);
+ switch (origin) {
+ case 1:
+ offset += file->f_pos;
+ case 0:
+ if (offset >= 0)
+ break;
+ default:
+ up(&file->f_dentry->d_inode->i_sem);
+ return -EINVAL;
+ }
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (file->f_pos >= 2) {
+ struct list_head *p;
+ struct dentry *cursor = file->private_data;
+ loff_t n = file->f_pos - 2;
+
+ spin_lock(&dcache_lock);
+ list_del(&cursor->d_child);
+ p = file->f_dentry->d_subdirs.next;
+ while (n && p != &file->f_dentry->d_subdirs) {
+ struct dentry *next;
+ next = list_entry(p, struct dentry, d_child);
+ if (!d_unhashed(next) && next->d_inode)
+ n--;
+ p = p->next;
+ }
+ list_add_tail(&cursor->d_child, p);
+ spin_unlock(&dcache_lock);
+ }
+ }
+ up(&file->f_dentry->d_inode->i_sem);
+ return offset;
+}
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+ return (inode->i_mode >> 12) & 15;
+}
+
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+
+int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+ struct dentry *dentry = filp->f_dentry;
+ struct dentry *cursor = filp->private_data;
+ struct list_head *p, *q = &cursor->d_child;
+ ino_t ino;
+ int i = filp->f_pos;
+
+ switch (i) {
+ case 0:
+ ino = dentry->d_inode->i_ino;
+ if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+ break;
+ filp->f_pos++;
+ i++;
+ /* fallthrough */
+ case 1:
+ ino = dentry->d_parent->d_inode->i_ino;
+ if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+ break;
+ filp->f_pos++;
+ i++;
+ /* fallthrough */
+ default:
+ spin_lock(&dcache_lock);
+ if (filp->f_pos == 2) {
+ list_del(q);
+ list_add(q, &dentry->d_subdirs);
+ }
+ for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
+ struct dentry *next;
+ next = list_entry(p, struct dentry, d_child);
+ if (d_unhashed(next) || !next->d_inode)
+ continue;
+
+ spin_unlock(&dcache_lock);
+ if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0)
+ return 0;
+ spin_lock(&dcache_lock);
+ /* next is still alive */
+ list_del(q);
+ list_add(q, p);
+ p = q;
+ filp->f_pos++;
+ }
+ spin_unlock(&dcache_lock);
+ }
+ return 0;
+}
+
+ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
+{
+ return -EISDIR;
+}
+
+struct file_operations simple_dir_operations = {
+ .open = dcache_dir_open,
+ .release = dcache_dir_close,
+ .llseek = dcache_dir_lseek,
+ .read = generic_read_dir,
+ .readdir = dcache_readdir,
+};
+
+struct inode_operations simple_dir_inode_operations = {
+ .lookup = simple_lookup,
+};
+
+#if 0
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct super_block *
+get_sb_pseudo(struct file_system_type *fs_type, char *name,
+ struct super_operations *ops, unsigned long magic)
+{
+ struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+ static struct super_operations default_ops = {.statfs = simple_statfs};
+ struct dentry *dentry;
+ struct inode *root;
+ struct qstr d_name = {.name = name, .len = strlen(name)};
+
+ if (IS_ERR(s))
+ return s;
+
+ s->s_flags = MS_NOUSER;
+ s->s_maxbytes = ~0ULL;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = magic;
+ s->s_op = ops ? ops : &default_ops;
+ root = new_inode(s);
+ if (!root)
+ goto Enomem;
+ root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+ root->i_uid = root->i_gid = 0;
+ root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+ dentry = d_alloc(NULL, &d_name);
+ if (!dentry) {
+ iput(root);
+ goto Enomem;
+ }
+ dentry->d_sb = s;
+ dentry->d_parent = dentry;
+ d_instantiate(dentry, root);
+ s->s_root = dentry;
+ s->s_flags |= MS_ACTIVE;
+ return s;
+
+Enomem:
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ return ERR_PTR(-ENOMEM);
+}
+#endif
+
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_nlink++;
+ atomic_inc(&inode->i_count);
+ dget(dentry);
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+static inline int simple_positive(struct dentry *dentry)
+{
+ return dentry->d_inode && !d_unhashed(dentry);
+}
+
+int simple_empty(struct dentry *dentry)
+{
+ struct dentry *child;
+ int ret = 0;
+
+ spin_lock(&dcache_lock);
+ list_for_each_entry(child, &dentry->d_subdirs, d_child)
+ if (simple_positive(child))
+ goto out;
+ ret = 1;
+out:
+ spin_unlock(&dcache_lock);
+ return ret;
+}
+
+int simple_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_nlink--;
+ dput(dentry);
+ return 0;
+}
+
+int simple_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ if (!simple_empty(dentry))
+ return -ENOTEMPTY;
+
+ dentry->d_inode->i_nlink--;
+ simple_unlink(dir, dentry);
+ dir->i_nlink--;
+ return 0;
+}
+
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
+
+ if (!simple_empty(new_dentry))
+ return -ENOTEMPTY;
+
+ if (new_dentry->d_inode) {
+ simple_unlink(new_dir, new_dentry);
+ if (they_are_dirs)
+ old_dir->i_nlink--;
+ } else if (they_are_dirs) {
+ old_dir->i_nlink--;
+ new_dir->i_nlink++;
+ }
+
+ old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
+ new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ return 0;
+}
+
+#if 0
+int simple_readpage(struct file *file, struct page *page)
+{
+ void *kaddr;
+
+ if (PageUptodate(page))
+ goto out;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr, 0, PAGE_CACHE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+out:
+ unlock_page(page);
+ return 0;
+}
+
+int simple_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ if (!PageUptodate(page)) {
+ if (to - from != PAGE_CACHE_SIZE) {
+ void *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr, 0, from);
+ memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ SetPageUptodate(page);
+ }
+ return 0;
+}
+
+int simple_commit_write(struct file *file, struct page *page,
+ unsigned offset, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold the i_sem.
+ */
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+ set_page_dirty(page);
+ return 0;
+}
+#endif
+
+void d_genocide(struct dentry *root);
+
+void d_genocide(struct dentry *root)
+{
+ struct dentry *this_parent = root;
+ struct list_head *next;
+ spin_lock(&dcache_lock);
+repeat:
+ next = this_parent->d_subdirs.next;
+resume:
+ while (next != &this_parent->d_subdirs) {
+ struct list_head *tmp = next;
+ struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+ next = tmp->next;
+ if (d_unhashed(dentry)||!dentry->d_inode)
+ continue;
+ if (!list_empty(&dentry->d_subdirs)) {
+ this_parent = dentry;
+ goto repeat;
+ }
+ atomic_dec(&dentry->d_count);
+ }
+ if (this_parent != root) {
+ next = this_parent->d_child.next;
+ atomic_dec(&this_parent->d_count);
+ this_parent = this_parent->d_parent;
+ goto resume;
+ }
+ spin_unlock(&dcache_lock);
+}
+
+static void simple_read_inode(struct inode * inode)
+{
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+}
+
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+{
+ static struct super_operations s_ops = {
+ .statfs = simple_statfs,
+ .read_inode = simple_read_inode
+ };
+ struct inode *inode;
+ struct dentry *root;
+ struct dentry *dentry;
+ int i;
+
+ s->s_blocksize = PAGE_CACHE_SIZE;
+ s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ s->s_magic = magic;
+ s->s_op = &s_ops;
+
+ inode = new_inode(s);
+ if (!inode)
+ return -ENOMEM;
+ inode->i_mode = S_IFDIR | 0755;
+ inode->i_uid = inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ root = d_alloc_root(inode);
+ if (!root) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ for (i = 0; !files->name || files->name[0]; i++, files++) {
+ struct qstr name;
+ if (!files->name)
+ continue;
+ name.name = files->name;
+ name.len = strlen(name.name);
+ printk("adding file %*s\n", name.len, name.name);
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_alloc(root, &name);
+ if (!dentry)
+ goto out;
+ inode = new_inode(s);
+ if (!inode)
+ goto out;
+ inode->i_mode = S_IFREG | files->mode;
+ inode->i_uid = inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_fop = files->ops;
+ inode->i_ino = i;
+ d_add(dentry, inode);
+ }
+ s->s_root = root;
+ return 0;
+out:
+ d_genocide(root);
+ dput(root);
+ return -ENOMEM;
+}
+
+#if 0
+static spinlock_t pin_fs_lock = SPIN_LOCK_UNLOCKED;
+
+int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+{
+ struct vfsmount *mnt = NULL;
+ spin_lock(&pin_fs_lock);
+ if (unlikely(!*mount)) {
+ spin_unlock(&pin_fs_lock);
+ mnt = do_kern_mount(name, 0, name, NULL);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ spin_lock(&pin_fs_lock);
+ if (!*mount)
+ *mount = mnt;
+ }
+ mntget(*mount);
+ ++*count;
+ spin_unlock(&pin_fs_lock);
+ mntput(mnt);
+ return 0;
+}
+
+void simple_release_fs(struct vfsmount **mount, int *count)
+{
+ struct vfsmount *mnt;
+ spin_lock(&pin_fs_lock);
+ mnt = *mount;
+ if (!--*count)
+ *mount = NULL;
+ spin_unlock(&pin_fs_lock);
+ mntput(mnt);
+}
+
+ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
+ const void *from, size_t available)
+{
+ loff_t pos = *ppos;
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= available)
+ return 0;
+ if (count > available - pos)
+ count = available - pos;
+ if (copy_to_user(to, from + pos, count))
+ return -EFAULT;
+ *ppos = pos + count;
+ return count;
+}
+
+EXPORT_SYMBOL(dcache_dir_close);
+EXPORT_SYMBOL(dcache_dir_lseek);
+EXPORT_SYMBOL(dcache_dir_open);
+EXPORT_SYMBOL(dcache_readdir);
+EXPORT_SYMBOL(generic_read_dir);
+EXPORT_SYMBOL(simple_commit_write);
+EXPORT_SYMBOL(simple_empty);
+EXPORT_SYMBOL(simple_fill_super);
+EXPORT_SYMBOL(simple_getattr);
+EXPORT_SYMBOL(simple_link);
+EXPORT_SYMBOL(simple_lookup);
+EXPORT_SYMBOL(simple_pin_fs);
+EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_SYMBOL(simple_readpage);
+EXPORT_SYMBOL(simple_release_fs);
+EXPORT_SYMBOL(simple_rename);
+EXPORT_SYMBOL(simple_rmdir);
+EXPORT_SYMBOL(simple_statfs);
+EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(simple_unlink);
+EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(get_sb_pseudo);
+#endif
+
+/* -----------------------------------------------------------------*/
+
+
+
+/* transaction file support */
+
+/*
+ * transaction based IO methods.
+ * The file expects a single write which triggers the transaction, and then
+ * possibly a read which collects the result - which is stored in a
+ * file-local buffer.
+ */
+static ssize_t TA_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+{
+ ino_t ino = file->f_dentry->d_inode->i_ino;
+ struct argresp *ar;
+ ssize_t rv = 0;
+ struct super_block *sb = file->f_dentry->d_inode->i_sb;
+ TA_write_ops *ops = TA_GENERIC_SB_MEMBER(sb);
+ TA_write_op *write_op;
+
+ printk("welcome to TA_write: num_ops=%d, op[%d]=%p, private=%p, size=%u\n",
+ ops->num_ops, (int)ino, ops->write_op[ino], file->private_data, size);
+ if (ino >= ops->num_ops || ops->write_op[ino] == NULL)
+ return -EINVAL;
+ write_op = ops->write_op[ino];
+ if (file->private_data)
+ return -EINVAL; /* only one write allowed per open */
+ if (size > PAGE_SIZE - sizeof(struct argresp))
+ return -EFBIG;
+
+ ar = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ar)
+ return -ENOMEM;
+ ar->size = 0;
+ down(&file->f_dentry->d_inode->i_sem);
+ if (file->private_data)
+ rv = -EINVAL;
+ else
+ file->private_data = ar;
+ up(&file->f_dentry->d_inode->i_sem);
+ if (rv) {
+ kfree(ar);
+ return rv;
+ }
+ if (copy_from_user(ar->data, buf, size))
+ return -EFAULT;
+
+ printk("now calling write_op...\n");
+ rv = write_op(file, ar->data, size);
+ printk("write_op returned %d\n", rv);
+ if (rv>0) {
+ ar->size = rv;
+ rv = size;
+ }
+ return rv;
+}
+
+
+static ssize_t TA_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+ struct argresp *ar;
+ ssize_t rv = 0;
+
+ if (file->private_data == NULL)
+ rv = TA_write(file, buf, 0, pos);
+ if (rv < 0)
+ return rv;
+
+ ar = file->private_data;
+ if (!ar)
+ return 0;
+ if (*pos >= ar->size)
+ return 0;
+ if (*pos + size > ar->size)
+ size = ar->size - *pos;
+ if (copy_to_user(buf, ar->data + *pos, size))
+ return -EFAULT;
+ *pos += size;
+ return size;
+}
+
+static int TA_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return 0;
+}
+
+static int TA_release(struct inode *inode, struct file *file)
+{
+ void *p = file->private_data;
+ file->private_data = NULL;
+ kfree(p);
+ return 0;
+}
+
+
+
+
+
+
+
+
+struct file_operations transaction_ops = {
+ .write = TA_write,
+ .read = TA_read,
+ .open = TA_open,
+ .release = TA_release,
+};
Added: branches/dlm-glue/cluster/compat_libfs.h
===================================================================
--- branches/dlm-glue/cluster/compat_libfs.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/compat_libfs.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,36 @@
+#ifndef CLUSTER_COMPAT_LIBFS_H
+#define CLUSTER_COMPAT_LIBFS_H
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define TA_GENERIC_SB_MEMBER(sb) ((sb)->s_fs_info)
+#else
+#define TA_GENERIC_SB_MEMBER(sb) ((sb)->u.generic_sbp)
+#endif
+
+
+/* an argresp is stored in an allocated page and holds the
+ * size of the argument or response, along with its content
+ */
+struct argresp {
+ ssize_t size;
+ char data[0];
+};
+
+typedef ssize_t (TA_write_op)(struct file *, char *, size_t);
+typedef struct _TA_write_ops
+{
+ int num_ops;
+ TA_write_op *write_op[0];
+} TA_write_ops;
+
+struct tree_descr
+{
+ char *name;
+ struct file_operations *ops;
+ int mode;
+};
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name);
+
+#endif /* CLUSTER_COMPAT_LIBFS_H */
Added: branches/dlm-glue/cluster/dlm_compat.h
===================================================================
--- branches/dlm-glue/cluster/dlm_compat.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlm_compat.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlm_compat.h
+ *
+ * Compatibility stuff for 2.4
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version
+ * 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLM_COMPAT_H
+#define CLUSTER_DLM_COMPAT_H
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/kdev_t.h>
+#include <linux/sched.h>
+#include <linux/compiler.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+# include <linux/locks.h>
+#else
+# include <linux/buffer_head.h>
+#endif
+
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+
+#ifdef __ia64__
+extern inline void prefetch(const void *x);
+extern inline void prefetchw(const void *x);
+#else
+static inline void prefetch(const void *x);
+static inline void prefetchw(const void *x);
+#endif
+extern inline int generic_fls(int x);
+extern inline int get_bitmask_order(unsigned int count);
+/* XXX Hack to avoid warning */
+struct mem_dqinfo;
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+
+
+
+
+#define flush_scheduled_work flush_scheduled_tasks
+#define work_struct tq_struct
+#define INIT_WORK(w, f, d) INIT_TQUEUE(w, f, d)
+#define schedule_work(w) schedule_task(w)
+
+#ifdef HAVE_NPTL
+static inline void dequeue_signal_lock(struct task_struct *task,
+ sigset_t *blocked, siginfo_t *info)
+{
+ spin_lock_irq(&task->sighand->siglock);
+ dequeue_signal(blocked, info);
+ spin_unlock_irq(&task->sighand->siglock);
+}
+#else
+static inline void dequeue_signal_lock(struct task_struct *task,
+ sigset_t *blocked, siginfo_t *info)
+{
+ spin_lock_irq(&task->sigmask_lock);
+ dequeue_signal(blocked, info);
+ spin_unlock_irq(&task->sigmask_lock);
+}
+#endif
+#define kstatfs statfs
+
+
+
+/*
+ * Copied right out of the 2.6.2 kernel's buffer_head.h:
+ * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
+ * and buffer_foo() functions.
+ */
+#define BUFFER_FNS(bit, name) \
+static inline void set_buffer_##name(struct buffer_head *bh) \
+{ \
+ set_bit(BH_##bit, &(bh)->b_state); \
+} \
+static inline void clear_buffer_##name(struct buffer_head *bh) \
+{ \
+ clear_bit(BH_##bit, &(bh)->b_state); \
+} \
+static inline int buffer_##name(struct buffer_head *bh) \
+{ \
+ return test_bit(BH_##bit, &(bh)->b_state); \
+}
+
+#undef buffer_uptodate
+#undef buffer_dirty
+BUFFER_FNS(Uptodate, uptodate)
+BUFFER_FNS(Dirty, dirty)
+
+#define clear_buffer_dirty mark_buffer_clean
+
+#endif /* LINUX_VERSION_CODE < 2.6 */
+
+
+#endif /* CLUSTER_DLM_COMPAT_H */
+
Added: branches/dlm-glue/cluster/dlmcommon.h
===================================================================
--- branches/dlm-glue/cluster/dlmcommon.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmcommon.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,52 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Common stuff
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMCOMMON_H
+#define CLUSTER_DLMCOMMON_H
+
+#define DLM_ASSERT(x) ({ if (!(x)) { printk("assert failed! %s:%d\n", __FILE__, __LINE__); BUG(); } })
+
+typedef struct _nm_ctxt nm_ctxt;
+typedef struct _dlm_ctxt dlm_ctxt;
+typedef struct _heartbeat_ctxt heartbeat_ctxt;
+
+#define CLUSTER_DISK_UUID_LEN 32 // 16 byte binary == 32 char hex string
+
+typedef struct _cluster_disk
+{
+ // uuid of disk
+ char uuid[CLUSTER_DISK_UUID_LEN+1];
+ // all the rest are for heartbeat
+ kdev_t dev;
+ u32 blocksize_bits;
+ u32 num_blocks;
+ u64 start_block;
+ util_rarray slots;
+} cluster_disk;
+
+
+#endif /* CLUSTER_DLMCOMMON_H */
Added: branches/dlm-glue/cluster/dlmmaster.c
===================================================================
--- branches/dlm-glue/cluster/dlmmaster.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmmaster.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,967 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+
+
+spinlock_t dlm_master_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(dlm_master_list);
+
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm,
+ dlm_lock_resource *res, struct qstr *name, int locked);
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm,
+ dlm_lock_resource *res, struct qstr *name, int locked)
+{
+ int ret = 0;
+
+ mle->dlm = dlm;
+ mle->type = type;
+ INIT_LIST_HEAD(&mle->list);
+ memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ spin_lock_init(&mle->spinlock);
+ init_waitqueue_head(&mle->wq);
+ atomic_set(&mle->woken, 0);
+ atomic_set(&mle->refcnt, 1);
+ memset(mle->response_map, 0, sizeof(mle->response_map));
+ mle->master = NM_MAX_NODES;
+ mle->error = 0;
+
+ if (mle->type == DLM_MLE_MASTER)
+ mle->u.res = res;
+ else
+ strncpy(mle->u.name.name, name->name, name->len);
+
+ if (!locked)
+ spin_lock(&dlm->spinlock);
+
+ /* copy off the node_map and register hb callbacks on our copy */
+ memcpy(mle->node_map, dlm->node_map, sizeof(mle->node_map));
+ memcpy(mle->vote_map, dlm->node_map, sizeof(mle->vote_map));
+ clear_bit(dlm->group_index, mle->vote_map);
+ clear_bit(dlm->group_index, mle->node_map);
+
+#warning cannot do this here cuz this kmallocs and we are under a spinlock dammit
+ if (hb_register_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle, DLM_HB_NODE_DOWN_PRI+1) ||
+ hb_register_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle, DLM_HB_NODE_UP_PRI+1)) {
+ ret = -EINVAL;
+ }
+
+ if (!locked)
+ spin_unlock(&dlm->spinlock);
+
+ return ret;
+}
+
+
+
+
+/////////////////////////////////////////////////
+//
+// TODO: change these comments to reflect reality
+//
+// master_request(target=me)
+// wait for all responses
+// if maybe_map is 0 there are no others in progress
+// assert_master(me)
+// else (maybe_map has some nodes in it)
+// (nodes in maybe_map had better be < my node num)
+// wait for assert_master
+// endif
+//
+//
+// receive:
+// master_request(target):
+// if i own it, return YES
+// if i dont know anything about it, return NO
+// if i have it in progress
+// if my node number is lower
+// return MAYBE
+// else
+// if target < lowest_so_far, lowest_so_far=target
+// return NO
+//
+// assert_master(master):
+// if i own it, BUG()!!!
+// if i have it, but owner!=master, BUG()!!!
+// if i dont know anything about it, ignore
+// if i have it in progress
+// if lowest_so_far != master
+// BUG()!!!
+// else
+// set the owner, DONE
+//
+/////////////////////////////////////////////////
+
+
+/* remove from list and free */
+void dlm_put_mle(dlm_master_list_entry *mle)
+{
+ if (atomic_dec_and_lock(&mle->refcnt, &dlm_master_lock)) {
+ list_del(&mle->list);
+ spin_unlock(&dlm_master_lock);
+ hb_unregister_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle);
+ hb_unregister_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle);
+ kfree(mle);
+ }
+}
+
+
+
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ *
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ *
+ * also, do a lookup in the dlm_master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here. need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags)
+{
+ dlm_lock_resource *tmpres=NULL, *res=NULL;
+ struct list_head *bucket;
+ dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+ struct list_head *iter;
+ int blocked = 0;
+ int map_changed = 0, restart = 0, assert = 0;
+ int ret, start, bit;
+
+ bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+ /* pre-allocate a dlm_lock_resource and master stuff */
+ mle = kmalloc(sizeof(dlm_master_list_entry), GFP_KERNEL);
+ res = kmalloc(sizeof(dlm_lock_resource), GFP_KERNEL);
+ if (!mle || !res) {
+ printk("could not allocate memory for new lock resource!\n");
+ if (mle)
+ kfree(mle);
+ if (res)
+ kfree(res);
+ return NULL;
+ }
+
+ /* check for pre-existing lock */
+ spin_lock(&dlm->spinlock);
+ tmpres = __dlm_lookup_lock(dlm, lockname);
+ if (tmpres) {
+ spin_unlock(&dlm->spinlock);
+ /* TODO: return error, or return the lockres ?!? */
+ kfree(res);
+ kfree(mle);
+ /* waits for any outstanding work to finish
+ * will hold tmpres->spinlock on exit */
+ dlm_wait_on_lockres(tmpres);
+ return tmpres;
+ }
+
+ dlm_init_lockres(res, lockname);
+
+ if (flags & LKM_LOCAL) {
+ /* caller knows it's safe to assume it's not mastered elsewhere
+ * DONE! return right away */
+ list_add_tail(&res->list, bucket);
+ res->owner = dlm->group_index;
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+
+ /* return with res->spinlock held */
+
+ /* lock ordering note: this lockres will not be
+ * visible until i release dlm->spinlock, so it
+ * is ok to release dlm->spinlock out of order here */
+ spin_lock(&res->spinlock);
+
+ spin_unlock(&dlm->spinlock);
+ return res;
+ }
+
+ /* look in master list to see if another node has started mastering this */
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ tmpmle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, tmpmle, lockname))
+ continue;
+
+ if (tmpmle->type == DLM_MLE_MASTER) {
+ printk("impossible! master entry for nonexistent lock!\n");
+ BUG();
+ }
+ dlm_get_mle(tmpmle);
+ blocked = 1;
+ // found a block! must wait for lock to be mastered by another node
+ break;
+ }
+
+ if (!blocked) {
+ /* go ahead and try to master lock on this node */
+ if (dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 1)) {
+ printk("bug! failed to register hb callbacks\n");
+ BUG();
+ }
+ list_add(&mle->list, &dlm_master_list);
+ }
+ spin_unlock(&dlm_master_lock);
+
+ /* at this point there is either a DLM_MLE_BLOCK or a DLM_MLE_MASTER
+ * on the master list, so it's safe to add the lockres to the hashtable.
+ * anyone who finds the lock will still have to wait on the IN_PROGRESS.
+ * also, any new nodes that try to join at this point will have to wait
+ * until my dlm_master_lock list is empty, so they cannot possibly
+ * do any master requests yet... TODO
+ * ?? should i have a special type of mle just for joining nodes ??
+ * ?? could allow them to come in and put their mle on the list and sleep ?? */
+
+ /* finally add the lockres to its hash bucket */
+ list_add_tail(&res->list, bucket);
+ spin_unlock(&dlm->spinlock);
+
+ if (blocked) {
+ /* must wait for lock to be mastered elsewhere */
+ kfree(mle);
+ mle = tmpmle;
+ goto wait;
+ }
+
+ ret = -EINVAL;
+ start = 0;
+ while (1) {
+ bit = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+ if (bit >= NM_MAX_NODES) {
+ printk("no more nodes\n");
+ break;
+ }
+
+ ret = dlm_do_master_request(mle, bit);
+ if (ret < 0) {
+ // TODO
+ //printk("dlm_do_master_request returned %d!\n", ret);
+ }
+ if (mle->master != NM_MAX_NODES) {
+ // found a master!
+ break;
+ }
+ start = bit+1;
+ }
+
+wait:
+ while (1) {
+ spin_lock(&res->spinlock);
+ if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ // another node has become the owner
+ spin_unlock(&res->spinlock);
+ break;
+ }
+ spin_unlock(&res->spinlock);
+
+ spin_lock(&mle->spinlock);
+ if (mle->master != NM_MAX_NODES) {
+ u16 m = mle->master;
+ // printk("node %u is the master!\n", m);
+ spin_unlock(&mle->spinlock);
+
+ spin_lock(&res->spinlock);
+ res->owner = m;
+ spin_unlock(&res->spinlock);
+ break;
+ }
+ restart = 0;
+ map_changed = (memcmp(mle->vote_map, mle->node_map, sizeof(mle->vote_map)) != 0);
+ if (memcmp(mle->vote_map, mle->response_map, sizeof(mle->vote_map)) == 0) {
+ // printk("every node has responded...\n");
+ if (map_changed) {
+ printk("eek! got all original nodes, but nodemap changed while collecting responses\n");
+ restart = 1;
+ }
+
+ if (mle->error) {
+ printk("ugh. some node hit an error (-ENOMEM). try the whole thing again\n");
+ mle->error = 0;
+ /* TODO: treat this just like the dead node case below,
+ * cleanup and start over, but keep the error node around */
+ restart = 1;
+ }
+
+ if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+ /* no other nodes are in-progress */
+ /* those nodes should all be locking out this lockid until I assert */
+ /* they should have put a dummy entry on dlm_master_list */
+ /* need to assert myself as the master */
+
+ // printk("I am the only node in-progress! asserting myself as master\n");
+ assert = 1;
+ } else {
+ /* other nodes are in-progress */
+ if (map_changed && !test_bit(bit, mle->node_map)) {
+ /* TODO: need to copy the node_map into the vote_map, zero
+ * everything out and start over */
+ printk("need to handle this case! winning node %u just died!\n", bit);
+ restart = 1;
+ }
+
+ if (bit > dlm->group_index) {
+ // printk("next in-progress node (%u) is higher than me (%u)\n",
+ // bit, dlm->group_index);
+
+ /* nodes not in-progress should be locking out this lockid until I assert */
+ /* in-progress nodes should match me up with their lowest maybe_map bit */
+ /* need to assert myself as the master */
+
+ // printk("I am the lowest node! asserting myself as master\n");
+ assert = 1;
+ } else {
+ /* need to sit around and wait for assert */
+ /* my lowest maybe_map bit should be the one to assert */
+ /* just fall through and sleep. should be woken by the handler */
+
+ // printk("sleeping while waiting for %u to assert himself as master\n", bit);
+ }
+ }
+ } else {
+ if (map_changed) {
+ /* TODO: need to handle this */
+ printk("eek! nodemap changed while collecting responses\n");
+ restart = 1;
+ }
+ // printk("still waiting for all nodes to respond...\n");
+ }
+
+ if (restart && assert)
+ assert = 0;
+
+ /* make sure to tell any other nodes that i am mastering this */
+ if (assert)
+ mle->master = dlm->group_index;
+
+ spin_unlock(&mle->spinlock);
+
+ if (assert) {
+ ret = dlm_do_assert_master(mle);
+ // printk("assert returned %d!\n", ret);
+ if (ret == 0) {
+ spin_lock(&res->spinlock);
+ res->owner = dlm->group_index;
+ spin_unlock(&res->spinlock);
+ // printk("wooo! i am the owner. phew!\n");
+ break;
+ } else
+ restart = 1;
+ }
+ if (restart) {
+ printk("something happened such that the master process needs to be restarted!\n");
+ /* TODO: clear it all out and start over */
+ }
+
+ atomic_set(&mle->woken, 0);
+ ret = util_wait_atomic_eq(&mle->wq, &mle->woken, 1, 5000);
+ }
+ dlm_put_mle(mle);
+
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+ wake_up(&res->wq);
+
+ /* exits holding res->spinlock */
+ return res;
+}
+
+
+
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data)
+{
+ u8 response = DLM_MASTER_RESP_MAYBE;
+ dlm_ctxt *dlm = data;
+ dlm_lock_resource *res;
+ dlm_master_request *request = (dlm_master_request *) msg->buf;
+ dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+ struct qstr lockname = { .name=request->name, .len=request->namelen };
+ int found;
+ struct list_head *iter;
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+way_up_top:
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_unlock(&dlm->spinlock);
+
+ /* take care of the easy cases up front */
+ spin_lock(&res->spinlock);
+ if (res->owner == dlm->group_index) {
+ spin_unlock(&res->spinlock);
+ // printk("this node is the master\n");
+ response = DLM_MASTER_RESP_YES;
+ if (mle)
+ kfree(mle);
+ goto send_response;
+ } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ spin_unlock(&res->spinlock);
+ // printk("node %u is the master\n", res->owner);
+ response = DLM_MASTER_RESP_NO;
+ if (mle)
+ kfree(mle);
+ goto send_response;
+ }
+
+ /* ok, there is no owner. either this node is
+ * being blocked, or it is actively trying to
+ * master this lock. */
+ if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+ printk("bug! lock with no owner should be in-progress!\n");
+ BUG();
+ }
+
+ // printk("lockres is in progress...\n");
+ found = 0;
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ tmpmle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+ continue;
+
+ dlm_get_mle(tmpmle);
+ spin_lock(&tmpmle->spinlock);
+ if (tmpmle->type == DLM_MLE_BLOCK) {
+ // printk("this node is waiting for lockres to be mastered\n");
+ response = DLM_MASTER_RESP_NO;
+ } else {
+ // printk("this node is attempting to master lockres\n");
+ response = DLM_MASTER_RESP_MAYBE;
+ }
+ set_bit(request->node_idx, tmpmle->maybe_map);
+ spin_unlock(&tmpmle->spinlock);
+
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&res->spinlock);
+
+ dlm_put_mle(tmpmle);
+ if (mle)
+ kfree(mle);
+ goto send_response;
+ }
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&res->spinlock);
+ printk("bug bug bug!!! no mle found for this lock!\n");
+ BUG();
+ }
+
+ /*
+ * lockres doesn't exist on this node
+ * if there is an MLE_BLOCK, return NO
+ * if there is an MLE_MASTER, return MAYBE
+ * otherwise, add an MLE_BLOCK, return NO
+ */
+ found = 0;
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ tmpmle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+ continue;
+ dlm_get_mle(tmpmle);
+ found = 1;
+ break;
+ }
+
+ if (!found) {
+ /* this lockid has never been seen on this node yet */
+ // printk("no mle found\n");
+ if (!mle) {
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&dlm->spinlock);
+
+ mle = kmalloc(sizeof(dlm_master_list_entry) + lockname.len, GFP_KERNEL);
+ if (!mle) {
+ // bad bad bad... this sucks.
+ response = DLM_MASTER_RESP_ERROR;
+ goto send_response;
+ }
+ if (dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, &lockname, 0)) {
+ printk("eeek!\n");
+ response = DLM_MASTER_RESP_ERROR;
+ dlm_put_mle(mle);
+ goto send_response;
+ }
+ goto way_up_top;
+ }
+
+ // printk("this is second time thru, already allocated, add the block.\n");
+ set_bit(request->node_idx, mle->maybe_map);
+ list_add(&mle->list, &dlm_master_list);
+ response = DLM_MASTER_RESP_NO;
+ } else {
+ // printk("mle was found\n");
+ spin_lock(&tmpmle->spinlock);
+ if (tmpmle->type == DLM_MLE_BLOCK)
+ response = DLM_MASTER_RESP_NO;
+ else
+ response = DLM_MASTER_RESP_MAYBE;
+ set_bit(request->node_idx, tmpmle->maybe_map);
+ spin_unlock(&tmpmle->spinlock);
+ dlm_put_mle(tmpmle);
+ }
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&dlm->spinlock);
+
+send_response:
+ //ret = dlm_do_master_request_resp(dlm, &lockname, response, request->node_idx);
+ //printk("response returned %d\n", ret);
+
+ // printk("sending response %d to other node\n", response);
+ return response;
+}
+
+/* NOTE: when doing node recovery, run the dlm_master_list looking for the dead node in
+ * any maybe_map... clear that bit, and if now empty, clear the whole thing */
+
+/*
+ * locks that can be taken here:
+ * mle->spinlock
+ * dlm_master_list
+ *
+ */
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_master_list_entry *mle = NULL;
+ dlm_master_request_resp *resp = (dlm_master_request_resp *) msg->buf;
+ int found = 0, wake = 0;
+ struct list_head *iter;
+ struct qstr lockname = { .name=resp->name, .len=resp->namelen };
+
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ mle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, mle, &lockname)) {
+ mle = NULL;
+ continue;
+ }
+
+ dlm_get_mle(mle);
+ if (mle->type == DLM_MLE_BLOCK) {
+ printk("eek! cannot get a response for a block!\n");
+ break;
+ }
+ found = 1;
+ wake = 0;
+ spin_lock(&mle->spinlock);
+ switch (resp->response) {
+ case DLM_MASTER_RESP_YES:
+ set_bit(resp->node_idx, mle->response_map);
+ // printk("woot! node %u is the master!\n", resp->node_idx);
+ mle->master = resp->node_idx;
+ wake = 1;
+ break;
+ case DLM_MASTER_RESP_NO:
+ // printk("node %u is not the master, not in-progress\n", resp->node_idx);
+ set_bit(resp->node_idx, mle->response_map);
+ if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+ wake = 1;
+ break;
+ case DLM_MASTER_RESP_MAYBE:
+ // printk("node %u is not the master, but IS in-progress\n", resp->node_idx);
+ set_bit(resp->node_idx, mle->response_map);
+ set_bit(resp->node_idx, mle->maybe_map);
+ if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+ wake = 1;
+ break;
+ case DLM_MASTER_RESP_ERROR:
+ printk("node %u hit an -ENOMEM! try this whole thing again\n", resp->node_idx);
+ mle->error = 1;
+ wake = 1;
+ break;
+ default:
+ printk("bad response! %u\n", resp->response);
+ break;
+ }
+ if (wake) {
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+ }
+ spin_unlock(&mle->spinlock);
+ break;
+ }
+ spin_unlock(&dlm_master_lock);
+
+ if (found)
+ dlm_put_mle(mle);
+ else
+ printk("hrrm... got a master resp but found no matching request\n");
+ return 0;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_master_list_entry *mle = NULL;
+ dlm_assert_master *assert = (dlm_assert_master *)msg->buf;
+ dlm_lock_resource *res;
+ int bit;
+ struct list_head *iter;
+ struct qstr lockname = { .name=assert->name, .len=assert->namelen };
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ spin_lock(&dlm->spinlock);
+
+ /* find the MLE */
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ mle = list_entry(iter, dlm_master_list_entry, list);
+ if (dlm_mle_equal(dlm, mle, &lockname)) {
+ dlm_get_mle(mle);
+ break;
+ }
+ mle = NULL;
+ }
+ if (!mle) {
+ printk("EEEEEEK! just got an assert_master from %u, but no MLE for it!\n",
+ assert->node_idx);
+ spin_unlock(&dlm_master_lock);
+ goto check_lockres;
+ }
+ if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+ printk("EEK! no bits set in the maybe_map, but %u is asserting!\n",
+ assert->node_idx);
+ BUG();
+ } else if (bit != assert->node_idx) {
+ /* TODO: is this ok? */
+ printk("EEK! expected %u to be the master, but %u is asserting!\n",
+ bit, assert->node_idx);
+ BUG();
+ }
+ spin_unlock(&dlm_master_lock);
+
+ /* ok everything checks out with the MLE
+ * now check to see if there is a lockres */
+check_lockres:
+ res = __dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_lock(&res->spinlock);
+ if (!mle) {
+ if (res->owner != assert->node_idx) {
+ printk("EEEEeeEEeeEEEK! assert_master from %u, but current owner is %u!\n",
+ assert->node_idx, res->owner);
+ BUG();
+ }
+ } else {
+ if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ printk("EEEEEEEEEEEEEEEEEK!!! got assert_master from node %u, but %u is the owner!\n",
+ assert->node_idx, res->owner);
+ printk("goodnite!\n");
+ BUG();
+ }
+ if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+ printk("bug! got assert from %u, but lock with no owner should be in-progress!\n",
+ assert->node_idx);
+ BUG();
+ }
+ }
+ spin_unlock(&res->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+
+ // printk("woo! got an assert_master from node %u!\n", assert->node_idx);
+ if (mle) {
+ spin_lock(&mle->spinlock);
+ mle->master = assert->node_idx;
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+ spin_unlock(&mle->spinlock);
+
+ /* if this is the last put, it will be removed from the list */
+ dlm_put_mle(mle);
+ }
+ return 0;
+}
+
+
+int dlm_do_master_request(dlm_master_list_entry *mle, int to)
+{
+ struct inode *inode = NULL;
+ dlm_ctxt *dlm = mle->dlm;
+ dlm_master_request request;
+ int ret, response=0;
+
+ memset(&request, 0, sizeof(request));
+ request.node_idx = dlm->group_index;
+ if (mle->type == DLM_MLE_BLOCK) {
+ request.namelen = mle->u.name.len;
+ strncpy(request.name, mle->u.name.name, request.namelen);
+ } else {
+ request.namelen = mle->u.res->lockname.len;
+ strncpy(request.name, mle->u.res->lockname.name, request.namelen);
+ }
+
+ ret = -EINVAL;
+ inode = nm_get_group_node_by_index(dlm->group, to);
+ if (inode) {
+ ret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, sizeof(request), inode, &response);
+ iput(inode);
+ if (ret >= 0) {
+ spin_lock(&mle->spinlock);
+ switch (response) {
+ case DLM_MASTER_RESP_YES:
+ set_bit(to, mle->response_map);
+ // printk("woot! node %u is the master!\n", to);
+ mle->master = to;
+ break;
+ case DLM_MASTER_RESP_NO:
+ // printk("node %u is not the master, not in-progress\n", to);
+ set_bit(to, mle->response_map);
+ break;
+ case DLM_MASTER_RESP_MAYBE:
+ // printk("node %u is not the master, but IS in-progress\n", to);
+ set_bit(to, mle->response_map);
+ set_bit(to, mle->maybe_map);
+ break;
+ case DLM_MASTER_RESP_ERROR:
+ printk("node %u hit an -ENOMEM! try this whole thing again\n", to);
+ mle->error = 1;
+ break;
+ default:
+ printk("bad response! %u\n", response);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock(&mle->spinlock);
+ } else {
+ printk("net_send_message returned %d!\n", ret);
+ }
+ } else {
+ printk("nm_get_group_node_by_index failed to find inode for node %d!\n", to);
+ }
+ return ret;
+}
+
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to)
+{
+ struct inode *inode = NULL;
+ dlm_master_request_resp resp;
+ int ret;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.node_idx = dlm->group_index;
+ resp.response = response;
+ resp.namelen = name->len;
+ strncpy(resp.name, name->name, name->len);
+
+ inode = nm_get_group_node_by_index(dlm->group, to);
+ if (!inode)
+ return -EINVAL;
+ ret = net_send_message(DLM_MASTER_REQUEST_RESP_MSG, dlm->key, &resp, sizeof(resp), inode, NULL);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+int dlm_do_assert_master(dlm_master_list_entry *mle)
+{
+ struct inode *inode = NULL;
+ dlm_ctxt *dlm = mle->dlm;
+ dlm_assert_master assert;
+ int to, start = 0, ret = 0, tmpret;
+
+ while (1) {
+ to = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+ if (to >= NM_MAX_NODES) {
+ // printk("no more nodes\n");
+ break;
+ }
+ // printk("sending assert master to %d\n", to);
+
+ memset(&assert, 0, sizeof(assert));
+ assert.node_idx = dlm->group_index;
+ if (mle->type == DLM_MLE_BLOCK) {
+ assert.namelen = mle->u.name.len;
+ strncpy(assert.name, mle->u.name.name, assert.namelen);
+ } else {
+ assert.namelen = mle->u.res->lockname.len;
+ strncpy(assert.name, mle->u.res->lockname.name, assert.namelen);
+ }
+
+ inode = nm_get_group_node_by_index(dlm->group, to);
+ if (!inode) {
+ tmpret = -EINVAL;
+ printk("could not get nm info for node %d! need to retry this whole thing\n", to);
+ ret = tmpret;
+ break;
+ }
+ tmpret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &assert, sizeof(assert), inode, NULL);
+ iput(inode);
+
+ if (tmpret < 0) {
+ // TODO
+ // printk("assert_master returned %d!\n", tmpret);
+ ret = tmpret;
+ break;
+ }
+ start = to+1;
+ }
+
+ return ret;
+}
+
+
+
+
+
+
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //int ret;
+ //struct inode *node = ptr2;
+
+ dlm_master_list_entry *mle;
+ dlm_ctxt *dlm;
+
+ mle = data;
+ if (!mle) {
+ printk("eek! NULL mle!\n");
+ return;
+ }
+ if (!mle->dlm) {
+ printk("eek! NULL dlm\n");
+ return;
+ }
+ dlm = mle->dlm;
+ if (dlm->group != group)
+ return;
+
+ spin_lock(&mle->spinlock);
+
+ if (!test_bit(idx, mle->node_map))
+ printk("node %u already removed from nodemap!\n", idx);
+ else
+ clear_bit(idx, mle->node_map);
+
+#if 0
+ if (test_bit(idx, mle->recovery_map))
+ printk("node %u already added to recovery map!\n", idx);
+ else
+ set_bit(idx, mle->recovery_map);
+#endif
+ spin_unlock(&mle->spinlock);
+}
+
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //struct inode *node = ptr2;
+ dlm_master_list_entry *mle;
+ dlm_ctxt *dlm;
+
+ mle = data;
+ if (!mle) {
+ printk("eek! NULL mle!\n");
+ return;
+ }
+ if (!mle->dlm) {
+ printk("eek! NULL dlm\n");
+ return;
+ }
+ dlm = mle->dlm;
+ if (dlm->group != group)
+ return;
+
+ spin_lock(&mle->spinlock);
+
+#if 0
+ if (test_bit(idx, mle->recovery_map))
+ printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+ else
+#endif
+ {
+ if (test_bit(idx, mle->node_map))
+ printk("node %u already in node map!!!\n", idx);
+ else
+ set_bit(idx, mle->node_map);
+ }
+
+ spin_unlock(&mle->spinlock);
+}
Added: branches/dlm-glue/cluster/dlmmod.c
===================================================================
--- branches/dlm-glue/cluster/dlmmod.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmmod.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Oracle Corporation");
+//MODULE_DESCRIPTION("Oracle DLM");
+
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, always obey this ordering:
+ * dlm_domain_lock -> dlm_ctxt -> dlm_lock_resource -> dlm_lock
+ *
+ */
+
+
+static int __init dlm_driver_entry (void);
+static int dlm_read_params(void);
+static void __exit dlm_driver_exit (void);
+
+
+
+LIST_HEAD(dlm_domains);
+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+u16 dlm_global_index = NM_MAX_NODES;
+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_next_cookie = 1;
+
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+static dlm_ctxt * __dlm_lookup_domain(char *domain);
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type);
+
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void __dlm_wait_on_lockres(dlm_lock_resource *res);
+
+
+/* ----------------------------------------------------------------- */
+
+extern spinlock_t dlm_master_lock;
+extern struct list_head dlm_master_list;
+
+typedef struct _dlm_create_lock
+{
+ u16 node_idx;
+ s8 requested_type;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_create_lock;
+
+typedef struct _dlm_convert_lock
+{
+ u16 node_idx;
+ s8 requested_type;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_convert_lock;
+
+typedef struct _dlm_unlock_lock
+{
+ u32 flags;
+ u16 node_idx;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_unlock_lock;
+
+typedef struct _dlm_proxy_ast
+{
+ u16 node_idx;
+ u8 type;
+ u8 blocked_type;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_proxy_ast;
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data);
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data);
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags);
+
+/* ----------------------------------------------------------------- */
+
+
+
+
+/*
+ * dlm_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init dlm_driver_entry (void)
+{
+ int status;
+
+
+ printk("Loaded dlm Driver module\n");
+ status = dlm_read_params();
+ if (status < 0)
+ return -1;
+
+ dlm_global_index = nm_this_node(NULL);
+ if (dlm_global_index == NM_MAX_NODES)
+ return -1;
+
+ return 0;
+} /* dlm_driver_entry */
+
+/*
+ * dlm_read_params()
+ *
+ * Read insmod params
+ */
+static int dlm_read_params(void)
+{
+ int status = 0;
+ return status;
+} /* dlm_read_params */
+
+
+/*
+ * dlm_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit dlm_driver_exit (void)
+{
+ printk("Unloaded dlm Driver module\n");
+ return;
+} /* dlm_driver_exit */
+
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name,
+ dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast)
+{
+ dlm_status status;
+ dlm_lock_resource *res;
+ dlm_lock *lock = NULL;
+ char *buf = NULL;
+ int convert = 0, recovery = 0;
+ struct qstr q;
+
+ if (!lksb)
+ return DLM_BADARGS;
+
+ status = DLM_BADPARAM;
+ if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE)
+ goto error_status;
+
+ if (flags & ~LKM_VALID_FLAGS)
+ goto error_status;
+
+ convert = (flags & LKM_CONVERT);
+ recovery = (flags & LKM_RECOVERY);
+
+ if (recovery && (!dlm_is_recovery_lock(name, strlen(name)) ||
+ convert) ) {
+ goto error_status;
+ }
+
+
+ if (convert) {
+ /* if converting, must pass in a valid dlm_lock */
+ if (!lksb->lockid || !lksb->lockid->lockres)
+ goto error_status;
+ lock = lksb->lockid;
+
+ /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are essentially
+ * static after the original lock call. convert requests will check
+ * to ensure that everything is the same and pass DLM_BADARGS if not.
+ * this means that DLM_DENIED_NOASTS will never be returned.
+ */
+#warning differs from spec here!
+
+ if (lock->lksb != lksb || lock->ast != ast ||
+ lock->bast != bast || lock->astdata != data) {
+ status = DLM_BADARGS;
+ printk("ERROR new args: lksb=%p, ast=%p, bast=%p, astdata=%p\n",
+ lksb, ast, bast, data);
+ printk(" orig args: lksb=%p, ast=%p, bast=%p, astdata=%p\n",
+ lock->lksb, lock->ast, lock->bast, lock->astdata);
+ goto error_status;
+ }
+ res = lock->lockres;
+
+ down_read(&dlm->recovery_sem);
+ spin_lock(&res->spinlock);
+ if (flags & LKM_LOCAL) {
+ printk("strange LOCAL convert request!\n");
+ if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ spin_unlock(&res->spinlock);
+ status = DLM_BADPARAM;
+ goto up_error;
+ }
+ res->owner = dlm->group_index;
+ printk("set owner to this node. you SURE thats what you want!?\n");
+ }
+ status = do_dlmconvert(dlm, res, lock, flags, mode);
+ } else {
+ status = DLM_BADARGS;
+ if (!name)
+ goto error;
+
+ status = DLM_IVBUFLEN;
+ q.len = strlen(name);
+ if (q.len > DLM_LOCKID_NAME_MAX)
+ goto error;
+
+ status = DLM_SYSERR;
+ buf = kmalloc(q.len+1, GFP_KERNEL);
+ if (!buf)
+ goto error;
+
+ memcpy(buf, name, q.len);
+ buf[q.len] = 0;
+ q.name = buf;
+ q.hash = full_name_hash(q.name, q.len);
+
+ if (!recovery)
+ down_read(&dlm->recovery_sem);
+{
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2;
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+ res = dlm_get_lock_resource(dlm, &q, flags);
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ printk("dlm_get_lock_resource took %llu cycles\n", u2.q-u1.q);
+}
+ if (!res) {
+ status = DLM_IVLOCKID;
+ goto up_error;
+ }
+ status = do_dlmlock(dlm, res, lksb, flags, mode, ast, bast, data);
+ if (status != DLM_NORMAL)
+ goto up_error;
+ }
+
+ /* TODO: lvb */
+ if (!recovery)
+ up_read(&dlm->recovery_sem);
+ return status;
+
+up_error:
+ if (!recovery)
+ up_read(&dlm->recovery_sem);
+error:
+ if (buf)
+ kfree(buf);
+ lksb->lockid = NULL;
+
+error_status:
+ // this is kind of unnecessary
+ lksb->status = status;
+ return status;
+}
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb, int flags,
+ int type, dlm_astlockfunc_t *ast, dlm_bastlockfunc_t *bast, void *data)
+{
+ dlm_lock *tmplock;
+ dlm_status status;
+ u8 *c;
+
+ dlmprintk("type=%d\n", type);
+
+ status = DLM_SYSERR;
+ tmplock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+ if (!tmplock)
+ goto error;
+
+ memset(tmplock, 0, sizeof(dlm_lock));
+ INIT_LIST_HEAD(&tmplock->list);
+ INIT_LIST_HEAD(&tmplock->ast_list);
+ spin_lock_init(&tmplock->spinlock);
+ tmplock->lockres = res;
+ tmplock->type = type;
+ tmplock->convert_type = LKM_IVMODE;
+ tmplock->highest_blocked = LKM_IVMODE;
+ tmplock->node = dlm->group_index;
+ tmplock->ast = ast;
+ tmplock->bast = bast;
+ tmplock->astdata = data;
+ tmplock->lksb = lksb;
+
+ lksb->lockid = tmplock;
+
+ c = (u8 *)(&tmplock->cookie);
+
+ spin_lock(&dlm_cookie_lock);
+ tmplock->cookie = dlm_next_cookie;
+ dlm_next_cookie++;
+ if (dlm_next_cookie & 0xff00000000000000ull) {
+ printk("eek! this node's cookie will now wrap!\n");
+ dlm_next_cookie = 1;
+ }
+ c[7] = (u8)(tmplock->node & 0x00ff);
+ spin_unlock(&dlm_cookie_lock);
+
+ if (res->owner == dlm->group_index)
+ status = dlmlock_local(dlm, res, tmplock, flags);
+ else
+ status = dlmlock_remote(dlm, res, tmplock, flags);
+error:
+ if (status != DLM_NORMAL) {
+ if (tmplock)
+ kfree(tmplock);
+ lksb->lockid = NULL;
+ }
+ return status;
+}
+
+
+
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+ struct list_head *iter;
+ dlm_lock *tmplock;
+ int got_it = 0;
+
+ BUG_ON(!lock);
+ BUG_ON(!res);
+ BUG_ON(!dlm);
+
+ if (lock->node == dlm->group_index) {
+ BUG_ON(!lock->lksb);
+ }
+
+ dlmprintk("type=%d\n", lock->type);
+
+ list_for_each(iter, &res->granted) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+ list_add_tail(&lock->list, &res->blocked);
+ goto done;
+ }
+ }
+
+ list_for_each(iter, &res->converting) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+ list_add_tail(&lock->list, &res->blocked);
+ goto done;
+ }
+ }
+
+ /* got it right away */
+
+ /* if it is a remote request, proxy
+ * handler will set the lksb status */
+ if (lock->node == dlm->group_index)
+ lock->lksb->status = DLM_NORMAL;
+
+ list_add_tail(&lock->list, &res->granted);
+
+ if (dlm_do_ast(dlm, res, lock) < 0)
+ printk("eek\n");
+ got_it = 1;
+
+done:
+ spin_unlock(&res->spinlock);
+ dlm_kick_thread(dlm, res);
+ if (!got_it && (flags & LKM_NOQUEUE)) {
+ return DLM_NOTQUEUED;
+ }
+ return DLM_NORMAL;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+ dlm_status status = DLM_DENIED;
+
+ dlmprintk("type=%d\n", lock->type);
+
+ if (res->state & DLM_LOCK_RES_RECOVERING) {
+ status = DLM_RECOVERING;
+ goto bail;
+ }
+
+ /* will exit this call with spinlock held */
+ __dlm_wait_on_lockres(res);
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+ /* add lock to local (secondary) queue */
+ list_add_tail(&lock->list, &res->blocked);
+ spin_unlock(&res->spinlock);
+
+ /* spec seems to say that you will get DLM_NORMAL when the lock
+ * has been queued, meaning we need to wait for a reply here. */
+ status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+ if (status != DLM_NORMAL) {
+ /* remove from local queue if it failed */
+ list_del(&lock->list);
+ }
+bail:
+ spin_unlock(&res->spinlock);
+ return status;
+}
+
+
+/* must be already holding lockres->spinlock */
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ dlm_status status;
+
+{
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2;
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+
+ if (res->owner == dlm->group_index)
+ status = dlmconvert_local(dlm, res, lock, flags, type);
+ else
+ status = dlmconvert_remote(dlm, res, lock, flags, type);
+
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ printk("dlmconvert took %llu cycles\n", u2.q-u1.q);
+}
+ return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ dlm_status status = DLM_NORMAL;
+ struct list_head *iter;
+ dlm_lock *tmplock=NULL;
+ int remote_in_place = 0;
+
+ dlmprintk("type=%d, convert_type=%d, new convert_type=%d\n", lock->type, lock->convert_type, type);
+
+ spin_lock(&lock->spinlock);
+
+ /* already converting? */
+ if (lock->convert_type != LKM_IVMODE) {
+ printk("attempted to convert a lock with a lock conversion pending\n");
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+ return DLM_DENIED;
+ }
+
+ /* must be on grant queue to convert */
+ if (!dlm_lock_on_list(&res->granted, lock)) {
+ printk("attempted to convert a lock not on grant queue\n");
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+ return DLM_DENIED;
+ }
+
+
+ /* in-place downconvert? */
+ if (type <= lock->type)
+ goto grant;
+
+ /* upconvert from here on */
+ status = DLM_NORMAL;
+ list_for_each(iter, &res->granted) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (tmplock == lock)
+ continue;
+ if (!dlm_lock_compatible(tmplock->type, type))
+ goto switch_queues;
+ }
+
+ list_for_each(iter, &res->converting) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (!dlm_lock_compatible(tmplock->type, type))
+ goto switch_queues;
+ /* existing conversion requests take precedence */
+ if (!dlm_lock_compatible(tmplock->convert_type, type))
+ goto switch_queues;
+ }
+
+ /* fall thru to grant */
+
+grant:
+ if (lock->node != dlm->group_index) {
+ dlmprintk0("no in-place convert for nonlocal locks :( see if this helps...\n");
+ remote_in_place = 1;
+ goto switch_queues;
+ }
+
+ /* immediately grant the new lock type */
+ //printk("doing in-place %sconvert from %d to %d\n",
+ // type > lock->type ? "up" : "down", lock->type, type);
+ lock->type = type;
+ status = DLM_NORMAL;
+
+ /* if it is a remote request, proxy
+ * handler will set the lksb status */
+ if (lock->node == dlm->group_index)
+ lock->lksb->status = DLM_NORMAL;
+
+ if (dlm_do_ast(dlm, res, lock) < 0)
+ printk("eek\n");
+
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+
+ /* if successful, kick the queue runner */
+ if (status == DLM_NORMAL) {
+ dlm_kick_thread(dlm, res);
+ }
+
+ return status;
+
+switch_queues:
+ if (flags & LKM_NOQUEUE) {
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+ return DLM_NOTQUEUED;
+ }
+
+ lock->convert_type = type;
+ list_del(&lock->list);
+ /* make sure the remote in-place convert gets handled right away */
+ if (remote_in_place)
+ list_add(&lock->list, &res->converting);
+ else
+ list_add_tail(&lock->list, &res->converting);
+
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+
+ dlm_kick_thread(dlm, res);
+ return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ dlm_status status = DLM_DENIED;
+
+ dlmprintk("type=%d, convert_type=%d\n", lock->type, lock->convert_type);
+
+ if (res->state & DLM_LOCK_RES_RECOVERING) {
+ status = DLM_RECOVERING;
+ goto bail;
+ }
+ /* will exit this call with spinlock held */
+ __dlm_wait_on_lockres(res);
+
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+ /* move lock to local convert queue */
+ list_del(&lock->list);
+ list_add_tail(&lock->list, &res->converting);
+ if (lock->convert_type != LKM_IVMODE) {
+ printk("error! converting a remote lock that is already converting!\n");
+ /* TODO: return correct error */
+ BUG();
+ }
+ lock->convert_type = type;
+ spin_unlock(&res->spinlock);
+
+ /* spec seems to say that you will get DLM_NORMAL when the lock
+ * has been queued, meaning we need to wait for a reply here. */
+ status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+
+ /* if it failed, move it back to granted queue */
+ if (status != DLM_NORMAL) {
+ list_del(&lock->list);
+ list_add_tail(&lock->list, &res->granted);
+ lock->convert_type = LKM_IVMODE;
+ }
+bail:
+ spin_unlock(&res->spinlock);
+ return status;
+}
+
+
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+ dlm_status status;
+ dlm_lock_resource *res;
+ dlm_lock *lock = NULL;
+ int call_ast = 0;
+
+ if (!lksb)
+ return DLM_BADARGS;
+
+ if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK))
+ return DLM_BADPARAM;
+
+ if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+ printk("VALBLK given with CANCEL: ignoring VALBLK\n");
+ flags &= ~LKM_VALBLK;
+ }
+
+ if (!lksb->lockid || !lksb->lockid->lockres)
+ return DLM_BADPARAM;
+
+ lock = lksb->lockid;
+ res = lock->lockres;
+
+ status = dlmunlock_local(dlm, res, lock, lksb, flags, &call_ast);
+ if (call_ast)
+ (*unlockast)(data, lksb->status);
+ return status;
+}
+
+
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast)
+{
+ dlm_status status;
+ int free_lock = 0, remote_ready = 0;
+ int local = 0, remove = 0, regrant = 0;
+
+ /* according to spec and opendlm code
+ * flags & LKM_CANCEL != 0: must be converting or blocked
+ * flags & LKM_CANCEL == 0: must be granted
+ * iow, to unlock a converting lock, you must first LKM_CANCEL
+ * the convert, then call the unlock again with no LKM_CANCEL
+ */
+ *call_ast = 0;
+
+recheck:
+ spin_lock(&res->spinlock);
+ spin_lock(&lock->spinlock);
+
+ local = (res->owner == dlm->group_index);
+
+ if (flags & LKM_CANCEL) {
+ /* cancel request */
+ if (dlm_lock_on_list(&res->blocked, lock)) {
+ /* cancel this outright */
+ lksb->status = DLM_NORMAL;
+ status = DLM_NORMAL;
+ free_lock = 1;
+ *call_ast = 1;
+ remove = 1;
+ regrant = 0;
+ } else if (dlm_lock_on_list(&res->converting, lock)) {
+ /* cancel the request, put back on granted */
+ lksb->status = DLM_NORMAL;
+ status = DLM_NORMAL;
+ free_lock = 0;
+ *call_ast = 1;
+ remove = 1;
+ regrant = 1;
+ } else if (dlm_lock_on_list(&res->granted, lock)) {
+ /* too late, already granted. DLM_CANCELGRANT */
+ lksb->status = DLM_CANCELGRANT;
+ status = DLM_NORMAL;
+ free_lock = 0;
+ *call_ast = 1;
+ remove = 0;
+ regrant = 0;
+ } else {
+ /* err. um. eek! */
+ printk("lock to cancel is not on any list! bug!\n");
+ lksb->status = DLM_IVLOCKID;
+ status = DLM_IVLOCKID;
+ free_lock = 0;
+ *call_ast = 0;
+ remove = 0;
+ regrant = 0;
+ }
+ } else {
+ /* unlock request */
+ if (!dlm_lock_on_list(&res->granted, lock)) {
+ lksb->status = DLM_DENIED;
+ status = DLM_DENIED;
+ free_lock = 0;
+ *call_ast = 0;
+ remove = 0;
+ regrant = 0;
+ } else {
+ /* unlock granted lock */
+ lksb->status = DLM_NORMAL;
+ status = DLM_NORMAL;
+ free_lock = 1;
+ *call_ast = 1;
+ remove = 1;
+ regrant = 0;
+ }
+ }
+
+ if (!local) {
+ /* safe since nothing can change on this
+ * seconndary queue without lockres lock */
+ spin_unlock(&lock->spinlock);
+
+ /* if there was an outstanding change on the
+ * lockres, conditions could have changed */
+ if (!remote_ready &&
+ res->state & DLM_LOCK_RES_IN_PROGRESS) {
+ __dlm_wait_on_lockres(res);
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+ remote_ready = 1;
+ spin_unlock(&res->spinlock);
+ goto recheck;
+ }
+
+ if (res->state & DLM_LOCK_RES_RECOVERING) {
+ /* !!!!! */
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ } else {
+ spin_unlock(&res->spinlock);
+ status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, flags);
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+ }
+ spin_lock(&lock->spinlock);
+ }
+
+ if (remove)
+ list_del(&lock->list);
+ if (regrant)
+ list_add_tail(&lock->list, &res->granted);
+
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+
+ if (free_lock) {
+#warning this must change to proper refcounting
+ /* TODO: refcounting... tho for now this will work because
+ * the middle layer is keeping track of everything */
+ kfree(lock);
+ lksb->lockid = NULL;
+ }
+ return status;
+}
+
+
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags)
+{
+ struct inode *inode = NULL;
+ dlm_unlock_lock unlock;
+ int tmpret;
+ dlm_status ret;
+ int status = 0;
+
+ dlmprintk0("\n");
+
+ memset(&unlock, 0, sizeof(unlock));
+ unlock.node_idx = dlm->group_index;
+ unlock.flags = flags;
+ unlock.cookie = lock->cookie;
+ unlock.namelen = res->lockname.len;
+ strncpy(unlock.name, res->lockname.name, unlock.namelen);
+
+ ret = DLM_NOLOCKMGR;
+ lksb->status = DLM_NOLOCKMGR;
+ inode = nm_get_group_node_by_index(dlm->group, res->owner);
+ if (inode) {
+ tmpret = net_send_message(DLM_UNLOCK_LOCK_MSG, dlm->key, &unlock, sizeof(unlock), inode, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ if (status == DLM_CANCELGRANT)
+ ret = DLM_NORMAL;
+ else
+ ret = status;
+ lksb->status = status;
+ } else {
+ printk("error occurred in net_send_message: %d\n", tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
+ lksb->status = ret;
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_unlock_lock *unlock = (dlm_unlock_lock *)msg->buf;
+ dlm_lock_resource *res;
+ struct list_head *iter, *queue;
+ dlm_lock *lock;
+ dlm_status status = DLM_NORMAL;
+ int found = 0;
+ dlm_lockstatus lksb;
+ int ignore;
+ struct qstr lockname = { .name=unlock->name, .len=unlock->namelen };
+
+ dlmprintk0("\n");
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ status = DLM_IVLOCKID;
+ res = dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_lock(&res->spinlock);
+ queue = &res->granted;
+again:
+ list_for_each(iter, queue) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock->cookie == unlock->cookie &&
+ lock->node == unlock->node_idx) {
+ found = 1;
+ /* unlockast only called on originating node */
+ status = dlmunlock_local(dlm, res, lock, &lksb, unlock->flags, &ignore);
+ break;
+ }
+ }
+ if (queue == &res->granted) {
+ queue = &res->converting;
+ goto again;
+ } else if (queue == &res->converting) {
+ queue = &res->blocked;
+ goto again;
+ }
+ spin_unlock(&res->spinlock);
+ }
+ if (!found)
+ printk("failed to find lock to unlock! cookie=%llu\n", unlock->cookie);
+ else
+ status = lksb.status;
+
+ return status;
+}
+
+
+
+
+
+static dlm_ctxt * __dlm_lookup_domain(char *domain)
+{
+ dlm_ctxt *tmp = NULL;
+ struct list_head *iter;
+
+ list_for_each(iter, &dlm_domains) {
+ tmp = list_entry (iter, dlm_ctxt, list);
+ if (strncmp(tmp->name, domain, NM_MAX_NAME_LEN)==0)
+ break;
+ tmp = NULL;
+ }
+
+ return tmp;
+}
+
+dlm_ctxt * dlm_lookup_domain(char *domain)
+{
+ dlm_ctxt *tmp = NULL;
+ spin_lock(&dlm_domain_lock);
+ tmp = __dlm_lookup_domain(domain);
+ spin_unlock(&dlm_domain_lock);
+ return tmp;
+}
+
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+ struct list_head *iter;
+ dlm_lock_resource *tmpres=NULL;
+ struct list_head *bucket;
+
+ bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+ /* check for pre-existing lock */
+ list_for_each(iter, bucket) {
+ tmpres = list_entry(iter, dlm_lock_resource, list);
+ if (tmpres->lockname.len == lockname->len &&
+ strncmp(tmpres->lockname.name, lockname->name, lockname->len) == 0)
+ break;
+ tmpres = NULL;
+ }
+ return tmpres;
+}
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+ dlm_lock_resource *res;
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lock(dlm, lockname);
+ spin_unlock(&dlm->spinlock);
+ return res;
+}
+
+
+
+/*
+ * dlm_register_domain: one-time setup per "domain"
+ */
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key)
+{
+ dlm_ctxt *tmp = NULL, *dlm = NULL;
+ struct inode *group = NULL;
+ int tmpret, i;
+ char *netbuf;
+
+ if (strlen(domain) > NM_MAX_NAME_LEN) {
+ printk("domain name length too long\n");
+ goto leave;
+ }
+
+ group = nm_get_group_by_name(group_name);
+ if (!group) {
+ printk("no nm group %s for domain %s!\n", group_name, domain);
+ goto leave;
+ }
+
+ /*
+ * TODO: should i do some type of dlm-group-join business here?
+ * I need to have new nodes communicate with other dlm nodes to
+ * wait until their master lists are empty before allowing me to
+ * join. does this belong here? or in hb?
+ * seems like stuff that heartbeat shouldn't care about, cuz we
+ * would actually be preventing a node that is "UP" from being
+ * part of the dlm group.
+ */
+ dlm = dlm_lookup_domain(domain);
+ if (dlm) {
+ /* found a pre-existing domain */
+ goto leave;
+ }
+
+ dlm = kmalloc(sizeof(dlm_ctxt), GFP_KERNEL);
+ if (dlm == NULL) {
+ printk("could not allocate dlm_ctxt\n");
+ goto leave;
+ }
+ memset(dlm, 0, sizeof(dlm_ctxt));
+ dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+ if (dlm->name == NULL) {
+ kfree(dlm);
+ dlm = NULL;
+ printk("could not allocate dlm domain name\n");
+ goto leave;
+ }
+ dlm->net_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!dlm->net_buf) {
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ printk("could not allocate dlm network temporary buffer\n");
+ goto leave;
+ }
+ dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
+ if (!dlm->resources) {
+ kfree(dlm->name);
+ kfree(dlm);
+ free_page((unsigned long)dlm->net_buf);
+ dlm = NULL;
+ printk("could not allocate dlm hash\n");
+ goto leave;
+ }
+ memset(dlm->resources, 0, PAGE_SIZE);
+
+ for (i=0; i<DLM_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&dlm->resources[i]);
+
+ strcpy(dlm->name, domain);
+ spin_lock_init(&dlm->spinlock);
+ INIT_LIST_HEAD(&dlm->list);
+ INIT_LIST_HEAD(&dlm->dirty_list);
+ INIT_LIST_HEAD(&dlm->reco.resources);
+ INIT_LIST_HEAD(&dlm->reco.received);
+ util_thread_info_init(&dlm->thread);
+ util_thread_info_init(&dlm->reco.thread);
+ init_rwsem(&dlm->recovery_sem);
+ dlm->group = group;
+ dlm->group_index = nm_this_node(group);
+ dlm->key = key;
+ dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.next_seq = 0;
+
+ spin_lock(&dlm_domain_lock);
+ tmp = __dlm_lookup_domain(domain);
+ if (tmp) {
+ spin_unlock(&dlm_domain_lock);
+ /* found a pre-existing domain */
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ goto leave;
+ }
+
+ /* add the new domain */
+ list_add_tail(&dlm->list, &dlm_domains);
+ spin_unlock(&dlm_domain_lock);
+
+ tmpret = hb_register_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+ if (tmpret)
+ goto error;
+ tmpret = hb_register_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+ if (tmpret)
+ goto error;
+
+ /* TODO: need to use hb_fill_node_map to fill a temporary votemap
+ * then communicate with each of these nodes that I want to come up
+ * FOR THIS DLM. there may be many nodes in this group heartbeating
+ * but they may not care about this particular dlm instance. once
+ * everyone has come back with a response that i have been added or
+ * that they are not a member I can put together the REAL node map
+ * for this dlm in dlm->node_map */
+ /* TODO: I guess we can fill this here as a superset of possible nodes
+ * so that the hb_callbacks above have something to work on in the meantime
+ * then trim out the nodes that are not part of this dlm once we know */
+ /* TODO: I may need to register a special net handler on insmod of dlm.o
+ * with a key of 0 so that I can respond to requests even if I am not
+ * part of a dlm group. this would still leave a gap in time between the
+ * start of heartbeating and the insmod dlm.o, unless I change the module
+ * loading stuff in clusterbo to include dlm.o (which would work fine) */
+#warning WRONG WRONG WRONG
+ tmpret = hb_fill_node_map(group, dlm->node_map, NM_MAX_NODES);
+ if (tmpret)
+ goto error;
+
+
+#if 0
+ tmpret = net_register_handler("reco-request",
+ DLM_NET_RECOVERY_REQUEST_MSG_TYPE,
+ key, sizeof(dlm_reco_request),
+ dlm_recovery_request_handler, dlm);
+ if (tmpret)
+ goto error;
+ tmpret = net_register_handler("reco-lock-arr-req",
+ DLM_NET_RECOVERY_LOCK_ARR_REQ_MSG_TYPE,
+ key, sizeof(dlm_reco_lock_arr_req),
+ dlm_recovery_lock_arr_req_handler, dlm);
+ if (tmpret)
+ goto error;
+ tmpret = net_register_handler("reco-response",
+ DLM_NET_RECOVERY_RESPONSE_MSG_TYPE,
+ key, sizeof(dlm_reco_response),
+ dlm_recovery_response_handler, dlm);
+ if (tmpret)
+ goto error;
+#endif
+
+ netbuf = dlm->net_buf;
+ tmpret = net_register_handler(DLM_MASTER_REQUEST_RESP_MSG, key, 0,
+ sizeof(dlm_master_request_resp),
+ dlm_master_request_resp_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request_resp));
+
+ tmpret = net_register_handler(DLM_MASTER_REQUEST_MSG, key, 0,
+ sizeof(dlm_master_request),
+ dlm_master_request_handler,
+ dlm, netbuf);
+
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request));
+
+ tmpret = net_register_handler(DLM_ASSERT_MASTER_MSG, key, 0,
+ sizeof(dlm_assert_master),
+ dlm_assert_master_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_assert_master));
+ tmpret = net_register_handler(DLM_CREATE_LOCK_MSG, key, 0,
+ sizeof(dlm_create_lock),
+ dlm_create_lock_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_create_lock));
+ tmpret = net_register_handler(DLM_CONVERT_LOCK_MSG, key, 0,
+ sizeof(dlm_convert_lock),
+ dlm_convert_lock_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_convert_lock));
+
+ tmpret = net_register_handler(DLM_UNLOCK_LOCK_MSG, key, 0,
+ sizeof(dlm_unlock_lock),
+ dlm_unlock_lock_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_unlock_lock));
+
+ tmpret = net_register_handler(DLM_PROXY_AST_MSG, key, 0,
+ sizeof(dlm_proxy_ast),
+ dlm_proxy_ast_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_proxy_ast));
+// printk("netbuf=%p net_buf=%p diff=%d\n", netbuf, dlm->net_buf, ((char *)netbuf - (char *)dlm->net_buf)); // currently 768
+
+ tmpret = dlm_launch_thread(dlm);
+ if (tmpret == 0)
+ goto leave;
+
+error:
+ hb_unregister_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm);
+ hb_unregister_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm);
+ spin_lock(&dlm_domain_lock);
+ list_del(&dlm->list);
+ spin_unlock(&dlm_domain_lock);
+ free_page((unsigned long)dlm->net_buf);
+ free_page((unsigned long)dlm->resources);
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+
+leave:
+ if (!dlm && group)
+ iput(group);
+ return dlm;
+}
+
+void dlm_unregister_domain(dlm_ctxt *dlm)
+{
+ // fill me in please
+}
+
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname)
+{
+ memset(res, 0, sizeof(dlm_lock_resource));
+ res->lockname.name = lockname->name;
+ res->lockname.len = lockname->len;
+ res->lockname.hash = lockname->hash;
+ init_waitqueue_head(&res->wq);
+ spin_lock_init(&res->spinlock);
+ INIT_LIST_HEAD(&res->list);
+ INIT_LIST_HEAD(&res->granted);
+ INIT_LIST_HEAD(&res->converting);
+ INIT_LIST_HEAD(&res->blocked);
+ INIT_LIST_HEAD(&res->dirty);
+ INIT_LIST_HEAD(&res->recovering);
+
+ res->owner = DLM_LOCK_RES_OWNER_UNKNOWN;
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+}
+
+
+
+
+/* will exit holding res->spinlock, but may drop in function */
+void dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&res->wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+ spin_unlock(&res->spinlock);
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(&res->wq, &wait);
+ current->state = TASK_RUNNING;
+}
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&res->wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+ spin_unlock(&res->spinlock);
+ schedule();
+ spin_lock(&res->spinlock);
+ goto repeat;
+ }
+ remove_wait_queue(&res->wq, &wait);
+ current->state = TASK_RUNNING;
+}
+
+
+
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock)
+{
+ dlm_astlockfunc_t *fn = lock->ast;
+
+ dlmprintk0("\n");
+
+ if (lock->node != dlm->group_index) {
+ return dlm_send_proxy_ast(dlm, res, lock, DLM_AST, 0);
+ }
+ if (!fn) {
+ printk("eek! lock has no ast %*s! cookie=%llu\n",
+ res->lockname.len, res->lockname.name, lock->cookie);
+ return -EINVAL;
+ }
+ (*fn)(lock->astdata);
+ return 0;
+}
+
+
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type)
+{
+ dlm_bastlockfunc_t *fn = lock->bast;
+
+ dlmprintk0("\n");
+
+ if (lock->node != dlm->group_index) {
+ return dlm_send_proxy_ast(dlm, res, lock, DLM_BAST, blocked_type);
+ }
+
+ if (!fn) {
+ printk("eek! lock has no bast %*s! cookie=%llu\n",
+ res->lockname.len, res->lockname.name, lock->cookie);
+ return -EINVAL;
+ }
+ (*fn)(lock->astdata, blocked_type);
+ return 0;
+}
+
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type)
+{
+ int ret = 0;
+ dlm_proxy_ast past;
+ struct inode *inode = NULL;
+
+ dlmprintk("to=%u, type=%d, blocked_type=%d\n", lock->node, type, blocked_type);
+
+ past.node_idx = dlm->group_index;
+ past.type = type;
+ past.blocked_type = blocked_type;
+ past.namelen = res->lockname.len;
+ strncpy(past.name, res->lockname.name, past.namelen);
+ past.cookie = lock->cookie;
+
+ ret = -EINVAL;
+ inode = nm_get_group_node_by_index(dlm->group, lock->node);
+ if (inode) {
+ ret = net_send_message(DLM_PROXY_AST_MSG, dlm->key, &past, sizeof(past), inode, NULL);
+ iput(inode);
+ }
+ if (ret < 0) {
+ printk("(%d) dlm_send_proxy_ast: returning %d\n", current->pid, ret);
+ }
+ return ret;
+}
+
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data)
+{
+ int status;
+ dlm_ctxt *dlm = data;
+ dlm_lock_resource *res;
+ dlm_lock *lock = NULL;
+ dlm_proxy_ast *past = (dlm_proxy_ast *) msg->buf;
+ struct qstr lockname = { .name=past->name, .len=past->namelen };
+ struct list_head *iter, *head=NULL;
+ u64 cookie = past->cookie;
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ dlmprintk("type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+ if (past->type != DLM_AST &&
+ past->type != DLM_BAST) {
+ printk("Eeeek unknown ast type! %d, cookie=%llu, name=%*s\n",
+ past->type, cookie, lockname.len, lockname.name);
+ return 0;
+ }
+
+ res = dlm_lookup_lock(dlm, &lockname);
+ if (!res) {
+ printk("eek! got %sast for unknown lockres! cookie=%llu, name=%*s, namelen=%d\n",
+ past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+ return 0;
+ }
+
+ if (!dlm_is_recovery_lock(past->name, past->namelen))
+ down_read(&dlm->recovery_sem);
+ spin_lock(&res->spinlock);
+
+ /* try convert queue for both ast/bast */
+ head = &res->converting;
+ lock = NULL;
+ list_for_each(iter, head) {
+ lock = list_entry (iter, dlm_lock, list);
+ if (lock->cookie == cookie)
+ goto do_ast;
+ }
+
+ /* if not on convert, try blocked for ast, granted for bast */
+ if (past->type == DLM_AST)
+ head = &res->blocked;
+ else
+ head = &res->granted;
+
+ list_for_each(iter, head) {
+ lock = list_entry (iter, dlm_lock, list);
+ if (lock->cookie == cookie)
+ goto do_ast;
+ }
+
+ printk("eek! got %sast for unknown lock! cookie=%llu, name=%*s, namelen=%d\n",
+ past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+ spin_unlock(&res->spinlock);
+ if (!dlm_is_recovery_lock(past->name, past->namelen))
+ up_read(&dlm->recovery_sem);
+ return 0;
+
+do_ast:
+ if (past->type == DLM_AST) {
+ list_del(&lock->list);
+ list_add_tail(&lock->list, &res->granted);
+ dlmprintk("ast: adding to granted list... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ if (lock->convert_type != LKM_IVMODE) {
+ lock->type = lock->convert_type;
+ lock->convert_type = LKM_IVMODE;
+ } else {
+ // should already be there....
+ }
+
+ lock->lksb->status = DLM_NORMAL;
+
+ status = dlm_do_ast(dlm, res, lock);
+ dlmprintk("ast done: now... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ } else {
+ dlmprintk("bast: before... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ status = dlm_do_bast(dlm, res, lock, past->blocked_type);
+ dlmprintk("bast: after... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ }
+
+ if (status < 0)
+ printk("eeek: ast/bast returned %d\n", status);
+
+ spin_unlock(&res->spinlock);
+ if (!dlm_is_recovery_lock(past->name, past->namelen))
+ up_read(&dlm->recovery_sem);
+ return 0;
+}
+
+
+
+
+
+
+
+/*
+ * message handlers should just return status.
+ * this will get send back to the calling node if it
+ * requested a status return.
+ */
+
+
+/* remote lock creation */
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+ struct inode *inode = NULL;
+ dlm_create_lock create;
+ int tmpret, status = 0;
+ dlm_status ret;
+
+ dlmprintk0("\n");
+
+ memset(&create, 0, sizeof(create));
+ create.node_idx = dlm->group_index;
+ create.requested_type = lock->type;
+ create.cookie = lock->cookie;
+ create.namelen = res->lockname.len;
+ strncpy(create.name, res->lockname.name, create.namelen);
+
+ ret = DLM_NOLOCKMGR;
+ inode = nm_get_group_node_by_index(dlm->group, res->owner);
+ if (inode) {
+ tmpret = net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, sizeof(create), inode, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ ret = status; // this is already a dlm_status
+ } else {
+ printk("error occurred in net_send_message: %d\n", tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_create_lock *create = (dlm_create_lock *)msg->buf;
+ dlm_lock_resource *res;
+ dlm_lock *newlock;
+ dlm_status status = DLM_NORMAL;
+ struct qstr lockname = { .name=create->name, .len=create->namelen };
+
+ dlmprintk0("\n");
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ newlock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+ if (!newlock)
+ return DLM_SYSERR;
+
+ memset(newlock, 0, sizeof(dlm_lock));
+ INIT_LIST_HEAD(&newlock->list);
+ INIT_LIST_HEAD(&newlock->ast_list);
+ spin_lock_init(&newlock->spinlock);
+ newlock->type = create->requested_type;
+ newlock->convert_type = LKM_IVMODE;
+ newlock->highest_blocked = LKM_IVMODE;
+ newlock->node = create->node_idx;
+ newlock->ast = NULL;
+ newlock->bast = NULL;
+ newlock->astdata = NULL;
+ newlock->cookie = create->cookie;
+
+ status = DLM_IVLOCKID;
+ res = dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_lock(&res->spinlock);
+ newlock->lockres = res;
+ status = dlmlock_local(dlm, res, newlock, 0);
+ spin_unlock(&res->spinlock);
+ }
+
+ return status;
+}
+
+/* remote lock conversion */
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ struct inode *inode = NULL;
+ dlm_convert_lock convert;
+ int tmpret;
+ dlm_status ret;
+ int status = 0;
+
+ dlmprintk0("\n");
+
+ memset(&convert, 0, sizeof(convert));
+ convert.node_idx = dlm->group_index;
+ convert.requested_type = type;
+ convert.cookie = lock->cookie;
+ convert.namelen = res->lockname.len;
+ strncpy(convert.name, res->lockname.name, convert.namelen);
+
+ ret = DLM_NOLOCKMGR;
+ inode = nm_get_group_node_by_index(dlm->group, res->owner);
+ if (inode) {
+ tmpret = net_send_message(DLM_CONVERT_LOCK_MSG, dlm->key, &convert, sizeof(convert), inode, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ ret = status; // this is already a dlm_status
+ } else {
+ printk("error occurred in net_send_message: %d\n", tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_convert_lock *convert = (dlm_convert_lock *)msg->buf;
+ dlm_lock_resource *res;
+ struct list_head *iter;
+ dlm_lock *lock;
+ dlm_status status = DLM_NORMAL;
+ int found = 0;
+ struct qstr lockname = { .name=convert->name, .len=convert->namelen };
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2, u3, u4, u5, u6, u7;
+
+
+ dlmprintk0("\n");
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+
+ status = DLM_IVLOCKID;
+ res = dlm_lookup_lock(dlm, &lockname);
+ rdtsc(u3.hilo[0], u3.hilo[1]);
+ if (res) {
+ spin_lock(&res->spinlock);
+ rdtsc(u4.hilo[0], u4.hilo[1]);
+ list_for_each(iter, &res->granted) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock->cookie == convert->cookie &&
+ lock->node == convert->node_idx) {
+ found = 1;
+ rdtsc(u5.hilo[0], u5.hilo[1]);
+ status = dlmconvert_local(dlm, res, lock, 0, convert->requested_type);
+ rdtsc(u6.hilo[0], u6.hilo[1]);
+ break;
+ }
+ }
+ spin_unlock(&res->spinlock);
+ }
+ if (!found)
+ printk("failed to find lock to convert on grant queue! cookie=%llu\n", convert->cookie);
+
+ rdtsc(u7.hilo[0], u7.hilo[1]);
+ dlmprintk("1-2:%llu 2-3:%llu 3-4:%llu 4-5:%llu 5-6:%llu 6-7:%llu\n",
+ u2.q-u1.q, u3.q-u2.q, u4.q-u3.q, u5.q-u4.q, u6.q-u5.q, u7.q-u6.q);
+ return status;
+}
+
+void dlm_dump_everything(void)
+{
+ dlm_ctxt *dlm;
+ struct list_head *iter;
+
+ printk("dumping ALL dlm state for node %s\n", system_utsname.nodename);
+ spin_lock(&dlm_domain_lock);
+ list_for_each(iter, &dlm_domains) {
+ dlm = list_entry (iter, dlm_ctxt, list);
+ dlm_dump_dlm(dlm);
+ }
+ spin_unlock(&dlm_domain_lock);
+}
+
+void dlm_dump_dlm(dlm_ctxt *dlm)
+{
+ dlm_lock_resource *res;
+ dlm_lock *lock;
+ struct list_head *iter, *iter2;
+ struct list_head *bucket;
+ int i;
+
+ printk("dlm_ctxt: %s, group=%u, key=%u\n", dlm->name, dlm->group_index, dlm->key);
+ printk("some bug here... should not have to check for this...\n");
+ if (!dlm || !dlm->name) {
+ printk("wtf... dlm=%p\n", dlm);
+ return;
+ }
+
+ spin_lock(&dlm->spinlock);
+ for (i=0; i<DLM_HASH_SIZE; i++) {
+ bucket = &(dlm->resources[i]);
+ list_for_each(iter, bucket) {
+ res = list_entry(iter, dlm_lock_resource, list);
+ printk("lockres: %*s, owner=%u, state=%u\n", res->lockname.len, res->lockname.name,
+ res->owner, res->state);
+ spin_lock(&res->spinlock);
+ printk(" granted queue: \n");
+ list_for_each(iter2, &res->granted) {
+ lock = list_entry(iter2, dlm_lock, list);
+ spin_lock(&lock->spinlock);
+ printk(" type=%d, conv=%d, node=%u, cookie=%llu\n",
+ lock->type, lock->convert_type, lock->node, lock->cookie);
+ spin_unlock(&lock->spinlock);
+ }
+ printk(" converting queue: \n");
+ list_for_each(iter2, &res->converting) {
+ lock = list_entry(iter2, dlm_lock, list);
+ spin_lock(&lock->spinlock);
+ printk(" type=%d, conv=%d, node=%u, cookie=%llu\n",
+ lock->type, lock->convert_type, lock->node, lock->cookie);
+ spin_unlock(&lock->spinlock);
+ }
+ printk(" blocked queue: \n");
+ list_for_each(iter2, &res->blocked) {
+ lock = list_entry(iter2, dlm_lock, list);
+ spin_lock(&lock->spinlock);
+ printk(" type=%d, conv=%d, node=%u, cookie=%llu\n",
+ lock->type, lock->convert_type, lock->node, lock->cookie);
+ spin_unlock(&lock->spinlock);
+ }
+ spin_unlock(&res->spinlock);
+ }
+ }
+ spin_unlock(&dlm->spinlock);
+}
+
+module_init (dlm_driver_entry);
+module_exit (dlm_driver_exit);
Added: branches/dlm-glue/cluster/dlmmod.h
===================================================================
--- branches/dlm-glue/cluster/dlmmod.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmmod.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,467 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMMOD_H
+#define CLUSTER_DLMMOD_H
+
+
+
+#if 0
+#define dlmprintk(x, arg...)
+#define dlmprintk0(x)
+#else
+#define dlmprintk(x, arg...) printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__, ##arg)
+#define dlmprintk0(x) printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__)
+#endif
+
+
+
+
+#define DLM_HB_NODE_DOWN_PRI (0xf000000)
+#define DLM_HB_NODE_UP_PRI (0x8000000)
+
+#define DLM_LVB_LEN 64
+#define DLM_LOCKID_NAME_MAX 32
+
+#define DLM_DOMAIN_NAME_MAX_LEN 255
+#define DLM_LOCK_RES_OWNER_UNKNOWN NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
+#define DLM_THREAD_MS 200 // flush at least every 200 ms
+
+#define DLM_HASH_BITS 7
+#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
+#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
+
+typedef enum _dlm_ast_type {
+ DLM_AST = 0,
+ DLM_BAST,
+ DLM_ASTUNLOCK
+} dlm_ast_type;
+
+
+#define LKM_IVMODE (-1) /* invalid mode */
+#define LKM_NLMODE 0 /* null lock */
+#define LKM_CRMODE 1 /* concurrent read */ /* unsupported */
+#define LKM_CWMODE 2 /* concurrent write */ /* unsupported */
+#define LKM_PRMODE 3 /* protected read */
+#define LKM_PWMODE 4 /* protected write */ /* unsupported */
+#define LKM_EXMODE 5 /* exclusive */
+#define LKM_MAXMODE 5
+#define LKM_MODEMASK 0xff
+
+
+/* TODO: Flags which OCFS2 will require:
+ * - LKM_LOCAL
+ * - LKM_VALBLK
+ * - LKM_NOQUEUE
+ * - LKM_CONVERT
+ * - LKM_CANCEL */
+#define LKM_ORPHAN 0x10 /* this lock is orphanable */ /* unsupported */
+#define LKM_PARENTABLE 0x20 /* this lock was orphaned */ /* unsupported */
+#define LKM_BLOCK 0x40 /* blocking lock request */ /* unsupported */
+#define LKM_LOCAL 0x80 /* local lock request */
+#define LKM_VALBLK 0x100 /* lock value block request */
+#define LKM_NOQUEUE 0x200 /* non blocking request */
+#define LKM_CONVERT 0x400 /* conversion request */
+#define LKM_NODLCKWT 0x800 /* this lock wont deadlock */ /* unsupported */
+#define LKM_UNLOCK 0x1000 /* deallocate this lock */
+#define LKM_CANCEL 0x2000 /* cancel conversion request */
+#define LKM_DEQALL 0x4000 /* remove all locks held by proc */ /* unsupported */
+#define LKM_INVVALBLK 0x8000 /* invalidate lock value block */
+#define LKM_SYNCSTS 0x10000 /* return synchronous status if poss */ /* unsupported */
+#define LKM_TIMEOUT 0x20000 /* lock request contains timeout */ /* unsupported */
+#define LKM_SNGLDLCK 0x40000 /* request can self-deadlock */ /* unsupported */
+#define LKM_FINDLOCAL 0x80000 /* find local lock request */ /* unsupported */
+#define LKM_PROC_OWNED 0x100000 /* owned by process, not group */ /* unsupported */
+#define LKM_XID 0x200000 /* use transaction id for deadlock */ /* unsupported */
+#define LKM_XID_CONFLICT 0x400000 /* do not allow lock inheritance */ /* unsupported */
+#define LKM_FORCE 0x800000 /* force unlock flag */
+#define LKM_REVVALBLK 0x1000000 /* temporary solution: re-validate lock value block */ /* unsupported */
+
+#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock, used to avoid recovery rwsem */
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+ LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+ LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN 9
+
+static inline int dlm_is_recovery_lock(char *lock_name, int name_len)
+{
+ if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+ strncmp(lock_name, DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN)==0)
+ return 1;
+ return 0;
+}
+
+typedef enum _dlm_status {
+ DLM_NORMAL, /* request in progress */
+ DLM_GRANTED, /* request granted */
+ DLM_DENIED, /* request denied */
+ DLM_DENIED_NOLOCKS, /* request denied, out of system resources */
+ DLM_WORKING, /* async request in progress */
+ DLM_BLOCKED, /* lock request blocked */
+ DLM_BLOCKED_ORPHAN, /* lock request blocked by a orphan lock*/
+ DLM_DENIED_GRACE_PERIOD, /* topological change in progress */
+ DLM_SYSERR, /* system error */
+ DLM_NOSUPPORT, /* unsupported */
+ DLM_CANCELGRANT, /* can't cancel convert: already granted */
+ DLM_IVLOCKID, /* bad lockid */
+ DLM_SYNC, /* synchronous request granted */
+ DLM_BADTYPE, /* bad resource type */
+ DLM_BADRESOURCE, /* bad resource handle */
+ DLM_MAXHANDLES, /* no more resource handles */
+ DLM_NOCLINFO, /* can't contact cluster manager */
+ DLM_NOLOCKMGR, /* can't contact lock manager */
+ DLM_NOPURGED, /* can't contact purge daemon */
+ DLM_BADARGS, /* bad api args */
+ DLM_VOID, /* no status */
+ DLM_NOTQUEUED, /* NOQUEUE was specified and request failed */
+ DLM_IVBUFLEN, /* invalid resource name length */
+ DLM_CVTUNGRANT, /* attempted to convert ungranted lock */
+ DLM_BADPARAM, /* invalid lock mode specified */
+ DLM_VALNOTVALID, /* value block has been invalidated */
+ DLM_REJECTED, /* request rejected, unrecognized client */
+ DLM_ABORT, /* blocked lock request cancelled */
+ DLM_CANCEL, /* conversion request cancelled */
+ DLM_IVRESHANDLE, /* invalid resource handle */
+ DLM_DEADLOCK, /* deadlock recovery refused this request */
+ DLM_DENIED_NOASTS, /* failed to allocate AST */
+ DLM_FORWARD, /* request must wait for primary's response */
+ DLM_TIMEOUT, /* timeout value for lock has expired */
+ DLM_IVGROUPID, /* invalid group specification */
+ DLM_VERS_CONFLICT, /* version conflicts prevent request handling */
+ DLM_BAD_DEVICE_PATH, /* Locks device does not exist or path wrong */
+ DLM_NO_DEVICE_PERMISSION, /* Client has insufficient pers for device */
+ DLM_NO_CONTROL_DEVICE, /* Cannot set options on opened device */
+ DLM_MAXSTATS, /* upper limit for return code validation */
+
+ DLM_RECOVERING /* our lame addition to allow caller to fail a lock
+ request if it is being recovered */
+} dlm_status;
+
+
+
+typedef struct _dlm_recovery_ctxt
+{
+ struct list_head resources;
+ struct list_head received; // list of dlm_reco_lock_infos received from other nodes during recovery
+ u16 new_master;
+ u16 dead_node;
+ u16 sending_node;
+ u32 next_seq;
+ util_thread_info thread;
+} dlm_recovery_ctxt;
+
+
+struct _dlm_ctxt
+{
+ struct list_head list;
+ struct list_head *resources;
+ struct list_head dirty_list;
+ spinlock_t spinlock;
+ struct rw_semaphore recovery_sem;
+ char *name;
+ char *net_buf;
+ util_thread_info thread;
+ struct inode *group;
+ u32 key;
+ u16 group_index;
+ u32 node_map[8];
+ u32 recovery_map[8];
+ dlm_recovery_ctxt reco;
+};
+
+#define DLM_LOCK_RES_UNINITED 0x00000001
+#define DLM_LOCK_RES_RECOVERING 0x00000002
+#define DLM_LOCK_RES_READY 0x00000004
+#define DLM_LOCK_RES_DIRTY 0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
+
+typedef struct _dlm_lock_resource
+{
+ struct list_head list;
+ struct list_head granted;
+ struct list_head converting;
+ struct list_head blocked;
+ struct list_head dirty;
+ struct list_head recovering; // dlm_recovery_ctxt.resources list
+ spinlock_t spinlock;
+ wait_queue_head_t wq;
+ u16 owner; // node which owns the lock resource, or unknown
+ u16 state;
+ struct qstr lockname;
+ char lvb[DLM_LVB_LEN];
+} dlm_lock_resource;
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, dlm_status);
+
+typedef struct _dlm_lockstatus dlm_lockstatus;
+
+typedef struct _dlm_lock
+{
+ struct list_head list;
+ struct list_head ast_list;
+ dlm_lock_resource *lockres;
+ spinlock_t spinlock;
+
+ s8 type;
+ s8 convert_type;
+ s8 highest_blocked;
+ s8 reserved1;
+ u16 node;
+ u16 reserved2;
+
+ dlm_astlockfunc_t *ast; // ast and bast must be callable while holding a spinlock!
+ dlm_bastlockfunc_t *bast;
+ void *astdata;
+ u64 cookie;
+ dlm_lockstatus *lksb;
+} dlm_lock;
+
+
+struct _dlm_lockstatus {
+ dlm_status status;
+ dlm_lock *lockid;
+ char lvb[DLM_LVB_LEN];
+};
+
+enum {
+ DLM_MLE_BLOCK,
+ DLM_MLE_MASTER
+};
+
+typedef struct _dlm_lock_name
+{
+ u8 len;
+ u8 name[0]; // [DLM_LOCKID_NAME_MAX]
+} dlm_lock_name;
+
+/* good god this needs to be trimmed down */
+typedef struct _dlm_master_list_entry
+{
+ struct list_head list;
+ dlm_ctxt *dlm;
+ spinlock_t spinlock;
+ wait_queue_head_t wq;
+ atomic_t woken;
+ atomic_t refcnt;
+ u32 maybe_map[8];
+ u32 vote_map[8];
+ u32 response_map[8];
+ u32 node_map[8];
+ u16 master;
+ u8 error;
+ u8 type; // BLOCK or MASTER
+ union {
+ dlm_lock_resource *res;
+ dlm_lock_name name;
+ } u;
+} dlm_master_list_entry;
+
+void dlm_put_mle(dlm_master_list_entry *mle);
+static inline void dlm_get_mle(dlm_master_list_entry *mle)
+{
+ atomic_inc(&mle->refcnt);
+}
+
+
+#define DLM_MASTER_REQUEST_MSG 500
+#define DLM_MASTER_REQUEST_RESP_MSG 501
+#define DLM_ASSERT_MASTER_MSG 502
+#define DLM_CREATE_LOCK_MSG 503
+#define DLM_CONVERT_LOCK_MSG 504
+#define DLM_PROXY_AST_MSG 505
+#define DLM_UNLOCK_LOCK_MSG 506
+
+
+enum {
+ DLM_MASTER_RESP_NO,
+ DLM_MASTER_RESP_YES,
+ DLM_MASTER_RESP_MAYBE,
+ DLM_MASTER_RESP_ERROR
+};
+
+typedef struct _dlm_master_request
+{
+ u16 node_idx;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request;
+
+typedef struct _dlm_master_request_resp
+{
+ u16 node_idx;
+ u8 response;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request_resp;
+
+typedef struct _dlm_assert_master
+{
+ u16 node_idx;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} dlm_assert_master;
+
+
+
+
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res);
+void dlm_thread_run_lock_resources(dlm_ctxt *dlm);
+int dlm_thread(void *data);
+int dlm_launch_thread(dlm_ctxt *dlm);
+void dlm_complete_thread(dlm_ctxt *dlm);
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name,
+ dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast);
+
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb,
+ int flags, int type, dlm_astlockfunc_t *ast,
+ dlm_bastlockfunc_t *bast, void *data);
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data);
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast);
+
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key);
+void dlm_unregister_domain(dlm_ctxt *dlm);
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_refresh_lock_resource(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock);
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type);
+u16 dlm_nm_this_node(dlm_ctxt *dlm);
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res);
+
+int dlm_nm_init(dlm_ctxt *dlm);
+int dlm_heartbeat_init(dlm_ctxt *dlm);
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+dlm_ctxt * dlm_lookup_domain(char *domain);
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data);
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int dlm_hb_node_up(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node);
+
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data);
+int dlm_do_master_request(dlm_master_list_entry *mle, int to);
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to);
+int dlm_do_assert_master(dlm_master_list_entry *mle);
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data);
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname);
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void dlm_dump_everything(void);
+void dlm_dump_dlm(dlm_ctxt *dlm);
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+ /* NO_LOCK compatible with all */
+ if (request == LKM_NLMODE ||
+ existing == LKM_NLMODE)
+ return 1;
+
+ /* EX incompatible with all non-NO_LOCK */
+ if (request == LKM_EXMODE)
+ return 0;
+
+ /* request must be PR, which is compatible with PR */
+ if (existing == LKM_PRMODE)
+ return 1;
+
+ return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head, dlm_lock *lock)
+{
+ struct list_head *iter;
+ dlm_lock *tmplock;
+
+ list_for_each(iter, head) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (tmplock == lock)
+ return 1;
+ }
+ return 0;
+}
+
+static inline int dlm_mle_equal(dlm_ctxt *dlm, dlm_master_list_entry *mle, struct qstr *lockname)
+{
+ dlm_lock_resource *res;
+
+ if (dlm != mle->dlm)
+ return 0;
+
+ if (mle->type == DLM_MLE_BLOCK) {
+ if (lockname->len != mle->u.name.len ||
+ strncmp(lockname->name, mle->u.name.name, lockname->len)!=0)
+ return 0;
+ } else {
+ res = mle->u.res;
+ if (res->lockname.hash != lockname->hash ||
+ res->lockname.len != lockname->len ||
+ strncmp(res->lockname.name, lockname->name, lockname->len)!=0)
+ return 0;
+ }
+ return 1;
+}
+
+static inline dlm_status dlm_err_to_dlm_status(int err)
+{
+ dlm_status ret;
+ if (err == -ENOMEM)
+ ret = DLM_SYSERR;
+ else if (err == -ETIMEDOUT || net_link_down(err, NULL))
+ ret = DLM_NOLOCKMGR;
+ else if (err == -EINVAL)
+ ret = DLM_BADPARAM;
+ else if (err == -ENAMETOOLONG)
+ ret = DLM_IVBUFLEN;
+ else
+ ret = DLM_BADARGS;
+ return ret;
+}
+
+#endif /* CLUSTER_DLMMOD_H */
Added: branches/dlm-glue/cluster/dlmrecovery.c
===================================================================
--- branches/dlm-glue/cluster/dlmrecovery.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmrecovery.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,705 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked);
+
+int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(dlm_ctxt *dlm);
+
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node);
+static int dlm_remaster_locks_local(dlm_ctxt *dlm);
+int dlm_init_recovery_area(dlm_ctxt *dlm, u16 dead_node, u16 num_nodes);
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node);
+void dlm_destroy_recovery_area(dlm_ctxt *dlm, u16 dead_node);
+
+#define DLM_RECOVERY_THREAD_MS 2000
+
+#if 0
+/*
+ * RECOVERY THREAD
+ */
+
+void dlm_kick_recovery_thread(dlm_ctxt *dlm)
+{
+ /* wake the recovery thread */
+ atomic_set(&dlm->reco.thread.woken, 1);
+ wake_up(&dlm->reco.thread.thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(dlm_ctxt *dlm)
+{
+ printk("starting recovery thread...\n");
+ dlm->reco.thread.pid = kernel_thread (dlm_recovery_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (dlm->reco.thread.pid < 0) {
+ printk("unable to launch recovery thread, error=%d", dlm->reco.thread.pid);
+ return -EINVAL;
+ }
+ printk("recovery thread running...\n");
+ return 0;
+}
+
+void dlm_complete_recovery_thread(dlm_ctxt *dlm)
+{
+ printk ("waiting for recovery thread to exit....");
+ send_sig (SIGINT, dlm->reco.thread.task, 0);
+ wait_for_completion (&dlm->reco.thread.complete);
+ printk ("recovery thread exited\n");
+ dlm->reco.thread.task = NULL;
+}
+
+ /*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ * ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ * thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ * and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ * one lock at a time, forcing each node to communicate back
+ * before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ * message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ * and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ * everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+
+
+int dlm_recovery_thread(void *data)
+{
+ int status, i;
+ int cnt = 0, dlm_num;
+ struct list_head *iter, *iter2, *tmpiter;
+ dlm_lock_resource *res;
+ char name[12];
+ dlm_ctxt *dlm = data;
+ u16 tmp;
+
+
+ dlm_num = nm_get_group_global_index(dlm->group);
+ sprintf(name, "dlmreco-%03u", dlm_num);
+ util_daemonize (name, strlen(name), 1);
+ dlm->reco.thread.task = current;
+
+ while (1) {
+ spin_lock(&dlm->spinlock);
+
+ /* check to see if the new master has died */
+ if (dlm->reco.new_master != NM_INVALID_SLOT_NUM &&
+ test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+ printk("new master %u died while recovering %u!\n",
+ dlm->reco.new_master, dlm->reco.dead_node);
+ // unset the new_master, leave dead_node
+ dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+ }
+
+ /* select a target to recover */
+ if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+ dlm->reco.dead_node = find_next_bit (dlm->recovery_map, NM_MAX_NODES, 0);
+ if (dlm->reco.dead_node >= NM_MAX_NODES)
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+ // BUG?
+ printk("dead_node %u no longer in recovery map!\n",
+ dlm->reco.dead_node);
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ }
+
+ spin_unlock(&dlm->spinlock);
+
+ if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+ printk("nothing to recover! sleeping now!\n");
+ goto sleep;
+ }
+
+ /* take write barrier */
+ /* (stops the list reshuffling thread, proxy ast handling) */
+ down_write(&dlm->recovery_sem);
+
+ /* choose a new master */
+ if (dlm->reco.new_master == NM_INVALID_SLOT_NUM) {
+ u16 new_dead_node = dlm->reco.dead_node;
+ dlm->reco.new_master = dlm_pick_recovery_master(dlm, &new_dead_node);
+ if (new_dead_node != dlm->reco.dead_node) {
+ // master wants to recover a different node
+ dlm->reco.dead_node = new_dead_node;
+
+ // do local cleanup if heartbeat has not added the
+ // node to the recovery map yet
+ spin_lock(&dlm->spinlock);
+ if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+ dlm_do_local_recovery_cleanup(dlm, dlm->reco.dead_node, 1);
+ set_bit(dlm->reco.dead_node, dlm->recovery_map);
+ clear_bit(dlm->reco.dead_node, dlm->node_map);
+ }
+ spin_unlock(&dlm->spinlock);
+ }
+ }
+
+
+ if (dlm->reco.new_master == dlm->group_index) {
+ status = dlm_remaster_locks_local(dlm);
+ if (status < 0) {
+ printk("error remastering locks for node %u!!!! retrying!\n",
+ dlm->reco.dead_node);
+ } else {
+ // success! see if any other nodes need recovery
+ spin_lock(&dlm->spinlock);
+ clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+ spin_unlock(&dlm->spinlock);
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+ dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.next_seq = 0;
+ }
+ up_write(&dlm->recovery_sem);
+ // pick another dead node
+ continue;
+ } else {
+ // sit around until new_master is dead or done
+ // we will get signalled by the waitqueue either way
+ printk("new_master %u is recovering dead_node %u... waiting...\n",
+ dlm->reco.new_master, dlm->reco.dead_node);
+ }
+
+ up_write(&dlm->recovery_sem);
+
+sleep:
+ atomic_set(&dlm->reco.thread.woken, 0);
+ status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq,
+ &dlm->reco.thread.woken,
+ 1, DLM_RECOVERY_THREAD_MS);
+ if (status == 0 || status == -ETIMEDOUT) {
+ if (atomic_read(&dlm->reco.thread.woken))
+ printk("aha!!! recovery thread woken!\n");
+ else
+ printk("timed out waiting, running again\n");
+ continue;
+ }
+ printk("recovery thread got %d while waiting\n", status);
+ break;
+ }
+
+ flush_scheduled_work();
+ complete (&dlm->reco.thread.complete);
+ printk("quitting recovery thread!!!!!!\n");
+ return 0;
+}
+
+/* +- if this node is NOT the new master... */
+/* +--- if master's dead_node is not the one we chose, do local cleanup again with proper dead_node */
+/* +--- wait for poll messages from new master: register net message handler, it will do the work */
+/* +--- check for death of new master */
+/* +--- if dead, unregister the handler, unset new_master, keep dead_node and goto "select a target" */
+/* |- on request, send header with number of packets, get response, then start blasting packets */
+/* |- retransmit any missed packets on request */
+/* |- once ALL DONE is received, run all locks again */
+/* +--- unset the RECOVERING flag */
+/* +--- set the new owner as new_master */
+/* +--- remove dead_node from recovery map */
+/* +--- unset new_master and dead_node and start all over */
+
+
+static int dlm_remaster_locks_local(dlm_ctxt *dlm)
+{
+ int num_nodes = 255, i, status = 0;
+ u32 node_map[8];
+
+
+/* +- if this node is the new master, init the temp recovery area */
+/* |- poll each live node for lock state */
+/* |- collect the data from each node until node says it's done, or dead */
+/* +--- if node died, throw away temp recovery area, keep new_master and dead_node, goto "select a target" */
+/* |- apply all temp area changes to real lock */
+/* +- send ALL DONE message to each node */
+
+
+ status = dlm_init_recovery_area(dlm, dlm->reco.dead_node, num_nodes);
+ if (status < 0)
+ return status;
+
+ spin_lock(&dlm->spinlock);
+ num_nodes = nm_get_group_max_slots(dlm->group);
+ memcpy(node_map, dlm->node_map, sizeof(node_map));
+ spin_unlock(&dlm->spinlock);
+
+ for (i=0; i<num_nodes; i++) {
+ if (test_bit(i, node_map)) {
+ spin_lock(&dlm->spinlock);
+ dlm->reco.sending_node = i;
+ dlm->reco.next_seq = 0;
+ spin_unlock(&dlm->spinlock);
+ status = dlm_request_all_locks(dlm, i, dlm->reco.dead_node);
+ if (status < 0) {
+ spin_lock(&dlm->spinlock);
+ dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.next_seq = 0;
+ spin_unlock(&dlm->spinlock);
+ dlm_destroy_recovery_area(dlm, dlm->reco.dead_node);
+ return status;
+ }
+ }
+ }
+ return status;
+}
+
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node)
+{
+ printk("dlm_request_all_locks: dead node is %u, sending request to %u\n",
+ dead_node, request_from);
+ // send message
+ // sleep until all received or error
+ return 0;
+}
+
+#endif
+
+#if 0
+
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data);
+
+typedef struct _dlm_reco_lock_info
+{
+ u16 node;
+ u16 unused1;
+ u64 cookie;
+ s8 type;
+ s8 convert_type;
+ u8 list;
+ u8 lockname_len;
+ u8 lockname[DLM_LOCKID_NAME_MAX];
+} dlm_reco_lock_info;
+
+enum {
+ DLM_RECO_MASTER_REQUEST,
+ DLM_RECO_XMIT_LOCKS_REQUEST,
+ DLM_RECO_XMIT_LOCK_HDR_REQUEST,
+ DLM_RECO_XMIT_LOCK_ARR_REQUEST,
+ DLM_RECO_XMIT_COMPLETE_REQUEST,
+ DLM_RECO_ALL_DONE_REQUEST
+};
+
+enum {
+ DLM_RECO_NO_RESPONSE,
+ DLM_RECO_YES_RESPONSE
+};
+
+#define DLM_LOCKS_PER_PACKET 40
+
+typedef struct _dlm_reco_lock_arr_req
+{
+ u8 request_type;
+ u8 num_locks;
+ u16 dead_node;
+ u32 seqnum;
+ dlm_reco_lock_info lock[DLM_LOCKS_PER_PACKET];
+} dlm_reco_lock_arr_req;
+
+typedef struct _dlm_reco_request
+{
+ u8 request_type;
+ u8 unused1;
+ u16 dead_node;
+ u32 num;
+} dlm_reco_request;
+
+typedef struct _dlm_reco_response
+{
+ u8 response_type;
+ u8 unused1[7];
+} dlm_reco_response;
+
+static inline int dlm_reco_lock_info_valid(dlm_reco_lock_info *info)
+{
+ if (info->type != LKM_NLMODE &&
+ info->type != LKM_PRMODE &&
+ info->type != LKM_EXMODE)
+ return 0;
+ if (info->convert_type != LKM_NLMODE &&
+ info->convert_type != LKM_PRMODE &&
+ info->convert_type != LKM_EXMODE)
+ return 0;
+ if (info->list > 2)
+ return 0;
+ return 1;
+}
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order);
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order)
+{
+ int ret = -EINVAL;
+ dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+
+ /* check a bunch of ugly conditions */
+ *out_of_order = 0;
+ if (req->num_locks > DLM_LOCKS_PER_PACKET) {
+ printk("num_locks too large! %u\n", req->num_locks);
+ } else if (req->seqnum != dlm->reco.next_seq) {
+ printk("expected seq %lu from node %u, got %lu\n",
+ dlm->reco.next_seq, msg->src_node,
+ req->seqnum);
+ *out_of_order = 1;
+ } else if (dlm->reco.dead_node != req->dead_node) {
+ printk("bad lock array: dead node=%u, sent=%u\n",
+ dlm->reco.dead_node != req->dead_node);
+ } else if (dlm->reco.new_master != dlm->group_index) {
+ printk("this node is not the recovery master!\n");
+ } else if (dlm->reco.sending_node != msg->src_node ||
+ dlm->group_index == msg->dest_node) {
+ printk("eek. sending_node=%u, actual=%u, dest=%u, me=%u\n",
+ dlm->reco.sending_node, msg->src_node,
+ msg->dest_node, dlm->group_index);
+ } else
+ ret = 0;
+ return ret;
+}
+
+
+/*
+ * gawd i hate udp
+ */
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+ dlm_lock_resource *res = NULL;
+ dlm_reco_lock_info *info;
+ dlm_lock **newlocks = NULL;
+ dlm_lock *lock = NULL;
+ int ret, i, out_of_order = 0;
+
+ // TODO: ntoh(req)
+
+ ret = 0;
+ if (req->num_locks == 0)
+ goto send_response;
+
+ /* check to see if it's worth kmallocing */
+ spin_lock(&dlm->spinlock);
+ ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+ spin_unlock(&dlm->spinlock);
+ if (ret < 0)
+ goto send_response;
+
+ newlocks = kmalloc(req->num_locks * sizeof(dlm_lock *), GFP_KERNEL);
+ if (!newlocks) {
+ printk("failed to alloc temp lock array!\n");
+ ret = -ENOMEM;
+ goto send_response;
+ }
+ memset(newlocks, 0, req->num_locks * sizeof(dlm_lock *));
+ for (i=0; i<req->num_locks; i++) {
+ info = &(req->lock[i]);
+ if (!dlm_reco_lock_info_valid(info)) {
+ ret = -EINVAL;
+ goto send_response;
+ }
+ lock = newlocks[i] = kmem_cache_alloc(dlm_lock_cache, GFP_KERNEL);
+ if (!newlocks[i]) {
+ ret = -ENOMEM;
+ goto send_response;
+ }
+ memset(lock, 0, sizeof(dlm_lock));
+ LIST_HEAD_INIT(&lock->list);
+ LIST_HEAD_INIT(&lock->ast_list);
+ spin_lock_init(&lock->spinlock);
+ lock->type = info->type;
+ lock->convert_type = info->convert_type;
+ lock->node = dlm->group_index;
+ //atomic_set(&lock->ast_lock, 0);
+ //atomic_set(&lock->bast_lock, 0);
+ lock->ast = NULL;
+ lock->bast = NULL;
+ lock->astdata = (void *)info->list; // cheating here...
+ lock->cookie = info->cookie;
+ }
+
+ spin_lock(&dlm->spinlock);
+ /* ok now that everything is allocated and the lock has
+ * been taken again, recheck all those stupid conditions */
+ ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+ if (ret < 0) {
+ spin_unlock(&dlm->spinlock);
+ goto send_response;
+ }
+ for (i=0; i<req->num_locks; i++) {
+ info = &(req->lock[i]);
+ lock = newlocks[i];
+ list_add_tail(&lock->list, &dlm->reco.received);
+ }
+ spin_unlock(&dlm->spinlock);
+
+send_response:
+ if (newlocks) {
+ if (ret < 0) {
+ for (i=0; i<req->num_locks; i++)
+ if (newlocks[i])
+ kmem_cache_free(dlm_reco_lock_info_cache, newlocks[i]);
+ }
+ kfree(newlocks);
+ }
+
+ return ret;
+}
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+}
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+}
+
+
+
+
+
+static int dlm_send_reco_request(dlm_ctxt *dlm, dlm_reco_request *buf, u16 to, struct inode *node)
+{
+ int ret;
+ net_msg *msg = net_package_message(DLM_NET_RECOVERY_REQUEST_MSG_TYPE,
+ dlm->key, buf, sizeof(*buf),
+ dlm->group_index, to);
+ if (!msg)
+ return -ENOMEM;
+ ret = net_send_udp_msg (node, msg, sizeof(*buf));
+ kfree(msg);
+ return ret;
+}
+
+static int dlm_recover_domain(dlm_ctxt *dlm)
+{
+
+
+ return 0;
+}
+
+
+#endif
+
+#warning may need to change kfree to put_lock and refcounting here
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked)
+{
+ struct list_head *iter, *iter2, *tmpiter;
+ dlm_lock_resource *res;
+ dlm_lock *lock;
+ int i;
+ struct list_head *bucket;
+
+ if (!locked)
+ spin_lock(&dlm->spinlock);
+
+ for (i=0; i<DLM_HASH_SIZE; i++) {
+ bucket = &(dlm->resources[i]);
+ list_for_each(iter, bucket) {
+ res = list_entry (iter, dlm_lock_resource, list);
+ spin_lock(&res->spinlock);
+ if (res->owner == dead_node) {
+ res->state |= DLM_LOCK_RES_RECOVERING;
+ list_del(&res->recovering);
+ list_add_tail(&res->recovering, &dlm->reco.resources);
+ } else if (res->owner == dlm->group_index) {
+ list_for_each_safe(iter2, tmpiter, &res->granted) {
+ lock = list_entry (iter2, dlm_lock, list);
+ if (lock->node == dead_node) {
+ list_del(&lock->list);
+ kfree(lock);
+ }
+ }
+ list_for_each_safe(iter2, tmpiter, &res->converting) {
+ lock = list_entry (iter2, dlm_lock, list);
+ if (lock->node == dead_node) {
+ list_del(&lock->list);
+ kfree(lock);
+ }
+ }
+ list_for_each_safe(iter2, tmpiter, &res->blocked) {
+ lock = list_entry (iter2, dlm_lock, list);
+ if (lock->node == dead_node) {
+ list_del(&lock->list);
+ kfree(lock);
+ }
+ }
+ }
+ spin_unlock(&res->spinlock);
+ }
+ }
+
+ if (!locked)
+ spin_unlock(&dlm->spinlock);
+}
+
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //int ret;
+ //struct inode *group = ptr1;
+ //struct inode *node = ptr2;
+ dlm_ctxt *dlm = data;
+
+ spin_lock(&dlm->spinlock);
+
+ if (!test_bit(idx, dlm->node_map))
+ printk("node %u already removed from nodemap!\n", idx);
+ else
+ clear_bit(idx, dlm->node_map);
+
+ if (test_bit(idx, dlm->recovery_map))
+ printk("node %u already added to recovery map!\n", idx);
+ else {
+ set_bit(idx, dlm->recovery_map);
+ dlm_do_local_recovery_cleanup(dlm, idx, 1);
+ }
+ spin_unlock(&dlm->spinlock);
+}
+
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //struct inode *group = ptr1;
+ //struct inode *node = ptr2;
+ dlm_ctxt *dlm = data;
+
+ spin_lock(&dlm->spinlock);
+
+ if (test_bit(idx, dlm->recovery_map)) {
+ printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+ } else {
+ if (test_bit(idx, dlm->node_map))
+ printk("node %u already in node map!!!\n", idx);
+ else
+ set_bit(idx, dlm->node_map);
+ }
+
+ spin_unlock(&dlm->spinlock);
+}
+
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+ if (test_bit(node, dlm->recovery_map))
+ return 1;
+ return 0;
+}
+
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+ if (test_bit(node, dlm->node_map))
+ return 1;
+ return 0;
+}
+
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+ int ret;
+ spin_lock(&dlm->spinlock);
+ ret = __dlm_hb_node_dead(dlm, node);
+ spin_unlock(&dlm->spinlock);
+ return ret;
+}
+
+int dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+ int ret;
+ spin_lock(&dlm->spinlock);
+ ret = __dlm_hb_node_up(dlm, node);
+ spin_unlock(&dlm->spinlock);
+ return ret;
+}
+
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node)
+{
+ u16 master = 0;
+#if 0
+ dlm_status ret;
+ dlm_lockstatus lksb;
+
+ ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
+ DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+
+ if (ret == DLM_NORMAL) {
+ // I am master
+ // send message to all nodes saying that I am beginning a recovery session for node XX,
+ // then call dlmunlock???
+
+ } else if (ret == DLM_NOTQUEUED) {
+ // another node is master
+ // wait on reco.new_master != NM_INVALID_SLOT_NUM
+ }
+
+ // at this point, every node in this domain should have reco.new_master and .dead_node set, even
+ // if they have not discovered the dead node on their own
+ //
+ //
+ // atomic_set(&dlm->reco.thread.woken, 0);
+ // 232 status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq,
+ // 233 &dlm->reco.thread.woken,
+ // 234 1, DLM_RECOVERY_THREAD_MS);
+ //
+#endif
+ return master;
+}
Added: branches/dlm-glue/cluster/dlmthread.c
===================================================================
--- branches/dlm-glue/cluster/dlmthread.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/dlmthread.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,329 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+extern u16 dlm_global_index;
+
+#define dlm_lock_is_remote(dlm, lock) ((lock)->node != (dlm)->group_index)
+
+/*
+ * DLM THREAD
+ */
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+ dlm_lock *lock, *target;
+ struct list_head *iter, *tmpiter;
+ LIST_HEAD(bast_list);
+ struct list_head *head;
+ s8 hi;
+
+ spin_lock(&res->spinlock);
+
+#if 0
+ {
+ int g=0, c=0, b=0;
+ list_for_each(iter, &res->granted) {
+ g++;
+ }
+ list_for_each(iter, &res->converting) {
+ c++;
+ }
+ list_for_each(iter, &res->blocked) {
+ b++;
+ }
+ printk("(%d) granted: %d, converting: %d, blocked: %d\n", current->pid, g, c, b);
+ }
+#endif
+
+converting:
+ if (list_empty(&res->converting))
+ goto blocked;
+ target = list_entry(res->converting.next, dlm_lock, list);
+ if (target->convert_type == LKM_IVMODE) {
+ printk("eeek!!! converting a lock with no convert_type!!!!\n");
+ BUG();
+ }
+ head = &res->granted;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+ head = &res->converting;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+
+ /* we can convert the lock */
+ if (list_empty(&bast_list)) {
+ spin_lock(&target->spinlock);
+ DLM_ASSERT(target->highest_blocked == LKM_IVMODE);
+
+ dlmprintk("calling ast for converting lock: %*s, have: %d, granting: %d, node: %u\n",
+ res->lockname.len, res->lockname.name, target->type, target->convert_type, target->node);
+
+ target->type = target->convert_type;
+ target->convert_type = LKM_IVMODE;
+ list_del(&target->list);
+ list_add_tail(&target->list, &res->granted);
+
+ if (target->node == dlm->group_index) {
+ DLM_ASSERT(target->lksb);
+ DLM_ASSERT(target->lksb->status);
+
+ target->lksb->status = DLM_NORMAL;
+ } else {
+ dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+ }
+
+ spin_unlock(&target->spinlock);
+
+ if (dlm_do_ast(dlm, res, target) < 0)
+ printk("eek\n");
+ /* go back and check for more */
+ goto converting;
+ }
+
+blocked:
+ if (list_empty(&res->blocked)) {
+ goto basts;
+ }
+ target = list_entry(res->blocked.next, dlm_lock, list);
+
+ head = &res->granted;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+
+ head = &res->converting;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+
+ /* we can grant the blocked lock (only
+ * possible if converting list empty) */
+ if (list_empty(&bast_list)) {
+ spin_lock(&target->spinlock);
+ DLM_ASSERT(target->highest_blocked == LKM_IVMODE);
+
+ dlmprintk("calling ast for blocked lock: %*s, granting: %d, node: %u\n",
+ res->lockname.len, res->lockname.name, target->type, target->node);
+
+ // target->type is already correct
+ list_del(&target->list);
+ list_add_tail(&target->list, &res->granted);
+
+ if (target->node == dlm->group_index) {
+ DLM_ASSERT(target->lksb);
+ DLM_ASSERT(target->lksb->status);
+
+ target->lksb->status = DLM_NORMAL;
+ } else {
+ dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+ }
+
+ spin_unlock(&target->spinlock);
+
+ if (dlm_do_ast(dlm, res, target) < 0)
+ printk("eek\n");
+ /* go back and check for more */
+ goto converting;
+ }
+
+basts:
+ list_for_each_safe(iter, tmpiter, &bast_list) {
+ lock = list_entry(iter, dlm_lock, ast_list);
+ spin_lock(&lock->spinlock);
+ DLM_ASSERT(lock->highest_blocked > LKM_IVMODE);
+ hi = lock->highest_blocked;
+ lock->highest_blocked = LKM_IVMODE;
+ list_del(&lock->ast_list);
+ spin_unlock(&lock->spinlock);
+
+ if (dlm_do_bast(dlm, res, lock, hi) < 0)
+ printk("eeek\n");
+ }
+ spin_unlock(&res->spinlock);
+}
+
+
+/* must have NO locks when calling this */
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+ if (res) {
+ spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
+ if (!(res->state & DLM_LOCK_RES_DIRTY)) {
+ list_add_tail(&res->dirty, &dlm->dirty_list);
+ res->state |= DLM_LOCK_RES_DIRTY;
+ }
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ }
+
+ /* wake the dlm thread */
+ atomic_set(&dlm->thread.woken, 1);
+ wake_up(&dlm->thread.thread_wq);
+}
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(dlm_ctxt *dlm)
+{
+ printk("starting dlm thread...\n");
+ dlm->thread.pid = kernel_thread (dlm_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (dlm->thread.pid < 0) {
+ printk("unable to launch dlm thread, error=%d", dlm->thread.pid);
+ return -EINVAL;
+ }
+ printk("dlm thread running for %s...\n", dlm->name);
+ return 0;
+}
+
+void dlm_complete_thread(dlm_ctxt *dlm)
+{
+ printk ("waiting for dlm thread to exit....");
+ send_sig (SIGINT, dlm->thread.task, 0);
+ wait_for_completion (&dlm->thread.complete);
+ printk ("dlm thread exited\n");
+ dlm->thread.task = NULL;
+}
+
+
+
+
+int dlm_thread(void *data)
+{
+ int status;
+ struct list_head *iter, *tmpiter;
+ dlm_lock_resource *res;
+ dlm_ctxt *dlm = data;
+
+ util_daemonize ("dlm_thread", strlen("dlm_thread"), 1);
+ dlm->thread.task = current;
+
+ while (1) {
+ down_read(&dlm->recovery_sem);
+ spin_lock(&dlm->spinlock);
+ list_for_each_safe(iter, tmpiter, &dlm->dirty_list) {
+ res = list_entry(iter, dlm_lock_resource, dirty);
+ /* don't shuffle secondary queues */
+ if (res->owner != dlm->group_index)
+ continue;
+ dlm_shuffle_lists(dlm, res);
+ spin_lock(&res->spinlock);
+ list_del(&res->dirty);
+ res->state &= ~DLM_LOCK_RES_DIRTY;
+ spin_unlock(&res->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+ up_read(&dlm->recovery_sem);
+
+ atomic_set(&dlm->thread.woken, 0);
+ status = util_wait_atomic_eq(&dlm->thread.thread_wq,
+ &dlm->thread.woken,
+ 1, DLM_THREAD_MS);
+
+ if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+ if (atomic_read(&dlm->thread.woken))
+ printk("aha!!! dlm thread woken!\n");
+ else
+ printk("timed out waiting, running again\n");
+#endif
+ continue;
+ }
+
+ printk("DLM thread got %d while waiting\n", status);
+ break;
+ }
+
+ flush_scheduled_work();
+ complete (&dlm->thread.complete);
+ printk("quitting DLM thread!!!!!!\n");
+ return 0;
+}
Added: branches/dlm-glue/cluster/heartbeat.c
===================================================================
--- branches/dlm-glue/cluster/heartbeat.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/heartbeat.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,869 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Keeps track of alive nodes in the cluster.
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+static void hb_teardown(void);
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx);
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start);
+static ssize_t write_disk(struct file *file, char *buf, size_t size);
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx);
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx);
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx);
+static int hb_do_disk_heartbeat(void *page);
+static int hb_thread(void *data);
+static void hb_complete_thread(void);
+static void hb_kick_thread(void);
+static int hb_launch_thread(void);
+static inline int hb_wait_on_callback_state(int type);
+
+
+
+/* globals */
+extern char *nm_nodename;
+static spinlock_t hb_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(hb_net_groups);
+static LIST_HEAD(hb_disk_groups);
+static int hb_callback_state[HB_NUM_CB];
+struct list_head hb_callbacks[HB_NUM_CB];
+static spinlock_t hb_cb_lock = SPIN_LOCK_UNLOCKED;
+static struct task_struct *hb_task = NULL;
+static atomic_t hb_thread_woken = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(hb_thread_wait_queue);
+static struct completion hb_complete;
+static int hb_pid = -1;
+
+static wait_queue_head_t hb_cb_wq;
+static atomic_t hb_cb_ready = ATOMIC_INIT(0);
+
+
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate)
+ set_buffer_uptodate(bh);
+ else {
+ printk("eek! EIO!\n");
+ clear_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+}
+
+
+
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx)
+{
+ int ret;
+ printk("hb_do_node_down: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+ printk("NOT removing node from group\n");
+ //ret = nm_remove_node_from_group(group, node);
+ hb_do_callbacks(HB_NODE_DOWN_CB, group, node, 0);
+ return 0;
+}
+
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx)
+{
+ printk("hb_do_node_up: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+ hb_do_callbacks(HB_NODE_UP_CB, group, node, 0);
+ return 0;
+}
+
+static inline void hb_submit_bh(int rw, struct buffer_head *bh)
+{
+ printk("submit_bh: rw=%s, blocknr=%lu, mapped=%s\n",
+ rw==WRITE?"write":"read", bh->b_blocknr,
+ buffer_mapped(bh) ? "yes" : "no");
+ submit_bh(rw, bh);
+}
+
+
+static int hb_do_disk_heartbeat(void *page)
+{
+ nm_group_inode_private *priv;
+ struct inode *group, *node;
+ struct list_head *iter;
+ struct buffer_head *bh;
+ hb_disk_slot *slot;
+ hb_disk_heartbeat_block *hb_block;
+ int ino, idx, ret, i;
+ struct inode **dead_nodes, **live_nodes;
+ LIST_HEAD(tmplist);
+ u64 blkno;
+ cluster_disk *disk;
+
+ // NM_MAX_NODES is 255
+ dead_nodes = page;
+ live_nodes = page + (sizeof(struct inode *) * 256);
+
+ spin_lock(&hb_lock);
+ list_splice_init(&hb_disk_groups, &tmplist);
+ spin_unlock(&hb_lock);
+
+ list_for_each(iter, &tmplist) {
+ priv = list_entry(iter, nm_group_inode_private, disk_list);
+ group = priv->inode;
+ disk = &priv->disk;
+
+ memset(page, 0, PAGE_SIZE);
+ down(&group->i_sem);
+
+ idx = 0;
+ while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+retry_submit:
+ bh = slot->bh;
+ node = slot->inode;
+
+ ino = nm_get_node_global_index(node);
+
+ if (ino == nm_this_node(group)) {
+ lock_buffer(bh);
+ if (!buffer_mapped(bh)) {
+ blkno = (unsigned long long) bh->b_blocknr;
+ unlock_buffer(bh);
+ brelse(bh);
+ slot->bh = getblk(disk->dev,
+ blkno,
+ (1 << disk->blocksize_bits));
+ goto retry_submit;
+ }
+ memset(bh->b_data, 0, bh->b_size);
+ hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+ hb_block->time = CURRENT_TIME;
+ if (!hb_block->time)
+ hb_block->time = 1;
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ bh->b_end_io = hb_end_buffer_io_sync;
+ hb_submit_bh(WRITE, bh);
+ } else {
+ lock_buffer(bh);
+ if (!buffer_mapped(bh)) {
+ blkno = (unsigned long long) bh->b_blocknr;
+ unlock_buffer(bh);
+ brelse(bh);
+ slot->bh = getblk(disk->dev,
+ blkno,
+ (1 << disk->blocksize_bits));
+ goto retry_submit;
+ }
+ clear_buffer_uptodate(bh);
+ bh->b_end_io = hb_end_buffer_io_sync;
+ hb_submit_bh(READ, bh);
+ }
+ idx++;
+ }
+
+ idx = 0;
+ while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+ bh = slot->bh;
+ node = slot->inode;
+
+ ino = nm_get_node_global_index(node);
+
+ wait_on_buffer(bh);
+ hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+ if (hb_block->time != slot->last_time) {
+ if (slot->state == HB_NODE_STATE_INIT) {
+ printk("first time for this node!\n");
+ live_nodes[ino] = node;
+ slot->state = HB_NODE_STATE_UP;
+ }
+ node->i_atime = hb_block->time;
+ slot->last_time = hb_block->time;
+ slot->margin = HB_DISK_MARGIN;
+ hb_do_callbacks(HB_NODE_RESPONDED_CB, group, node, HB_TYPE_DISK);
+ } else {
+ slot->margin--;
+ printk("node %d missed. margin=%d\n", ino, slot->margin);
+ }
+
+ if (ino != nm_this_node(group) && slot->margin <= 0) {
+ printk("node %d JUST DIED!!!!\n", ino);
+ dead_nodes[ino] = node;
+ slot->state = HB_NODE_STATE_DOWN;
+ }
+ idx++;
+ }
+
+ up(&group->i_sem);
+
+ /* Do holding group i_sem while doing node-up/down.
+ * Changes may need to be made to the group, so
+ * i_sem will be needed... */
+ for (i=0; i<NM_MAX_NODES; i++) {
+ if (live_nodes[i])
+ ret = hb_do_node_up(group, live_nodes[i], i);
+ else if (dead_nodes[i])
+ ret = hb_do_node_down(group, dead_nodes[i], i);
+ }
+ }
+
+ spin_lock(&hb_lock);
+ list_splice(&tmplist, &hb_disk_groups);
+ spin_unlock(&hb_lock);
+ return 0;
+}
+
+
+static int hb_thread(void *data)
+{
+ int status;
+ void *page;
+
+ page = (void *) __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ util_daemonize ("hb_thread", strlen("hb_thread"), 1);
+ hb_task = current;
+
+ while (1) {
+ status = hb_do_disk_heartbeat(page);
+
+ atomic_set(&hb_thread_woken, 0);
+ status = util_wait_atomic_eq(&hb_thread_wait_queue,
+ &hb_thread_woken,
+ 1, HB_THREAD_MS);
+
+ if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+ if (atomic_read(&hb_thread_woken))
+ printk("aha!!! hb thread woken!\n");
+ else
+ printk("hb thread timed out waiting, running again\n");
+#endif
+ continue;
+ }
+ printk("hb thread got %d while waiting\n", status);
+ break;
+ }
+
+ flush_scheduled_work();
+ complete (&hb_complete);
+ printk("quitting hb thread!!!!!!\n");
+ return 0;
+}
+
+
+static void hb_kick_thread(void)
+{
+ atomic_set(&hb_thread_woken, 1);
+ wake_up(&hb_thread_wait_queue);
+}
+
+/* Launch the hb thread for the mounted volume */
+static int hb_launch_thread(void)
+{
+ hb_pid = -1;
+ hb_task = NULL;
+ init_completion (&hb_complete);
+
+ printk("starting hb thread...\n");
+ hb_pid = kernel_thread (hb_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (hb_pid < 0) {
+ printk("unable to launch hb thread, error=%d", hb_pid);
+ return -EINVAL;
+ }
+ printk("hb thread running...\n");
+ return 0;
+}
+
+static void hb_complete_thread(void)
+{
+ printk ("waiting for hb thread to exit....");
+ send_sig (SIGINT, hb_task, 0);
+ wait_for_completion (&hb_complete);
+ printk ("hb thread exited\n");
+ hb_task = NULL;
+}
+
+
+
+
+
+
+
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start)
+{
+ int ret = -EINVAL;
+ cluster_disk *disk;
+ nm_group_inode_private *priv;
+
+ priv = group->u.generic_ip;
+ if (!priv)
+ goto leave;
+
+ if (priv->state == NM_GROUP_READY)
+ return 0;
+
+ /* hold an extra ref as long as hb keeps track of the group */
+ igrab(group);
+
+ disk = &priv->disk;
+ if (blocks > NM_MAX_NODES)
+ blocks = NM_MAX_NODES;
+ disk->dev = dev;
+ disk->blocksize_bits = bits;
+ disk->num_blocks = blocks;
+ disk->start_block = start;
+ util_init_rarray(&disk->slots, sizeof(hb_disk_slot));
+
+ /* start allowing group additions */
+ ret = nm_make_group_ready(group);
+
+leave:
+ if (ret < 0)
+ iput(group);
+
+ return ret;
+}
+
+
+static ssize_t write_disk(struct file *file, char *buf, size_t size)
+{
+ hb_op *data;
+ struct inode *group = NULL;
+ struct file *filp = NULL;
+ kdev_t dev;
+ int ret, tmpret;
+ nm_group_inode_private *priv;
+ u32 tmpmap[8];
+
+ printk("write_disk\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (hb_op *) buf; if (data->magic != HB_OP_MAGIC)
+ return -EINVAL;
+
+ switch (data->opcode)
+ {
+ case HB_OP_START_DISK_HEARTBEAT:
+ if (data->bits < 9 || data->bits > 12) {
+ ret = sprintf(buf, "%d: bad blocksize bits! %u", -EINVAL, data->bits);
+ break;
+ }
+ group = nm_get_group_by_num(data->group_num);
+ if (!group || !group->u.generic_ip) {
+ ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+ break;
+ }
+ priv = group->u.generic_ip;
+ if (strncmp(priv->disk.uuid, data->disk_uuid, CLUSTER_DISK_UUID_LEN) != 0) {
+ ret = sprintf(buf, "%d: bad disk uuid!", -EINVAL);
+ break;
+ }
+ filp = fget(data->fd);
+ if (!filp) {
+ ret = sprintf(buf, "%d: bad fd!", -EINVAL);
+ break;
+ }
+ dev = filp->f_dentry->d_inode->i_rdev;
+ tmpret = hb_init_disk_hb_group(group, dev, data->bits, data->blocks, data->start);
+ if (tmpret < 0) {
+ fput(filp);
+ ret = sprintf(buf, "%d: failed to init disk heartbeat for group %u!",
+ -EINVAL, data->group_num);
+ } else {
+ ret = sprintf(buf, "0: disk heartbeat started for group %u!",
+ data->group_num);
+ }
+ break;
+
+ case HB_OP_GET_NODE_MAP:
+ group = nm_get_group_by_num(data->group_num);
+ if (!group || !group->u.generic_ip) {
+ ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+ break;
+ }
+
+ if ((ret = hb_fill_node_map(group, tmpmap, sizeof(tmpmap))) == 0) {
+ ret = sprintf(buf, "0: ");
+ buf += ret;
+ memcpy(buf, tmpmap, sizeof(tmpmap));
+ ret += sizeof(tmpmap);
+ } else {
+ ret = sprintf(buf, "%d: error occurred in hb_fill_node_map", ret);
+ }
+ break;
+
+ default:
+ ret = sprintf(buf, "%d: bad opcode! %u", -EINVAL, data->opcode);
+ break;
+ }
+
+ if (group)
+ iput(group);
+
+ return ret;
+}
+
+
+extern struct file_operations transaction_ops;
+
+/*----------------------------------------------------------------------------*/
+/*
+ * populating the filesystem.
+ */
+static int hb_fill_super(struct super_block * sb, void * data, int silent)
+{
+ int ret;
+ TA_write_ops *ops;
+ static struct tree_descr hb_files[] = {
+ [HB_Disk] = {".disk", &transaction_ops, S_IWUSR},
+ /* last one */ {""}
+ };
+
+ ops = kmalloc(sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ memset(ops, 0, sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)));
+ ops->num_ops = HB_WriteOpArraySize;
+ ops->write_op[HB_Disk] = write_disk;
+
+ printk("calling simple_fill_super...\n");
+ ret = simple_fill_super(sb, 0x5551212f, hb_files);
+ if (ret >= 0)
+ TA_GENERIC_SB_MEMBER(sb) = ops;
+ else
+ kfree(ops);
+ return ret;
+}
+
+static struct super_block *hb_read_super (struct super_block *sb, void *data, int silent)
+{
+ printk("welcome to hb_read_super!!!\n");
+ return (hb_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (hb_fs_type, "hb", hb_read_super, FS_SINGLE|FS_LITTER);
+
+
+/* TODO: make callbacks all return int */
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ hb_disk_slot *slot;
+ struct inode *group = ptr1;
+ struct inode *node = ptr2;
+ cluster_disk *disk;
+ nm_group_inode_private *priv;
+ int ino, ret = 0;
+ u64 block;
+
+ printk("hb_nm_group_node_add_cb: group=%lu, node=%lu, idx=%u\n",
+ group->i_ino, node->i_ino, idx);
+
+ down(&group->i_sem);
+ priv = group->u.generic_ip;
+ if (!priv) {
+ printk("eek! bad group inode!\n");
+ goto leave;
+ }
+ disk = &priv->disk;
+ if (disk->uuid[0]) {
+ ret = util_resize_rarray(&disk->slots, idx+1);
+ if (ret < 0) {
+ printk("eeeeeeek!!!! failed to resize disk state data\n");
+ goto leave;
+ }
+
+ ino = nm_get_node_global_index(node);
+ if (ino > disk->num_blocks) {
+ printk("disk heartbeat area does not have enough blocks!\n");
+ goto leave;
+ }
+ block = ino + disk->start_block;
+
+ slot = util_rarray_idx_to_slot(&disk->slots, idx);
+ if (!slot) {
+ printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+ goto leave;
+ }
+ slot->inode = igrab(node);
+ slot->last_time = 0;
+ slot->margin = HB_INITIAL_DISK_MARGIN;
+#warning needs to change for 2.6
+ slot->bh = getblk(disk->dev, (int)block, (1 << disk->blocksize_bits));
+ slot->state = HB_NODE_STATE_INIT;
+ } else {
+ printk("doing nothing for group add for non-disk heartbeat group\n");
+ }
+
+leave:
+ up(&group->i_sem);
+ return;
+}
+
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ hb_disk_slot *slot;
+ struct inode *group = ptr1;
+ struct inode *node = ptr2;
+ cluster_disk *disk;
+ nm_group_inode_private *priv;
+ int ret = -EINVAL;
+
+ printk("hb_nm_group_node_del_cb: group=%lu, node=%lu, idx=%u\n",
+ group->i_ino, node->i_ino, idx);
+
+ down(&group->i_sem);
+ priv = group->u.generic_ip;
+ if (!priv) {
+ printk("eek! bad group inode!\n");
+ goto leave;
+ }
+ disk = &priv->disk;
+ slot = util_rarray_idx_to_slot(&disk->slots, idx);
+ if (!slot) {
+ printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+ goto leave;
+ }
+ if (slot->inode!=node) {
+ printk("eeeeeeek!!!! node inode changed!\n");
+ goto leave;
+ }
+ iput(node);
+ if (slot->bh) {
+ wait_on_buffer(slot->bh);
+ brelse(slot->bh);
+ }
+ memset(slot, 0, sizeof(hb_disk_slot));
+ ret = 0;
+leave:
+
+ up(&group->i_sem);
+ printk("hb_nm_group_node_del_cb done: %d\n", ret);
+ return;
+}
+
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ //struct inode *node = ptr1;
+}
+
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ struct inode *group = ptr1;
+ nm_group_inode_private *priv;
+
+ printk("hb_nm_group_add_cb: group=%lu, idx=%u\n",
+ group->i_ino, idx);
+
+ priv = group->u.generic_ip;
+ if (!priv) {
+ printk("eek! bad group inode!\n");
+ return;
+ }
+
+ spin_lock(&hb_lock);
+ list_add_tail(&priv->net_list, &hb_net_groups);
+ if (priv->disk.uuid[0]) {
+ printk("adding priv=%p inode=%p to disk group list\n", priv, group);
+ list_add_tail(&priv->disk_list, &hb_disk_groups);
+ }
+ spin_unlock(&hb_lock);
+}
+
+enum {
+ HB_CB_STATE_FROZEN = 0,
+ HB_CB_STATE_READY
+};
+
+static int __init init_hb(void)
+{
+ int retval=-1, i;
+ printk("loading heartbeat module: nodename is %s\n", nm_nodename);
+
+ if (proc_mkdir("cluster/heartbeat", 0)) {
+ // ???
+ }
+
+ //hb_net_timestamps = __get_free_page(GFP_KERNEL);
+ //if (!hb_net_timestamps)
+ // goto done;
+
+ for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+ INIT_LIST_HEAD(&hb_callbacks[i]);
+ init_waitqueue_head(&hb_cb_wq);
+ for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+ hb_callback_state[i] = HB_CB_STATE_READY;
+
+ if (nm_register_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb))
+ goto done;
+ if (nm_register_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb))
+ goto done;
+ if (nm_register_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb))
+ goto done;
+ if (nm_register_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb))
+ goto done;
+
+ if (hb_launch_thread() < 0)
+ goto done;
+
+ retval = register_filesystem(&hb_fs_type);
+done:
+ if (retval)
+ hb_teardown();
+ return retval;
+}
+
+static void __exit exit_hb(void)
+{
+ int i;
+ spin_lock(&hb_cb_lock);
+ for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++) {
+ hb_wait_on_callback_state(i);
+ hb_callback_state[i] = HB_CB_STATE_FROZEN;
+ }
+ spin_unlock(&hb_cb_lock);
+
+ hb_complete_thread();
+ hb_teardown();
+ unregister_filesystem(&hb_fs_type);
+ printk("unloading heartbeat module\n");
+}
+
+static void hb_teardown(void)
+{
+ nm_unregister_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb);
+ nm_unregister_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb);
+ nm_unregister_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb);
+ nm_unregister_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb);
+ remove_proc_entry("cluster/heartbeat", NULL);
+ //if (hb_net_timestamps)
+ // kfree(hb_net_timestamps);
+}
+
+module_init(init_hb)
+module_exit(exit_hb)
+
+
+int hb_fill_node_map(struct inode *group, void *map, int size)
+{
+ hb_disk_slot *slot;
+ int idx = 0;
+ nm_group_inode_private *priv;
+
+ priv = group->u.generic_ip;
+
+ memset(map, 0, size);
+ down(&group->i_sem);
+
+ if (priv->disk.uuid[0]) {
+ while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+ if (idx >= size-1) {
+ printk("map size (%d) too small for idx (%d)\n",
+ size, idx);
+ up(&group->i_sem);
+ return -EINVAL;
+ }
+ if (slot->state == HB_NODE_STATE_UP)
+ set_bit(idx, map);
+ idx++;
+ }
+ } else {
+ printk("filling straight from slot bitmap for non-disk heartbeat group\n");
+ memcpy(map, priv->slot_bitmap, size);
+ }
+
+ up(&group->i_sem);
+
+ return 0;
+}
+
+
+static inline int hb_wait_on_callback_state(int type)
+{
+ while (hb_callback_state[type] == HB_CB_STATE_FROZEN) {
+ spin_unlock(&hb_cb_lock);
+ atomic_set(&hb_cb_ready, 0);
+ if (util_wait_atomic_eq(&hb_cb_wq, &hb_cb_ready, 1, 0) == -EINTR) {
+ return -EINTR;
+ }
+ spin_lock(&hb_cb_lock);
+ }
+ return 0;
+}
+
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority)
+{
+ hb_callback_func *f, *tmp;
+ struct list_head *iter;
+ int ret;
+
+ if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+ return -EINVAL;
+ f = kmalloc(sizeof(hb_callback_func), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOMEM;
+ memset(f, 0, sizeof(hb_callback_func));
+ f->func = func;
+ f->data = data;
+ f->priority = priority;
+
+ spin_lock(&hb_cb_lock);
+ ret = hb_wait_on_callback_state(type);
+ if (ret < 0) {
+ spin_unlock(&hb_cb_lock);
+ kfree(f);
+ return ret;
+ }
+
+ list_for_each(iter, &hb_callbacks[type]) {
+ tmp = list_entry (iter, hb_callback_func, list);
+ if (priority < tmp->priority) {
+ list_add_tail(&f->list, iter);
+ spin_unlock(&hb_cb_lock);
+ return 0;
+ }
+ }
+ list_add_tail(&f->list, &hb_callbacks[type]);
+ spin_unlock(&hb_cb_lock);
+ return 0;
+}
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data)
+{
+ struct list_head *iter, *tmpiter;
+ int ret = -EINVAL;
+ hb_callback_func *f;
+
+ if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+ return -EINVAL;
+
+ spin_lock(&hb_cb_lock);
+ ret = hb_wait_on_callback_state(type);
+ if (ret < 0) {
+ spin_unlock(&hb_cb_lock);
+ return ret;
+ }
+ hb_callback_state[type] = HB_CB_STATE_FROZEN;
+ spin_unlock(&hb_cb_lock);
+
+ list_for_each_safe(iter, tmpiter, &hb_callbacks[type]) {
+ f = list_entry (iter, hb_callback_func, list);
+ if (f->func == func && f->data == data) {
+ list_del(&f->list);
+ kfree(f);
+ ret = 0;
+ break;
+ }
+ }
+
+ spin_lock(&hb_cb_lock);
+ hb_callback_state[type] = HB_CB_STATE_READY;
+ atomic_set(&hb_cb_ready, 1);
+ wake_up(&hb_cb_wq);
+ spin_unlock(&hb_cb_lock);
+ return ret;
+}
+
+
+
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx)
+{
+ struct list_head *iter;
+ hb_callback_func *f;
+ int ret;
+
+ spin_lock(&hb_cb_lock);
+ ret = hb_wait_on_callback_state(type);
+ if (ret < 0) {
+ spin_unlock(&hb_cb_lock);
+ printk("missed hb callback(%d) due to EINTR!\n", type);
+ return;
+ }
+ hb_callback_state[type] = HB_CB_STATE_FROZEN;
+ spin_unlock(&hb_cb_lock);
+
+ list_for_each(iter, &hb_callbacks[type]) {
+ f = list_entry (iter, hb_callback_func, list);
+ (f->func) (ptr1, ptr2, idx, f->data);
+ }
+
+ spin_lock(&hb_cb_lock);
+ hb_callback_state[type] = HB_CB_STATE_READY;
+ atomic_set(&hb_cb_ready, 1);
+ wake_up(&hb_cb_wq);
+ spin_unlock(&hb_cb_lock);
+}
Added: branches/dlm-glue/cluster/heartbeat.h
===================================================================
--- branches/dlm-glue/cluster/heartbeat.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/heartbeat.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,129 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_HEARTBEAT_H
+#define CLUSTER_HEARTBEAT_H
+
+
+enum {
+ HB_NODE_STATE_INIT = 0,
+ HB_NODE_STATE_DOWN,
+ HB_NODE_STATE_UP
+};
+
+struct _heartbeat_ctxt
+{
+ int dummy;
+};
+
+typedef struct _hb_disk_slot
+{
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct list_head list;
+ unsigned long last_time;
+ u16 margin;
+ u16 state;
+} hb_disk_slot;
+
+
+
+#define HB_THREAD_MS 2000 // every 2 seconds
+
+
+#define HB_OP_MAGIC 0xf00d
+enum {
+ HB_OP_START_DISK_HEARTBEAT=371,
+ HB_OP_GET_NODE_MAP
+};
+
+typedef struct _hb_op
+{
+ u16 magic;
+ u16 opcode;
+ unsigned int fd;
+ char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+ u16 group_num;
+ u32 bits;
+ u32 blocks;
+ u64 start;
+} hb_op;
+
+enum {
+ HB_TYPE_DISK = 0,
+ HB_TYPE_NET
+};
+
+
+/* callback stuff */
+
+enum {
+ HB_NODE_DOWN_CB = 0,
+ HB_NODE_UP_CB,
+ HB_NODE_RESPONDED_CB, // this one is very chatty
+ HB_NUM_CB
+};
+
+typedef void (hb_cb_func)(struct inode *, struct inode *, int, void *);
+
+typedef struct _hb_callback_func
+{
+ struct list_head list;
+ hb_cb_func *func;
+ void *data;
+ int priority;
+} hb_callback_func;
+
+
+enum {
+ HB_Root = 1,
+ HB_Disk,
+ HB_WriteOpArraySize
+};
+
+typedef struct _hb_disk_heartbeat_block
+{
+ u64 time;
+} hb_disk_heartbeat_block;
+
+
+// number of initial allowed misses
+#define HB_INITIAL_DISK_MARGIN 60
+#define HB_INITIAL_NET_MARGIN 60
+
+// number of allowed misses in steady state
+#define HB_DISK_MARGIN 30
+#define HB_NET_MARGIN 30
+
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data);
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority);
+int hb_fill_node_map(struct inode *group, void *map, int size);
+
+
+
+#endif /* CLUSTER_HEARTBEAT_H */
Added: branches/dlm-glue/cluster/nodemanager.c
===================================================================
--- branches/dlm-glue/cluster/nodemanager.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/nodemanager.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,1330 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.c
+ *
+ * totally lame static node management placeholder
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/hash.h>
+
+#include <asm/uaccess.h>
+
+#include "tcp.h"
+#include "dlmmod.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+/*
+ * This nm module is similar to nfsd/nfsctl.c in that it uses
+ * transaction files (in /proc/cluster/nm) to communicate with
+ * the kernel module instead of ioctls or other means.
+ *
+ * Files involved:
+ * /proc/cluster/nm/cluster - used to create/destroy cluster, adds
+ * nodes/groups to the cluster, queries info
+ * about the cluster
+ * /proc/cluster/nm/group - adds/removes nodes from a group, queries
+ * info about a group
+ * /proc/cluster/nm/node - changes info for a node, queries info about
+ * a node
+ *
+ * This nm implementation basically allows this node to live in exactly one
+ * cluster. All "clustered" nodes that are known to this node should be
+ * added to the cluster, and all nodes should see the same list of nodes in
+ * the same order at all times. The "slot" number given to a node in this
+ * global cluster list is fixed and never changes. Groups can be dynamically
+ * created within a cluster (TODO: currently static only) and be made up of
+ * one or more nodes (listed at most once) in the global list. A node may exist
+ * in many groups. Also, a group may have an optional disk UUID which is simply
+ * stored for later use by the heartbeat service. (The heartbeat service will
+ * do disk heartbeating only for those groups with valid UUIDs.)
+ *
+ * USAGE:
+ * For our purposes, the nm service can be autoloaded by an fstab entry or manually
+ * through mount (mount -t nm none /proc/cluster/nm). Once that is done, an init
+ * script (or single executable on an initrd) should be run to create the static
+ * cluster info, possibly from a file like /etc/nm.conf or similar. We should
+ * probably create a "dlm" or "everyone" group (with NO disk heartbeating) so that
+ * the dlm service can be used with the network only. This group should contain
+ * all known nodes. After this is done, the net, hb and dlm modules can come up.
+ * The nm service is now ready for use, since groups don't need to be created till
+ * later.
+ *
+ * A group services daemon can be written (by someone!? ;-) to run at this point.
+ * Since the "dlm" group has everything it needs for full dlmming (since it uses
+ * only network), the dlm itself can be used to arbitrate for group creation,
+ * and additions/deletions from groups. Callbacks should be registered with nm by
+ * other services that care on each of these events. For instance, heartbeat should
+ * register a callback with nm for group creation, and addition and deletion from
+ * a group so that it can make any necessary changes to its heartbeating (primarily
+ * so that it can begin/end disk heartbeat for any group/node that needs it).
+ *
+ * NOTE NOTE NOTE !!!!:
+ * This is intended to be a quickie implementation. (translation: lame) I do not
+ * want to step on anyone's toes who may have implemented something wayyy better.
+ * If something out there "wins", we will plug into that instead. If nothing really
+ * takes off, we at least have a (lame) reference to work off of. However, since this
+ * implementation exists solely to make ocfs2 work, and one of the major advantages
+ * of ocfs version 1 was ease of setup, we don't want to move to something
+ * substantially more complicated than this (one conf file).
+ *
+ */
+
+
+
+/* globals */
+nm_cluster cluster;
+struct super_block *single_sb;
+char *nm_nodename;
+static spinlock_t nm_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t nm_cb_lock = SPIN_LOCK_UNLOCKED;
+struct list_head nm_callbacks[NM_NUM_CB];
+
+
+static void nm_teardown(void);
+static int nm_create_cluster(char *buf);
+static void nm_init_cluster(nm_cluster *cluster);
+int nm_create_node(char *buf, nm_op *data);
+int nm_name_cluster(char *buf, nm_op *data);
+int nm_destroy_cluster(char *buf);
+int nm_get_cluster_num_nodes(char *buf);
+int nm_get_cluster_num_groups(char *buf);
+int nm_get_node_info(char *buf, nm_op *data);
+int nm_get_group_info(char *buf, nm_op *data);
+nm_cluster *nm_get_cluster(void);
+struct inode *nm_get_group_by_name(char *node_name);
+struct inode *nm_get_node_by_name(char *node_name);
+int nm_init(dlm_ctxt *dlm);
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx);
+
+/* support for adding files, dirs, hardlinks in /proc/cluster/nm/... */
+extern struct file_operations simple_dir_operations;
+extern struct inode_operations simple_dir_inode_operations;
+extern struct file_operations transaction_ops;
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request);
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+
+static ssize_t write_node(struct file *file, char *buf, size_t size);
+static ssize_t write_group(struct file *file, char *buf, size_t size);
+static ssize_t write_cluster(struct file *file, char *buf, size_t size);
+
+static struct inode * __nm_get_group_by_num(u16 group_num);
+static struct inode * __nm_get_node_by_num(u16 node_num);
+
+
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child);
+
+#define NM_HASH_BITS 7
+#define NM_HASH_SIZE (1 << NM_HASH_BITS)
+#define NM_HASH_MASK (NM_HASH_SIZE - 1)
+
+static struct list_head *nm_ip_hash = NULL;
+static spinlock_t nm_ip_hash_lock;
+
+static int nm_init_ip_hash(void);
+static void nm_destroy_ip_hash(void);
+
+
+static void nm_destroy_ip_hash(void)
+{
+ int i;
+ if (!nm_ip_hash)
+ return;
+ for (i=0; i<NM_HASH_SIZE; i++) {
+ /* TODO: cleanup */
+ }
+ free_page((unsigned long)nm_ip_hash);
+}
+
+static int nm_init_ip_hash(void)
+{
+ int i;
+
+ if ((PAGE_SIZE / sizeof(struct list_head)) < NM_HASH_SIZE) {
+ printk("eek! hash size too big for this arch!\n");
+ BUG();
+ }
+
+ nm_ip_hash = (struct list_head *) __get_free_page(GFP_KERNEL);
+ if (!nm_ip_hash)
+ return -ENOMEM;
+ for (i=0; i<NM_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nm_ip_hash[i]);
+ spin_lock_init(&nm_ip_hash_lock);
+ return 0;
+}
+
+
+
+
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request)
+{
+ int start = 0, slot_num;
+ if (request != NM_INVALID_SLOT_NUM)
+ start = request;
+ slot_num = find_next_zero_bit (bitmap, max, start);
+ if (slot_num >= max)
+ return -1;
+ if (request != NM_INVALID_SLOT_NUM && slot_num != request)
+ return -1;
+ set_bit(slot_num, bitmap);
+ return slot_num;
+}
+
+
+
+
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+ struct qstr name;
+ struct dentry *dentry = ERR_PTR(-EINVAL);
+ struct inode *inode;
+
+ if (!file->name)
+ goto out;
+ name.name = file->name;
+ name.len = strlen(name.name);
+ printk("adding file %*s\n", name.len, name.name);
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_alloc(parent, &name);
+ if (!dentry) {
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode = new_inode(s);
+ if (!inode) {
+ dput(dentry);
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode->i_mode = file->mode;
+ inode->i_uid = inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (file->mode & S_IFDIR) {
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ } else {
+ inode->i_fop = file->ops;
+ }
+ inode->i_ino = ino;
+ insert_inode_hash(inode);
+ d_add(dentry, inode);
+
+out:
+ return dentry;
+}
+
+
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+ struct qstr name;
+ struct dentry *dentry = ERR_PTR(-EINVAL);
+ struct inode *inode;
+
+ if (!file->name)
+ goto out;
+ name.name = file->name;
+ name.len = strlen(name.name);
+ printk("adding link %*s\n", name.len, name.name);
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_alloc(parent, &name);
+ if (!dentry) {
+ printk("failed to d_alloc\n");
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode = iget(s, ino);
+ if (!inode) {
+ printk("failed to iget\n");
+ dput(dentry);
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ if (!inode->u.generic_ip) {
+ printk("bad inode: %d\n", ino);
+ iput(inode);
+ dput(dentry);
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode->i_nlink++;
+ d_add(dentry, inode);
+
+out:
+ return dentry;
+}
+
+
+
+
+
+/* cluster, node and group transaction files.
+ * here's where the actual work of nm takes place. */
+
+static int nm_create_cluster(char *buf)
+{
+ int ret = -EINVAL;
+
+ printk("create cluster...\n");
+
+ spin_lock(&nm_lock);
+ if (cluster.state == NM_CLUSTER_UP) {
+ ret = sprintf(buf, "%d: cluster already up\n", -EINVAL);
+ } else {
+ cluster.state = NM_CLUSTER_UP;
+ ret = sprintf(buf, "0: cluster state: UP");
+ }
+ spin_unlock(&nm_lock);
+ return ret;
+}
+
+
+
+int nm_create_group(char *buf, nm_op *data)
+{
+ struct tree_descr desc;
+ struct dentry *dentry = NULL;
+ struct inode *inode = NULL;
+ int ino, group_num;
+ int ret = -EINVAL;
+ nm_group_inode_private *g = NULL;
+
+ printk("create group...\n");
+
+ data->arg_u.gc.name[NM_MAX_NAME_LEN] = '\0';
+ inode = nm_get_group_by_name(data->arg_u.gc.name);
+ if (inode) {
+ ret = sprintf(buf, "%d: group %u (%s) already exists", -EEXIST,
+ nm_get_group_global_index(inode), data->arg_u.gc.name);
+ iput(inode);
+ return ret;
+ }
+
+ group_num = data->arg_u.gc.group_num;
+ if (group_num > NM_INVALID_SLOT_NUM)
+ goto leave;
+
+ spin_lock(&cluster.bitmap_lock);
+ group_num = nm_find_next_slot(&(cluster.group_bitmap[0]), 255, group_num);
+ spin_unlock(&cluster.bitmap_lock);
+
+ if (group_num < 0) {
+ printk("out of group slots!\n");
+ goto leave;
+ }
+
+ ino = group_num + NM_GROUP_INODE_START;
+
+ desc.name = data->arg_u.gc.name;
+ desc.ops = NULL;
+ desc.mode = S_IFDIR | 0755;
+ dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+ if (IS_ERR(dentry))
+ goto leave;
+ inode = igrab(dentry->d_inode);
+ if (!inode) {
+ printk("igrab failed!\n");
+ goto leave;
+ }
+
+ g = kmalloc(sizeof(nm_group_inode_private), GFP_KERNEL);
+ if (!g)
+ goto leave;
+
+ memset(g, 0, sizeof(nm_group_inode_private));
+ memcpy(g->disk.uuid, data->arg_u.gc.disk_uuid, CLUSTER_DISK_UUID_LEN);
+ spin_lock_init(&g->bitmap_lock);
+ if (g->disk.uuid[0])
+ g->state = NM_GROUP_NOT_READY;
+ else
+ g->state = NM_GROUP_READY;
+ g->inode = inode;
+ inode->u.generic_ip = g;
+
+ ret = sprintf(buf, "0: group %u (%s) added, uuid: %s", group_num,
+ data->arg_u.gc.name, g->disk.uuid);
+ nm_do_callbacks(NM_GROUP_ADD_CB, inode, NULL, group_num);
+
+leave:
+ if (ret < 0) {
+ if (inode) {
+ if (inode->u.generic_ip)
+ kfree(inode->u.generic_ip);
+ iput(inode);
+ }
+ if (dentry)
+ dput(dentry);
+ }
+ return ret;
+}
+
+
+int nm_create_node(char *buf, nm_op *data)
+{
+ struct tree_descr desc;
+ struct dentry *dentry = NULL;
+ struct inode *inode = NULL;
+ int ino, node_num, bucket;
+ int ret = -EINVAL;
+ nm_node_inode_private *n = NULL;
+
+ printk("add cluster node ...\n");
+
+ data->arg_u.node.node_name[NM_MAX_NAME_LEN] = '\0';
+ inode = nm_get_node_by_name(data->arg_u.node.node_name);
+ if (inode) {
+ ret = sprintf(buf, "%d: node %u (%s) already exists", -EEXIST,
+ nm_get_node_global_index(inode),
+ data->arg_u.node.node_name);
+ iput(inode);
+ return ret;
+ }
+
+ node_num = data->arg_u.node.node_num;
+ if (node_num > NM_INVALID_SLOT_NUM) {
+ printk("bad node_num: %d\n", node_num);
+ goto leave;
+ }
+
+ spin_lock(&cluster.bitmap_lock);
+ node_num = nm_find_next_slot(&(cluster.node_bitmap[0]), 255, node_num);
+ spin_unlock(&cluster.bitmap_lock);
+
+ if (node_num < 0) {
+ printk("out of node slots!\n");
+ goto leave;
+ }
+
+ ino = node_num + NM_NODE_INODE_START;
+
+ desc.name = data->arg_u.node.node_name;
+ desc.ops = NULL;
+ desc.mode = S_IFREG | S_IWUSR;
+ dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+ if (IS_ERR(dentry)) {
+ printk("bad dentry\n");
+ goto leave;
+ }
+ inode = igrab(dentry->d_inode);
+ if (!inode) {
+ printk("igrab failed!\n");
+ goto leave;
+ }
+
+ n = kmalloc(sizeof(nm_node_inode_private), GFP_KERNEL);
+ if (!n) {
+ printk("could not kmalloc\n");
+ goto leave;
+ }
+ memcpy(&n->node, &data->arg_u.node, sizeof(nm_node_info));
+ INIT_LIST_HEAD(&n->ip_hash);
+ n->net.sock = NULL;
+ INIT_LIST_HEAD(&n->net.list);
+ spin_lock_init(&n->net.sock_lock);
+ n->net.flags = 0;
+
+ /* hash on first ip address */
+ spin_lock(&nm_ip_hash_lock);
+ bucket = hash_long(n->node.ifaces[0].addr_u.ip_addr4, NM_HASH_BITS);
+ list_add_tail(&n->ip_hash, &nm_ip_hash[bucket]);
+ spin_unlock(&nm_ip_hash_lock);
+ printk("hashed ip %d.%d.%d.%d to bucket %d\n", NIPQUAD(n->node.ifaces[0].addr_u.ip_addr4), bucket);
+ n->inode = inode;
+ inode->u.generic_ip = n;
+
+ ret = sprintf(buf, "0: node %u (%s) added", node_num, n->node.node_name);
+ nm_do_callbacks(NM_NODE_ADD_CB, inode, NULL, node_num);
+
+leave:
+ if (ret < 0) {
+ if (inode) {
+ if (inode->u.generic_ip)
+ kfree(inode->u.generic_ip);
+ iput(inode);
+ }
+ if (dentry)
+ dput(dentry);
+ }
+ return ret;
+}
+
+int nm_make_group_ready(struct inode *group)
+{
+ nm_group_inode_private *g = group->u.generic_ip;
+ if (!g)
+ return -EINVAL;
+ g->state = NM_GROUP_READY;
+ return 0;
+}
+
+int nm_add_node_to_group(char *buf, nm_op *data)
+{
+ struct tree_descr desc;
+ struct inode *inode = NULL;
+ struct dentry *dentry = NULL, *child = NULL;
+ nm_group_inode_private *g = NULL;
+ int group_num, slot_num;
+ int ret = -EINVAL;
+ u16 ino;
+ char tmpname[6];
+
+ printk("add node to group...\n");
+
+ group_num = data->arg_u.gc.group_num;
+ ino = data->arg_u.gc.node_num;
+ slot_num = data->arg_u.gc.slot_num;
+
+ /* request a certain slot, or NM_INVALID_SLOT_NUM for any slot */
+ if (slot_num > NM_INVALID_SLOT_NUM)
+ goto leave;
+
+ if (ino >= NM_INVALID_SLOT_NUM || group_num >= NM_INVALID_SLOT_NUM)
+ goto leave;
+
+ inode = __nm_get_group_by_num(group_num);
+ if (!inode)
+ goto leave;
+ if (list_empty(&inode->i_dentry))
+ goto leave;
+ dentry = dget(list_entry(inode->i_dentry.next, struct dentry, d_alias));
+ if (!dentry)
+ goto leave;
+ g = inode->u.generic_ip;
+ if (!g)
+ goto leave;
+
+ if (g->state == NM_GROUP_NOT_READY) {
+ ret = sprintf(buf, "%d: group disk has not been discovered. cannot add nodes.", -EROFS);
+ goto leave;
+ }
+
+ spin_lock(&g->bitmap_lock);
+ slot_num = nm_find_next_slot(&(g->slot_bitmap[0]), 255, slot_num);
+ spin_unlock(&g->bitmap_lock);
+ if (slot_num < 0)
+ goto leave;
+
+ /* create hardlink to ino with name "slot_num" */
+ sprintf(tmpname, "%03u", slot_num);
+ desc.name = &(tmpname[0]);
+ desc.ops = NULL;
+ desc.mode = 0;
+ child = nm_add_link(single_sb, dentry, &desc,
+ NM_NODE_INODE_START+ino);
+ if (IS_ERR(child)) {
+ printk("error adding link for %s\n", tmpname);
+ child = NULL;
+ goto leave;
+ }
+
+ ret = sprintf(buf, "0: node %u added to group: %*s",
+ ino, dentry->d_name.len, dentry->d_name.name);
+
+ if (!igrab(child->d_inode))
+ goto leave;
+ nm_do_callbacks(NM_GROUP_NODE_ADD_CB, inode, child->d_inode, slot_num);
+ iput(child->d_inode);
+
+leave:
+ if (dentry)
+ dput(dentry);
+ if (child)
+ dput(child);
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+
+int nm_remove_node_from_group(struct inode *group, struct inode *node)
+{
+ struct dentry *child = NULL;
+ nm_group_inode_private *g = NULL;
+ int slot_num;
+ int ret = -EINVAL;
+
+ printk("remove node from group...\n");
+
+ slot_num = nm_get_group_index(group, node, &child);
+
+ if (slot_num == NM_MAX_NODES || !child)
+ goto leave;
+
+ g = group->u.generic_ip;
+ if (!g)
+ goto leave;
+
+ printk("killing the dentry now!!\n");
+ down(&group->i_zombie);
+ node->i_nlink--;
+ d_delete(child);
+ up(&group->i_zombie);
+ printk("done killing the dentry!!\n");
+
+
+ if (!igrab(node))
+ goto leave;
+ nm_do_callbacks(NM_GROUP_NODE_DEL_CB, group, node, slot_num);
+ iput(node);
+
+ spin_lock(&g->bitmap_lock);
+ clear_bit(slot_num, (void *)(&g->slot_bitmap[0]));
+ spin_unlock(&g->bitmap_lock);
+
+ ret = 0;
+
+leave:
+ if (child)
+ dput(child);
+ return ret;
+}
+
+
+
+int nm_name_cluster(char *buf, nm_op *data)
+{
+ int ret = -EINVAL;
+
+ printk("name cluster...\n");
+ spin_lock(&nm_lock);
+ if (cluster.state == NM_CLUSTER_UP) {
+ ret = sprintf(buf, "%d: cluster name could not be set. cluster already up.", -EINVAL);
+ goto leave;
+ }
+ memset(cluster.name, 0, NM_MAX_NAME_LEN+1);
+ memcpy(cluster.name, data->arg_u.name, NM_MAX_NAME_LEN);
+ ret = sprintf(buf, "0: cluster name set: %s", cluster.name);
+leave:
+ spin_unlock(&nm_lock);
+ return ret;
+}
+
+int nm_destroy_cluster(char *buf)
+{
+ int ret;
+ printk("destroy cluster...\n");
+
+ /* TODO */
+ spin_lock(&nm_lock);
+ nm_init_cluster(&cluster);
+ ret = sprintf(buf, "0: rudely destroyed cluster!!!");
+ spin_unlock(&nm_lock);
+ return ret;
+}
+
+int nm_get_cluster_num_nodes(char *buf)
+{
+ int num_nodes=0, i;
+
+ printk("get cluster num nodes...\n");
+
+ spin_lock(&cluster.bitmap_lock);
+ for (i=0; i<8; i++)
+ num_nodes += hweight32(cluster.node_bitmap[i]);
+ spin_unlock(&cluster.bitmap_lock);
+
+ return sprintf(buf, "0: %d", num_nodes);
+}
+
+int nm_get_cluster_num_groups(char *buf)
+{
+ int num_groups=0, i;
+
+ printk("get cluster num groups...\n");
+
+ spin_lock(&cluster.bitmap_lock);
+ for (i=0; i<8; i++)
+ num_groups += hweight32(cluster.group_bitmap[i]);
+ spin_unlock(&cluster.bitmap_lock);
+
+ return sprintf(buf, "0: %d", num_groups);
+}
+
+int nm_get_group_num_nodes(struct inode *group)
+{
+ int num_nodes=0, i;
+ nm_group_inode_private *g;
+
+ printk("get group num nodes...\n");
+
+ g = group->u.generic_ip;
+ if (!g)
+ return -EINVAL;
+
+ spin_lock(&g->bitmap_lock);
+ for (i=0; i<8; i++)
+ num_nodes += hweight32(g->slot_bitmap[i]);
+ spin_unlock(&g->bitmap_lock);
+
+ return num_nodes;
+}
+
+int nm_get_group_max_slots(struct inode *group)
+{
+ int last=0, i;
+ nm_group_inode_private *g;
+
+ printk("get group num nodes...\n");
+
+ g = group->u.generic_ip;
+ if (!g)
+ return -EINVAL;
+
+#warning need to change this for 64 bit
+ spin_lock(&g->bitmap_lock);
+ for (i=7; i>=0; i--) {
+ if (g->slot_bitmap[i]) {
+ last = fls(g->slot_bitmap[i]);
+ last += (i * sizeof(g->slot_bitmap[i]));
+ break;
+ }
+ }
+ spin_unlock(&g->bitmap_lock);
+
+ return last;
+}
+
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx)
+{
+ nm_group_inode_private *priv;
+ int next;
+
+ if (*idx >= 255)
+ return NULL;
+ priv = group->u.generic_ip;
+ if (!priv)
+ return NULL;
+ next = find_next_bit(priv->slot_bitmap, 255, *idx);
+ if (next >= 255)
+ return NULL;
+ *idx = next;
+ return util_rarray_idx_to_slot(&priv->disk.slots, next);
+}
+
+int nm_get_node_info(char *buf, nm_op *data)
+{
+ int ret, tmpret, i;
+ nm_node_inode_private *priv;
+ nm_network_iface *n;
+ struct inode *inode = NULL;
+ struct dentry *dentry;
+ u16 node_num;
+ u16 vers;
+
+ ret = -EINVAL;
+ node_num = data->arg_u.index;
+ inode = __nm_get_node_by_num(node_num);
+ if (inode) {
+ dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ priv = inode->u.generic_ip;
+ ret = sprintf(buf, "0: global_index=%u\n"
+ "name=%*s\n",
+ priv->node.node_num, dentry->d_name.len,
+ dentry->d_name.name);
+ buf += ret;
+ for (i=0; i<NM_MAX_IFACES; i++) {
+ n = &priv->node.ifaces[i];
+ vers = ntohs(n->ip_version);
+ printk("ip_version=%u, vers=%u\n", n->ip_version, vers);
+ if (vers!=4 && vers!=6)
+ continue;
+ /* TODO: how to print ipv6? */
+ tmpret = sprintf(buf, "iface%d.port=%u\n"
+ "iface%d.version=%d\n"
+ "iface%d.addr=%d.%d.%d.%d\n",
+ i, ntohs(n->ip_port), i, vers, i,
+ NIPQUAD(n->addr_u.ip_addr4));
+ buf += tmpret;
+ ret += tmpret;
+ }
+ iput(inode);
+ }
+ return ret;
+}
+
+int nm_get_group_info(char *buf, nm_op *data)
+{
+ int ret, tmpret;
+ nm_group_inode_private *g = NULL;
+ struct inode *inode = NULL;
+ u16 group_num;
+ struct dentry *dentry, *child;
+
+ ret = -EINVAL;
+ group_num = data->arg_u.index;
+ inode = __nm_get_group_by_num(group_num);
+ if (inode) {
+ g = inode->u.generic_ip;
+ dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ ret = sprintf(buf, "0: group_num=%u\n"
+ "name=%*s\n"
+ "disk_uuid=%s\n",
+ group_num, dentry->d_name.len,
+ dentry->d_name.name, g->disk.uuid);
+ buf += ret;
+
+ spin_lock(&dcache_lock);
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ tmpret = sprintf(buf, "%*s\n", child->d_name.len,
+ child->d_name.name);
+ buf += tmpret;
+ ret += tmpret;
+ }
+ spin_unlock(&dcache_lock);
+ iput(inode);
+ }
+ return ret;
+}
+
+
+
+static ssize_t write_cluster(struct file *file, char *buf, size_t size)
+{
+ nm_op *data;
+ int ret;
+ u16 me;
+
+ printk("write_cluster\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (nm_op *) buf;
+ if (data->magic != NM_OP_MAGIC)
+ return -EINVAL;
+
+ switch (data->opcode) {
+ case NM_OP_CREATE_CLUSTER:
+ ret = nm_create_cluster(buf);
+ break;
+ case NM_OP_CREATE_GROUP:
+ ret = nm_create_group(buf, data);
+ break;
+ case NM_OP_NAME_CLUSTER:
+ ret = nm_name_cluster(buf, data);
+ break;
+ case NM_OP_DESTROY_CLUSTER:
+ ret = nm_destroy_cluster(buf);
+ break;
+ case NM_OP_ADD_CLUSTER_NODE:
+ ret = nm_create_node(buf, data);
+ break;
+ case NM_OP_GET_CLUSTER_NUM_NODES:
+ ret = nm_get_cluster_num_nodes(buf);
+ break;
+ case NM_OP_GET_GLOBAL_NODE_NUM:
+ ret = 0;
+ me = nm_this_node(NULL);
+ if (me >= NM_MAX_NODES)
+ ret = -EINVAL;
+ ret = sprintf(buf, "%d: %u", ret, me);
+ break;
+ default:
+ ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+ break;
+ }
+ printk("leaving!\n");
+ return ret;
+}
+
+static ssize_t write_node(struct file *file, char *buf, size_t size)
+{
+ nm_op *data;
+ int ret;
+
+ printk("write_node\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (nm_op *) buf;
+ if (data->magic != NM_OP_MAGIC)
+ return -EINVAL;
+
+ switch (data->opcode) {
+ case NM_OP_GET_NODE_INFO:
+ ret = nm_get_node_info(buf, data);
+ break;
+ default:
+ ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+ break;
+ }
+ printk("leaving!\n");
+ return ret;
+}
+
+static ssize_t write_group(struct file *file, char *buf, size_t size)
+{
+ nm_op *data;
+ int ret;
+
+ printk("write_group\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (nm_op *) buf;
+ if (data->magic != NM_OP_MAGIC)
+ return -EINVAL;
+
+ printk("opcode is %u, add_group is %u\n", data->opcode, NM_OP_ADD_GROUP_NODE);
+ switch (data->opcode) {
+ case NM_OP_GET_GROUP_INFO:
+ ret = nm_get_group_info(buf, data);
+ break;
+
+ case NM_OP_ADD_GROUP_NODE:
+ ret = nm_add_node_to_group(buf, data);
+ break;
+
+ default:
+ ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+ break;
+ }
+ printk("leaving!\n");
+ return ret;
+}
+
+
+
+static struct inode * __nm_get_group_by_num(u16 group_num)
+{
+ struct inode *inode = iget(single_sb, group_num + NM_GROUP_INODE_START);
+ if (!inode)
+ return NULL;
+ if (!inode->u.generic_ip) {
+ iput(inode);
+ return NULL;
+ }
+ return inode;
+}
+
+static struct inode * __nm_get_node_by_num(u16 node_num)
+{
+ struct inode *inode = iget(single_sb, node_num + NM_NODE_INODE_START);
+ if (!inode)
+ return NULL;
+ if (!inode->u.generic_ip) {
+ iput(inode);
+ return NULL;
+ }
+ return inode;
+}
+
+/* ipv4 only for now... */
+struct inode * nm_get_node_by_ip(u32 addr)
+{
+ int bucket;
+ struct list_head *iter;
+ nm_node_inode_private *priv;
+ struct inode *ret = NULL;
+
+ bucket = hash_long(addr, NM_HASH_BITS);
+
+ spin_lock(&nm_ip_hash_lock);
+ list_for_each(iter, &nm_ip_hash[bucket]) {
+ priv = list_entry(iter, nm_node_inode_private, ip_hash);
+ if (priv->node.ifaces[0].addr_u.ip_addr4 == addr) {
+ ret = igrab(priv->inode);
+ break;
+ }
+
+ }
+ spin_unlock(&nm_ip_hash_lock);
+ return ret;
+}
+
+
+struct inode * nm_get_group_by_num(u16 group_num)
+{
+ struct inode *inode;
+ spin_lock(&nm_lock);
+ inode = __nm_get_group_by_num(group_num);
+ spin_unlock(&nm_lock);
+ return inode;
+}
+
+nm_cluster * nm_get_cluster(void)
+{
+ return &cluster;
+}
+
+struct inode * nm_get_node_by_num(u16 node_num)
+{
+ struct inode *inode;
+ spin_lock(&nm_lock);
+ inode = __nm_get_node_by_num(node_num);
+ spin_unlock(&nm_lock);
+ return inode;
+}
+
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index)
+{
+ struct dentry *dentry = NULL, *parent;
+ struct inode *inode = NULL;
+ char tmpname[6];
+
+ if (list_empty(&group->i_dentry))
+ return NULL;
+ parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+ if (!parent)
+ return NULL;
+
+ sprintf(tmpname, "%03u", index);
+ dentry = lookup_one_len(tmpname, parent, strlen(tmpname));
+ if (!IS_ERR(dentry)) {
+ inode = dentry->d_inode;
+ if (inode) {
+ inode = igrab(inode);
+ if (!inode->u.generic_ip || !S_ISREG (inode->i_mode)) {
+ printk("bad inode!\n");
+ iput(inode);
+ inode = NULL;
+ }
+ }
+ if (!inode)
+ dput(dentry);
+ }
+ dput(parent);
+ return inode;
+}
+
+
+struct inode * __nm_get_node_by_name(char *node_name, int dir)
+{
+ struct dentry *dentry = NULL;
+ struct inode *inode = NULL;
+
+ dentry = lookup_one_len(node_name, single_sb->s_root, strlen(node_name));
+ if (!IS_ERR(dentry)) {
+ inode = dentry->d_inode;
+ if (inode) {
+ inode = igrab(inode);
+ if (!inode->u.generic_ip ||
+ (dir && !S_ISDIR (inode->i_mode)) ||
+ (!dir && !S_ISREG (inode->i_mode))) {
+ printk("bad inode!\n");
+ iput(inode);
+ inode = NULL;
+ }
+ }
+ }
+ return inode;
+}
+
+
+/*
+ * if group is NULL: return the global index for this node
+ * if group is non NULL: return the index within the group of this node
+ *
+ * NOTE: currently getting the group index is slow
+ * will need to change this somehow
+ */
+u16 nm_this_node(struct inode *group)
+{
+ struct inode *inode = NULL;
+ struct dentry *child = NULL;
+ u16 node_num = NM_MAX_NODES;
+
+ inode = nm_get_node_by_name(nm_nodename);
+ if (inode && inode->u.generic_ip) {
+ if (group)
+ node_num = nm_get_group_index(group, inode, &child);
+ else
+ node_num = nm_get_node_global_index(inode);
+
+ }
+ iput(inode);
+ dput(child);
+ //printk("for group=%p, this node is %u\n", group, node_num);
+ return node_num;
+}
+
+/* slow */
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child)
+{
+ struct dentry *tmp = NULL, *parent = NULL;
+ u16 slot_num = NM_MAX_NODES;
+ struct list_head *iter;
+ char tmpname[6];
+ char *err;
+
+ *child = NULL;
+ parent = NULL;
+ if (list_empty(&group->i_dentry))
+ goto leave;
+ parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+ if (!parent)
+ goto leave;
+
+ spin_lock(&dcache_lock);
+ list_for_each(iter, &parent->d_subdirs) {
+ tmp = list_entry(iter, struct dentry, d_child);
+ if (tmp->d_inode == inode)
+ break;
+ tmp = NULL;
+ }
+ if (tmp)
+ dget_locked(tmp);
+ spin_unlock(&dcache_lock);
+
+ if (!tmp || tmp->d_name.len > 3)
+ goto leave;
+ strncpy(tmpname, tmp->d_name.name, tmp->d_name.len);
+ tmpname[tmp->d_name.len] = '\0';
+ err=NULL;
+ slot_num = simple_strtoul(tmpname, &err, 10);
+
+ if (*err != '\0')
+ slot_num = NM_MAX_NODES; // error
+ else
+ *child = dget(tmp); // done, get extra ref for child
+
+leave:
+ dput(parent);
+ dput(tmp);
+
+ return slot_num;
+}
+
+int nm_init(dlm_ctxt *dlm)
+{
+ return 0;
+}
+
+int nm_register_callback(int type, void (*func)(void *, void *, u16))
+{
+ nm_callback_func *f;
+
+ if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+ return -EINVAL;
+ f = kmalloc(sizeof(nm_callback_func), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOMEM;
+ memset(f, 0, sizeof(nm_callback_func));
+ f->func = func;
+ spin_lock(&nm_cb_lock);
+ list_add_tail(&f->list, &nm_callbacks[type]);
+ spin_unlock(&nm_cb_lock);
+ return 0;
+}
+
+#warning need to change nm callbacks to be like hb callbacks... no locks when calling.
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16))
+{
+ struct list_head *iter, *tmpiter;
+ int ret = -EINVAL;
+ nm_callback_func *f;
+
+ if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+ return ret;
+
+ spin_lock(&nm_cb_lock);
+ list_for_each_safe(iter, tmpiter, &nm_callbacks[type]) {
+ f = list_entry (iter, nm_callback_func, list);
+ if (f->func == func) {
+ list_del(&f->list);
+ kfree(f);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock(&nm_cb_lock);
+ return ret;
+}
+
+
+
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx)
+{
+ struct list_head *iter;
+ nm_callback_func *f;
+
+ spin_lock(&nm_cb_lock);
+ list_for_each(iter, &nm_callbacks[type]) {
+ f = list_entry (iter, nm_callback_func, list);
+ (f->func) (ptr1, ptr2, idx);
+ }
+ spin_unlock(&nm_cb_lock);
+}
+
+
+static void nm_teardown(void)
+{
+ remove_proc_entry("cluster/nm", NULL);
+ remove_proc_entry("cluster", NULL);
+}
+
+static void nm_init_cluster(nm_cluster *cluster)
+{
+ int i;
+ memset(cluster, 0, sizeof(nm_cluster));
+ cluster->state = NM_CLUSTER_DOWN;
+ spin_lock_init(&cluster->bitmap_lock);
+
+ for (i=NM_NODE_ADD_CB; i<=NM_GROUP_NODE_DEL_CB; i++)
+ INIT_LIST_HEAD(&nm_callbacks[i]);
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ * populating the filesystem.
+ */
+static int nm_fill_super(struct super_block * sb, void * data, int silent)
+{
+ int ret, sz;
+ TA_write_ops *ops;
+ static struct tree_descr nm_files[] = {
+ [NM_Cluster] = {".cluster", &transaction_ops, S_IWUSR},
+ [NM_Node] = {".node", &transaction_ops, S_IWUSR},
+ [NM_Group] = {".group", &transaction_ops, S_IWUSR},
+ /* last one */ {""}
+ };
+
+ sz = sizeof(nm_files) / sizeof(struct tree_descr);
+ ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+ ops->num_ops = sz;
+ ops->write_op[NM_Cluster] = write_cluster;
+ ops->write_op[NM_Node] = write_node;
+ ops->write_op[NM_Group] = write_group;
+
+ single_sb = NULL;
+ printk("calling simple_fill_super...\n");
+ ret = simple_fill_super(sb, 0x98675309, nm_files);
+ if (ret >= 0) {
+ TA_GENERIC_SB_MEMBER(sb) = ops;
+ single_sb = sb;
+ } else {
+ kfree(ops);
+ }
+ return ret;
+}
+
+static struct super_block *nm_read_super (struct super_block *sb, void *data, int silent)
+{
+ printk("welcome to nm_read_super!!!\n");
+ return (nm_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (nm_fs_type, "nm", nm_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_nm(void)
+{
+ int retval;
+ nm_nodename = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
+ if (nm_nodename==NULL) {
+ printk("could not allocate a few bytes for nodename!\n");
+ return -ENOMEM;
+ }
+ strcpy(nm_nodename, system_utsname.nodename);
+ printk("loading nm module: nodename is %s\n", nm_nodename);
+
+ if (nm_init_ip_hash() < 0) {
+ printk("failed to allocate node IP hash\n");
+ return -ENOMEM;
+ }
+
+ nm_init_cluster(&cluster);
+
+ if (proc_mkdir("cluster", 0)) {
+ if (proc_mkdir("cluster/nm", 0)) {
+ }
+ }
+ printk("calling register_filesystem\n");
+ retval = register_filesystem(&nm_fs_type);
+ printk("done calling register_filesystem: ret=%d\n", retval);
+ if (retval)
+ nm_teardown();
+ return retval;
+}
+
+static void __exit exit_nm(void)
+{
+ nm_teardown();
+ unregister_filesystem(&nm_fs_type);
+ nm_destroy_ip_hash();
+ kfree(nm_nodename);
+ printk("unloading nm module\n");
+}
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_nm)
+module_exit(exit_nm)
Added: branches/dlm-glue/cluster/nodemanager.h
===================================================================
--- branches/dlm-glue/cluster/nodemanager.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/nodemanager.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,252 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_NODEMANAGER_H
+#define CLUSTER_NODEMANAGER_H
+
+
+
+struct _nm_ctxt
+{
+ int dummy;
+};
+
+#define NM_MAX_IFACES 2
+#define NM_MAX_NODES 255
+#define NM_INVALID_SLOT_NUM 255
+
+/* host name, group name, cluster name all 64 bytes */
+#define NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
+
+
+#define NM_GROUP_INODE_START 200000
+#define NM_NODE_INODE_START 100000
+
+enum {
+ NM_CLUSTER_DOWN=0,
+ NM_CLUSTER_UP
+};
+
+enum {
+ NM_GROUP_NOT_READY=0,
+ NM_GROUP_READY
+};
+
+enum {
+ NM_Root = 1,
+ NM_Cluster,
+ NM_Node,
+ NM_Group,
+};
+
+
+
+
+typedef struct _nm_network_iface
+{
+ u16 ip_port; /* for simplicity, just define exactly one port for this if */
+ u16 ip_version;
+ union {
+ u32 ip_addr4; /* IPv4 address in NBO */
+ u32 ip_addr6[4]; /* IPv6 address in NBO */
+ } addr_u;
+} nm_network_iface;
+
+typedef struct _nm_node_info
+{
+ u16 node_num;
+ char node_name[NM_MAX_NAME_LEN+1];
+ nm_network_iface ifaces[NM_MAX_IFACES];
+} nm_node_info;
+
+
+typedef struct _nm_cluster
+{
+ char name[NM_MAX_NAME_LEN+1];
+ int state;
+ spinlock_t bitmap_lock;
+ u32 group_bitmap[8];
+ u32 node_bitmap[8];
+} nm_cluster;
+
+
+typedef struct _nm_group_inode_private
+{
+ struct inode *inode;
+ struct list_head net_list;
+ struct list_head disk_list;
+ cluster_disk disk;
+ int state;
+ spinlock_t bitmap_lock;
+ u32 slot_bitmap[8];
+} nm_group_inode_private;
+
+#ifdef __KERNEL__
+/* TODO: move this */
+#define NET_FLAG_CREATING_SOCKET 0x00000001
+typedef struct _net_inode_private
+{
+ struct socket *sock;
+ wait_queue_t sleep;
+ spinlock_t sock_lock;
+ struct list_head handlers;
+ struct list_head list;
+ int flags;
+} net_inode_private;
+
+typedef struct _nm_node_inode_private
+{
+ struct inode *inode;
+ nm_node_info node;
+ struct list_head ip_hash;
+ net_inode_private net;
+} nm_node_inode_private;
+#endif
+
+/* transaction file nm_op stuff */
+
+#define NM_OP_MAGIC 0xbeaf
+enum {
+ NM_OP_CREATE_CLUSTER=123,
+ NM_OP_DESTROY_CLUSTER,
+ NM_OP_NAME_CLUSTER,
+ NM_OP_ADD_CLUSTER_NODE,
+ NM_OP_GET_CLUSTER_NUM_NODES,
+ NM_OP_GET_NODE_INFO,
+ NM_OP_CREATE_GROUP,
+ NM_OP_GET_GROUP_INFO,
+ NM_OP_ADD_GROUP_NODE,
+ NM_OP_GET_GLOBAL_NODE_NUM
+};
+
+typedef struct _nm_group_change
+{
+ u16 group_num;
+ u16 node_num;
+ u16 slot_num;
+ char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+ char name[NM_MAX_NAME_LEN+1];
+} nm_group_change;
+
+typedef struct _nm_op
+{
+ u16 magic;
+ u16 opcode;
+ union {
+ u16 index;
+ char name[NM_MAX_NAME_LEN+1];
+ nm_node_info node;
+ nm_group_change gc;
+ } arg_u;
+} nm_op;
+
+
+/* callback stuff */
+
+enum {
+ NM_NODE_ADD_CB = 0,
+ NM_NODE_DEL_CB,
+ NM_GROUP_ADD_CB,
+ NM_GROUP_DEL_CB,
+ NM_GROUP_NODE_ADD_CB,
+ NM_GROUP_NODE_DEL_CB,
+ NM_NUM_CB
+};
+
+typedef void (nm_cb_func)(void *, void *, u16);
+
+typedef struct _nm_callback_func
+{
+ struct list_head list;
+ nm_cb_func *func;
+ //void (*func)(void *, void *, u16);
+} nm_callback_func;
+
+
+
+
+u16 nm_this_node(struct inode *group);
+int nm_init(struct _dlm_ctxt *dlm);
+nm_cluster * nm_get_cluster(void);
+int nm_register_callback(int type, void (*func)(void *, void *, u16));
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16));
+int nm_get_group_num_nodes(struct inode *group);
+int nm_get_group_max_slots(struct inode *group);
+int nm_make_group_ready(struct inode *group);
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx);
+int nm_remove_node_from_group(struct inode *group, struct inode *node);
+int nm_create_group(char *buf, nm_op *data);
+int nm_add_node_to_group(char *buf, nm_op *data);
+
+#ifdef __KERNEL__
+
+
+struct inode * nm_get_group_by_num(u16 group_num);
+struct inode * nm_get_node_by_num(u16 node_num);
+struct inode * __nm_get_node_by_name(char *node_name, int dir);
+struct inode * nm_get_node_by_ip(u32 addr);
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index);
+
+static inline struct inode * nm_get_node_by_name(char *node_name)
+{
+ return __nm_get_node_by_name(node_name, 0);
+}
+static inline struct inode * nm_get_group_by_name(char *group_name)
+{
+ return __nm_get_node_by_name(group_name, 1);
+}
+
+
+static inline int nm_get_node_global_index(struct inode *node)
+{
+ return (node->i_ino - NM_NODE_INODE_START);
+}
+static inline int nm_get_group_global_index(struct inode *group)
+{
+ return (group->i_ino - NM_GROUP_INODE_START);
+}
+#endif
+
+static inline int nm_valid_ino(int ino)
+{
+#if 0
+ // these should never be referred to in kernel
+ if (ino >= NM_Cluster && ino <= NM_Group)
+ return 1;
+#endif
+ if (ino >= NM_NODE_INODE_START &&
+ ino < NM_NODE_INODE_START + NM_MAX_NODES)
+ return 1;
+ if (ino >= NM_GROUP_INODE_START &&
+ ino < NM_GROUP_INODE_START + NM_MAX_NODES)
+ return 1;
+ return 0;
+}
+
+
+
+#endif /* CLUSTER_NODEMANAGER_H */
Added: branches/dlm-glue/cluster/tcp.c
===================================================================
--- branches/dlm-glue/cluster/tcp.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/tcp.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,1614 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.c
+ *
+ * tcp network stuff
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+//#if 0
+#define netprintk(x, arg...) printk("(%d) " x, current->pid, ##arg)
+#define netprintk0(x) printk("(%d) " x, current->pid)
+//#else
+#if 0
+#define netprintk(x, arg...)
+#define netprintk0(x)
+#endif
+
+struct socket *recv_sock = NULL;
+static u16 ip_version, ip_port;
+static void *net_junk_buf = NULL;
+static struct inode *net_inode = NULL;
+static u16 net_node_num;
+
+char *gsd_buf = NULL;
+char *gsd_handler_buf = NULL;
+
+
+static spinlock_t net_handler_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_list_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_status_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(net_handlers);
+static LIST_HEAD(net_recv_list);
+static LIST_HEAD(net_dispatch_list);
+static LIST_HEAD(net_status_list);
+
+static DECLARE_WAIT_QUEUE_HEAD(net_disp_thread_wait_queue);
+static DECLARE_WAIT_QUEUE_HEAD(net_recv_thread_wait_queue);
+static int net_recv_pid = -1;
+static struct task_struct *net_recv_task = NULL;
+static struct completion net_recv_complete;
+
+
+
+/////////////////////
+static void net_shutdown(void);
+static int net_startup(void);
+static int __init net_driver_entry (void);
+static int net_init_driver(void);
+static void __exit net_driver_exit (void);
+static void net_remove_handlers(void);
+static int net_check_message_valid(net_msg *msg, u32 len);
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode);
+static void net_dump_msg(struct socket *sock, struct inode *inode);
+static int net_recv_message_header(net_msg *hdr, struct socket *sock);
+static int net_init_tcp_recv_sock(void);
+static int net_receive_thread(void *data);
+static int net_receive(void);
+static int net_accept_tcp_connections(void);
+static void net_release_tcp_sock(void);
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd);
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+
+int gsd_message_action(gsd_message *g);
+int gsd_message_handler(net_msg *msg, u32 len, void *data);
+void gsd_teardown(void);
+int gsd_setup(void);
+
+
+//////////////////////
+
+
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_get_handler(net_msg_handler *nmh)
+{
+ atomic_inc(&nmh->refcnt);
+}
+
+static inline void net_get_handler(net_msg_handler *nmh)
+{
+ spin_lock(&net_handler_lock);
+ __net_get_handler(nmh);
+ spin_unlock(&net_handler_lock);
+}
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_put_handler(net_msg_handler *nmh)
+{
+ atomic_dec(&nmh->refcnt);
+ if (!atomic_read(&nmh->refcnt)) {
+ if (net_handler_in_use(nmh))
+ netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+ kfree(nmh);
+ }
+}
+
+static inline void net_put_handler(net_msg_handler *nmh)
+{
+ if (atomic_dec_and_lock(&nmh->refcnt, &net_handler_lock)) {
+ if (net_handler_in_use(nmh))
+ netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+ kfree(nmh);
+ spin_unlock(&net_handler_lock);
+ }
+}
+
+
+
+DECLARE_MUTEX(net_state_lock);
+u32 net_driver_state = NET_DRIVER_UNINITED;
+u32 net_num_dispatched = 0;
+
+
+/*
+ * net_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init net_driver_entry (void)
+{
+ struct proc_dir_entry *de;
+ de = proc_mkdir("cluster/net", 0);
+ if (!de)
+ return -1;
+ de->proc_fops->ioctl = net_ioctl;
+
+ netprintk0("Loaded net Driver module\n");
+ return 0;
+} /* net_driver_entry */
+
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ net_ioc data;
+ gsd_ioc gsd_data;
+ int ret = 0;
+ gsd_message g;
+ int response = 0;
+ struct inode *to = NULL;
+ struct file *file = NULL;
+
+ if (_IOC_TYPE (cmd) != NET_IOC_MAGIC) {
+ ret = -ENOTTY;
+ goto exit_ioctl;
+ }
+
+ switch (cmd) {
+ case NET_IOC_ACTIVATE:
+ memset(&data, 0, sizeof(net_ioc));
+ down(&net_state_lock);
+ data.status = net_driver_state;
+ if (net_driver_state == NET_DRIVER_UNINITED) {
+ ret = net_init_driver();
+ if (ret < 0) {
+ netprintk("error trying to activate net driver: %d\n", ret);
+ data.status = NET_DRIVER_UNINITED;
+ } else {
+ netprintk0("activated net driver!\n");
+ net_driver_state = data.status = NET_DRIVER_READY;
+ }
+ }
+ up(&net_state_lock);
+
+ ret = copy_to_user ((net_ioc *) arg, &data,
+ sizeof (net_ioc));
+ break;
+ case NET_IOC_GETSTATE:
+ memset(&data, 0, sizeof(net_ioc));
+ down(&net_state_lock);
+ data.status = net_driver_state;
+ up(&net_state_lock);
+ ret = copy_to_user ((net_ioc *) arg, &data,
+ sizeof (net_ioc));
+ break;
+
+ case GSD_IOC_CREATE_GROUP:
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+
+ file = fget(gsd_data.fd);
+ if (!file || !file->f_dentry || !file->f_dentry->d_inode) {
+ ret = -EINVAL;
+ break;
+ }
+ to = file->f_dentry->d_inode;
+
+ g.action = GSD_ACTION_ADD_GROUP;
+ g.from = net_node_num;
+ g.namelen = gsd_data.namelen;
+ memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+ if (to == net_inode) {
+ /* create the group locally */
+ ret = gsd_message_action(&g);
+ } else {
+ /* create the group on remote node */
+ ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response);
+ if (ret == 0)
+ ret = response;
+ }
+
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ gsd_data.status = ret;
+ ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+ break;
+
+ case GSD_IOC_ADD_GROUP_NODE:
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+
+ file = fget(gsd_data.fd);
+ if (!file || !file->f_dentry || !file->f_dentry->d_inode) {
+ ret = -EINVAL;
+ break;
+ }
+ to = file->f_dentry->d_inode;
+
+ g.action = GSD_ACTION_ADD_GROUP_NODE;
+ g.from = net_node_num;
+ g.namelen = gsd_data.namelen;
+ memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+ if (to == net_inode) {
+ /* create the group locally */
+ ret = gsd_message_action(&g);
+ } else {
+ /* create the group on remote node */
+ ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response);
+ if (ret == 0)
+ ret = response;
+ }
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ gsd_data.status = ret;
+ ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+
+exit_ioctl:
+
+ if (file)
+ fput(file);
+
+ return ret;
+} /* net_ioctl */
+
+static int net_init_driver(void)
+{
+ nm_node_info *info;
+ nm_node_inode_private *priv;
+
+ /* get the global node number for this node */
+ net_node_num = nm_this_node(NULL);
+ if (net_node_num >= NM_MAX_NODES) {
+ netprintk0("local nm node number not initialized!\n");
+ return -1;
+ }
+ net_inode = nm_get_node_by_num(net_node_num);
+ if (!net_inode) {
+ netprintk0("local nm node inode not initialized!\n");
+ return -1;
+ }
+ priv = (nm_node_inode_private *)net_inode->u.generic_ip;
+ if (!priv) {
+ iput(net_inode);
+ netprintk0("local nm node info not initialized!\n");
+ return -1;
+ }
+ info = &priv->node;
+ ip_version = info->ifaces[0].ip_version;
+ ip_port = info->ifaces[0].ip_port;
+
+ if (net_startup() < 0)
+ return -1;
+
+ if (gsd_setup() < 0)
+ return -1;
+
+ return 0;
+} /* net_init_driver*/
+
+
+/*
+ * net_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit net_driver_exit (void)
+{
+ down(&net_state_lock);
+ if (net_driver_state == NET_DRIVER_READY) {
+ netprintk0("shutting down network\n");
+ net_shutdown();
+ netprintk0("removing all net driver handlers\n");
+ net_remove_handlers();
+ gsd_teardown();
+ if (net_inode)
+ iput(net_inode);
+ net_driver_state = NET_DRIVER_UNINITED;
+ }
+ up(&net_state_lock);
+ remove_proc_entry("cluster/net", NULL);
+ netprintk0("Unloading net driver module\n");
+ return;
+} /* net_driver_exit */
+
+
+static int net_startup(void)
+{
+ net_recv_pid = -1;
+ net_recv_task = NULL;
+ init_completion (&net_recv_complete);
+
+ net_junk_buf = (void *) __get_free_page(GFP_KERNEL);
+ if (!net_junk_buf)
+ return -ENOMEM;
+
+ netprintk0("starting net receive thread...\n");
+ net_recv_pid = kernel_thread (net_receive_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (net_recv_pid < 0) {
+ netprintk("unable to launch net receive thread, error=%d", net_recv_pid);
+ net_shutdown();
+ return -EINVAL;
+ }
+
+ netprintk0("net thread running...\n");
+ return 0;
+}
+
+static void net_shutdown(void)
+{
+ netprintk ("waiting for net thread to exit....");
+ send_sig (SIGINT, net_recv_task, 0);
+ wait_for_completion (&net_recv_complete);
+ free_page((unsigned long)net_junk_buf);
+ netprintk ("net thread exited\n");
+}
+
+
+static int net_receive_thread(void *data)
+{
+ int status = 0;
+ DECLARE_WAITQUEUE(main_wait, current);
+
+ util_daemonize ("netrecv", strlen("netrecv"), 1);
+ net_recv_task = current;
+
+ status = net_init_tcp_recv_sock();
+ if (status >= 0 && recv_sock) {
+ add_wait_queue_exclusive(recv_sock->sk->sleep, &main_wait);
+ while (1) {
+ status = 0;
+ if (recv_sock->sk->tp_pinfo.af_tcp.accept_queue)
+ status = net_accept_tcp_connections();
+ if (!list_empty(&net_recv_list))
+ status = net_receive();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(20*HZ);
+ current->state = TASK_RUNNING;
+
+ if (signal_pending(current)) {
+ netprintk0("net recv thread got signal!\n");
+ break;
+ }
+ }
+ remove_wait_queue(recv_sock->sk->sleep, &main_wait);
+ } else {
+ netprintk0("failed to initialize net_thread!\n");
+ }
+
+ /* Flush all scheduled tasks */
+ flush_scheduled_work();
+ net_release_tcp_sock();
+ net_recv_task = NULL;
+ complete (&net_recv_complete);
+ return 0;
+}
+
+typedef union _my_timing_t
+{
+ __u64 q;
+ __u32 lohi[2];
+} my_timing_t;
+
+
+static int net_check_message_valid(net_msg *msg, u32 len)
+{
+ return 1;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/* for lack of a better place to do this */
+
+int gsd_setup()
+{
+ int ret;
+ gsd_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!gsd_buf)
+ return -ENOMEM;
+ /* need this stupidity until I can divorce the actual nm actions
+ * from the output they send to their user buffer */
+ gsd_handler_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!gsd_handler_buf)
+ return -ENOMEM;
+
+ ret = net_register_handler(GSD_MESSAGE, 0, 0, sizeof(gsd_message),
+ gsd_message_handler, NULL, gsd_buf);
+
+ return ret;
+}
+
+void gsd_teardown()
+{
+ free_page((unsigned long)gsd_buf);
+ free_page((unsigned long)gsd_handler_buf);
+}
+
+int gsd_message_handler(net_msg *msg, u32 len, void *data)
+{
+ return gsd_message_action((gsd_message *)msg->buf);
+}
+
+int gsd_message_action(gsd_message *g)
+{
+ int ret;
+ nm_op op;
+ int namelen = g->namelen;
+ struct inode *node=NULL, *group=NULL;
+ char name[NM_MAX_NAME_LEN+1];
+
+ if (namelen > NM_MAX_NAME_LEN)
+ return -EINVAL;
+ strncpy(name, g->name, namelen);
+ name[namelen] = '\0';
+
+ memset(&op, 0, sizeof(op));
+ switch (g->action) {
+ case GSD_ACTION_ADD_GROUP:
+ group = nm_get_group_by_name(name);
+ if (group) {
+ ret = 0;
+ break;
+ }
+ op.arg_u.gc.group_num = NM_INVALID_SLOT_NUM;
+ memcpy(op.arg_u.gc.name, name, namelen);
+ memcpy(op.arg_u.gc.disk_uuid, name, namelen);
+
+ ret = nm_create_group(gsd_handler_buf, &op);
+ if (ret >= 0)
+ ret = 0;
+ break;
+
+ case GSD_ACTION_ADD_GROUP_NODE:
+ group = nm_get_group_by_name(name);
+ if (!group) {
+ ret = -EINVAL;
+ break;
+ }
+ node = nm_get_group_node_by_index(group, g->from);
+ if (node) {
+ ret = 0;
+ if (nm_get_node_global_index(node) != g->from)
+ ret = -EINVAL;
+ break;
+ }
+ op.arg_u.gc.group_num = nm_get_group_global_index(group);
+ op.arg_u.gc.node_num = g->from;
+ op.arg_u.gc.slot_num = g->from;
+ ret = nm_add_node_to_group(gsd_handler_buf, &op);
+ if (ret >= 0)
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (node)
+ iput(node);
+ if (group)
+ iput(group);
+ return ret;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+int net_register_handler(u32 msg_type, u32 key, int flags, u32 max_len,
+ net_msg_handler_func *func, void *data, void *buf)
+{
+ net_msg_handler *nmh, *found=NULL;
+ u32 packet_len = sizeof(net_msg) + max_len;
+
+ if (packet_len < NET_MIN_MSG_LEN || packet_len > NET_MAX_MSG_LEN) {
+ netprintk("max_len for message handler out of range: %u\n",
+ max_len);
+ return -EINVAL;
+ }
+
+ /* if expecting any message payload, must pass a prealloced buffer */
+ if (!buf && max_len) {
+ netprintk("max_len > 0 (%u), but no buffer supplied!\n",
+ max_len);
+ return -EINVAL;
+ }
+
+ if (!msg_type) {
+ netprintk("no message type provided: %u, %p\n", msg_type, func);
+ return -EINVAL;
+
+ }
+ if (!func) {
+ netprintk("no message handler provided: %u, %p\n",
+ msg_type, func);
+ return -EINVAL;
+ }
+
+ nmh = kmalloc(sizeof(net_msg_handler), GFP_KERNEL);
+ if (!nmh) {
+ return -ENOMEM;
+ }
+ memset(nmh, 0, sizeof(net_msg_handler));
+ nmh->func = func;
+ nmh->data = data;
+ nmh->msg_type = msg_type;
+ nmh->max_len = max_len;
+ nmh->key = key;
+ spin_lock_init(&nmh->lock);
+ atomic_set(&nmh->refcnt, 0);
+ if (max_len == 0) {
+ nmh->buf = &nmh->hdr;
+ } else {
+ nmh->buf = buf;
+ }
+ nmh->flags = flags;
+ INIT_LIST_HEAD(&nmh->list);
+ net_get_handler(nmh);
+
+
+ /* add the new handler, checking for pre-existing */
+ spin_lock(&net_handler_lock);
+ found = net_lookup_handler(msg_type, key);
+ if (!found) {
+ list_add_tail(&nmh->list, &net_handlers);
+ } else {
+ spin_unlock(&net_handler_lock);
+ net_put_handler(found);
+ netprintk("message handler for type %u, key %u already exists!!!\n",
+ msg_type, key);
+ /* this should destroy it */
+ net_put_handler(nmh);
+ return -EEXIST;
+ }
+ spin_unlock(&net_handler_lock);
+ return 0;
+}
+
+
+
+/* net_handler_lock should be held */
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key)
+{
+ net_msg_handler *ret;
+ struct list_head *iter;
+
+ list_for_each(iter, &net_handlers) {
+ ret = list_entry(iter, net_msg_handler, list);
+ if (ret->msg_type == msg_type && ret->key == key) {
+ __net_get_handler(ret);
+ return ret;
+ }
+ }
+ return NULL;
+}
+
+
+
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len)
+{
+ net_msg *ret = NULL;
+ net_msg_handler *handler = NULL;
+ u32 packet_len;
+
+ spin_lock(&net_handler_lock);
+ handler = net_lookup_handler(msg_type, key);
+ spin_unlock(&net_handler_lock);
+
+ if (!handler) {
+ netprintk("no such message type: %u/%u\n", msg_type, key);
+ return NULL;
+ }
+ if (net_handler_msg_len_ok(handler, len)) {
+ netprintk("len for message type %u incorrect: %u, should be %u\n",
+ msg_type, len, handler->max_len);
+ goto done;
+ }
+ packet_len = len + sizeof(net_msg);
+ ret = kmalloc(packet_len, GFP_KERNEL);
+ if (!ret) {
+ netprintk("failed to allocate %u bytes for message!\n", packet_len);
+ goto done;
+ }
+ memset(ret, 0, packet_len);
+ ret->magic = NET_MSG_MAGIC;
+ ret->data_len = len;
+ ret->msg_type = msg_type;
+ ret->key = key;
+ if (len > 0)
+ memcpy(&(ret->buf[0]), data, len);
+
+done:
+ if (handler)
+ net_put_handler(handler);
+ return ret;
+}
+
+/* TODO Fix */
+static void net_remove_handlers(void)
+{
+ /* TODO: make an iterator in nm for running over each global inode
+ * do I have this already? then call destroy on each. last put
+ * will do the work. doesnt matter if it's slow. this is only
+ * on shutdown... */
+}
+
+
+
+
+/*
+ * net_recv_tcp_msg()
+ *
+ */
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len)
+{
+ nm_node_inode_private *priv;
+ nm_node_info *node;
+ int status = -EINVAL, error;
+ mm_segment_t oldfs;
+ struct sockaddr_in sin;
+ struct iovec iov = {
+ .iov_len = *packet_len,
+ .iov_base = data
+ };
+ struct msghdr msg = {
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_iovlen = 1,
+ .msg_iov = &iov,
+ .msg_name = (struct sockaddr *) &sin,
+ .msg_namelen = sizeof (sin),
+ .msg_flags = 0
+ };
+
+
+ priv = (nm_node_inode_private *)inode->u.generic_ip;
+ node = &priv->node;
+ if (!sock) {
+ spin_lock(&priv->net.sock_lock);
+ /* TODO: sock refcounting... i think we can get/put the sk */
+ sock = priv->net.sock;
+ if (!sock)
+ return -EINVAL;
+ spin_unlock(&priv->net.sock_lock);
+ }
+
+ memset (&sin, 0, sizeof (sin));
+ oldfs = get_fs ();
+ set_fs (get_ds ());
+ error = sock_recvmsg (sock, &msg, *packet_len, msg.msg_flags);
+ set_fs (oldfs);
+
+ status = 0;
+ if (error < 0) {
+ if (error == -ERESTARTSYS) {
+ status = -EBADF;
+ netprintk ("Shutting down\n");
+ } else {
+ status = -EINVAL;
+ netprintk ("unable to recvmsg, error=%d\n", error);
+ }
+ goto bail;
+ } else {
+ *packet_len = iov.iov_len;
+ status = 0;
+ netprintk("woot. recevied len=%d\n", *packet_len);
+ if (!net_check_message_valid(data, *packet_len)) {
+ netprintk0("eeeek bad net message!\n");
+ status = -EINVAL;
+ }
+ }
+
+ //netprintk ("Received packet from: %d.%d.%d.%d\n",
+ // NIPQUAD (sin.sin_addr.s_addr));
+
+bail:
+ return status;
+} /* net_recv_tcp_msg */
+
+
+/*
+ * net_send_tcp_msg()
+ *
+ */
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len)
+{
+ int status = 0, error;
+ struct sockaddr_in sin;
+ mm_segment_t oldfs;
+ nm_node_inode_private *priv;
+ nm_node_info *node;
+
+ priv = (nm_node_inode_private *)inode->u.generic_ip;
+ node = &priv->node;
+ if (!sock) {
+ spin_lock(&priv->net.sock_lock);
+ /* TODO: sock refcounting... i think we can get/put the sk */
+ sock = priv->net.sock;
+ spin_unlock(&priv->net.sock_lock);
+ }
+
+ oldfs = get_fs ();
+ netprintk("Sending msg to node=%u, name=%s\n", node->node_num, node->node_name);
+ memset (&sin, 0, sizeof (sin));
+ sin.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+ sin.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+ sin.sin_port = node->ifaces[0].ip_port;
+
+
+ status = -EINVAL;
+ if (sock) {
+ struct iovec iov = {
+ .iov_base = data,
+ .iov_len = packet_len
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_name = (struct sockaddr *) &sin,
+ .msg_namelen = sizeof (sin),
+ .msg_flags = 0
+ };
+
+ status = 0;
+ set_fs (get_ds ());
+ error = sock_sendmsg (sock, &msg, packet_len);
+ set_fs (oldfs);
+
+ if (error < 0) {
+ netprintk ("unable to sendmsg, error=%d\n", error);
+ status = -EINVAL;
+ }
+ }
+ if (status < 0)
+ netprintk ("bad status: %d\n", status);
+
+ status = 0;
+ return status;
+} /* net_send_tcp_msg */
+
+static spinlock_t net_msg_num_lock = SPIN_LOCK_UNLOCKED;
+static u64 net_msg_num = 1;
+
+/*
+ * net_send_message
+ *
+ * - this is probably the function you are looking for
+ * - it will package up the message for you, verifying that
+ * the message handler is there and the length is ok,
+ * connect to the other node if there is not already a
+ * socket for it, and optionally wait on a status return
+ * from the other node
+ * - all you need prior to this call is to have inited the
+ * net stuff, to have a valid inode for the node to contact
+ * in nm, and to have registered the message handler
+ */
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status)
+{
+ int ret = 0, tmpret;
+ net_msg *msg = NULL;
+ net_msg_handler *handler = NULL;
+ u32 packet_len;
+ net_status_ctxt nsc;
+ wait_queue_t sleep;
+ nm_node_inode_private *priv = NULL;
+ net_inode_private *net = NULL;
+
+ if (!inode || !inode->u.generic_ip) {
+ netprintk0("bad inode, cannot send message\n");
+ return -EINVAL;
+ }
+ priv = (nm_node_inode_private *)inode->u.generic_ip;
+ net = &priv->net;
+ spin_lock(&net->sock_lock);
+ if (!net->sock) {
+ spin_unlock(&net->sock_lock);
+ ret = net_init_tcp_sock(inode);
+ if (!(ret == 0 || ret == -EEXIST)) {
+ netprintk0("failed to create socket!");
+ return -EINVAL;
+ }
+ }
+ spin_unlock(&net->sock_lock);
+
+
+ spin_lock(&net_handler_lock);
+ handler = net_lookup_handler(msg_type, key);
+ spin_unlock(&net_handler_lock);
+
+ if (!handler) {
+ netprintk("no such message type: %u/%u\n", msg_type, key);
+ return -EINVAL;
+ }
+
+ if (net_handler_msg_len_ok(handler, len)) {
+ netprintk("len for message type %u incorrect: %u, should be %u\n",
+ msg_type, len, handler->max_len);
+ ret = -EINVAL;
+ goto done;
+ }
+ packet_len = len + sizeof(net_msg);
+ msg = kmalloc(packet_len, GFP_KERNEL);
+ if (!msg) {
+ netprintk("failed to allocate %u bytes for message!\n", packet_len);
+ ret = -ENOMEM;
+ goto done;
+ }
+ memset(msg, 0, packet_len);
+ msg->magic = NET_MSG_MAGIC;
+ msg->data_len = len;
+ msg->msg_type = msg_type;
+ msg->key = key;
+ spin_lock(&net_msg_num_lock);
+ msg->msg_num = net_msg_num;
+ if (net_msg_num == NET_MSG_NUM_MAX) {
+ printk("eek! net_msg_num wrapping to 1 now...\n");
+ net_msg_num = 1;
+ }
+ spin_unlock(&net_msg_num_lock);
+ if (len > 0)
+ memcpy(&(msg->buf[0]), data, len);
+
+ /* does the caller want to wait for a simple status? */
+ if (status) {
+ msg->status = 1;
+
+ INIT_LIST_HEAD(&nsc.list);
+ init_waitqueue_head(&nsc.wq);
+ atomic_set(&nsc.woken, 0);
+ nsc.msg_num = msg->msg_num;
+ nsc.status = 0;
+ spin_lock(&net_status_lock);
+ list_add(&nsc.list, &net_status_list);
+ spin_unlock(&net_status_lock);
+
+ init_waitqueue_entry(&sleep, current);
+ spin_lock(&net->sock_lock);
+ if (!net->sock) {
+ spin_unlock(&net->sock_lock);
+ netprintk0("caller wanted status return but socket went away!\n");
+ kfree(msg);
+ return -EINVAL;
+ }
+ add_wait_queue(net->sock->sk->sleep, &sleep);
+ spin_unlock(&net->sock_lock);
+ }
+{
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2;
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+
+
+ ret = net_send_tcp_msg(inode, NULL, msg, packet_len);
+
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ netprintk("net_send_tcp_msg took %llu cycles\n", u2.q-u1.q);
+ if (status) {
+ if (ret >= 0) {
+ /* wait on other node's handler */
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+ tmpret = util_wait_atomic_eq(&nsc.wq, &nsc.woken, 1, 0);
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ netprintk("waiting on status took %llu cycles\n", u2.q-u1.q);
+ *status = nsc.status;
+ netprintk("status return requested, status is %d\n", *status);
+ remove_wait_queue(recv_sock->sk->sleep, &sleep);
+ } else {
+ netprintk("status return requested, and error returned from net_send_tcp_msg=%d\n", ret);
+ /* return bad status right away */
+ *status = ret;
+ }
+ } else if (ret < 0) {
+ netprintk("no status return requested, but error returned from net_send_tcp_msg=%d\n", ret);
+ }
+}
+
+done:
+ if (handler)
+ net_put_handler(handler);
+ if (msg)
+ kfree(msg);
+ return ret;
+}
+
+
+
+
+
+/*
+ * net_receive: receive from and dispatch all sockets with data pending
+ */
+static int net_receive(void)
+{
+ struct inode *inode;
+ struct list_head *iter, *tmpiter;
+ nm_node_inode_private *priv;
+ net_inode_private *net;
+ struct socket *sock;
+ struct sock *sk;
+ net_msg hdr;
+ net_msg_handler *hnd = NULL;
+ int err = 0;
+ int tmperr;
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2, u3, u4, u5, u6;
+
+
+start_over:
+ spin_lock(&net_list_lock);
+ list_for_each_safe(iter, tmpiter, &net_recv_list) {
+ net = list_entry(iter, net_inode_private, list);
+ priv = container_of(net, nm_node_inode_private, net);
+ inode = priv->inode;
+ sock = net->sock;
+
+ if (!sock) {
+ //netprintk0("no socket yet....\n");
+ continue;
+ }
+
+ if (sock->sk->state != TCP_ESTABLISHED &&
+ sock->sk->state != TCP_CLOSE_WAIT) {
+ netprintk0("kill it and continue\n");
+ net_dump_and_close_sock(sock, inode);
+ continue;
+ }
+
+ sk = sock->sk;
+ if (skb_queue_empty(&sk->receive_queue)) {
+ //netprintk("queue empty for %lu\n", inode->i_ino);
+ continue;
+ }
+
+
+
+ list_del(&net->list);
+ spin_unlock(&net_list_lock);
+
+ memset(&hdr, 0, sizeof(net_msg));
+ err = net_recv_message_header(&hdr, sock);
+ if (err < 0) {
+ netprintk0("failed to receive message!\n");
+ goto error;
+ }
+ netprintk("received message header... magic=%u type=%u key=%u\n",
+ hdr.magic, hdr.msg_type, hdr.key);
+
+ if (hdr.magic == NET_MSG_STATUS_MAGIC) {
+rdtsc(u1.hilo[0], u1.hilo[1]);
+ net_dump_msg(sock, inode);
+ /* special type for returning message status */
+rdtsc(u2.hilo[0], u2.hilo[1]);
+ net_do_status_return(hdr.msg_num, hdr.status);
+rdtsc(u3.hilo[0], u3.hilo[1]);
+printk("status return: net_dump_msg took %llu, net_do_status_return took %llu\n", u2.q-u1.q, u3.q-u2.q);
+ err = 0;
+ goto error;
+ } else if (hdr.magic != NET_MSG_MAGIC) {
+ netprintk("bad magic: %u\n", hdr.magic);
+ goto error;
+ }
+
+ if (net_is_valid_error_type(hdr.msg_type)) {
+ /* do error handling */
+ netprintk("this is a standard error message: type=%d\n", hdr.msg_type);
+ if (hdr.msg_type == NET_ALREADY_CONNECTED) {
+ netprintk0("error: there is already a socket for this connection\n");
+ } else if (hdr.msg_type == NET_UNKNOWN_HOST) {
+ netprintk0("error: unknown host\n");
+ }
+ net_dump_msg(sock, inode);
+ err = 0;
+ goto error;
+ }
+
+ /* find a handler for it */
+ spin_lock(&net_handler_lock);
+ hnd = net_lookup_handler(hdr.msg_type, hdr.key);
+ spin_unlock(&net_handler_lock);
+
+ if (!hnd) {
+ err = -EINVAL;
+ netprintk0("no handler for message.\n");
+ goto error;
+ }
+rdtsc(u1.hilo[0], u1.hilo[1]);
+ err = net_dispatch_message(inode, sock, &hdr, hnd);
+rdtsc(u2.hilo[0], u2.hilo[1]);
+printk("net_dispatch_message took %llu\n", u2.q-u1.q);
+
+ /* if node has requested status return, do it now */
+ if (hdr.status) {
+#ifdef BIG_NET_MSG
+ u16 n = hdr.src_node;
+ hdr.src_node = hdr.dst_node;
+ hdr.dst_node = n;
+#endif
+ hdr.status = err;
+ hdr.magic = NET_MSG_STATUS_MAGIC; // twiddle the magic
+rdtsc(u3.hilo[0], u3.hilo[1]);
+ tmperr = net_send_tcp_msg(inode, sock, &hdr, sizeof(net_msg));
+rdtsc(u4.hilo[0], u4.hilo[1]);
+printk("status return (net_send_tcp_msg) took %llu\n", u4.q-u3.q);
+ } else if (err < 0) {
+ netprintk("dispatch (%u/%u) returned %d\n",
+ hdr.msg_type, hdr.key, err);
+ }
+
+
+ net_put_handler(hnd);
+
+ // re-add this socket
+ spin_lock(&net_list_lock);
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ goto start_over;
+
+error:
+ if (err < 0) {
+ if (net_link_down(err, sock)) {
+ // do NOT re-add this socket
+ netprintk("link down! err=%d\n", err);
+ net_dump_and_close_sock(sock, inode);
+ } else {
+ netprintk("bad message... node=%lu.\n", inode->i_ino);
+ net_dump_msg(sock, inode);
+ // re-add this socket
+ spin_lock(&net_list_lock);
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+ } else {
+ // re-add this socket
+ spin_lock(&net_list_lock);
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+ goto start_over;
+ }
+ spin_unlock(&net_list_lock);
+
+ return 0;
+}
+
+
+void net_do_status_return(u64 msg_num, s32 status)
+{
+ net_status_ctxt *nsc;
+ struct list_head *iter;
+
+ spin_lock(&net_status_lock);
+ list_for_each(iter, &net_status_list) {
+ nsc = list_entry(iter, net_status_ctxt, list);
+ if (nsc->msg_num == msg_num) {
+ nsc->status = status;
+ atomic_set(&nsc->woken, 1);
+ list_del(&nsc->list);
+ spin_unlock(&net_status_lock);
+ wake_up(&nsc->wq);
+ return;
+ }
+ }
+ spin_unlock(&net_status_lock);
+}
+
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd)
+{
+ int ret = -EINVAL;
+ int len, packet_len;
+
+ len = hdr->data_len;
+ packet_len = len + sizeof(net_msg);
+
+ spin_lock(&hnd->lock);
+ if (net_handler_in_use(hnd)) {
+ netprintk0("EEEEEK! handler in use! bugbug\n");
+ spin_unlock(&hnd->lock);
+ return -EINVAL;
+ }
+ if (len > hnd->max_len) {
+ netprintk("eek! advertised message data len is too large %u (max: %u)\n",
+ len, hnd->max_len);
+ spin_unlock(&hnd->lock);
+ return -EINVAL;
+ }
+ hnd->flags |= (1 << NET_HND_IN_USE);
+ spin_unlock(&hnd->lock);
+
+ memset(hnd->buf, 0, packet_len);
+ ret = net_recv_tcp_msg(inode, sock, hnd->buf, &packet_len);
+ if (ret < 0) {
+ netprintk("net_recv_tcp_msg returned: %d\n", ret);
+ } else {
+ net_num_dispatched++;
+ ret = (hnd->func)((net_msg *)hnd->buf, packet_len, hnd->data);
+ }
+
+ spin_lock(&hnd->lock);
+ hnd->flags &= ~(1 << NET_HND_IN_USE);
+ spin_unlock(&hnd->lock);
+
+ return ret;
+}
+
+
+
+/*
+ * net_accept_tcp_connections()
+ *
+ */
+static int net_accept_tcp_connections(void)
+{
+ int error, slen;
+ struct sockaddr_in sin;
+ struct socket *sock;
+ struct inode *inode;
+
+ if (!recv_sock) {
+ netprintk0("no socket!\n");
+ return 0;
+ }
+
+ if (!recv_sock->sk->tp_pinfo.af_tcp.accept_queue) {
+ //netprintk0("no connections on the queue\n");
+ return 0;
+ }
+ error = 0;
+ while (error >= 0) {
+ sock = sock_alloc();
+ if (!sock)
+ break;
+
+ sock->type = recv_sock->type;
+ sock->ops = recv_sock->ops;
+ error = recv_sock->ops->accept(recv_sock, sock, O_NONBLOCK);
+ if (error < 0) {
+ sock_release(sock);
+ break;
+ }
+ if (sock->sk->state == TCP_CLOSE) {
+ sock_release(sock);
+ continue;
+ }
+
+ slen = sizeof(sin);
+ error = sock->ops->getname(sock, (struct sockaddr *) &sin, &slen, 1);
+ if (error < 0)
+ break;
+
+ netprintk("attempt to connect from %u.%u.%u.%u:%04x\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+
+ inode = nm_get_node_by_ip(sin.sin_addr.s_addr);
+ if (inode) {
+ int exists = 1;
+ nm_node_inode_private *priv = inode->u.generic_ip;
+ net_inode_private *net = NULL;
+
+ if (priv) {
+ net = &priv->net;
+ netprintk("connect from known host: %s\n",
+ priv->node.node_name);
+ if (ntohs(sin.sin_port) >= 1024)
+ netprintk("warning: connect from unprivileged port: %u.%u.%u.%u:%d\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ spin_lock(&priv->net.sock_lock);
+ if (!priv->net.sock) {
+ netprintk("new sock, doesnt exist\n");
+ exists = 0;
+ priv->net.sock = sock;
+ if (current != net_recv_task) {
+ netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+ if (net_recv_task == NULL)
+ BUG();
+ init_waitqueue_entry(&priv->net.sleep, net_recv_task);
+ } else {
+ netprintk("process %p added to waitqueue\n", current);
+ init_waitqueue_entry(&priv->net.sleep, current);
+ }
+ add_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+ }
+ spin_unlock(&priv->net.sock_lock);
+
+ if (exists) {
+ netprintk0("already a socket for this connection!\n");
+ net_send_error(sock, NET_ALREADY_CONNECTED);
+ net_dump_and_close_sock(sock, inode);
+ } else {
+ spin_lock(&net_list_lock);
+ netprintk("added inode %lu to net_recv_list\n", inode->i_ino);
+ if (list_empty(&net->list))
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+ }
+
+ iput(inode);
+ } else {
+ netprintk0("connect from unknown host...\n");
+ net_send_error(sock, NET_UNKNOWN_HOST);
+ net_dump_and_close_sock(sock, inode);
+ }
+ }
+ return error;
+}
+
+
+int net_send_error(struct socket *sock, u32 err_type)
+{
+ struct msghdr msg;
+ mm_segment_t oldfs;
+ struct iovec iov;
+ int len;
+ static net_msg err;
+
+ if (!net_is_valid_error_type(err_type)) {
+ netprintk("bug! bad error type! %u\n", err_type);
+ return -EINVAL;
+ }
+ memset(&err, 0, sizeof(net_msg));
+ err.magic = NET_MSG_MAGIC;
+ err.msg_type = err_type;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_NOSIGNAL;
+ msg.msg_iov->iov_len = (__kernel_size_t)sizeof(net_msg);
+ msg.msg_iov->iov_base = (char*) &err;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_sendmsg(sock, &msg, (size_t)(sizeof(net_msg)));
+ set_fs(oldfs);
+
+ return len;
+}
+
+
+static int net_recv_message_header(net_msg *hdr, struct socket *sock)
+{
+ int status;
+ mm_segment_t oldfs;
+ struct iovec iov = {
+ .iov_base = hdr,
+ .iov_len = sizeof(net_msg)
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_name = 0, // (struct sockaddr *) &sin,
+ .msg_namelen = 0, // sizeof (sin),
+ .msg_flags = 0
+ };
+
+ status = 0;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ status = sock_recvmsg(sock, &msg, sizeof(net_msg), MSG_PEEK);
+ set_fs(oldfs);
+
+ if (status < 0) {
+ if (status == -ERESTARTSYS) {
+ status = -EBADF;
+ netprintk ("Shutting down\n");
+ } else {
+ status = -EINVAL;
+ netprintk ("unable to recvmsg, error=%d\n", status);
+ }
+ }
+ // error or bytes received
+ return status;
+}
+
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode)
+{
+ nm_node_inode_private *priv = NULL;
+
+ net_dump_msg(sock, inode);
+
+ if (sock->sk) {
+ if (inode) {
+ priv = inode->u.generic_ip;
+ if (priv) {
+ spin_lock(&priv->net.sock_lock);
+ remove_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+ priv->net.sock = NULL;
+ spin_unlock(&priv->net.sock_lock);
+ }
+ }
+ }
+ sock_release(sock);
+}
+
+static void net_dump_msg(struct socket *sock, struct inode *inode)
+{
+ struct msghdr msg;
+ struct iovec iov;
+ int len;
+ mm_segment_t oldfs;
+
+ if (sock->sk) {
+ len = 1;
+ while (len>0)
+ {
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_iov->iov_base = net_junk_buf;
+ msg.msg_iov->iov_len = (__kernel_size_t)PAGE_SIZE;
+ len = 0;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+ set_fs(oldfs);
+ }
+ }
+}
+
+
+int net_init_tcp_sock(struct inode *inode)
+{
+ nm_node_inode_private *priv;
+ nm_node_info *node;
+ net_inode_private *net = NULL;
+ struct sockaddr_in myaddr, remoteaddr;
+ int err = -EINVAL;
+ int i;
+ struct sock *sk;
+ struct socket *sock = NULL;
+
+ priv = inode->u.generic_ip;
+ if (!priv) {
+ netprintk0("bad inode\n");
+ return -EINVAL;
+ }
+ net = &priv->net;
+ node = &priv->node;
+
+ if ((err = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+ netprintk("can't create socket: err=%d\n", err);
+ return err;
+ }
+
+ spin_lock(&net->sock_lock);
+ if (net->sock || net->flags & NET_FLAG_CREATING_SOCKET) {
+ netprintk("socket already created or creating for inode %lu\n", inode->i_ino);
+ spin_unlock(&net->sock_lock);
+ sock_release(sock);
+ return -EEXIST;
+ }
+ net->flags |= NET_FLAG_CREATING_SOCKET;
+ spin_unlock(&net->sock_lock);
+
+ memset(&myaddr, 0, sizeof(myaddr));
+ myaddr.sin_family = AF_INET;
+ myaddr.sin_port = htons(0); // any port
+ err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, sizeof(myaddr));
+
+ memset (&remoteaddr, 0, sizeof (remoteaddr));
+ remoteaddr.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+ remoteaddr.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+ remoteaddr.sin_port = node->ifaces[0].ip_port;
+
+ //netprintk("connecting new socket: ip %d.%d.%d.%d, port %d\n", NIPQUAD(remoteaddr.sin_addr.s_addr), remoteaddr.sin_port);
+ err = sock->ops->connect(sock, (struct sockaddr *) &remoteaddr,
+ sizeof(remoteaddr), 0); /* TODO put this back! O_NONBLOCK); */
+ //netprintk("connect status %d\n", err);
+
+ if (err >= 0) {
+ spin_lock(&net->sock_lock);
+ net->sock = sock;
+ net->flags &= ~NET_FLAG_CREATING_SOCKET;
+
+ netprintk0("1) ok this node is actively trying to connect, add to waitqueue\n");
+ if (current != net_recv_task) {
+ netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+ if (net_recv_task == NULL)
+ BUG();
+ init_waitqueue_entry(&net->sleep, net_recv_task);
+ } else {
+ netprintk("process %p added to waitqueue\n", current);
+ init_waitqueue_entry(&net->sleep, current);
+ }
+ add_wait_queue(sock->sk->sleep, &net->sleep);
+
+ spin_unlock(&net->sock_lock);
+ goto out;
+ }
+
+ sk = sock->sk;
+ switch (err) {
+ case -EALREADY:
+ case -EINPROGRESS:
+
+ /* TODO: awful awful awful */
+ for (i=0; i<100; i++) {
+ /* Protect against TCP socket state changes */
+ lock_sock(sk);
+ if (sk->state == TCP_ESTABLISHED) {
+ release_sock(sk);
+ netprintk0("woo! connected...\n");
+ err = 0;
+ spin_lock(&net->sock_lock);
+ net->flags &= ~NET_FLAG_CREATING_SOCKET;
+ net->sock = sock;
+
+ netprintk0("2) ok this node is actively trying to connect, add to waitqueue\n");
+ if (current != net_recv_task) {
+ netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+ if (net_recv_task == NULL)
+ BUG();
+ init_waitqueue_entry(&net->sleep, net_recv_task);
+ } else {
+ netprintk("process %p added to waitqueue\n", current);
+ init_waitqueue_entry(&net->sleep, current);
+ }
+ add_wait_queue(sock->sk->sleep, &net->sleep);
+
+ spin_unlock(&net->sock_lock);
+ break;
+ } else {
+ netprintk("waiting for connection: pass %d, state %d\n", i, sk->state);
+ /* TODO */
+#if 0
+ task->tk_timeout = RPC_CONNECT_TIMEOUT;
+ /* if the socket is already closing, delay briefly */
+ if ((1<<sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+ task->tk_timeout = RPC_REESTABLISH_TIMEOUT;
+ rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
+#endif
+ /* TODO: this is awful... change it later */
+ }
+ release_sock(sk);
+ util_sleep(100);
+ }
+ break;
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ENOTCONN:
+ netprintk("conn refused, reset or not connected\n");
+ break;
+ default:
+ /* Report myriad other possible returns. If this file
+ * system is soft mounted, just error out, like Solaris. */
+ netprintk("error %d connecting to server\n", err);
+ /* TODO */
+#if 0
+ /* This will prevent anybody else from connecting */
+ rpc_delay(task, RPC_REESTABLISH_TIMEOUT);
+ task->tk_status = status;
+#endif
+ break;
+ }
+
+out:
+ if (err < 0) {
+ if (net) {
+ spin_lock(&net->sock_lock);
+ if (net->sock)
+ netprintk0("wha?! there's a socket there already!!!!\n");
+ net->flags &= ~NET_FLAG_CREATING_SOCKET;
+ spin_unlock(&net->sock_lock);
+ }
+ if (sock)
+ sock_release(sock);
+ } else {
+ /* add this inode to the receive list, if not already */
+ spin_lock(&net_list_lock);
+ if (list_empty(&net->list))
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+
+ return err;
+}
+
+
+
+/*
+ * net_init_tcp_recv_sock()
+ *
+ */
+static int net_init_tcp_recv_sock(void)
+{
+ struct sockaddr_in sin;
+ int status = -EINVAL;
+
+ /* Create Receive Socket */
+ status = sock_create(net_ip_version_to_family(ip_version),
+ SOCK_STREAM, IPPROTO_TCP,
+ &recv_sock);
+ if (status < 0) {
+ netprintk ("unable to create socket, error=%d", status);
+ goto bail;
+ }
+
+
+ /* Bind Receive Socket */
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = net_ip_version_to_family(ip_version);
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = ip_port;
+
+ status = recv_sock->ops->bind(recv_sock,
+ (struct sockaddr *)&sin,
+ sizeof(sin));
+ if (status < 0) {
+ netprintk ("unable to bind socket to port %d, error=%d",
+ ntohs(ip_port), status);
+ }
+
+ /* !!! dunno about these... */
+ recv_sock->sk->reuse = 1;
+ status = recv_sock->ops->listen(recv_sock, 64);
+
+bail:
+ return status;
+} /* net_init_tcp_recv_sock */
+
+
+static void net_release_tcp_sock(void)
+{
+ if (recv_sock) {
+ sock_release (recv_sock);
+ recv_sock = NULL;
+ }
+}
+
+
+module_init (net_driver_entry);
+module_exit (net_driver_exit);
Added: branches/dlm-glue/cluster/tcp.h
===================================================================
--- branches/dlm-glue/cluster/tcp.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/tcp.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_TCP_H
+#define CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+#include "nodemanager.h"
+
+
+#ifdef __KERNEL__
+
+#define NET_DISP_THREAD_MS 5000 /* TODO */
+#define NET_RECV_THREAD_MS 5000 /* TODO */
+
+#ifdef BIG_NET_MSG
+#define NET_MSG_MAGIC ((u32)0xbc0ffa55)
+#define NET_MSG_STATUS_MAGIC ((u32)0xbc0ffa56)
+#define NET_MSG_NUM_MAX ((u64)0xffffffffffffffffULL)
+typedef struct _net_msg
+{
+ __u32 magic;
+ __u32 data_len;
+ __u16 src_node;
+ __u16 dst_node;
+ __u32 msg_type;
+ __u32 key;
+ __s32 status;
+ __u64 msg_num;
+ __u8 buf[0];
+} net_msg;
+#else
+
+#define NET_MSG_MAGIC ((u16)0xfa55)
+#define NET_MSG_STATUS_MAGIC ((u16)0xfa56)
+#define NET_MSG_NUM_MAX ((u32)0xffffffffUL)
+typedef struct _net_msg
+{
+ __u16 magic;
+ __u16 data_len;
+ __u16 msg_type;
+ __s16 status;
+ __u32 key;
+ __u32 msg_num;
+ __u8 buf[0];
+} net_msg;
+
+#endif
+
+typedef int (net_msg_handler_func)(net_msg *msg, u32 len, void *data);
+
+typedef struct _net_msg_handler
+{
+ struct list_head list;
+ u32 msg_type;
+ u32 key;
+ net_msg_handler_func *func;
+ void *data;
+ net_msg hdr;
+ u32 max_len;
+ void *buf;
+ spinlock_t lock;
+ atomic_t refcnt;
+ int flags;
+} net_msg_handler;
+
+typedef struct _net_status_ctxt
+{
+ struct list_head list;
+ s32 status;
+ u64 msg_num;
+ wait_queue_head_t wq;
+ atomic_t woken;
+} net_status_ctxt;
+
+void net_do_status_return(u64 msg_num, s32 status);
+
+/* no clue for these yet... */
+#define NET_MIN_MSG_LEN (0)
+#define NET_MAX_MSG_LEN (8192)
+
+
+#define NET_ALREADY_CONNECTED 2
+#define NET_UNKNOWN_HOST 3
+
+
+static inline int net_is_valid_error_type(u32 err_type)
+{
+ if (err_type == NET_ALREADY_CONNECTED ||
+ err_type == NET_UNKNOWN_HOST)
+ return 1;
+ return 0;
+}
+
+enum {
+ NET_HND_VAR_LEN = 0,
+ NET_HND_IN_USE,
+};
+
+#define net_handler_variable_len(h) ((h)->flags & (1 << NET_HND_VAR_LEN))
+#define net_handler_in_use(h) ((h)->flags & (1 << NET_HND_IN_USE))
+
+static inline int net_handler_msg_len_ok(net_msg_handler *handler, u32 len)
+{
+ return (net_handler_variable_len(handler) ?
+ len > handler->max_len : len != handler->max_len);
+}
+
+
+static inline int net_ip_version_to_family(u16 ip_version)
+{
+ printk("ip_version passed: %u, host byteorder: %u\n", ip_version, ntohs(ip_version));
+ return PF_INET;
+ switch (ntohs(ip_version)) {
+ case 4:
+ return PF_INET;
+ case 6:
+ return PF_INET6;
+ default:
+ BUG();
+ }
+
+ return 4;
+}
+
+
+
+/* TODO: figure this out.... */
+static inline int net_link_down(int err, struct socket *sock)
+{
+ if (sock) {
+ if (sock->sk->state != TCP_ESTABLISHED &&
+ sock->sk->state != TCP_CLOSE_WAIT)
+ return 1;
+ }
+
+ if (err >= 0)
+ return 0;
+ switch (err) {
+ /* ????????????????????????? */
+ case -ERESTARTSYS:
+ case -EBADF:
+ /* When the server has died, an ICMP port unreachable
+ * message prompts ECONNREFUSED. */
+ case -ECONNREFUSED:
+ case -ENOTCONN:
+ case -ECONNRESET:
+ case -EPIPE:
+ return 1;
+ }
+ return 0;
+}
+
+enum {
+ NET_DRIVER_UNINITED,
+ NET_DRIVER_READY,
+};
+
+
+int net_register_handler(u32 msg_type, u32 key, int flags,
+ u32 max_len, net_msg_handler_func *func, void *data, void *buf);
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len);
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len);
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len);
+int net_send_error(struct socket *sock, u32 err_type);
+int net_init_tcp_sock(struct inode *inode);
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status);
+int net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *group);
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key);
+
+#endif /* __KERNEL__ */
+
+typedef struct _net_ioc
+{
+ u32 status;
+} net_ioc;
+
+typedef struct _gsd_ioc
+{
+ int fd;
+ int namelen;
+ char name[NM_MAX_NAME_LEN+1];
+ int status;
+} gsd_ioc;
+
+#define NET_IOC_MAGIC 'O'
+#define NET_IOC_ACTIVATE _IOR(NET_IOC_MAGIC, 1, net_ioc)
+#define NET_IOC_GETSTATE _IOR(NET_IOC_MAGIC, 2, net_ioc)
+#define GSD_IOC_CREATE_GROUP _IOR(NET_IOC_MAGIC, 3, gsd_ioc)
+#define GSD_IOC_ADD_GROUP_NODE _IOR(NET_IOC_MAGIC, 4, gsd_ioc)
+
+#define GSD_MESSAGE 130
+#define GSD_ACTION_ADD_GROUP (0x01)
+#define GSD_ACTION_ADD_GROUP_NODE (0x02)
+
+typedef struct _gsd_message
+{
+ u16 from;
+ u8 action;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} gsd_message;
+
+#endif /* CLUSTER_TCP_H */
Added: branches/dlm-glue/cluster/test.c
===================================================================
--- branches/dlm-glue/cluster/test.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/test.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,811 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * test.c
+ *
+ * test module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/proc_fs.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "dlmmod.h"
+
+#include "compat_libfs.h"
+
+#define testprintk(x, arg...) printk("TEST: (%d) " x, current->pid, ##arg)
+#define testprintk0(x) printk("TEST: (%d) " x, current->pid)
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size);
+static ssize_t write_net_send(struct file *file, char *buf, size_t size);
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size);
+
+enum {
+ TEST_Root = 1,
+ TEST_NetRegister,
+ TEST_NetSend,
+ TEST_NetGetNum,
+ TEST_DLMPoop,
+ TEST_DLMPoop2,
+ TEST_DLMPoop3,
+ TEST_DLMRegister
+};
+
+extern spinlock_t net_state_lock;
+extern u32 net_driver_state;
+extern struct file_operations transaction_ops;
+extern char *nm_nodename;
+extern u32 net_num_dispatched;
+
+
+static void test_teardown(void);
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data);
+
+static int test_net_send(int arg);
+static int test_net_register(int arg);
+static int test_net_get_num(int arg);
+static int test_dlm_poop(int arg);
+static int test_dlm_poop2(int arg);
+static int test_dlm_poop3(int arg);
+static int test_dlm_register(int arg);
+
+
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data)
+{
+ testprintk("got a message! type=%u, len=%u, data=%d\n", msg->msg_type, len, *(int *)data);
+ return 0;
+}
+
+#define TEST_MSG_TYPE1 87654321
+#define TEST_KEY1 12378534
+
+int test_data1 = 723123123;
+
+static int test_net_register(int arg)
+{
+ int ret;
+ struct inode *dest_inode;
+ u16 dest_node_num = (u16)arg;
+
+ testprintk("running test_net_register: will contact node %u\n", dest_node_num);
+
+ dest_inode = nm_get_node_by_num(dest_node_num);
+ if (!dest_inode) {
+ testprintk("eeek! failed to find node %u\n", dest_node_num);
+ return 0;
+ }
+ {
+ struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+ testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+ }
+
+ ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+ test_small_msg_func, &test_data1, NULL);
+ if (ret < 0) {
+ testprintk0("eek! register failed!\n");
+ return -1;
+ }
+ ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+ test_small_msg_func, &test_data1, NULL);
+ if (ret >= 0) {
+ testprintk0("eek! re-register was supposed to fail but didnt!!!\n");
+ return -1;
+ }
+ testprintk0("sweet. re-register failed like it should have.\n");
+
+ testprintk0("creating socket now...\n");
+ ret = net_init_tcp_sock(dest_inode);
+ if (ret < 0) {
+ testprintk0("failed to make socket\n");
+ return -1;
+ }
+ testprintk("net_init_tcp_sock returned %d\n", ret);
+
+ testprintk0("leaving test_net_register!\n");
+ return 0;
+}
+
+
+static int test_net_send(int arg)
+{
+ int ret;
+ struct inode *dest_inode;
+ u16 dest_node_num = (u16)arg;
+
+ testprintk("running test_net_send: will contact node %u\n", dest_node_num);
+
+ dest_inode = nm_get_node_by_num(dest_node_num);
+ if (!dest_inode) {
+ testprintk("eeek! failed to find node %u\n", dest_node_num);
+ return 0;
+ }
+ {
+ struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+ testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+ }
+
+ testprintk0("packaging message now\n");
+
+ {
+ testprintk0("woo! made a message packet... lets try sending it to ourself...\n");
+ testprintk0("waiting for socket to be created\n");
+ while (1) {
+ printk(".");
+ spin_lock(&net_state_lock);
+ if (net_driver_state == NET_DRIVER_READY) {
+ spin_unlock(&net_state_lock);
+ break;
+ }
+ spin_unlock(&net_state_lock);
+ util_sleep (100);
+ }
+ printk(". done... let's go!\n");
+ ret = net_send_message(TEST_MSG_TYPE1, TEST_KEY1, NULL, 0, dest_inode, NULL);
+ testprintk("sent!!!! ret=%d\n", ret);
+ }
+ testprintk0("leaving test_net_send!\n");
+ return 0;
+
+}
+
+static int test_net_get_num(int arg)
+{
+ testprintk("number of messages dispatched: %u\n", net_num_dispatched);
+ return 0;
+}
+
+void my_ast(void *data);
+void my_bast(void *data, int blocked_type);
+
+dlm_lockstatus lksb1, lksb2;
+wait_queue_head_t convert_wq;
+atomic_t convert_flag;
+
+dlm_ctxt *the_dlm = NULL;
+
+static int test_dlm_poop(int arg)
+{
+ testprintk("calling dlm_dump_dlm(%p)\n", the_dlm);
+ if (the_dlm)
+ dlm_dump_dlm(the_dlm);
+
+#if 0
+ dlm_ctxt *dlm;
+ dlm_status status;
+ void *data1 = &lksb1;
+ void *data2 = &lksb2;
+ int ret;
+
+ memset(&lksb1, 0, sizeof(dlm_lockstatus));
+ memset(&lksb1, 0, sizeof(dlm_lockstatus));
+
+ testprintk0("calling dlm_register_domain...\n");
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+ testprintk("dlm_register_domain returned %p\n", dlm);
+
+ testprintk0("calling dlmlock...\n");
+ status = dlmlock(dlm, LKM_EXMODE, &lksb1, 0, "lock1", my_ast, data1, my_bast);
+ testprintk("dlmlock returned %d. lksb.status=%d, lock=%p\n", status, lksb1.status, lksb1.lockid);
+
+ testprintk0("calling dlmlock to do a convert...\n");
+ status = dlmlock(dlm, LKM_PRMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data1, my_bast);
+ testprintk("dlmlock returned %d\n", status);
+
+ init_waitqueue_head (&convert_wq);
+ atomic_set(&convert_flag, 0);
+
+ testprintk0("calling second dlmlock...\n");
+ status = dlmlock(dlm, LKM_EXMODE, &lksb2, 0, "lock1", my_ast, data2, my_bast);
+ testprintk("dlmlock returned %d. lksb.status=%d, lock=%p\n", status, lksb2.status, lksb2.lockid);
+
+ testprintk0("sleeping now!\n");
+ ret = util_wait_atomic_eq(&convert_wq, &convert_flag, 1, 20000);
+ testprintk("wait returned %d\n", ret);
+
+ testprintk0("calling dlmlock to do a convert the blocking lock to NL...\n");
+ status = dlmlock(dlm, LKM_NLMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data2, my_bast);
+ testprintk("dlmlock returned %d\n", status);
+
+ testprintk0("sleeping\n");
+ util_sleep(10000);
+ testprintk0("DONE!\n");
+#endif
+ return 0;
+}
+
+
+void my_ast(void *data)
+{
+ dlm_lockstatus *l = data;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ testprintk("AST!!!: lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+}
+
+void my_bast(void *data, int blocked_type)
+{
+ dlm_lockstatus *l = data;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ testprintk("BAST!!!: blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&convert_flag, 1);
+ wake_up(&convert_wq);
+}
+
+atomic_t finish;
+
+typedef struct _poo
+{
+ struct task_struct *task;
+ dlm_ctxt *dlm;
+ dlm_lockstatus *lksb;
+ wait_queue_head_t wq;
+ atomic_t ast_flag;
+ atomic_t bast_flag;
+ struct completion complete;
+} poo;
+void my_ast2(void *data);
+void my_bast2(void *data, int blocked_type);
+int test_dlm_thread(void *data);
+atomic_t asts_fired, basts_fired;
+
+typedef union _my_timing_t
+{
+ __u64 q;
+ __u32 lohi[2];
+} my_timing_t;
+
+
+static int test_dlm_poop2(int arg)
+{
+ dlm_ctxt *dlm;
+ dlm_status status;
+ void *data1 = &lksb1;
+ void *data2 = &lksb2;
+ int ret;
+ int pid1, pid2;
+ poo *poo1, *poo2;
+ my_timing_t t1, t2, t3;
+
+ poo1 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo1=%p\n", poo1);
+ poo2 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo2=%p\n", poo2);
+
+ atomic_set(&finish, 0);
+ atomic_set(&asts_fired, 0);
+ atomic_set(&basts_fired, 0);
+
+ testprintk0("calling dlm_register_domain...\n");
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+ testprintk("dlm_register_domain returned %p\n", dlm);
+
+ poo1->dlm = dlm;
+ poo2->dlm = dlm;
+ init_completion(&poo1->complete);
+ init_completion(&poo2->complete);
+
+ rdtsc(t1.lohi[0], t1.lohi[1]);
+ pid1 = kernel_thread (test_dlm_thread, poo1, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (pid1 < 0) {
+ printk("unable to launch thread, error=%d", pid1);
+ return -EINVAL;
+ }
+ pid2 = kernel_thread (test_dlm_thread, poo2, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (pid2 < 0) {
+ printk("unable to launch thread, error=%d", pid2);
+ return -EINVAL;
+ }
+ testprintk("dlm threads running for %s...\n", dlm->name);
+ testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+ testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+ //testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+ //testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+ //testprintk("sending sigint now...\n");
+ //send_sig (SIGINT, poo1->task, 0);
+ //send_sig (SIGINT, poo2->task, 0);
+ //atomic_set(&finish, 1);
+ while (1) {
+ util_sleep(30000);
+ rdtsc(t3.lohi[0], t3.lohi[1]);
+ testprintk("another 30 sec: asts=%d, basts=%d, diff=%llu\n",
+ atomic_read(&asts_fired), atomic_read(&basts_fired),
+ t3.q - t1.q);
+ if (atomic_read(&finish)==1) {
+ printk("finish set!\n");
+ break;
+ }
+ }
+ wait_for_completion (&poo1->complete);
+ wait_for_completion (&poo2->complete);
+ rdtsc(t2.lohi[0], t2.lohi[1]);
+ kfree(poo1);
+ kfree(poo2);
+ testprintk("leaving! asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired),
+ t2.q - t1.q);
+ return 0;
+}
+
+
+int test_dlm_thread(void *data)
+{
+ dlm_status status;
+ int ret;
+ dlm_lockstatus *lksb;
+ poo *mypoo = data;
+ dlm_ctxt *dlm = mypoo->dlm;
+
+ testprintk("mypoo=%p, dlm=%p\n", mypoo, dlm);
+ mypoo->task = current;
+ lksb = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+ memset(lksb, 0, sizeof(dlm_lockstatus));
+
+ mypoo->lksb = lksb;
+ init_waitqueue_head(&mypoo->wq);
+
+ atomic_set(&mypoo->ast_flag, 0);
+ atomic_set(&mypoo->bast_flag, 0);
+
+ testprintk("mypoo=%p, dlm=%p, task=%p\n", mypoo, dlm, mypoo->task);
+
+ testprintk("calling dlmlock(%p, %d, %p, 0, \"lock1\", %p, %p, %p) to create the lock...\n",
+ dlm, LKM_EXMODE, lksb, my_ast2, data, my_bast2);
+ status = dlmlock(dlm, LKM_EXMODE, lksb, 0, "lock1", my_ast2, data, my_bast2);
+ testprintk("dlmlock returned %d. lksb.status=%d, lock=%p\n", status, lksb->status, lksb->lockid);
+
+again:
+ ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+ if (ret < 0) {
+ testprintk("1: waiting on ast converting to EX, ret=%d, type=%d, convtype=%d\n",
+ ret, lksb->lockid->type, lksb->lockid->convert_type);
+ if (ret == -EINTR)
+ goto leave;
+ goto again;
+ }
+ atomic_set(&mypoo->ast_flag, 0);
+
+
+
+wait_bast:
+ ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->bast_flag, 1, 0);
+ if (ret < 0) {
+ testprintk("2: waiting on bast after converting to EX, ret=%d, type=%d, convtype=%d\n",
+ ret, lksb->lockid->type, lksb->lockid->convert_type);
+ if (ret == -EINTR)
+ goto leave;
+ goto wait_bast;
+ }
+ atomic_set(&mypoo->bast_flag, 0);
+
+
+
+
+ atomic_set(&mypoo->ast_flag, 0);
+
+ status = dlmlock(dlm, LKM_NLMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+wait_ast:
+ ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+ if (ret < 0) {
+ testprintk("3: waiting on ast converting to NL, ret=%d, type=%d, convtype=%d\n",
+ ret, lksb->lockid->type, lksb->lockid->convert_type);
+ if (ret == -EINTR)
+ goto leave;
+ goto wait_ast;
+ }
+
+ atomic_set(&mypoo->ast_flag, 0);
+ atomic_set(&mypoo->bast_flag, 0);
+
+ status = dlmlock(dlm, LKM_EXMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+
+ if (atomic_read(&finish) == 0)
+ goto again;
+leave:
+
+ atomic_set(&finish, 1);
+ kfree(mypoo->lksb);
+ complete (&mypoo->complete);
+ testprintk0("exiting thread\n");
+ return 0;
+}
+
+
+void my_ast2(void *data)
+{
+ poo *mypoo = data;
+ dlm_lockstatus *l = mypoo->lksb;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&asts_fired);
+ //testprintk("AST!!!: lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ // l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&mypoo->ast_flag, 1);
+ wake_up(&mypoo->wq);
+}
+
+void my_bast2(void *data, int blocked_type)
+{
+ poo *mypoo = data;
+ dlm_lockstatus *l = mypoo->lksb;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&basts_fired);
+ //testprintk("BAST!!!: blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ // blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&mypoo->bast_flag, 1);
+ wake_up(&mypoo->wq);
+}
+
+wait_queue_head_t wq3;
+atomic_t ast_flag3, bast_flag3;
+dlm_lockstatus *lksb3;
+
+void my_bast3(void *data, int blocked_type);
+void my_ast3(void *data);
+
+void my_ast3(void *data)
+{
+ dlm_lock *lock = lksb3->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&asts_fired);
+ testprintk("AST!!!: lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&ast_flag3, 1);
+ wake_up(&wq3);
+}
+
+void my_bast3(void *data, int blocked_type)
+{
+ dlm_lock *lock = lksb3->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&basts_fired);
+ testprintk("BAST!!!: blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ blocked_type, lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&bast_flag3, 1);
+ wake_up(&wq3);
+}
+
+static int test_dlm_poop3(int arg)
+{
+ dlm_ctxt *dlm;
+ dlm_status status;
+ int ret, i;
+ my_timing_t t1, t2, t3, t4;
+
+ atomic_set(&finish, 0);
+ atomic_set(&asts_fired, 0);
+ atomic_set(&basts_fired, 0);
+
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+
+ lksb3 = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+ memset(lksb3, 0, sizeof(dlm_lockstatus));
+
+ init_waitqueue_head(&wq3);
+
+ atomic_set(&ast_flag3, 0);
+ atomic_set(&bast_flag3, 0);
+
+ i = 0;
+ rdtsc(t1.lohi[0], t1.lohi[1]);
+
+ /* CREATE -> NL */
+ testprintk0("creating lock\n");
+rdtsc(t3.lohi[0], t3.lohi[1]);
+ status = dlmlock(dlm, LKM_NLMODE, lksb3, 0, "lock1", my_ast3, NULL, my_bast3);
+
+ while (1) {
+ testprintk("%d: waiting on ast\n", i);
+ ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+ if (ret == -EINTR)
+ break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->NL took: %llu\n", i, t4.q - t3.q);
+ testprintk("%d: no bast for NL\n", i);
+
+ atomic_set(&ast_flag3, 0);
+ atomic_set(&bast_flag3, 0);
+
+ if (i == 10) {
+ testprintk("%d: reached 10, goodbye\n", i);
+ break;
+ }
+ dlm_dump_dlm(dlm);
+
+ /* CONVERT -> EX */
+ testprintk("%d: converting dlmlock->EX\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+ status = dlmlock(dlm, LKM_EXMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+ testprintk("%d: waiting on ast\n", i);
+ ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+ if (ret == -EINTR)
+ break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->EX took: %llu\n", i, t4.q - t3.q);
+ atomic_set(&ast_flag3, 0);
+
+ testprintk("%d: waiting on bast\n", i);
+ ret = util_wait_atomic_eq(&wq3, &bast_flag3, 1, 0);
+ if (ret == -EINTR)
+ break;
+ atomic_set(&ast_flag3, 0);
+ atomic_set(&bast_flag3, 0);
+
+ /* CONVERT -> NL */
+ testprintk("%d: converting dlmlock->NL\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+ status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+ /* WAIT ON AST AGAIN */
+ i++;
+ }
+
+ /* DOWNCONVERT LAST TIME */
+ /* TODO: replace with dlmunlock once implemented */
+ status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+ kfree(lksb3);
+
+ rdtsc(t2.lohi[0], t2.lohi[1]);
+ testprintk("leaving! asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired),
+ t2.q - t1.q);
+ return 0;
+}
+
+
+static int test_dlm_register(int arg)
+{
+ dlm_ctxt *dlm;
+
+ testprintk0("calling dlm_register_domain...\n");
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+ testprintk("dlm_register_domain returned %p\n", dlm);
+
+ the_dlm = dlm;
+ testprintk0("leaving!\n");
+ return 0;
+}
+
+
+
+
+/*
+ * module stuff
+ */
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_net_register(%d)\n", arg);
+ tmpret = test_net_register(arg);
+ ret = sprintf(buf, "test_net_register(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_net_send(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_net_send(%d)\n", arg);
+ tmpret = test_net_send(arg);
+ ret = sprintf(buf, "test_net_send(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_net_get_num(%d)\n", arg);
+ tmpret = test_net_get_num(arg);
+ ret = sprintf(buf, "test_net_get_num(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_poop(%d)\n", arg);
+ tmpret = test_dlm_poop(arg);
+ ret = sprintf(buf, "test_dlm_poop(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_poop2(%d)\n", arg);
+ tmpret = test_dlm_poop2(arg);
+ ret = sprintf(buf, "test_dlm_poop2(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_poop3(%d)\n", arg);
+ tmpret = test_dlm_poop3(arg);
+ ret = sprintf(buf, "test_dlm_poop3(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_register(%d)\n", arg);
+ tmpret = test_dlm_register(arg);
+ ret = sprintf(buf, "test_dlm_register(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ * populating the filesystem.
+ */
+static int test_fill_super(struct super_block * sb, void * data, int silent)
+{
+ int ret, sz;
+ TA_write_ops *ops;
+ static struct tree_descr test_files[] = {
+ [TEST_NetRegister] = {"net-register", &transaction_ops, S_IWUSR},
+ [TEST_NetSend] = {"net-send", &transaction_ops, S_IWUSR},
+ [TEST_NetGetNum] = {"net-get-num", &transaction_ops, S_IWUSR},
+ [TEST_DLMPoop] = {"dlm-poop", &transaction_ops, S_IWUSR},
+ [TEST_DLMPoop2] = {"dlm-poop2", &transaction_ops, S_IWUSR},
+ [TEST_DLMPoop3] = {"dlm-poop3", &transaction_ops, S_IWUSR},
+ [TEST_DLMRegister] = {"dlm-register", &transaction_ops, S_IWUSR},
+ /* last one */ {""}
+ };
+
+ sz = sizeof(test_files) / sizeof(struct tree_descr);
+ ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+ ops->num_ops = sz;
+ ops->write_op[TEST_NetRegister] = write_net_register;
+ ops->write_op[TEST_NetSend] = write_net_send;
+ ops->write_op[TEST_NetGetNum] = write_net_get_num;
+ ops->write_op[TEST_DLMPoop] = write_dlm_poop;
+ ops->write_op[TEST_DLMPoop2] = write_dlm_poop2;
+ ops->write_op[TEST_DLMPoop3] = write_dlm_poop3;
+ ops->write_op[TEST_DLMRegister] = write_dlm_register;
+
+ printk("calling simple_fill_super...\n");
+ ret = simple_fill_super(sb, 0x12beAf00L, test_files);
+ if (ret >= 0) {
+ TA_GENERIC_SB_MEMBER(sb) = ops;
+ } else {
+ kfree(ops);
+ }
+ return ret;
+}
+
+static struct super_block *test_read_super (struct super_block *sb, void *data, int silent)
+{
+ printk("welcome to test_read_super!!!\n");
+ return (test_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (test_fs_type, "test", test_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_test(void)
+{
+ int retval;
+ void *ret;
+
+ printk("loading test module: nodename is %s\n", nm_nodename);
+
+ ret = proc_mkdir("cluster/test", 0);
+ printk("proc_mkdir of cluster/test returned %p\n", ret);
+
+ printk("calling register_filesystem\n");
+ retval = register_filesystem(&test_fs_type);
+ printk("done calling register_filesystem: ret=%d\n", retval);
+ if (retval) {
+ printk("oopsy that did not work\n");
+ test_teardown();
+ } else
+ printk("woot. good to go.\n");
+ return retval;
+}
+
+static void __exit exit_test(void)
+{
+ test_teardown();
+ unregister_filesystem(&test_fs_type);
+ printk("unloading test module\n");
+}
+
+static void test_teardown(void)
+{
+ printk("removing cluster/test\n");
+ remove_proc_entry("cluster/test", NULL);
+}
+
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_test)
+module_exit(exit_test)
Added: branches/dlm-glue/cluster/util.c
===================================================================
--- branches/dlm-glue/cluster/util.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/util.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,349 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.c
+ *
+ * General purpose code
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "util.h"
+
+static void util_timeout_func(unsigned long data);
+
+/* block all but 'mask' sigs, optionally saving off our previous
+ * signal state. */
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+ sigset_t tmpsig;
+
+ siginitsetinv(&tmpsig, mask);
+ sigprocmask(SIG_BLOCK, &tmpsig, oldsigs);
+#else
+#ifdef HAVE_NPTL
+ spin_lock_irq (¤t->sighand->siglock);
+ if (oldsigs)
+ *oldsigs = current->blocked;
+ siginitsetinv (¤t->blocked, mask);
+ recalc_sigpending ();
+ spin_unlock_irq (¤t->sighand->siglock);
+#else
+ spin_lock_irq (¤t->sigmask_lock);
+ if (oldsigs)
+ *oldsigs = current->blocked;
+ siginitsetinv (¤t->blocked, mask);
+ recalc_sigpending (current);
+ spin_unlock_irq (¤t->sigmask_lock);
+#endif
+#endif
+}
+
+void util_unblock_sigs(sigset_t newsig)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+ sigprocmask(SIG_SETMASK, &newsig, NULL);
+#else
+#ifdef HAVE_NPTL
+ spin_lock_irq (¤t->sighand->siglock);
+ current->blocked = newsig;
+ recalc_sigpending ();
+ spin_unlock_irq (¤t->sighand->siglock);
+#else
+ spin_lock_irq (¤t->sigmask_lock);
+ current->blocked = newsig;
+ recalc_sigpending (current);
+ spin_unlock_irq (¤t->sigmask_lock);
+#endif
+#endif
+}
+
+/*
+ * util_daemonize()
+ *
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/* yes, len is unused but kept here for backwards compatibility. */
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+ sigset_t tmpsig;
+
+ daemonize (name);
+
+ if (shutdown_sigs) {
+ /* Unblock SIGKILL, SIGSTOP, SIGHUP and SIGINT */
+ sigemptyset(&tmpsig);
+ sigaddsetmask(&tmpsig, SHUTDOWN_SIGS);
+ sigprocmask(SIG_UNBLOCK, &tmpsig, NULL);
+ }
+
+ return;
+} /* util_daemonize */
+#else
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+ daemonize ();
+ reparent_to_init ();
+
+ if (len > 0) {
+ if (len > 15)
+ BUG();
+ strncpy (current->comm, name, len);
+ current->comm[len] = '\0';
+ }
+
+ if (shutdown_sigs)
+ util_block_sigs(NULL, SHUTDOWN_SIGS);
+ else
+ util_block_sigs(NULL, 0);
+ return;
+} /* util_daemonize */
+#endif
+
+/*
+ * util_sleep()
+ *
+ * The interval time is in milliseconds
+ *
+ * This function needs to be removed.
+ * Instead call schedule_timeout() directly and handle signals.
+ */
+int util_sleep (__u32 ms)
+{
+ __u32 numJiffies;
+
+ /* 10ms = 1 jiffy, minimum resolution is one jiffy */
+ numJiffies = ms * HZ / 1000;
+ numJiffies = (numJiffies < 1) ? 1 : numJiffies;
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ numJiffies = schedule_timeout (numJiffies);
+
+ return 0;
+} /* util_sleep */
+
+/* prefetch has been declared to allow to build in debug mode */
+#ifdef DEBUG
+#ifndef ARCH_HAS_PREFETCH
+inline void prefetch (const void *x)
+{;
+}
+#endif
+#endif
+
+
+static void util_timeout_func(unsigned long data)
+{
+ util_timeout *to = (util_timeout *)data;
+
+ to->timed_out = 1;
+ wake_up(&to->wait);
+}
+
+void util_init_timeout(util_timeout *to)
+{
+ init_timer(&to->timer);
+ to->timer.data = (unsigned long)to;
+ to->timer.function = util_timeout_func;
+ to->timed_out = 0;
+ init_waitqueue_head(&to->wait);
+}
+
+void util_set_timeout(util_timeout *to, __u32 timeout)
+{
+ __u32 how_long;
+
+ if (!timeout) {
+ to->timed_out = 1;
+ return ;
+ }
+
+ how_long = (timeout * HZ / 1000);
+ if (how_long < 1)
+ how_long = 1;
+
+ to->timer.expires = jiffies + how_long;
+ add_timer(&to->timer);
+}
+
+void util_clear_timeout(util_timeout *to)
+{
+ del_timer_sync(&to->timer);
+}
+
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms)
+{
+ int ret;
+ util_timeout timeout;
+ DECLARE_WAITQUEUE(wait, current);
+ DECLARE_WAITQUEUE(to_wait, current);
+
+ util_init_timeout(&timeout);
+
+ if (ms) {
+ util_set_timeout(&timeout, ms);
+ if (timeout.timed_out) {
+ util_clear_timeout(&timeout);
+ }
+ }
+ add_wait_queue(wq, &wait);
+ add_wait_queue(&timeout.wait, &to_wait);
+ do {
+ ret = 0;
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_read(var)==val)
+ break;
+ ret = -ETIMEDOUT;
+ if (timeout.timed_out)
+ break;
+ schedule();
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ } while (1);
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(wq, &wait);
+ remove_wait_queue(&timeout.wait, &to_wait);
+
+ if (ms)
+ util_clear_timeout(&timeout);
+
+ return ret;
+}
+
+/* resizable (using chained pages) array stuff */
+void util_init_rarray(util_rarray *arr, u16 elem_size)
+{
+ arr->elements = 0;
+ arr->max_elem = 0;
+ arr->elem_size = elem_size;
+ arr->page = NULL;
+}
+
+
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx)
+{
+ int pgnum, pgoff;
+ util_rarray_page *pg;
+
+ if (idx >= arr->max_elem) {
+ printk("eek! asked for %d, but only %d elements\n",
+ idx, arr->max_elem);
+ return NULL;
+ }
+
+ pgnum = idx / UTIL_RARRAY_ELEM_PER_BUF(arr);
+ pgoff = idx % UTIL_RARRAY_ELEM_PER_BUF(arr);
+ pg = (util_rarray_page *)arr->page;
+ while (pgnum--) {
+ if (!pg->next) {
+ printk("eeek! no next page!\n");
+ return NULL;
+ }
+ pg = pg->next;
+ }
+ return (((char *)pg->buf) + (pgoff * arr->elem_size));
+}
+
+
+void * util_get_new_rarray_slot(util_rarray *arr, int *index)
+{
+ char *tmp;
+ util_rarray_page *newpg, *pg;
+
+ if (arr->max_elem == arr->elements) {
+ newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+ if (!newpg) {
+ printk("could not grow array!!!\n");
+ return NULL;
+ }
+ memset(newpg, 0, PAGE_SIZE);
+ if (arr->page) {
+ pg = (util_rarray_page *)arr->page;
+ while (pg->next)
+ pg = pg->next;
+ pg->next = newpg;
+ } else
+ arr->page = newpg;
+ arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+ }
+
+ tmp = util_rarray_idx_to_slot(arr, arr->elements);
+ if (tmp) {
+ if (index)
+ *index = arr->elements;
+ arr->elements++;
+ }
+ return tmp;
+}
+
+
+int util_add_to_rarray(util_rarray *arr, void *new)
+{
+ void *slot;
+ int idx;
+
+ slot = util_get_new_rarray_slot(arr, &idx);
+ if (slot == NULL)
+ return -EINVAL;
+ memcpy(slot, new, arr->elem_size);
+ return idx;
+}
+
+/* resizes rarray to at least newelem elements */
+int util_resize_rarray(util_rarray *arr, int newelem)
+{
+ util_rarray_page *newpg, *pg;
+
+ printk("util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+ while (arr->max_elem < newelem) {
+ newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+ if (!newpg) {
+ printk("could not grow array!!!\n");
+ return -ENOMEM;
+ }
+ memset(newpg, 0, PAGE_SIZE);
+ if (arr->page) {
+ pg = (util_rarray_page *)arr->page;
+ while (pg->next)
+ pg = pg->next;
+ pg->next = newpg;
+ } else
+ arr->page = newpg;
+ arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+ }
+ printk("leaving util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+
+ return 0;
+}
+
+
Added: branches/dlm-glue/cluster/util.h
===================================================================
--- branches/dlm-glue/cluster/util.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/util.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_UTIL_H
+#define CLUSTER_UTIL_H
+
+#ifdef __KERNEL__
+#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGHUP) | \
+ sigmask(SIGINT) | sigmask(SIGQUIT))
+
+/* timeout structure taken from Ben's aio.c */
+typedef struct _util_timeout {
+ struct timer_list timer;
+ int timed_out;
+ wait_queue_head_t wait;
+} util_timeout;
+
+void util_clear_timeout(util_timeout *to);
+void util_daemonize(char *name, int len, int shutdown_sigs);
+void util_init_timeout(util_timeout *to);
+void util_set_timeout(util_timeout *to, __u32 timeout);
+void util_show_stack(unsigned long *esp);
+void util_show_trace(unsigned long *stack);
+int util_sleep(__u32 ms);
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms);
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask);
+void util_unblock_sigs(sigset_t newsig);
+
+/* exits when var == val, or on timeout */
+static inline int util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int timeout)
+{
+ int ret = 0;
+ if (atomic_read(var) != val)
+ ret = __util_wait_atomic_eq(wq, var, val, timeout);
+ return ret;
+}
+
+#endif /* __KERNEL__ */
+
+/* resizable array */
+typedef struct _util_rarray
+{
+ void *page;
+ u16 elements;
+ u16 max_elem;
+ u16 elem_size;
+ u16 reserved1;
+} util_rarray;
+
+#define UTIL_RARRAY_PAGE_BUF_SIZE (PAGE_SIZE - offsetof(util_rarray_page, buf))
+#define UTIL_RARRAY_ELEM_PER_BUF(r) ((UTIL_RARRAY_PAGE_BUF_SIZE) / (r)->elem_size)
+typedef struct _util_rarray_page
+{
+ void *next;
+ char buf[0];
+} util_rarray_page;
+
+void util_init_rarray(util_rarray *arr, u16 elem_size);
+void * util_get_new_rarray_slot(util_rarray *arr, int *index);
+int util_add_to_rarray(util_rarray *arr, void *new);
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx);
+int util_resize_rarray(util_rarray *arr, int newelem);
+
+#ifdef __KERNEL__
+typedef struct _util_thread_info
+{
+ wait_queue_head_t thread_wq;
+ atomic_t woken;
+ struct task_struct *task;
+ struct completion complete;
+ int pid;
+} util_thread_info;
+
+
+static inline void util_thread_info_init(util_thread_info *info)
+{
+ init_waitqueue_head(&info->thread_wq);
+ atomic_set(&info->woken, 0);
+ info->task = NULL;
+ info->pid = -1;
+ init_completion(&info->complete);
+}
+#endif /* __KERNEL__ */
+
+#endif /* CLUSTER_UTIL_H */
Added: branches/dlm-glue/cluster/warning_hack.h
===================================================================
--- branches/dlm-glue/cluster/warning_hack.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/cluster/warning_hack.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * warning_hack.h
+ *
+ * just to get rid of stupid warnings
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef WARNING_HACK_H
+#define WARNING_HACK_H
+
+struct mem_dqinfo;
+struct request;
+
+extern __inline__ int generic_fls(int x);
+extern __inline__ int get_bitmask_order(unsigned int count);
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+extern inline int rq_data_dir(struct request *rq);
+
+
+#endif /* WARNING_HACK_H */
Modified: branches/dlm-glue/configure.in
===================================================================
--- branches/dlm-glue/configure.in 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/configure.in 2004-12-04 02:54:01 UTC (rev 1692)
@@ -82,19 +82,6 @@
AC_MSG_ERROR(GCC is required)
fi
-AC_MSG_CHECKING(for cluster support headers)
-AC_ARG_WITH(cluster-support, [ --with-cluster-support=dir Path to the cluster support headers [[none]]], clusterinc="$withval", clusterinc="not found")
-AC_MSG_RESULT($clusterinc)
-
-CLUSTERINC=
-if test -f "$clusterinc/dlmcommon.h"; then
- CLUSTERINC=$clusterinc
-else
- AC_MSG_ERROR([Cluster support headers not found, please use --with-cluster-support=/path/to/headers])
-fi
-
-AC_SUBST(CLUSTERINC)
-
AC_MSG_CHECKING(for debugging)
AC_ARG_ENABLE(debug, [ --enable-debug=[yes/no] Turn on debugging [default=yes]],,enable_debug=yes)
OCFS_DEBUG=
Modified: branches/dlm-glue/src/Makefile
===================================================================
--- branches/dlm-glue/src/Makefile 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/Makefile 2004-12-04 02:54:01 UTC (rev 1692)
@@ -188,7 +188,7 @@
BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__
DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
-INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC) -I$(CLUSTERINC)
+INCLUDES = -I. -I$(TOPDIR) -I$(KERNELINC) -I$(GCCINC)
CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
-fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
@@ -237,8 +237,8 @@
INSTALL_RULES = install-ocfs
install-ocfs: $(INSTALL_MODULE)
- $(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)
- $(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/$<
+ $(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+ $(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/ocfs2/$<
include $(TOPDIR)/Postamble.make
Modified: branches/dlm-glue/src/dlmglue.c
===================================================================
--- branches/dlm-glue/src/dlmglue.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/dlmglue.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -30,12 +30,12 @@
#include <linux/highmem.h>
#include <linux/smp_lock.h>
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmhb.h>
-#include <dlmnm.h>
-#include <dlmtcp.h>
-#include <dlmmod.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
#include "ocfs_log.h"
#include "ocfs.h"
Modified: branches/dlm-glue/src/heartbeat.c
===================================================================
--- branches/dlm-glue/src/heartbeat.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/heartbeat.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -31,9 +31,9 @@
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmhb.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
#include "ocfs_log.h"
#include "ocfs.h"
Modified: branches/dlm-glue/src/ocfs.h
===================================================================
--- branches/dlm-glue/src/ocfs.h 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/ocfs.h 2004-12-04 02:54:01 UTC (rev 1692)
@@ -42,11 +42,11 @@
# include <linux/tqueue.h>
#endif
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmnm.h>
-#include <dlmtcp.h>
-#include <dlmmod.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
/* convenience macro */
Modified: branches/dlm-glue/src/super.c
===================================================================
--- branches/dlm-glue/src/super.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/super.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -42,9 +42,9 @@
#include <linux/socket.h>
#include <linux/inet.h>
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmnm.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
#include "ocfs_log.h"
#include "ocfs.h"
Modified: branches/dlm-glue/src/vote.c
===================================================================
--- branches/dlm-glue/src/vote.c 2004-12-04 01:18:01 UTC (rev 1691)
+++ branches/dlm-glue/src/vote.c 2004-12-04 02:54:01 UTC (rev 1692)
@@ -30,12 +30,12 @@
#include <linux/highmem.h>
#include <linux/smp_lock.h>
-#include <dlmutil.h>
-#include <dlmcommon.h>
-#include <dlmhb.h>
-#include <dlmnm.h>
-#include <dlmtcp.h>
-#include <dlmmod.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
#include "ocfs_log.h"
#include "ocfs.h"
More information about the Ocfs2-commits
mailing list