[Ocfs2-commits] mfasheh commits r1693 - in trunk: . cluster src
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Mon Dec 6 15:45:34 CST 2004
Author: mfasheh
Date: 2004-12-06 15:45:32 -0600 (Mon, 06 Dec 2004)
New Revision: 1693
Added:
trunk/cluster/
trunk/cluster/Makefile
trunk/cluster/compat_libfs.c
trunk/cluster/compat_libfs.h
trunk/cluster/dlm_compat.h
trunk/cluster/dlmcommon.h
trunk/cluster/dlmmaster.c
trunk/cluster/dlmmod.c
trunk/cluster/dlmmod.h
trunk/cluster/dlmrecovery.c
trunk/cluster/dlmthread.c
trunk/cluster/heartbeat.c
trunk/cluster/heartbeat.h
trunk/cluster/nodemanager.c
trunk/cluster/nodemanager.h
trunk/cluster/tcp.c
trunk/cluster/tcp.h
trunk/cluster/test.c
trunk/cluster/util.c
trunk/cluster/util.h
trunk/cluster/warning_hack.h
trunk/src/dlmglue.c
trunk/src/dlmglue.h
trunk/src/slot_map.c
trunk/src/slot_map.h
Removed:
trunk/src/dlm.c
trunk/src/dlm.h
trunk/src/lockres.c
trunk/src/lockres.h
trunk/src/nm.c
trunk/src/nm.h
trunk/src/ocfs2_disk_dlm.h
trunk/src/volcfg.c
trunk/src/volcfg.h
Modified:
trunk/Makefile
trunk/src/Makefile
trunk/src/alloc.c
trunk/src/aops.c
trunk/src/dcache.c
trunk/src/dir.c
trunk/src/file.c
trunk/src/heartbeat.c
trunk/src/heartbeat.h
trunk/src/inode.c
trunk/src/inode.h
trunk/src/journal.c
trunk/src/localalloc.c
trunk/src/namei.c
trunk/src/ocfs.h
trunk/src/ocfs1_fs_compat.h
trunk/src/ocfs2.h
trunk/src/ocfs2_fs.h
trunk/src/ocfs_journal.h
trunk/src/ocfs_log.h
trunk/src/proc.c
trunk/src/suballoc.c
trunk/src/super.c
trunk/src/sysfile.c
trunk/src/sysfile.h
trunk/src/util.c
trunk/src/util.h
trunk/src/vote.c
trunk/src/vote.h
Log:
* merge the dlm-glue branch back to trunk.
Modified: trunk/Makefile
===================================================================
--- trunk/Makefile 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/Makefile 2004-12-06 21:45:32 UTC (rev 1693)
@@ -2,7 +2,7 @@
include $(TOPDIR)/Preamble.make
-SUBDIRS = src docs patches vendor
+SUBDIRS = cluster src docs patches vendor
DIST_FILES = \
COPYING \
Added: trunk/cluster/Makefile
===================================================================
--- trunk/cluster/Makefile 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/Makefile 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,226 @@
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+TOPDIR = ..
+
+include $(TOPDIR)/Preamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6.x kernel build system
+
+# Global parameter so we know where our stuff is
+CLUSTER_SRC_DIR := $(M)
+
+include $(CLUSTER_SRC_DIR)/../Config.make
+endif
+
+#-*******************************************************
+# Now do stuff which is global for 2.4.x and 2.6.x builds
+
+#ifdef OCFS_DEBUG
+OPTS += -g
+#endif
+
+#ifdef OCFS_DEBUG
+GLOBAL_DEFINES += -DDEBUG
+#endif
+
+ifdef OCFS_TRACE
+GLOBAL_DEFINES += -DTRACE
+endif
+
+ifdef HAVE_NPTL
+GLOBAL_DEFINES += -DHAVE_NPTL
+endif
+
+CFILES = \
+ compat_libfs.c \
+ dlmmaster.c \
+ dlmmod.c \
+ dlmrecovery.c \
+ dlmthread.c \
+ heartbeat.c \
+ nodemanager.c \
+ tcp.c \
+ util.c \
+ test.c
+
+HFILES = \
+ compat_libfs.h \
+ dlm_compat.h \
+ dlmcommon.h \
+ dlmmod.h \
+ heartbeat.h \
+ nodemanager.h \
+ tcp.h \
+ util.h \
+ warning_hack.h
+
+CLEAN_RULES = clean-cluster
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+# End of stuff which is global for 2.4.x and 2.6.x kernels
+#-********************************************************
+
+# See if we are being included by the 2.6 kernel build system.
+ifeq ($(KERNELRELEASE),)
+# Normal build that is being called locally
+# Preliminary 2.6.x kernel support. See if we are building for the 2.6.x
+# kernel
+ifndef KERNEL_26
+# Building for a 2.4.x kernel
+
+WARNINGS = -Wall -Wstrict-prototypes
+
+ifneq ($(OCFS_PROCESSOR),x86_64)
+WARNINGS += -Wmissing-prototypes -Wmissing-declarations
+endif
+
+ifeq ($(KVER),vmware)
+ KERNELINC = /usr/src/linux-2.4/include
+endif
+
+ifeq ($(KVER),suse)
+ GLOBAL_DEFINES += -DSUSE
+endif
+ifeq ($(KVER),hugemem)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=1
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),smp)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=1
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),ent)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=1
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=0
+endif
+ifeq ($(KVER),up)
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_HUGEMEM=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_ENTERPRISE=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_SMP=0
+ GLOBAL_DEFINES += -D__BOOT_KERNEL_UP=1
+endif
+
+ifeq ($(OCFS_PROCESSOR),ppc64)
+ MACH_CFLAGS += -m64 -fsigned-char -fno-builtin -msoft-float -mminimal-toc
+ LDADD += -m elf64ppc
+endif
+ifeq ($(OCFS_PROCESSOR),x86_64)
+ MACH_CFLAGS += -m64 -mcmodel=kernel
+endif
+
+BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__
+DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
+
+INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC)
+
+CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
+ -fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
+LDADD = -nostdlib
+
+OPTIMIZE = -O2
+
+CFLAGS += $(OPTIMIZE)
+
+MODULES = ocfs2_dlm.o ocfs2_heartbeat.o ocfs2_nodemanager.o ocfs2_tcp.o
+TEST_MODULES = ocfs2_cluster_test.o
+
+INSTALL_MODULES = $(MODULES)
+
+# Make dependancies work
+$(CFILES): $(HFILES)
+$(OBJS): $(HFILES)
+
+build-cluster: $(MODULES)
+
+ocfs2_cluster_test.o: test.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_dlm.o: dlmmod.o dlmthread.o dlmrecovery.o util.o compat_libfs.o dlmmaster.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_nodemanager.o: nodemanager.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_heartbeat.o: heartbeat.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+ocfs2_tcp.o: tcp.o util.o compat_libfs.o
+ $(CC) $(OPTS) -Wl,-r -o $@ $^ $(LDADD)
+
+clean-cluster:
+ rm -f *.o *.p *.s
+
+else # ifndef KERNEL_26
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it. They are:
+# 1. We are being included by the local Makefile to do a 2.6 kernel build.
+# In this method we will call the kernel make system to build our module.
+# This will cause the kernel make system to call back into our makefile
+# (2nd way).
+
+INSTALL_MODULE = ocfs2.ko
+
+#ALL_RULES = stamp-md5 build-ocfs
+ALL_RULES = build-cluster
+
+build-ocfs:
+ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) modules
+
+clean-ocfs:
+ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) clean
+
+endif # OCFS_KERNEL_2_6
+
+INSTALL_RULES = install-cluster
+
+install-cluster: $(INSTALL_MODULES)
+ $(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+ @for file in $(INSTALL_MODULES); do \
+ $(INSTALL_DATA) $$file $(DESTDIR)$(MODULEDIR)/ocfs2/$$file \
+ done
+
+include $(TOPDIR)/Postamble.make
+
+else # ifeq ($(KERNELRELEASE),)
+# We are being included by the 2.6 kernel build system. So we will include the
+# 2.6.x Makefile and skip everything else.
+# The 2.6.x kernel makefile
+
+# This Makefile has two ways through it. They are:
+# 1. We are being included by the local Makefile to do a 2.6 kernel build.
+# In this method we will call the kernel make system to build our module.
+# This will cause the kernel make system to call back into our makefile
+# (2nd way).
+#
+# 2. We are being included by the kernel make system. So in this method we
+# just setup the variables that the make system wants and then the kernel
+# make system will take care of the build.
+
+# 2nd method. The kernel make system is including us. We need to setup the
+# various parameters for the kernel make system and then it will take care of
+# building us.
+
+STAMP_DIR = $(OCFS_SRC_DIR)
+include $(OCFS_SRC_DIR)/../Versioning.make
+
+EXTRA_CFLAGS += $(GLOBAL_DEFINES)
+
+CFLAGS_$(VERSION_OBJ) += $(VERDEFS)
+
+# Kernel Module file to produce
+obj-m += ocfs2.o
+
+# list of object files that are used to create our module
+ocfs2-objs := $(OBJS)
+
+endif # ifneq ($(KERNELRELEASE),)
Added: trunk/cluster/compat_libfs.c
===================================================================
--- trunk/cluster/compat_libfs.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/compat_libfs.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,705 @@
+/* -----------------------------------------------------------------*/
+
+
+/*
+ * compat_libfs.c
+ * Library for filesystems writers.
+ * PLUS... transaction file stuff stolen from nfsd
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+
+#include "compat_libfs.h"
+
+#define kstatfs statfs
+#define __user
+
+
+int simple_statfs(struct super_block *sb, struct statfs *buf);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+#else
+struct dentry *simple_lookup(struct inode *dir,struct dentry *dentry);
+#endif
+
+int simple_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int dcache_dir_open(struct inode *inode, struct file *file);
+int dcache_dir_close(struct inode *inode, struct file *file);
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin);
+ssize_t generic_read_dir(struct file *filp, char *buf, size_t siz, loff_t *ppos);
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry);
+int simple_empty(struct dentry *dentry);
+int simple_unlink(struct inode *dir, struct dentry *dentry);
+int simple_rmdir(struct inode *dir, struct dentry *dentry);
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry);
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+
+
+
+#if 0
+int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+ return 0;
+}
+#endif
+
+int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_namelen = NAME_MAX;
+ return 0;
+}
+
+/*
+ * Lookup the data. This is trivial - if the dentry didn't already
+ * exist, we know it is negative.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+#else
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+#endif
+
+
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name)
+{
+ struct list_head *iter;
+ struct dentry *child = NULL;
+
+ spin_lock(&dcache_lock);
+ list_for_each(iter, &dentry->d_subdirs) {
+ child = list_entry(iter, struct dentry, d_child);
+ if (child->d_name.len == name->len &&
+ memcmp(child->d_name.name, name->name, name->len)==0)
+ break;
+ child = NULL;
+ }
+ if (child)
+ dget_locked(child);
+ spin_unlock(&dcache_lock);
+ return child;
+}
+
+
+
+int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+ return 0;
+}
+
+int dcache_dir_open(struct inode *inode, struct file *file)
+{
+ static struct qstr cursor_name = {.len = 1, .name = "."};
+
+ file->private_data = d_alloc(file->f_dentry, &cursor_name);
+
+ return file->private_data ? 0 : -ENOMEM;
+}
+
+int dcache_dir_close(struct inode *inode, struct file *file)
+{
+ dput(file->private_data);
+ return 0;
+}
+
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
+{
+ down(&file->f_dentry->d_inode->i_sem);
+ switch (origin) {
+ case 1:
+ offset += file->f_pos;
+ case 0:
+ if (offset >= 0)
+ break;
+ default:
+ up(&file->f_dentry->d_inode->i_sem);
+ return -EINVAL;
+ }
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (file->f_pos >= 2) {
+ struct list_head *p;
+ struct dentry *cursor = file->private_data;
+ loff_t n = file->f_pos - 2;
+
+ spin_lock(&dcache_lock);
+ list_del(&cursor->d_child);
+ p = file->f_dentry->d_subdirs.next;
+ while (n && p != &file->f_dentry->d_subdirs) {
+ struct dentry *next;
+ next = list_entry(p, struct dentry, d_child);
+ if (!d_unhashed(next) && next->d_inode)
+ n--;
+ p = p->next;
+ }
+ list_add_tail(&cursor->d_child, p);
+ spin_unlock(&dcache_lock);
+ }
+ }
+ up(&file->f_dentry->d_inode->i_sem);
+ return offset;
+}
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+ return (inode->i_mode >> 12) & 15;
+}
+
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+
+int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+ struct dentry *dentry = filp->f_dentry;
+ struct dentry *cursor = filp->private_data;
+ struct list_head *p, *q = &cursor->d_child;
+ ino_t ino;
+ int i = filp->f_pos;
+
+ switch (i) {
+ case 0:
+ ino = dentry->d_inode->i_ino;
+ if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+ break;
+ filp->f_pos++;
+ i++;
+ /* fallthrough */
+ case 1:
+ ino = dentry->d_parent->d_inode->i_ino;
+ if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+ break;
+ filp->f_pos++;
+ i++;
+ /* fallthrough */
+ default:
+ spin_lock(&dcache_lock);
+ if (filp->f_pos == 2) {
+ list_del(q);
+ list_add(q, &dentry->d_subdirs);
+ }
+ for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
+ struct dentry *next;
+ next = list_entry(p, struct dentry, d_child);
+ if (d_unhashed(next) || !next->d_inode)
+ continue;
+
+ spin_unlock(&dcache_lock);
+ if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0)
+ return 0;
+ spin_lock(&dcache_lock);
+ /* next is still alive */
+ list_del(q);
+ list_add(q, p);
+ p = q;
+ filp->f_pos++;
+ }
+ spin_unlock(&dcache_lock);
+ }
+ return 0;
+}
+
+ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
+{
+ return -EISDIR;
+}
+
+struct file_operations simple_dir_operations = {
+ .open = dcache_dir_open,
+ .release = dcache_dir_close,
+ .llseek = dcache_dir_lseek,
+ .read = generic_read_dir,
+ .readdir = dcache_readdir,
+};
+
+struct inode_operations simple_dir_inode_operations = {
+ .lookup = simple_lookup,
+};
+
+#if 0
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct super_block *
+get_sb_pseudo(struct file_system_type *fs_type, char *name,
+ struct super_operations *ops, unsigned long magic)
+{
+ struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+ static struct super_operations default_ops = {.statfs = simple_statfs};
+ struct dentry *dentry;
+ struct inode *root;
+ struct qstr d_name = {.name = name, .len = strlen(name)};
+
+ if (IS_ERR(s))
+ return s;
+
+ s->s_flags = MS_NOUSER;
+ s->s_maxbytes = ~0ULL;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = magic;
+ s->s_op = ops ? ops : &default_ops;
+ root = new_inode(s);
+ if (!root)
+ goto Enomem;
+ root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+ root->i_uid = root->i_gid = 0;
+ root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+ dentry = d_alloc(NULL, &d_name);
+ if (!dentry) {
+ iput(root);
+ goto Enomem;
+ }
+ dentry->d_sb = s;
+ dentry->d_parent = dentry;
+ d_instantiate(dentry, root);
+ s->s_root = dentry;
+ s->s_flags |= MS_ACTIVE;
+ return s;
+
+Enomem:
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ return ERR_PTR(-ENOMEM);
+}
+#endif
+
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_nlink++;
+ atomic_inc(&inode->i_count);
+ dget(dentry);
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+static inline int simple_positive(struct dentry *dentry)
+{
+ return dentry->d_inode && !d_unhashed(dentry);
+}
+
+int simple_empty(struct dentry *dentry)
+{
+ struct dentry *child;
+ int ret = 0;
+
+ spin_lock(&dcache_lock);
+ list_for_each_entry(child, &dentry->d_subdirs, d_child)
+ if (simple_positive(child))
+ goto out;
+ ret = 1;
+out:
+ spin_unlock(&dcache_lock);
+ return ret;
+}
+
+int simple_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_nlink--;
+ dput(dentry);
+ return 0;
+}
+
+int simple_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ if (!simple_empty(dentry))
+ return -ENOTEMPTY;
+
+ dentry->d_inode->i_nlink--;
+ simple_unlink(dir, dentry);
+ dir->i_nlink--;
+ return 0;
+}
+
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
+
+ if (!simple_empty(new_dentry))
+ return -ENOTEMPTY;
+
+ if (new_dentry->d_inode) {
+ simple_unlink(new_dir, new_dentry);
+ if (they_are_dirs)
+ old_dir->i_nlink--;
+ } else if (they_are_dirs) {
+ old_dir->i_nlink--;
+ new_dir->i_nlink++;
+ }
+
+ old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
+ new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ return 0;
+}
+
+#if 0
+int simple_readpage(struct file *file, struct page *page)
+{
+ void *kaddr;
+
+ if (PageUptodate(page))
+ goto out;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr, 0, PAGE_CACHE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+out:
+ unlock_page(page);
+ return 0;
+}
+
+int simple_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ if (!PageUptodate(page)) {
+ if (to - from != PAGE_CACHE_SIZE) {
+ void *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr, 0, from);
+ memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ SetPageUptodate(page);
+ }
+ return 0;
+}
+
+int simple_commit_write(struct file *file, struct page *page,
+ unsigned offset, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold the i_sem.
+ */
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+ set_page_dirty(page);
+ return 0;
+}
+#endif
+
+void d_genocide(struct dentry *root);
+
+void d_genocide(struct dentry *root)
+{
+ struct dentry *this_parent = root;
+ struct list_head *next;
+ spin_lock(&dcache_lock);
+repeat:
+ next = this_parent->d_subdirs.next;
+resume:
+ while (next != &this_parent->d_subdirs) {
+ struct list_head *tmp = next;
+ struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+ next = tmp->next;
+ if (d_unhashed(dentry)||!dentry->d_inode)
+ continue;
+ if (!list_empty(&dentry->d_subdirs)) {
+ this_parent = dentry;
+ goto repeat;
+ }
+ atomic_dec(&dentry->d_count);
+ }
+ if (this_parent != root) {
+ next = this_parent->d_child.next;
+ atomic_dec(&this_parent->d_count);
+ this_parent = this_parent->d_parent;
+ goto resume;
+ }
+ spin_unlock(&dcache_lock);
+}
+
+static void simple_read_inode(struct inode * inode)
+{
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+}
+
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+{
+ static struct super_operations s_ops = {
+ .statfs = simple_statfs,
+ .read_inode = simple_read_inode
+ };
+ struct inode *inode;
+ struct dentry *root;
+ struct dentry *dentry;
+ int i;
+
+ s->s_blocksize = PAGE_CACHE_SIZE;
+ s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ s->s_magic = magic;
+ s->s_op = &s_ops;
+
+ inode = new_inode(s);
+ if (!inode)
+ return -ENOMEM;
+ inode->i_mode = S_IFDIR | 0755;
+ inode->i_uid = inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ root = d_alloc_root(inode);
+ if (!root) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ for (i = 0; !files->name || files->name[0]; i++, files++) {
+ struct qstr name;
+ if (!files->name)
+ continue;
+ name.name = files->name;
+ name.len = strlen(name.name);
+ printk("adding file %*s\n", name.len, name.name);
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_alloc(root, &name);
+ if (!dentry)
+ goto out;
+ inode = new_inode(s);
+ if (!inode)
+ goto out;
+ inode->i_mode = S_IFREG | files->mode;
+ inode->i_uid = inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_fop = files->ops;
+ inode->i_ino = i;
+ d_add(dentry, inode);
+ }
+ s->s_root = root;
+ return 0;
+out:
+ d_genocide(root);
+ dput(root);
+ return -ENOMEM;
+}
+
+#if 0
+static spinlock_t pin_fs_lock = SPIN_LOCK_UNLOCKED;
+
+int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+{
+ struct vfsmount *mnt = NULL;
+ spin_lock(&pin_fs_lock);
+ if (unlikely(!*mount)) {
+ spin_unlock(&pin_fs_lock);
+ mnt = do_kern_mount(name, 0, name, NULL);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ spin_lock(&pin_fs_lock);
+ if (!*mount)
+ *mount = mnt;
+ }
+ mntget(*mount);
+ ++*count;
+ spin_unlock(&pin_fs_lock);
+ mntput(mnt);
+ return 0;
+}
+
+void simple_release_fs(struct vfsmount **mount, int *count)
+{
+ struct vfsmount *mnt;
+ spin_lock(&pin_fs_lock);
+ mnt = *mount;
+ if (!--*count)
+ *mount = NULL;
+ spin_unlock(&pin_fs_lock);
+ mntput(mnt);
+}
+
+ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
+ const void *from, size_t available)
+{
+ loff_t pos = *ppos;
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= available)
+ return 0;
+ if (count > available - pos)
+ count = available - pos;
+ if (copy_to_user(to, from + pos, count))
+ return -EFAULT;
+ *ppos = pos + count;
+ return count;
+}
+
+EXPORT_SYMBOL(dcache_dir_close);
+EXPORT_SYMBOL(dcache_dir_lseek);
+EXPORT_SYMBOL(dcache_dir_open);
+EXPORT_SYMBOL(dcache_readdir);
+EXPORT_SYMBOL(generic_read_dir);
+EXPORT_SYMBOL(simple_commit_write);
+EXPORT_SYMBOL(simple_empty);
+EXPORT_SYMBOL(simple_fill_super);
+EXPORT_SYMBOL(simple_getattr);
+EXPORT_SYMBOL(simple_link);
+EXPORT_SYMBOL(simple_lookup);
+EXPORT_SYMBOL(simple_pin_fs);
+EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_SYMBOL(simple_readpage);
+EXPORT_SYMBOL(simple_release_fs);
+EXPORT_SYMBOL(simple_rename);
+EXPORT_SYMBOL(simple_rmdir);
+EXPORT_SYMBOL(simple_statfs);
+EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(simple_unlink);
+EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(get_sb_pseudo);
+#endif
+
+/* -----------------------------------------------------------------*/
+
+
+
+/* transaction file support */
+
+/*
+ * transaction based IO methods.
+ * The file expects a single write which triggers the transaction, and then
+ * possibly a read which collects the result - which is stored in a
+ * file-local buffer.
+ */
+static ssize_t TA_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+{
+ ino_t ino = file->f_dentry->d_inode->i_ino;
+ struct argresp *ar;
+ ssize_t rv = 0;
+ struct super_block *sb = file->f_dentry->d_inode->i_sb;
+ TA_write_ops *ops = TA_GENERIC_SB_MEMBER(sb);
+ TA_write_op *write_op;
+
+ printk("welcome to TA_write: num_ops=%d, op[%d]=%p, private=%p, size=%u\n",
+ ops->num_ops, (int)ino, ops->write_op[ino], file->private_data, size);
+ if (ino >= ops->num_ops || ops->write_op[ino] == NULL)
+ return -EINVAL;
+ write_op = ops->write_op[ino];
+ if (file->private_data)
+ return -EINVAL; /* only one write allowed per open */
+ if (size > PAGE_SIZE - sizeof(struct argresp))
+ return -EFBIG;
+
+ ar = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ar)
+ return -ENOMEM;
+ ar->size = 0;
+ down(&file->f_dentry->d_inode->i_sem);
+ if (file->private_data)
+ rv = -EINVAL;
+ else
+ file->private_data = ar;
+ up(&file->f_dentry->d_inode->i_sem);
+ if (rv) {
+ kfree(ar);
+ return rv;
+ }
+ if (copy_from_user(ar->data, buf, size))
+ return -EFAULT;
+
+ printk("now calling write_op...\n");
+ rv = write_op(file, ar->data, size);
+ printk("write_op returned %d\n", rv);
+ if (rv>0) {
+ ar->size = rv;
+ rv = size;
+ }
+ return rv;
+}
+
+
+static ssize_t TA_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+ struct argresp *ar;
+ ssize_t rv = 0;
+
+ if (file->private_data == NULL)
+ rv = TA_write(file, buf, 0, pos);
+ if (rv < 0)
+ return rv;
+
+ ar = file->private_data;
+ if (!ar)
+ return 0;
+ if (*pos >= ar->size)
+ return 0;
+ if (*pos + size > ar->size)
+ size = ar->size - *pos;
+ if (copy_to_user(buf, ar->data + *pos, size))
+ return -EFAULT;
+ *pos += size;
+ return size;
+}
+
+static int TA_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return 0;
+}
+
+static int TA_release(struct inode *inode, struct file *file)
+{
+ void *p = file->private_data;
+ file->private_data = NULL;
+ kfree(p);
+ return 0;
+}
+
+
+
+
+
+
+
+
+struct file_operations transaction_ops = {
+ .write = TA_write,
+ .read = TA_read,
+ .open = TA_open,
+ .release = TA_release,
+};
Added: trunk/cluster/compat_libfs.h
===================================================================
--- trunk/cluster/compat_libfs.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/compat_libfs.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,36 @@
+#ifndef CLUSTER_COMPAT_LIBFS_H
+#define CLUSTER_COMPAT_LIBFS_H
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define TA_GENERIC_SB_MEMBER(sb) ((sb)->s_fs_info)
+#else
+#define TA_GENERIC_SB_MEMBER(sb) ((sb)->u.generic_sbp)
+#endif
+
+
+/* an argresp is stored in an allocated page and holds the
+ * size of the argument or response, along with its content
+ */
+struct argresp {
+ ssize_t size;
+ char data[0];
+};
+
+typedef ssize_t (TA_write_op)(struct file *, char *, size_t);
+typedef struct _TA_write_ops
+{
+ int num_ops;
+ TA_write_op *write_op[0];
+} TA_write_ops;
+
+struct tree_descr
+{
+ char *name;
+ struct file_operations *ops;
+ int mode;
+};
+
+int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files);
+struct dentry * simple_find_child(struct dentry *dentry, struct qstr *name);
+
+#endif /* CLUSTER_COMPAT_LIBFS_H */
Added: trunk/cluster/dlm_compat.h
===================================================================
--- trunk/cluster/dlm_compat.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlm_compat.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlm_compat.h
+ *
+ * Compatibility stuff for 2.4
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version
+ * 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLM_COMPAT_H
+#define CLUSTER_DLM_COMPAT_H
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/kdev_t.h>
+#include <linux/sched.h>
+#include <linux/compiler.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+# include <linux/locks.h>
+#else
+# include <linux/buffer_head.h>
+#endif
+
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+
+#ifdef __ia64__
+extern inline void prefetch(const void *x);
+extern inline void prefetchw(const void *x);
+#else
+static inline void prefetch(const void *x);
+static inline void prefetchw(const void *x);
+#endif
+extern inline int generic_fls(int x);
+extern inline int get_bitmask_order(unsigned int count);
+/* XXX Hack to avoid warning */
+struct mem_dqinfo;
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+
+
+
+
+#define flush_scheduled_work flush_scheduled_tasks
+#define work_struct tq_struct
+#define INIT_WORK(w, f, d) INIT_TQUEUE(w, f, d)
+#define schedule_work(w) schedule_task(w)
+
+#ifdef HAVE_NPTL
+static inline void dequeue_signal_lock(struct task_struct *task,
+ sigset_t *blocked, siginfo_t *info)
+{
+ spin_lock_irq(&task->sighand->siglock);
+ dequeue_signal(blocked, info);
+ spin_unlock_irq(&task->sighand->siglock);
+}
+#else
+static inline void dequeue_signal_lock(struct task_struct *task,
+ sigset_t *blocked, siginfo_t *info)
+{
+ spin_lock_irq(&task->sigmask_lock);
+ dequeue_signal(blocked, info);
+ spin_unlock_irq(&task->sigmask_lock);
+}
+#endif
+#define kstatfs statfs
+
+
+
+/*
+ * Copied right out of the 2.6.2 kernel's buffer_head.h:
+ * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
+ * and buffer_foo() functions.
+ */
+#define BUFFER_FNS(bit, name) \
+static inline void set_buffer_##name(struct buffer_head *bh) \
+{ \
+ set_bit(BH_##bit, &(bh)->b_state); \
+} \
+static inline void clear_buffer_##name(struct buffer_head *bh) \
+{ \
+ clear_bit(BH_##bit, &(bh)->b_state); \
+} \
+static inline int buffer_##name(struct buffer_head *bh) \
+{ \
+ return test_bit(BH_##bit, &(bh)->b_state); \
+}
+
+#undef buffer_uptodate
+#undef buffer_dirty
+BUFFER_FNS(Uptodate, uptodate)
+BUFFER_FNS(Dirty, dirty)
+
+#define clear_buffer_dirty mark_buffer_clean
+
+#endif /* LINUX_VERSION_CODE < 2.6 */
+
+
+#endif /* CLUSTER_DLM_COMPAT_H */
+
Added: trunk/cluster/dlmcommon.h
===================================================================
--- trunk/cluster/dlmcommon.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmcommon.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,52 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Common stuff
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMCOMMON_H
+#define CLUSTER_DLMCOMMON_H
+
+#define DLM_ASSERT(x) ({ if (!(x)) { printk("assert failed! %s:%d\n", __FILE__, __LINE__); BUG(); } })
+
+typedef struct _nm_ctxt nm_ctxt;
+typedef struct _dlm_ctxt dlm_ctxt;
+typedef struct _heartbeat_ctxt heartbeat_ctxt;
+
+#define CLUSTER_DISK_UUID_LEN 32 // 16 byte binary == 32 char hex string
+
+typedef struct _cluster_disk
+{
+ // uuid of disk
+ char uuid[CLUSTER_DISK_UUID_LEN+1];
+ // all the rest are for heartbeat
+ kdev_t dev;
+ u32 blocksize_bits;
+ u32 num_blocks;
+ u64 start_block;
+ util_rarray slots;
+} cluster_disk;
+
+
+#endif /* CLUSTER_DLMCOMMON_H */
Added: trunk/cluster/dlmmaster.c
===================================================================
--- trunk/cluster/dlmmaster.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmmaster.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,967 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+
+
+spinlock_t dlm_master_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(dlm_master_list);
+
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm,
+ dlm_lock_resource *res, struct qstr *name, int locked);
+
+static int dlm_init_mle(dlm_master_list_entry *mle, int type, dlm_ctxt *dlm,
+ dlm_lock_resource *res, struct qstr *name, int locked)
+{
+ int ret = 0;
+
+ mle->dlm = dlm;
+ mle->type = type;
+ INIT_LIST_HEAD(&mle->list);
+ memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ spin_lock_init(&mle->spinlock);
+ init_waitqueue_head(&mle->wq);
+ atomic_set(&mle->woken, 0);
+ atomic_set(&mle->refcnt, 1);
+ memset(mle->response_map, 0, sizeof(mle->response_map));
+ mle->master = NM_MAX_NODES;
+ mle->error = 0;
+
+ if (mle->type == DLM_MLE_MASTER)
+ mle->u.res = res;
+ else
+ strncpy(mle->u.name.name, name->name, name->len);
+
+ if (!locked)
+ spin_lock(&dlm->spinlock);
+
+ /* copy off the node_map and register hb callbacks on our copy */
+ memcpy(mle->node_map, dlm->node_map, sizeof(mle->node_map));
+ memcpy(mle->vote_map, dlm->node_map, sizeof(mle->vote_map));
+ clear_bit(dlm->group_index, mle->vote_map);
+ clear_bit(dlm->group_index, mle->node_map);
+
+#warning cannot do this here cuz this kmallocs and we are under a spinlock dammit
+ if (hb_register_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle, DLM_HB_NODE_DOWN_PRI+1) ||
+ hb_register_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle, DLM_HB_NODE_UP_PRI+1)) {
+ ret = -EINVAL;
+ }
+
+ if (!locked)
+ spin_unlock(&dlm->spinlock);
+
+ return ret;
+}
+
+
+
+
+/////////////////////////////////////////////////
+//
+// TODO: change these comments to reflect reality
+//
+// master_request(target=me)
+// wait for all responses
+// if maybe_map is 0 there are no others in progress
+// assert_master(me)
+// else (maybe_map has some nodes in it)
+// (nodes in maybe_map had better be < my node num)
+// wait for assert_master
+// endif
+//
+//
+// receive:
+// master_request(target):
+// if i own it, return YES
+// if i dont know anything about it, return NO
+// if i have it in progress
+// if my node number is lower
+// return MAYBE
+// else
+// if target < lowest_so_far, lowest_so_far=target
+// return NO
+//
+// assert_master(master):
+// if i own it, BUG()!!!
+// if i have it, but owner!=master, BUG()!!!
+// if i dont know anything about it, ignore
+// if i have it in progress
+// if lowest_so_far != master
+// BUG()!!!
+// else
+// set the owner, DONE
+//
+/////////////////////////////////////////////////
+
+
+/* remove from list and free */
+void dlm_put_mle(dlm_master_list_entry *mle)
+{
+ if (atomic_dec_and_lock(&mle->refcnt, &dlm_master_lock)) {
+ list_del(&mle->list);
+ spin_unlock(&dlm_master_lock);
+ hb_unregister_callback(HB_NODE_DOWN_CB, dlm_mle_node_down, mle);
+ hb_unregister_callback(HB_NODE_UP_CB, dlm_mle_node_up, mle);
+ kfree(mle);
+ }
+}
+
+
+
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ *
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ *
+ * also, do a lookup in the dlm_master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here. need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags)
+{
+ dlm_lock_resource *tmpres=NULL, *res=NULL;
+ struct list_head *bucket;
+ dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+ struct list_head *iter;
+ int blocked = 0;
+ int map_changed = 0, restart = 0, assert = 0;
+ int ret, start, bit;
+
+ bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+ /* pre-allocate a dlm_lock_resource and master stuff */
+ mle = kmalloc(sizeof(dlm_master_list_entry), GFP_KERNEL);
+ res = kmalloc(sizeof(dlm_lock_resource), GFP_KERNEL);
+ if (!mle || !res) {
+ printk("could not allocate memory for new lock resource!\n");
+ if (mle)
+ kfree(mle);
+ if (res)
+ kfree(res);
+ return NULL;
+ }
+
+ /* check for pre-existing lock */
+ spin_lock(&dlm->spinlock);
+ tmpres = __dlm_lookup_lock(dlm, lockname);
+ if (tmpres) {
+ spin_unlock(&dlm->spinlock);
+ /* TODO: return error, or return the lockres ?!? */
+ kfree(res);
+ kfree(mle);
+ /* waits for any outstanding work to finish
+ * will hold tmpres->spinlock on exit */
+ dlm_wait_on_lockres(tmpres);
+ return tmpres;
+ }
+
+ dlm_init_lockres(res, lockname);
+
+ if (flags & LKM_LOCAL) {
+ /* caller knows it's safe to assume it's not mastered elsewhere
+ * DONE! return right away */
+ list_add_tail(&res->list, bucket);
+ res->owner = dlm->group_index;
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+
+ /* return with res->spinlock held */
+
+ /* lock ordering note: this lockres will not be
+ * visible until i release dlm->spinlock, so it
+ * is ok to release dlm->spinlock out of order here */
+ spin_lock(&res->spinlock);
+
+ spin_unlock(&dlm->spinlock);
+ return res;
+ }
+
+ /* look in master list to see if another node has started mastering this */
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ tmpmle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, tmpmle, lockname))
+ continue;
+
+ if (tmpmle->type == DLM_MLE_MASTER) {
+ printk("impossible! master entry for nonexistent lock!\n");
+ BUG();
+ }
+ dlm_get_mle(tmpmle);
+ blocked = 1;
+ // found a block! must wait for lock to be mastered by another node
+ break;
+ }
+
+ if (!blocked) {
+ /* go ahead and try to master lock on this node */
+ if (dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 1)) {
+ printk("bug! failed to register hb callbacks\n");
+ BUG();
+ }
+ list_add(&mle->list, &dlm_master_list);
+ }
+ spin_unlock(&dlm_master_lock);
+
+ /* at this point there is either a DLM_MLE_BLOCK or a DLM_MLE_MASTER
+ * on the master list, so it's safe to add the lockres to the hashtable.
+ * anyone who finds the lock will still have to wait on the IN_PROGRESS.
+ * also, any new nodes that try to join at this point will have to wait
+ * until my dlm_master_lock list is empty, so they cannot possibly
+ * do any master requests yet... TODO
+ * ?? should i have a special type of mle just for joining nodes ??
+ * ?? could allow them to come in and put their mle on the list and sleep ?? */
+
+ /* finally add the lockres to its hash bucket */
+ list_add_tail(&res->list, bucket);
+ spin_unlock(&dlm->spinlock);
+
+ if (blocked) {
+ /* must wait for lock to be mastered elsewhere */
+ kfree(mle);
+ mle = tmpmle;
+ goto wait;
+ }
+
+ ret = -EINVAL;
+ start = 0;
+ while (1) {
+ bit = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+ if (bit >= NM_MAX_NODES) {
+ printk("no more nodes\n");
+ break;
+ }
+
+ ret = dlm_do_master_request(mle, bit);
+ if (ret < 0) {
+ // TODO
+ //printk("dlm_do_master_request returned %d!\n", ret);
+ }
+ if (mle->master != NM_MAX_NODES) {
+ // found a master!
+ break;
+ }
+ start = bit+1;
+ }
+
+wait:
+ while (1) {
+ spin_lock(&res->spinlock);
+ if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ // another node has become the owner
+ spin_unlock(&res->spinlock);
+ break;
+ }
+ spin_unlock(&res->spinlock);
+
+ spin_lock(&mle->spinlock);
+ if (mle->master != NM_MAX_NODES) {
+ u16 m = mle->master;
+ // printk("node %u is the master!\n", m);
+ spin_unlock(&mle->spinlock);
+
+ spin_lock(&res->spinlock);
+ res->owner = m;
+ spin_unlock(&res->spinlock);
+ break;
+ }
+ restart = 0;
+ map_changed = (memcmp(mle->vote_map, mle->node_map, sizeof(mle->vote_map)) != 0);
+ if (memcmp(mle->vote_map, mle->response_map, sizeof(mle->vote_map)) == 0) {
+ // printk("every node has responded...\n");
+ if (map_changed) {
+ printk("eek! got all original nodes, but nodemap changed while collecting responses\n");
+ restart = 1;
+ }
+
+ if (mle->error) {
+ printk("ugh. some node hit an error (-ENOMEM). try the whole thing again\n");
+ mle->error = 0;
+ /* TODO: treat this just like the dead node case below,
+ * cleanup and start over, but keep the error node around */
+ restart = 1;
+ }
+
+ if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+ /* no other nodes are in-progress */
+ /* those nodes should all be locking out this lockid until I assert */
+ /* they should have put a dummy entry on dlm_master_list */
+ /* need to assert myself as the master */
+
+ // printk("I am the only node in-progress! asserting myself as master\n");
+ assert = 1;
+ } else {
+ /* other nodes are in-progress */
+ if (map_changed && !test_bit(bit, mle->node_map)) {
+ /* TODO: need to copy the node_map into the vote_map, zero
+ * everything out and start over */
+ printk("need to handle this case! winning node %u just died!\n", bit);
+ restart = 1;
+ }
+
+ if (bit > dlm->group_index) {
+ // printk("next in-progress node (%u) is higher than me (%u)\n",
+ // bit, dlm->group_index);
+
+ /* nodes not in-progress should be locking out this lockid until I assert */
+ /* in-progress nodes should match me up with their lowest maybe_map bit */
+ /* need to assert myself as the master */
+
+ // printk("I am the lowest node! asserting myself as master\n");
+ assert = 1;
+ } else {
+ /* need to sit around and wait for assert */
+ /* my lowest maybe_map bit should be the one to assert */
+ /* just fall through and sleep. should be woken by the handler */
+
+ // printk("sleeping while waiting for %u to assert himself as master\n", bit);
+ }
+ }
+ } else {
+ if (map_changed) {
+ /* TODO: need to handle this */
+ printk("eek! nodemap changed while collecting responses\n");
+ restart = 1;
+ }
+ // printk("still waiting for all nodes to respond...\n");
+ }
+
+ if (restart && assert)
+ assert = 0;
+
+ /* make sure to tell any other nodes that i am mastering this */
+ if (assert)
+ mle->master = dlm->group_index;
+
+ spin_unlock(&mle->spinlock);
+
+ if (assert) {
+ ret = dlm_do_assert_master(mle);
+ // printk("assert returned %d!\n", ret);
+ if (ret == 0) {
+ spin_lock(&res->spinlock);
+ res->owner = dlm->group_index;
+ spin_unlock(&res->spinlock);
+ // printk("wooo! i am the owner. phew!\n");
+ break;
+ } else
+ restart = 1;
+ }
+ if (restart) {
+ printk("something happened such that the master process needs to be restarted!\n");
+ /* TODO: clear it all out and start over */
+ }
+
+ atomic_set(&mle->woken, 0);
+ ret = util_wait_atomic_eq(&mle->wq, &mle->woken, 1, 5000);
+ }
+ dlm_put_mle(mle);
+
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+ wake_up(&res->wq);
+
+ /* exits holding res->spinlock */
+ return res;
+}
+
+
+
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data)
+{
+ u8 response = DLM_MASTER_RESP_MAYBE;
+ dlm_ctxt *dlm = data;
+ dlm_lock_resource *res;
+ dlm_master_request *request = (dlm_master_request *) msg->buf;
+ dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+ struct qstr lockname = { .name=request->name, .len=request->namelen };
+ int found;
+ struct list_head *iter;
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+way_up_top:
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_unlock(&dlm->spinlock);
+
+ /* take care of the easy cases up front */
+ spin_lock(&res->spinlock);
+ if (res->owner == dlm->group_index) {
+ spin_unlock(&res->spinlock);
+ // printk("this node is the master\n");
+ response = DLM_MASTER_RESP_YES;
+ if (mle)
+ kfree(mle);
+ goto send_response;
+ } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ spin_unlock(&res->spinlock);
+ // printk("node %u is the master\n", res->owner);
+ response = DLM_MASTER_RESP_NO;
+ if (mle)
+ kfree(mle);
+ goto send_response;
+ }
+
+ /* ok, there is no owner. either this node is
+ * being blocked, or it is actively trying to
+ * master this lock. */
+ if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+ printk("bug! lock with no owner should be in-progress!\n");
+ BUG();
+ }
+
+ // printk("lockres is in progress...\n");
+ found = 0;
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ tmpmle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+ continue;
+
+ dlm_get_mle(tmpmle);
+ spin_lock(&tmpmle->spinlock);
+ if (tmpmle->type == DLM_MLE_BLOCK) {
+ // printk("this node is waiting for lockres to be mastered\n");
+ response = DLM_MASTER_RESP_NO;
+ } else {
+ // printk("this node is attempting to master lockres\n");
+ response = DLM_MASTER_RESP_MAYBE;
+ }
+ set_bit(request->node_idx, tmpmle->maybe_map);
+ spin_unlock(&tmpmle->spinlock);
+
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&res->spinlock);
+
+ dlm_put_mle(tmpmle);
+ if (mle)
+ kfree(mle);
+ goto send_response;
+ }
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&res->spinlock);
+ printk("bug bug bug!!! no mle found for this lock!\n");
+ BUG();
+ }
+
+ /*
+ * lockres doesn't exist on this node
+ * if there is an MLE_BLOCK, return NO
+ * if there is an MLE_MASTER, return MAYBE
+ * otherwise, add an MLE_BLOCK, return NO
+ */
+ found = 0;
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ tmpmle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, tmpmle, &lockname))
+ continue;
+ dlm_get_mle(tmpmle);
+ found = 1;
+ break;
+ }
+
+ if (!found) {
+ /* this lockid has never been seen on this node yet */
+ // printk("no mle found\n");
+ if (!mle) {
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&dlm->spinlock);
+
+ mle = kmalloc(sizeof(dlm_master_list_entry) + lockname.len, GFP_KERNEL);
+ if (!mle) {
+ // bad bad bad... this sucks.
+ response = DLM_MASTER_RESP_ERROR;
+ goto send_response;
+ }
+ if (dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, &lockname, 0)) {
+ printk("eeek!\n");
+ response = DLM_MASTER_RESP_ERROR;
+ dlm_put_mle(mle);
+ goto send_response;
+ }
+ goto way_up_top;
+ }
+
+ // printk("this is second time thru, already allocated, add the block.\n");
+ set_bit(request->node_idx, mle->maybe_map);
+ list_add(&mle->list, &dlm_master_list);
+ response = DLM_MASTER_RESP_NO;
+ } else {
+ // printk("mle was found\n");
+ spin_lock(&tmpmle->spinlock);
+ if (tmpmle->type == DLM_MLE_BLOCK)
+ response = DLM_MASTER_RESP_NO;
+ else
+ response = DLM_MASTER_RESP_MAYBE;
+ set_bit(request->node_idx, tmpmle->maybe_map);
+ spin_unlock(&tmpmle->spinlock);
+ dlm_put_mle(tmpmle);
+ }
+ spin_unlock(&dlm_master_lock);
+ spin_unlock(&dlm->spinlock);
+
+send_response:
+ //ret = dlm_do_master_request_resp(dlm, &lockname, response, request->node_idx);
+ //printk("response returned %d\n", ret);
+
+ // printk("sending response %d to other node\n", response);
+ return response;
+}
+
+/* NOTE: when doing node recovery, run the dlm_master_list looking for the dead node in
+ * any maybe_map... clear that bit, and if now empty, clear the whole thing */
+
+/*
+ * locks that can be taken here:
+ * mle->spinlock
+ * dlm_master_list
+ *
+ */
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_master_list_entry *mle = NULL;
+ dlm_master_request_resp *resp = (dlm_master_request_resp *) msg->buf;
+ int found = 0, wake = 0;
+ struct list_head *iter;
+ struct qstr lockname = { .name=resp->name, .len=resp->namelen };
+
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ mle = list_entry(iter, dlm_master_list_entry, list);
+ if (!dlm_mle_equal(dlm, mle, &lockname)) {
+ mle = NULL;
+ continue;
+ }
+
+ dlm_get_mle(mle);
+ if (mle->type == DLM_MLE_BLOCK) {
+ printk("eek! cannot get a response for a block!\n");
+ break;
+ }
+ found = 1;
+ wake = 0;
+ spin_lock(&mle->spinlock);
+ switch (resp->response) {
+ case DLM_MASTER_RESP_YES:
+ set_bit(resp->node_idx, mle->response_map);
+ // printk("woot! node %u is the master!\n", resp->node_idx);
+ mle->master = resp->node_idx;
+ wake = 1;
+ break;
+ case DLM_MASTER_RESP_NO:
+ // printk("node %u is not the master, not in-progress\n", resp->node_idx);
+ set_bit(resp->node_idx, mle->response_map);
+ if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+ wake = 1;
+ break;
+ case DLM_MASTER_RESP_MAYBE:
+ // printk("node %u is not the master, but IS in-progress\n", resp->node_idx);
+ set_bit(resp->node_idx, mle->response_map);
+ set_bit(resp->node_idx, mle->maybe_map);
+ if (memcmp(mle->response_map, mle->vote_map, sizeof(mle->vote_map))==0)
+ wake = 1;
+ break;
+ case DLM_MASTER_RESP_ERROR:
+ printk("node %u hit an -ENOMEM! try this whole thing again\n", resp->node_idx);
+ mle->error = 1;
+ wake = 1;
+ break;
+ default:
+ printk("bad response! %u\n", resp->response);
+ break;
+ }
+ if (wake) {
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+ }
+ spin_unlock(&mle->spinlock);
+ break;
+ }
+ spin_unlock(&dlm_master_lock);
+
+ if (found)
+ dlm_put_mle(mle);
+ else
+ printk("hrrm... got a master resp but found no matching request\n");
+ return 0;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm_master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_master_list_entry *mle = NULL;
+ dlm_assert_master *assert = (dlm_assert_master *)msg->buf;
+ dlm_lock_resource *res;
+ int bit;
+ struct list_head *iter;
+ struct qstr lockname = { .name=assert->name, .len=assert->namelen };
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ spin_lock(&dlm->spinlock);
+
+ /* find the MLE */
+ spin_lock(&dlm_master_lock);
+ list_for_each(iter, &dlm_master_list) {
+ mle = list_entry(iter, dlm_master_list_entry, list);
+ if (dlm_mle_equal(dlm, mle, &lockname)) {
+ dlm_get_mle(mle);
+ break;
+ }
+ mle = NULL;
+ }
+ if (!mle) {
+ printk("EEEEEEK! just got an assert_master from %u, but no MLE for it!\n",
+ assert->node_idx);
+ spin_unlock(&dlm_master_lock);
+ goto check_lockres;
+ }
+ if ((bit = find_next_bit (mle->maybe_map, NM_MAX_NODES, 0)) >= NM_MAX_NODES) {
+ printk("EEK! no bits set in the maybe_map, but %u is asserting!\n",
+ assert->node_idx);
+ BUG();
+ } else if (bit != assert->node_idx) {
+ /* TODO: is this ok? */
+ printk("EEK! expected %u to be the master, but %u is asserting!\n",
+ bit, assert->node_idx);
+ BUG();
+ }
+ spin_unlock(&dlm_master_lock);
+
+ /* ok everything checks out with the MLE
+ * now check to see if there is a lockres */
+check_lockres:
+ res = __dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_lock(&res->spinlock);
+ if (!mle) {
+ if (res->owner != assert->node_idx) {
+ printk("EEEEeeEEeeEEEK! assert_master from %u, but current owner is %u!\n",
+ assert->node_idx, res->owner);
+ BUG();
+ }
+ } else {
+ if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ printk("EEEEEEEEEEEEEEEEEK!!! got assert_master from node %u, but %u is the owner!\n",
+ assert->node_idx, res->owner);
+ printk("goodnite!\n");
+ BUG();
+ }
+ if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+ printk("bug! got assert from %u, but lock with no owner should be in-progress!\n",
+ assert->node_idx);
+ BUG();
+ }
+ }
+ spin_unlock(&res->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+
+ // printk("woo! got an assert_master from node %u!\n", assert->node_idx);
+ if (mle) {
+ spin_lock(&mle->spinlock);
+ mle->master = assert->node_idx;
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+ spin_unlock(&mle->spinlock);
+
+ /* if this is the last put, it will be removed from the list */
+ dlm_put_mle(mle);
+ }
+ return 0;
+}
+
+
+int dlm_do_master_request(dlm_master_list_entry *mle, int to)
+{
+ struct inode *inode = NULL;
+ dlm_ctxt *dlm = mle->dlm;
+ dlm_master_request request;
+ int ret, response=0;
+
+ memset(&request, 0, sizeof(request));
+ request.node_idx = dlm->group_index;
+ if (mle->type == DLM_MLE_BLOCK) {
+ request.namelen = mle->u.name.len;
+ strncpy(request.name, mle->u.name.name, request.namelen);
+ } else {
+ request.namelen = mle->u.res->lockname.len;
+ strncpy(request.name, mle->u.res->lockname.name, request.namelen);
+ }
+
+ ret = -EINVAL;
+ inode = nm_get_group_node_by_index(dlm->group, to);
+ if (inode) {
+ ret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, sizeof(request), inode, &response);
+ iput(inode);
+ if (ret >= 0) {
+ spin_lock(&mle->spinlock);
+ switch (response) {
+ case DLM_MASTER_RESP_YES:
+ set_bit(to, mle->response_map);
+ // printk("woot! node %u is the master!\n", to);
+ mle->master = to;
+ break;
+ case DLM_MASTER_RESP_NO:
+ // printk("node %u is not the master, not in-progress\n", to);
+ set_bit(to, mle->response_map);
+ break;
+ case DLM_MASTER_RESP_MAYBE:
+ // printk("node %u is not the master, but IS in-progress\n", to);
+ set_bit(to, mle->response_map);
+ set_bit(to, mle->maybe_map);
+ break;
+ case DLM_MASTER_RESP_ERROR:
+ printk("node %u hit an -ENOMEM! try this whole thing again\n", to);
+ mle->error = 1;
+ break;
+ default:
+ printk("bad response! %u\n", response);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock(&mle->spinlock);
+ } else {
+ printk("net_send_message returned %d!\n", ret);
+ }
+ } else {
+ printk("nm_get_group_node_by_index failed to find inode for node %d!\n", to);
+ }
+ return ret;
+}
+
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to)
+{
+ struct inode *inode = NULL;
+ dlm_master_request_resp resp;
+ int ret;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.node_idx = dlm->group_index;
+ resp.response = response;
+ resp.namelen = name->len;
+ strncpy(resp.name, name->name, name->len);
+
+ inode = nm_get_group_node_by_index(dlm->group, to);
+ if (!inode)
+ return -EINVAL;
+ ret = net_send_message(DLM_MASTER_REQUEST_RESP_MSG, dlm->key, &resp, sizeof(resp), inode, NULL);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+int dlm_do_assert_master(dlm_master_list_entry *mle)
+{
+ struct inode *inode = NULL;
+ dlm_ctxt *dlm = mle->dlm;
+ dlm_assert_master assert;
+ int to, start = 0, ret = 0, tmpret;
+
+ while (1) {
+ to = find_next_bit (mle->vote_map, NM_MAX_NODES, start);
+ if (to >= NM_MAX_NODES) {
+ // printk("no more nodes\n");
+ break;
+ }
+ // printk("sending assert master to %d\n", to);
+
+ memset(&assert, 0, sizeof(assert));
+ assert.node_idx = dlm->group_index;
+ if (mle->type == DLM_MLE_BLOCK) {
+ assert.namelen = mle->u.name.len;
+ strncpy(assert.name, mle->u.name.name, assert.namelen);
+ } else {
+ assert.namelen = mle->u.res->lockname.len;
+ strncpy(assert.name, mle->u.res->lockname.name, assert.namelen);
+ }
+
+ inode = nm_get_group_node_by_index(dlm->group, to);
+ if (!inode) {
+ tmpret = -EINVAL;
+ printk("could not get nm info for node %d! need to retry this whole thing\n", to);
+ ret = tmpret;
+ break;
+ }
+ tmpret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &assert, sizeof(assert), inode, NULL);
+ iput(inode);
+
+ if (tmpret < 0) {
+ // TODO
+ // printk("assert_master returned %d!\n", tmpret);
+ ret = tmpret;
+ break;
+ }
+ start = to+1;
+ }
+
+ return ret;
+}
+
+
+
+
+
+
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //int ret;
+ //struct inode *node = ptr2;
+
+ dlm_master_list_entry *mle;
+ dlm_ctxt *dlm;
+
+ mle = data;
+ if (!mle) {
+ printk("eek! NULL mle!\n");
+ return;
+ }
+ if (!mle->dlm) {
+ printk("eek! NULL dlm\n");
+ return;
+ }
+ dlm = mle->dlm;
+ if (dlm->group != group)
+ return;
+
+ spin_lock(&mle->spinlock);
+
+ if (!test_bit(idx, mle->node_map))
+ printk("node %u already removed from nodemap!\n", idx);
+ else
+ clear_bit(idx, mle->node_map);
+
+#if 0
+ if (test_bit(idx, mle->recovery_map))
+ printk("node %u already added to recovery map!\n", idx);
+ else
+ set_bit(idx, mle->recovery_map);
+#endif
+ spin_unlock(&mle->spinlock);
+}
+
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //struct inode *node = ptr2;
+ dlm_master_list_entry *mle;
+ dlm_ctxt *dlm;
+
+ mle = data;
+ if (!mle) {
+ printk("eek! NULL mle!\n");
+ return;
+ }
+ if (!mle->dlm) {
+ printk("eek! NULL dlm\n");
+ return;
+ }
+ dlm = mle->dlm;
+ if (dlm->group != group)
+ return;
+
+ spin_lock(&mle->spinlock);
+
+#if 0
+ if (test_bit(idx, mle->recovery_map))
+ printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+ else
+#endif
+ {
+ if (test_bit(idx, mle->node_map))
+ printk("node %u already in node map!!!\n", idx);
+ else
+ set_bit(idx, mle->node_map);
+ }
+
+ spin_unlock(&mle->spinlock);
+}
Added: trunk/cluster/dlmmod.c
===================================================================
--- trunk/cluster/dlmmod.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmmod.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Oracle Corporation");
+//MODULE_DESCRIPTION("Oracle DLM");
+
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, always obey this ordering:
+ * dlm_domain_lock -> dlm_ctxt -> dlm_lock_resource -> dlm_lock
+ *
+ */
+
+
+static int __init dlm_driver_entry (void);
+static int dlm_read_params(void);
+static void __exit dlm_driver_exit (void);
+
+
+
+LIST_HEAD(dlm_domains);
+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+u16 dlm_global_index = NM_MAX_NODES;
+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_next_cookie = 1;
+
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+static dlm_ctxt * __dlm_lookup_domain(char *domain);
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type);
+
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void __dlm_wait_on_lockres(dlm_lock_resource *res);
+
+
+/* ----------------------------------------------------------------- */
+
+extern spinlock_t dlm_master_lock;
+extern struct list_head dlm_master_list;
+
+typedef struct _dlm_create_lock
+{
+ u16 node_idx;
+ s8 requested_type;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_create_lock;
+
+typedef struct _dlm_convert_lock
+{
+ u16 node_idx;
+ s8 requested_type;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_convert_lock;
+
+typedef struct _dlm_unlock_lock
+{
+ u32 flags;
+ u16 node_idx;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_unlock_lock;
+
+typedef struct _dlm_proxy_ast
+{
+ u16 node_idx;
+ u8 type;
+ u8 blocked_type;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+ u64 cookie;
+} dlm_proxy_ast;
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data);
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data);
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data);
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags);
+
+/* ----------------------------------------------------------------- */
+
+
+
+
+/*
+ * dlm_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init dlm_driver_entry (void)
+{
+ int status;
+
+
+ printk("Loaded dlm Driver module\n");
+ status = dlm_read_params();
+ if (status < 0)
+ return -1;
+
+ dlm_global_index = nm_this_node(NULL);
+ if (dlm_global_index == NM_MAX_NODES)
+ return -1;
+
+ return 0;
+} /* dlm_driver_entry */
+
+/*
+ * dlm_read_params()
+ *
+ * Read insmod params
+ */
+static int dlm_read_params(void)
+{
+ int status = 0;
+ return status;
+} /* dlm_read_params */
+
+
+/*
+ * dlm_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit dlm_driver_exit (void)
+{
+ printk("Unloaded dlm Driver module\n");
+ return;
+} /* dlm_driver_exit */
+
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name,
+ dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast)
+{
+ dlm_status status;
+ dlm_lock_resource *res;
+ dlm_lock *lock = NULL;
+ char *buf = NULL;
+ int convert = 0, recovery = 0;
+ struct qstr q;
+
+ if (!lksb)
+ return DLM_BADARGS;
+
+ status = DLM_BADPARAM;
+ if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE)
+ goto error_status;
+
+ if (flags & ~LKM_VALID_FLAGS)
+ goto error_status;
+
+ convert = (flags & LKM_CONVERT);
+ recovery = (flags & LKM_RECOVERY);
+
+ if (recovery && (!dlm_is_recovery_lock(name, strlen(name)) ||
+ convert) ) {
+ goto error_status;
+ }
+
+
+ if (convert) {
+ /* if converting, must pass in a valid dlm_lock */
+ if (!lksb->lockid || !lksb->lockid->lockres)
+ goto error_status;
+ lock = lksb->lockid;
+
+ /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are essentially
+ * static after the original lock call. convert requests will check
+ * to ensure that everything is the same and pass DLM_BADARGS if not.
+ * this means that DLM_DENIED_NOASTS will never be returned.
+ */
+#warning differs from spec here!
+
+ if (lock->lksb != lksb || lock->ast != ast ||
+ lock->bast != bast || lock->astdata != data) {
+ status = DLM_BADARGS;
+ printk("ERROR new args: lksb=%p, ast=%p, bast=%p, astdata=%p\n",
+ lksb, ast, bast, data);
+ printk(" orig args: lksb=%p, ast=%p, bast=%p, astdata=%p\n",
+ lock->lksb, lock->ast, lock->bast, lock->astdata);
+ goto error_status;
+ }
+ res = lock->lockres;
+
+ down_read(&dlm->recovery_sem);
+ spin_lock(&res->spinlock);
+ if (flags & LKM_LOCAL) {
+ printk("strange LOCAL convert request!\n");
+ if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+ spin_unlock(&res->spinlock);
+ status = DLM_BADPARAM;
+ goto up_error;
+ }
+ res->owner = dlm->group_index;
+ printk("set owner to this node. you SURE thats what you want!?\n");
+ }
+ status = do_dlmconvert(dlm, res, lock, flags, mode);
+ } else {
+ status = DLM_BADARGS;
+ if (!name)
+ goto error;
+
+ status = DLM_IVBUFLEN;
+ q.len = strlen(name);
+ if (q.len > DLM_LOCKID_NAME_MAX)
+ goto error;
+
+ status = DLM_SYSERR;
+ buf = kmalloc(q.len+1, GFP_KERNEL);
+ if (!buf)
+ goto error;
+
+ memcpy(buf, name, q.len);
+ buf[q.len] = 0;
+ q.name = buf;
+ q.hash = full_name_hash(q.name, q.len);
+
+ if (!recovery)
+ down_read(&dlm->recovery_sem);
+{
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2;
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+ res = dlm_get_lock_resource(dlm, &q, flags);
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ printk("dlm_get_lock_resource took %llu cycles\n", u2.q-u1.q);
+}
+ if (!res) {
+ status = DLM_IVLOCKID;
+ goto up_error;
+ }
+ status = do_dlmlock(dlm, res, lksb, flags, mode, ast, bast, data);
+ if (status != DLM_NORMAL)
+ goto up_error;
+ }
+
+ /* TODO: lvb */
+ if (!recovery)
+ up_read(&dlm->recovery_sem);
+ return status;
+
+up_error:
+ if (!recovery)
+ up_read(&dlm->recovery_sem);
+error:
+ if (buf)
+ kfree(buf);
+ lksb->lockid = NULL;
+
+error_status:
+ // this is kind of unnecessary
+ lksb->status = status;
+ return status;
+}
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb, int flags,
+ int type, dlm_astlockfunc_t *ast, dlm_bastlockfunc_t *bast, void *data)
+{
+ dlm_lock *tmplock;
+ dlm_status status;
+ u8 *c;
+
+ dlmprintk("type=%d\n", type);
+
+ status = DLM_SYSERR;
+ tmplock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+ if (!tmplock)
+ goto error;
+
+ memset(tmplock, 0, sizeof(dlm_lock));
+ INIT_LIST_HEAD(&tmplock->list);
+ INIT_LIST_HEAD(&tmplock->ast_list);
+ spin_lock_init(&tmplock->spinlock);
+ tmplock->lockres = res;
+ tmplock->type = type;
+ tmplock->convert_type = LKM_IVMODE;
+ tmplock->highest_blocked = LKM_IVMODE;
+ tmplock->node = dlm->group_index;
+ tmplock->ast = ast;
+ tmplock->bast = bast;
+ tmplock->astdata = data;
+ tmplock->lksb = lksb;
+
+ lksb->lockid = tmplock;
+
+ c = (u8 *)(&tmplock->cookie);
+
+ spin_lock(&dlm_cookie_lock);
+ tmplock->cookie = dlm_next_cookie;
+ dlm_next_cookie++;
+ if (dlm_next_cookie & 0xff00000000000000ull) {
+ printk("eek! this node's cookie will now wrap!\n");
+ dlm_next_cookie = 1;
+ }
+ c[7] = (u8)(tmplock->node & 0x00ff);
+ spin_unlock(&dlm_cookie_lock);
+
+ if (res->owner == dlm->group_index)
+ status = dlmlock_local(dlm, res, tmplock, flags);
+ else
+ status = dlmlock_remote(dlm, res, tmplock, flags);
+error:
+ if (status != DLM_NORMAL) {
+ if (tmplock)
+ kfree(tmplock);
+ lksb->lockid = NULL;
+ }
+ return status;
+}
+
+
+
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+ struct list_head *iter;
+ dlm_lock *tmplock;
+ int got_it = 0;
+
+ BUG_ON(!lock);
+ BUG_ON(!res);
+ BUG_ON(!dlm);
+
+ if (lock->node == dlm->group_index) {
+ BUG_ON(!lock->lksb);
+ }
+
+ dlmprintk("type=%d\n", lock->type);
+
+ list_for_each(iter, &res->granted) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+ list_add_tail(&lock->list, &res->blocked);
+ goto done;
+ }
+ }
+
+ list_for_each(iter, &res->converting) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (!dlm_lock_compatible(tmplock->type, lock->type)) {
+ list_add_tail(&lock->list, &res->blocked);
+ goto done;
+ }
+ }
+
+ /* got it right away */
+
+ /* if it is a remote request, proxy
+ * handler will set the lksb status */
+ if (lock->node == dlm->group_index)
+ lock->lksb->status = DLM_NORMAL;
+
+ list_add_tail(&lock->list, &res->granted);
+
+ if (dlm_do_ast(dlm, res, lock) < 0)
+ printk("eek\n");
+ got_it = 1;
+
+done:
+ spin_unlock(&res->spinlock);
+ dlm_kick_thread(dlm, res);
+ if (!got_it && (flags & LKM_NOQUEUE)) {
+ return DLM_NOTQUEUED;
+ }
+ return DLM_NORMAL;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+ dlm_status status = DLM_DENIED;
+
+ dlmprintk("type=%d\n", lock->type);
+
+ if (res->state & DLM_LOCK_RES_RECOVERING) {
+ status = DLM_RECOVERING;
+ goto bail;
+ }
+
+ /* will exit this call with spinlock held */
+ __dlm_wait_on_lockres(res);
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+ /* add lock to local (secondary) queue */
+ list_add_tail(&lock->list, &res->blocked);
+ spin_unlock(&res->spinlock);
+
+ /* spec seems to say that you will get DLM_NORMAL when the lock
+ * has been queued, meaning we need to wait for a reply here. */
+ status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+ if (status != DLM_NORMAL) {
+ /* remove from local queue if it failed */
+ list_del(&lock->list);
+ }
+bail:
+ spin_unlock(&res->spinlock);
+ return status;
+}
+
+
+/* must be already holding lockres->spinlock */
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ dlm_status status;
+
+{
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2;
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+
+ if (res->owner == dlm->group_index)
+ status = dlmconvert_local(dlm, res, lock, flags, type);
+ else
+ status = dlmconvert_remote(dlm, res, lock, flags, type);
+
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ printk("dlmconvert took %llu cycles\n", u2.q-u1.q);
+}
+ return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ dlm_status status = DLM_NORMAL;
+ struct list_head *iter;
+ dlm_lock *tmplock=NULL;
+ int remote_in_place = 0;
+
+ dlmprintk("type=%d, convert_type=%d, new convert_type=%d\n", lock->type, lock->convert_type, type);
+
+ spin_lock(&lock->spinlock);
+
+ /* already converting? */
+ if (lock->convert_type != LKM_IVMODE) {
+ printk("attempted to convert a lock with a lock conversion pending\n");
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+ return DLM_DENIED;
+ }
+
+ /* must be on grant queue to convert */
+ if (!dlm_lock_on_list(&res->granted, lock)) {
+ printk("attempted to convert a lock not on grant queue\n");
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+ return DLM_DENIED;
+ }
+
+
+ /* in-place downconvert? */
+ if (type <= lock->type)
+ goto grant;
+
+ /* upconvert from here on */
+ status = DLM_NORMAL;
+ list_for_each(iter, &res->granted) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (tmplock == lock)
+ continue;
+ if (!dlm_lock_compatible(tmplock->type, type))
+ goto switch_queues;
+ }
+
+ list_for_each(iter, &res->converting) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (!dlm_lock_compatible(tmplock->type, type))
+ goto switch_queues;
+ /* existing conversion requests take precedence */
+ if (!dlm_lock_compatible(tmplock->convert_type, type))
+ goto switch_queues;
+ }
+
+ /* fall thru to grant */
+
+grant:
+ if (lock->node != dlm->group_index) {
+ dlmprintk0("no in-place convert for nonlocal locks :( see if this helps...\n");
+ remote_in_place = 1;
+ goto switch_queues;
+ }
+
+ /* immediately grant the new lock type */
+ //printk("doing in-place %sconvert from %d to %d\n",
+ // type > lock->type ? "up" : "down", lock->type, type);
+ lock->type = type;
+ status = DLM_NORMAL;
+
+ /* if it is a remote request, proxy
+ * handler will set the lksb status */
+ if (lock->node == dlm->group_index)
+ lock->lksb->status = DLM_NORMAL;
+
+ if (dlm_do_ast(dlm, res, lock) < 0)
+ printk("eek\n");
+
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+
+ /* if successful, kick the queue runner */
+ if (status == DLM_NORMAL) {
+ dlm_kick_thread(dlm, res);
+ }
+
+ return status;
+
+switch_queues:
+ if (flags & LKM_NOQUEUE) {
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+ return DLM_NOTQUEUED;
+ }
+
+ lock->convert_type = type;
+ list_del(&lock->list);
+ /* make sure the remote in-place convert gets handled right away */
+ if (remote_in_place)
+ list_add(&lock->list, &res->converting);
+ else
+ list_add_tail(&lock->list, &res->converting);
+
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+
+ dlm_kick_thread(dlm, res);
+ return status;
+}
+
+/* must be already holding lockres->spinlock */
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ dlm_status status = DLM_DENIED;
+
+ dlmprintk("type=%d, convert_type=%d\n", lock->type, lock->convert_type);
+
+ if (res->state & DLM_LOCK_RES_RECOVERING) {
+ status = DLM_RECOVERING;
+ goto bail;
+ }
+ /* will exit this call with spinlock held */
+ __dlm_wait_on_lockres(res);
+
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+ /* move lock to local convert queue */
+ list_del(&lock->list);
+ list_add_tail(&lock->list, &res->converting);
+ if (lock->convert_type != LKM_IVMODE) {
+ printk("error! converting a remote lock that is already converting!\n");
+ /* TODO: return correct error */
+ BUG();
+ }
+ lock->convert_type = type;
+ spin_unlock(&res->spinlock);
+
+ /* spec seems to say that you will get DLM_NORMAL when the lock
+ * has been queued, meaning we need to wait for a reply here. */
+ status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+
+ /* if it failed, move it back to granted queue */
+ if (status != DLM_NORMAL) {
+ list_del(&lock->list);
+ list_add_tail(&lock->list, &res->granted);
+ lock->convert_type = LKM_IVMODE;
+ }
+bail:
+ spin_unlock(&res->spinlock);
+ return status;
+}
+
+
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+ dlm_status status;
+ dlm_lock_resource *res;
+ dlm_lock *lock = NULL;
+ int call_ast = 0;
+
+ if (!lksb)
+ return DLM_BADARGS;
+
+ if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK))
+ return DLM_BADPARAM;
+
+ if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+ printk("VALBLK given with CANCEL: ignoring VALBLK\n");
+ flags &= ~LKM_VALBLK;
+ }
+
+ if (!lksb->lockid || !lksb->lockid->lockres)
+ return DLM_BADPARAM;
+
+ lock = lksb->lockid;
+ res = lock->lockres;
+
+ status = dlmunlock_local(dlm, res, lock, lksb, flags, &call_ast);
+ if (call_ast)
+ (*unlockast)(data, lksb->status);
+ return status;
+}
+
+
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast)
+{
+ dlm_status status;
+ int free_lock = 0, remote_ready = 0;
+ int local = 0, remove = 0, regrant = 0;
+
+ /* according to spec and opendlm code
+ * flags & LKM_CANCEL != 0: must be converting or blocked
+ * flags & LKM_CANCEL == 0: must be granted
+ * iow, to unlock a converting lock, you must first LKM_CANCEL
+ * the convert, then call the unlock again with no LKM_CANCEL
+ */
+ *call_ast = 0;
+
+recheck:
+ spin_lock(&res->spinlock);
+ spin_lock(&lock->spinlock);
+
+ local = (res->owner == dlm->group_index);
+
+ if (flags & LKM_CANCEL) {
+ /* cancel request */
+ if (dlm_lock_on_list(&res->blocked, lock)) {
+ /* cancel this outright */
+ lksb->status = DLM_NORMAL;
+ status = DLM_NORMAL;
+ free_lock = 1;
+ *call_ast = 1;
+ remove = 1;
+ regrant = 0;
+ } else if (dlm_lock_on_list(&res->converting, lock)) {
+ /* cancel the request, put back on granted */
+ lksb->status = DLM_NORMAL;
+ status = DLM_NORMAL;
+ free_lock = 0;
+ *call_ast = 1;
+ remove = 1;
+ regrant = 1;
+ } else if (dlm_lock_on_list(&res->granted, lock)) {
+ /* too late, already granted. DLM_CANCELGRANT */
+ lksb->status = DLM_CANCELGRANT;
+ status = DLM_NORMAL;
+ free_lock = 0;
+ *call_ast = 1;
+ remove = 0;
+ regrant = 0;
+ } else {
+ /* err. um. eek! */
+ printk("lock to cancel is not on any list! bug!\n");
+ lksb->status = DLM_IVLOCKID;
+ status = DLM_IVLOCKID;
+ free_lock = 0;
+ *call_ast = 0;
+ remove = 0;
+ regrant = 0;
+ }
+ } else {
+ /* unlock request */
+ if (!dlm_lock_on_list(&res->granted, lock)) {
+ lksb->status = DLM_DENIED;
+ status = DLM_DENIED;
+ free_lock = 0;
+ *call_ast = 0;
+ remove = 0;
+ regrant = 0;
+ } else {
+ /* unlock granted lock */
+ lksb->status = DLM_NORMAL;
+ status = DLM_NORMAL;
+ free_lock = 1;
+ *call_ast = 1;
+ remove = 1;
+ regrant = 0;
+ }
+ }
+
+ if (!local) {
+ /* safe since nothing can change on this
+ * seconndary queue without lockres lock */
+ spin_unlock(&lock->spinlock);
+
+ /* if there was an outstanding change on the
+ * lockres, conditions could have changed */
+ if (!remote_ready &&
+ res->state & DLM_LOCK_RES_IN_PROGRESS) {
+ __dlm_wait_on_lockres(res);
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+ remote_ready = 1;
+ spin_unlock(&res->spinlock);
+ goto recheck;
+ }
+
+ if (res->state & DLM_LOCK_RES_RECOVERING) {
+ /* !!!!! */
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ } else {
+ spin_unlock(&res->spinlock);
+ status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, flags);
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+ }
+ spin_lock(&lock->spinlock);
+ }
+
+ if (remove)
+ list_del(&lock->list);
+ if (regrant)
+ list_add_tail(&lock->list, &res->granted);
+
+ spin_unlock(&lock->spinlock);
+ spin_unlock(&res->spinlock);
+
+ if (free_lock) {
+#warning this must change to proper refcounting
+ /* TODO: refcounting... tho for now this will work because
+ * the middle layer is keeping track of everything */
+ kfree(lock);
+ lksb->lockid = NULL;
+ }
+ return status;
+}
+
+
+dlm_status dlm_send_remote_unlock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags)
+{
+ struct inode *inode = NULL;
+ dlm_unlock_lock unlock;
+ int tmpret;
+ dlm_status ret;
+ int status = 0;
+
+ dlmprintk0("\n");
+
+ memset(&unlock, 0, sizeof(unlock));
+ unlock.node_idx = dlm->group_index;
+ unlock.flags = flags;
+ unlock.cookie = lock->cookie;
+ unlock.namelen = res->lockname.len;
+ strncpy(unlock.name, res->lockname.name, unlock.namelen);
+
+ ret = DLM_NOLOCKMGR;
+ lksb->status = DLM_NOLOCKMGR;
+ inode = nm_get_group_node_by_index(dlm->group, res->owner);
+ if (inode) {
+ tmpret = net_send_message(DLM_UNLOCK_LOCK_MSG, dlm->key, &unlock, sizeof(unlock), inode, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ if (status == DLM_CANCELGRANT)
+ ret = DLM_NORMAL;
+ else
+ ret = status;
+ lksb->status = status;
+ } else {
+ printk("error occurred in net_send_message: %d\n", tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
+ lksb->status = ret;
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
+int dlm_unlock_lock_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_unlock_lock *unlock = (dlm_unlock_lock *)msg->buf;
+ dlm_lock_resource *res;
+ struct list_head *iter, *queue;
+ dlm_lock *lock;
+ dlm_status status = DLM_NORMAL;
+ int found = 0;
+ dlm_lockstatus lksb;
+ int ignore;
+ struct qstr lockname = { .name=unlock->name, .len=unlock->namelen };
+
+ dlmprintk0("\n");
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ status = DLM_IVLOCKID;
+ res = dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_lock(&res->spinlock);
+ queue = &res->granted;
+again:
+ list_for_each(iter, queue) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock->cookie == unlock->cookie &&
+ lock->node == unlock->node_idx) {
+ found = 1;
+ /* unlockast only called on originating node */
+ status = dlmunlock_local(dlm, res, lock, &lksb, unlock->flags, &ignore);
+ break;
+ }
+ }
+ if (queue == &res->granted) {
+ queue = &res->converting;
+ goto again;
+ } else if (queue == &res->converting) {
+ queue = &res->blocked;
+ goto again;
+ }
+ spin_unlock(&res->spinlock);
+ }
+ if (!found)
+ printk("failed to find lock to unlock! cookie=%llu\n", unlock->cookie);
+ else
+ status = lksb.status;
+
+ return status;
+}
+
+
+
+
+
+static dlm_ctxt * __dlm_lookup_domain(char *domain)
+{
+ dlm_ctxt *tmp = NULL;
+ struct list_head *iter;
+
+ list_for_each(iter, &dlm_domains) {
+ tmp = list_entry (iter, dlm_ctxt, list);
+ if (strncmp(tmp->name, domain, NM_MAX_NAME_LEN)==0)
+ break;
+ tmp = NULL;
+ }
+
+ return tmp;
+}
+
+dlm_ctxt * dlm_lookup_domain(char *domain)
+{
+ dlm_ctxt *tmp = NULL;
+ spin_lock(&dlm_domain_lock);
+ tmp = __dlm_lookup_domain(domain);
+ spin_unlock(&dlm_domain_lock);
+ return tmp;
+}
+
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+ struct list_head *iter;
+ dlm_lock_resource *tmpres=NULL;
+ struct list_head *bucket;
+
+ bucket = &(dlm->resources[lockname->hash & DLM_HASH_MASK]);
+
+ /* check for pre-existing lock */
+ list_for_each(iter, bucket) {
+ tmpres = list_entry(iter, dlm_lock_resource, list);
+ if (tmpres->lockname.len == lockname->len &&
+ strncmp(tmpres->lockname.name, lockname->name, lockname->len) == 0)
+ break;
+ tmpres = NULL;
+ }
+ return tmpres;
+}
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname)
+{
+ dlm_lock_resource *res;
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lock(dlm, lockname);
+ spin_unlock(&dlm->spinlock);
+ return res;
+}
+
+
+
+/*
+ * dlm_register_domain: one-time setup per "domain"
+ */
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key)
+{
+ dlm_ctxt *tmp = NULL, *dlm = NULL;
+ struct inode *group = NULL;
+ int tmpret, i;
+ char *netbuf;
+
+ if (strlen(domain) > NM_MAX_NAME_LEN) {
+ printk("domain name length too long\n");
+ goto leave;
+ }
+
+ group = nm_get_group_by_name(group_name);
+ if (!group) {
+ printk("no nm group %s for domain %s!\n", group_name, domain);
+ goto leave;
+ }
+
+ /*
+ * TODO: should i do some type of dlm-group-join business here?
+ * I need to have new nodes communicate with other dlm nodes to
+ * wait until their master lists are empty before allowing me to
+ * join. does this belong here? or in hb?
+ * seems like stuff that heartbeat shouldn't care about, cuz we
+ * would actually be preventing a node that is "UP" from being
+ * part of the dlm group.
+ */
+ dlm = dlm_lookup_domain(domain);
+ if (dlm) {
+ /* found a pre-existing domain */
+ goto leave;
+ }
+
+ dlm = kmalloc(sizeof(dlm_ctxt), GFP_KERNEL);
+ if (dlm == NULL) {
+ printk("could not allocate dlm_ctxt\n");
+ goto leave;
+ }
+ memset(dlm, 0, sizeof(dlm_ctxt));
+ dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+ if (dlm->name == NULL) {
+ kfree(dlm);
+ dlm = NULL;
+ printk("could not allocate dlm domain name\n");
+ goto leave;
+ }
+ dlm->net_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!dlm->net_buf) {
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ printk("could not allocate dlm network temporary buffer\n");
+ goto leave;
+ }
+ dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
+ if (!dlm->resources) {
+ kfree(dlm->name);
+ kfree(dlm);
+ free_page((unsigned long)dlm->net_buf);
+ dlm = NULL;
+ printk("could not allocate dlm hash\n");
+ goto leave;
+ }
+ memset(dlm->resources, 0, PAGE_SIZE);
+
+ for (i=0; i<DLM_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&dlm->resources[i]);
+
+ strcpy(dlm->name, domain);
+ spin_lock_init(&dlm->spinlock);
+ INIT_LIST_HEAD(&dlm->list);
+ INIT_LIST_HEAD(&dlm->dirty_list);
+ INIT_LIST_HEAD(&dlm->reco.resources);
+ INIT_LIST_HEAD(&dlm->reco.received);
+ util_thread_info_init(&dlm->thread);
+ util_thread_info_init(&dlm->reco.thread);
+ init_rwsem(&dlm->recovery_sem);
+ dlm->group = group;
+ dlm->group_index = nm_this_node(group);
+ dlm->key = key;
+ dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.next_seq = 0;
+
+ spin_lock(&dlm_domain_lock);
+ tmp = __dlm_lookup_domain(domain);
+ if (tmp) {
+ spin_unlock(&dlm_domain_lock);
+ /* found a pre-existing domain */
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ goto leave;
+ }
+
+ /* add the new domain */
+ list_add_tail(&dlm->list, &dlm_domains);
+ spin_unlock(&dlm_domain_lock);
+
+ tmpret = hb_register_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+ if (tmpret)
+ goto error;
+ tmpret = hb_register_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+ if (tmpret)
+ goto error;
+
+ /* TODO: need to use hb_fill_node_map to fill a temporary votemap
+ * then communicate with each of these nodes that I want to come up
+ * FOR THIS DLM. there may be many nodes in this group heartbeating
+ * but they may not care about this particular dlm instance. once
+ * everyone has come back with a response that i have been added or
+ * that they are not a member I can put together the REAL node map
+ * for this dlm in dlm->node_map */
+ /* TODO: I guess we can fill this here as a superset of possible nodes
+ * so that the hb_callbacks above have something to work on in the meantime
+ * then trim out the nodes that are not part of this dlm once we know */
+ /* TODO: I may need to register a special net handler on insmod of dlm.o
+ * with a key of 0 so that I can respond to requests even if I am not
+ * part of a dlm group. this would still leave a gap in time between the
+ * start of heartbeating and the insmod dlm.o, unless I change the module
+ * loading stuff in clusterbo to include dlm.o (which would work fine) */
+#warning WRONG WRONG WRONG
+ tmpret = hb_fill_node_map(group, dlm->node_map, NM_MAX_NODES);
+ if (tmpret)
+ goto error;
+
+
+#if 0
+ tmpret = net_register_handler("reco-request",
+ DLM_NET_RECOVERY_REQUEST_MSG_TYPE,
+ key, sizeof(dlm_reco_request),
+ dlm_recovery_request_handler, dlm);
+ if (tmpret)
+ goto error;
+ tmpret = net_register_handler("reco-lock-arr-req",
+ DLM_NET_RECOVERY_LOCK_ARR_REQ_MSG_TYPE,
+ key, sizeof(dlm_reco_lock_arr_req),
+ dlm_recovery_lock_arr_req_handler, dlm);
+ if (tmpret)
+ goto error;
+ tmpret = net_register_handler("reco-response",
+ DLM_NET_RECOVERY_RESPONSE_MSG_TYPE,
+ key, sizeof(dlm_reco_response),
+ dlm_recovery_response_handler, dlm);
+ if (tmpret)
+ goto error;
+#endif
+
+ netbuf = dlm->net_buf;
+ tmpret = net_register_handler(DLM_MASTER_REQUEST_RESP_MSG, key, 0,
+ sizeof(dlm_master_request_resp),
+ dlm_master_request_resp_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request_resp));
+
+ tmpret = net_register_handler(DLM_MASTER_REQUEST_MSG, key, 0,
+ sizeof(dlm_master_request),
+ dlm_master_request_handler,
+ dlm, netbuf);
+
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_master_request));
+
+ tmpret = net_register_handler(DLM_ASSERT_MASTER_MSG, key, 0,
+ sizeof(dlm_assert_master),
+ dlm_assert_master_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_assert_master));
+ tmpret = net_register_handler(DLM_CREATE_LOCK_MSG, key, 0,
+ sizeof(dlm_create_lock),
+ dlm_create_lock_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_create_lock));
+ tmpret = net_register_handler(DLM_CONVERT_LOCK_MSG, key, 0,
+ sizeof(dlm_convert_lock),
+ dlm_convert_lock_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_convert_lock));
+
+ tmpret = net_register_handler(DLM_UNLOCK_LOCK_MSG, key, 0,
+ sizeof(dlm_unlock_lock),
+ dlm_unlock_lock_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_unlock_lock));
+
+ tmpret = net_register_handler(DLM_PROXY_AST_MSG, key, 0,
+ sizeof(dlm_proxy_ast),
+ dlm_proxy_ast_handler,
+ dlm, netbuf);
+ if (tmpret)
+ goto error;
+ netbuf += L1_CACHE_ALIGN(sizeof(dlm_proxy_ast));
+// printk("netbuf=%p net_buf=%p diff=%d\n", netbuf, dlm->net_buf, ((char *)netbuf - (char *)dlm->net_buf)); // currently 768
+
+ tmpret = dlm_launch_thread(dlm);
+ if (tmpret == 0)
+ goto leave;
+
+error:
+ hb_unregister_callback(HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm);
+ hb_unregister_callback(HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm);
+ spin_lock(&dlm_domain_lock);
+ list_del(&dlm->list);
+ spin_unlock(&dlm_domain_lock);
+ free_page((unsigned long)dlm->net_buf);
+ free_page((unsigned long)dlm->resources);
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+
+leave:
+ if (!dlm && group)
+ iput(group);
+ return dlm;
+}
+
+void dlm_unregister_domain(dlm_ctxt *dlm)
+{
+ // fill me in please
+}
+
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname)
+{
+ memset(res, 0, sizeof(dlm_lock_resource));
+ res->lockname.name = lockname->name;
+ res->lockname.len = lockname->len;
+ res->lockname.hash = lockname->hash;
+ init_waitqueue_head(&res->wq);
+ spin_lock_init(&res->spinlock);
+ INIT_LIST_HEAD(&res->list);
+ INIT_LIST_HEAD(&res->granted);
+ INIT_LIST_HEAD(&res->converting);
+ INIT_LIST_HEAD(&res->blocked);
+ INIT_LIST_HEAD(&res->dirty);
+ INIT_LIST_HEAD(&res->recovering);
+
+ res->owner = DLM_LOCK_RES_OWNER_UNKNOWN;
+ res->state |= DLM_LOCK_RES_IN_PROGRESS;
+}
+
+
+
+
+/* will exit holding res->spinlock, but may drop in function */
+void dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&res->wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+ spin_unlock(&res->spinlock);
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(&res->wq, &wait);
+ current->state = TASK_RUNNING;
+}
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres(dlm_lock_resource *res)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&res->wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+ spin_unlock(&res->spinlock);
+ schedule();
+ spin_lock(&res->spinlock);
+ goto repeat;
+ }
+ remove_wait_queue(&res->wq, &wait);
+ current->state = TASK_RUNNING;
+}
+
+
+
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock)
+{
+ dlm_astlockfunc_t *fn = lock->ast;
+
+ dlmprintk0("\n");
+
+ if (lock->node != dlm->group_index) {
+ return dlm_send_proxy_ast(dlm, res, lock, DLM_AST, 0);
+ }
+ if (!fn) {
+ printk("eek! lock has no ast %*s! cookie=%llu\n",
+ res->lockname.len, res->lockname.name, lock->cookie);
+ return -EINVAL;
+ }
+ (*fn)(lock->astdata);
+ return 0;
+}
+
+
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type)
+{
+ dlm_bastlockfunc_t *fn = lock->bast;
+
+ dlmprintk0("\n");
+
+ if (lock->node != dlm->group_index) {
+ return dlm_send_proxy_ast(dlm, res, lock, DLM_BAST, blocked_type);
+ }
+
+ if (!fn) {
+ printk("eek! lock has no bast %*s! cookie=%llu\n",
+ res->lockname.len, res->lockname.name, lock->cookie);
+ return -EINVAL;
+ }
+ (*fn)(lock->astdata, blocked_type);
+ return 0;
+}
+
+int dlm_send_proxy_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int type, int blocked_type)
+{
+ int ret = 0;
+ dlm_proxy_ast past;
+ struct inode *inode = NULL;
+
+ dlmprintk("to=%u, type=%d, blocked_type=%d\n", lock->node, type, blocked_type);
+
+ past.node_idx = dlm->group_index;
+ past.type = type;
+ past.blocked_type = blocked_type;
+ past.namelen = res->lockname.len;
+ strncpy(past.name, res->lockname.name, past.namelen);
+ past.cookie = lock->cookie;
+
+ ret = -EINVAL;
+ inode = nm_get_group_node_by_index(dlm->group, lock->node);
+ if (inode) {
+ ret = net_send_message(DLM_PROXY_AST_MSG, dlm->key, &past, sizeof(past), inode, NULL);
+ iput(inode);
+ }
+ if (ret < 0) {
+ printk("(%d) dlm_send_proxy_ast: returning %d\n", current->pid, ret);
+ }
+ return ret;
+}
+
+int dlm_proxy_ast_handler(net_msg *msg, u32 len, void *data)
+{
+ int status;
+ dlm_ctxt *dlm = data;
+ dlm_lock_resource *res;
+ dlm_lock *lock = NULL;
+ dlm_proxy_ast *past = (dlm_proxy_ast *) msg->buf;
+ struct qstr lockname = { .name=past->name, .len=past->namelen };
+ struct list_head *iter, *head=NULL;
+ u64 cookie = past->cookie;
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ dlmprintk("type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+ if (past->type != DLM_AST &&
+ past->type != DLM_BAST) {
+ printk("Eeeek unknown ast type! %d, cookie=%llu, name=%*s\n",
+ past->type, cookie, lockname.len, lockname.name);
+ return 0;
+ }
+
+ res = dlm_lookup_lock(dlm, &lockname);
+ if (!res) {
+ printk("eek! got %sast for unknown lockres! cookie=%llu, name=%*s, namelen=%d\n",
+ past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+ return 0;
+ }
+
+ if (!dlm_is_recovery_lock(past->name, past->namelen))
+ down_read(&dlm->recovery_sem);
+ spin_lock(&res->spinlock);
+
+ /* try convert queue for both ast/bast */
+ head = &res->converting;
+ lock = NULL;
+ list_for_each(iter, head) {
+ lock = list_entry (iter, dlm_lock, list);
+ if (lock->cookie == cookie)
+ goto do_ast;
+ }
+
+ /* if not on convert, try blocked for ast, granted for bast */
+ if (past->type == DLM_AST)
+ head = &res->blocked;
+ else
+ head = &res->granted;
+
+ list_for_each(iter, head) {
+ lock = list_entry (iter, dlm_lock, list);
+ if (lock->cookie == cookie)
+ goto do_ast;
+ }
+
+ printk("eek! got %sast for unknown lock! cookie=%llu, name=%*s, namelen=%d\n",
+ past->type == DLM_AST ? "" : "b", cookie, lockname.len, lockname.name, lockname.len);
+ spin_unlock(&res->spinlock);
+ if (!dlm_is_recovery_lock(past->name, past->namelen))
+ up_read(&dlm->recovery_sem);
+ return 0;
+
+do_ast:
+ if (past->type == DLM_AST) {
+ list_del(&lock->list);
+ list_add_tail(&lock->list, &res->granted);
+ dlmprintk("ast: adding to granted list... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ if (lock->convert_type != LKM_IVMODE) {
+ lock->type = lock->convert_type;
+ lock->convert_type = LKM_IVMODE;
+ } else {
+ // should already be there....
+ }
+
+ lock->lksb->status = DLM_NORMAL;
+
+ status = dlm_do_ast(dlm, res, lock);
+ dlmprintk("ast done: now... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ } else {
+ dlmprintk("bast: before... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ status = dlm_do_bast(dlm, res, lock, past->blocked_type);
+ dlmprintk("bast: after... type=%d, convert_type=%d\n",
+ lock->type, lock->convert_type);
+ }
+
+ if (status < 0)
+ printk("eeek: ast/bast returned %d\n", status);
+
+ spin_unlock(&res->spinlock);
+ if (!dlm_is_recovery_lock(past->name, past->namelen))
+ up_read(&dlm->recovery_sem);
+ return 0;
+}
+
+
+
+
+
+
+
+/*
+ * message handlers should just return status.
+ * this will get send back to the calling node if it
+ * requested a status return.
+ */
+
+
+/* remote lock creation */
+dlm_status dlm_send_remote_lock_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags)
+{
+ struct inode *inode = NULL;
+ dlm_create_lock create;
+ int tmpret, status = 0;
+ dlm_status ret;
+
+ dlmprintk0("\n");
+
+ memset(&create, 0, sizeof(create));
+ create.node_idx = dlm->group_index;
+ create.requested_type = lock->type;
+ create.cookie = lock->cookie;
+ create.namelen = res->lockname.len;
+ strncpy(create.name, res->lockname.name, create.namelen);
+
+ ret = DLM_NOLOCKMGR;
+ inode = nm_get_group_node_by_index(dlm->group, res->owner);
+ if (inode) {
+ tmpret = net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, sizeof(create), inode, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ ret = status; // this is already a dlm_status
+ } else {
+ printk("error occurred in net_send_message: %d\n", tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
+int dlm_create_lock_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_create_lock *create = (dlm_create_lock *)msg->buf;
+ dlm_lock_resource *res;
+ dlm_lock *newlock;
+ dlm_status status = DLM_NORMAL;
+ struct qstr lockname = { .name=create->name, .len=create->namelen };
+
+ dlmprintk0("\n");
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+
+ newlock = kmalloc(sizeof(dlm_lock), GFP_KERNEL);
+ if (!newlock)
+ return DLM_SYSERR;
+
+ memset(newlock, 0, sizeof(dlm_lock));
+ INIT_LIST_HEAD(&newlock->list);
+ INIT_LIST_HEAD(&newlock->ast_list);
+ spin_lock_init(&newlock->spinlock);
+ newlock->type = create->requested_type;
+ newlock->convert_type = LKM_IVMODE;
+ newlock->highest_blocked = LKM_IVMODE;
+ newlock->node = create->node_idx;
+ newlock->ast = NULL;
+ newlock->bast = NULL;
+ newlock->astdata = NULL;
+ newlock->cookie = create->cookie;
+
+ status = DLM_IVLOCKID;
+ res = dlm_lookup_lock(dlm, &lockname);
+ if (res) {
+ spin_lock(&res->spinlock);
+ newlock->lockres = res;
+ status = dlmlock_local(dlm, res, newlock, 0);
+ spin_unlock(&res->spinlock);
+ }
+
+ return status;
+}
+
+/* remote lock conversion */
+dlm_status dlm_send_remote_convert_request(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type)
+{
+ struct inode *inode = NULL;
+ dlm_convert_lock convert;
+ int tmpret;
+ dlm_status ret;
+ int status = 0;
+
+ dlmprintk0("\n");
+
+ memset(&convert, 0, sizeof(convert));
+ convert.node_idx = dlm->group_index;
+ convert.requested_type = type;
+ convert.cookie = lock->cookie;
+ convert.namelen = res->lockname.len;
+ strncpy(convert.name, res->lockname.name, convert.namelen);
+
+ ret = DLM_NOLOCKMGR;
+ inode = nm_get_group_node_by_index(dlm->group, res->owner);
+ if (inode) {
+ tmpret = net_send_message(DLM_CONVERT_LOCK_MSG, dlm->key, &convert, sizeof(convert), inode, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ ret = status; // this is already a dlm_status
+ } else {
+ printk("error occurred in net_send_message: %d\n", tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
+int dlm_convert_lock_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_convert_lock *convert = (dlm_convert_lock *)msg->buf;
+ dlm_lock_resource *res;
+ struct list_head *iter;
+ dlm_lock *lock;
+ dlm_status status = DLM_NORMAL;
+ int found = 0;
+ struct qstr lockname = { .name=convert->name, .len=convert->namelen };
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2, u3, u4, u5, u6, u7;
+
+
+ dlmprintk0("\n");
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+
+ lockname.hash = full_name_hash(lockname.name, lockname.len);
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+
+ status = DLM_IVLOCKID;
+ res = dlm_lookup_lock(dlm, &lockname);
+ rdtsc(u3.hilo[0], u3.hilo[1]);
+ if (res) {
+ spin_lock(&res->spinlock);
+ rdtsc(u4.hilo[0], u4.hilo[1]);
+ list_for_each(iter, &res->granted) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock->cookie == convert->cookie &&
+ lock->node == convert->node_idx) {
+ found = 1;
+ rdtsc(u5.hilo[0], u5.hilo[1]);
+ status = dlmconvert_local(dlm, res, lock, 0, convert->requested_type);
+ rdtsc(u6.hilo[0], u6.hilo[1]);
+ break;
+ }
+ }
+ spin_unlock(&res->spinlock);
+ }
+ if (!found)
+ printk("failed to find lock to convert on grant queue! cookie=%llu\n", convert->cookie);
+
+ rdtsc(u7.hilo[0], u7.hilo[1]);
+ dlmprintk("1-2:%llu 2-3:%llu 3-4:%llu 4-5:%llu 5-6:%llu 6-7:%llu\n",
+ u2.q-u1.q, u3.q-u2.q, u4.q-u3.q, u5.q-u4.q, u6.q-u5.q, u7.q-u6.q);
+ return status;
+}
+
+void dlm_dump_everything(void)
+{
+ dlm_ctxt *dlm;
+ struct list_head *iter;
+
+ printk("dumping ALL dlm state for node %s\n", system_utsname.nodename);
+ spin_lock(&dlm_domain_lock);
+ list_for_each(iter, &dlm_domains) {
+ dlm = list_entry (iter, dlm_ctxt, list);
+ dlm_dump_dlm(dlm);
+ }
+ spin_unlock(&dlm_domain_lock);
+}
+
+void dlm_dump_dlm(dlm_ctxt *dlm)
+{
+ dlm_lock_resource *res;
+ dlm_lock *lock;
+ struct list_head *iter, *iter2;
+ struct list_head *bucket;
+ int i;
+
+ printk("dlm_ctxt: %s, group=%u, key=%u\n", dlm->name, dlm->group_index, dlm->key);
+ printk("some bug here... should not have to check for this...\n");
+ if (!dlm || !dlm->name) {
+ printk("wtf... dlm=%p\n", dlm);
+ return;
+ }
+
+ spin_lock(&dlm->spinlock);
+ for (i=0; i<DLM_HASH_SIZE; i++) {
+ bucket = &(dlm->resources[i]);
+ list_for_each(iter, bucket) {
+ res = list_entry(iter, dlm_lock_resource, list);
+ printk("lockres: %*s, owner=%u, state=%u\n", res->lockname.len, res->lockname.name,
+ res->owner, res->state);
+ spin_lock(&res->spinlock);
+ printk(" granted queue: \n");
+ list_for_each(iter2, &res->granted) {
+ lock = list_entry(iter2, dlm_lock, list);
+ spin_lock(&lock->spinlock);
+ printk(" type=%d, conv=%d, node=%u, cookie=%llu\n",
+ lock->type, lock->convert_type, lock->node, lock->cookie);
+ spin_unlock(&lock->spinlock);
+ }
+ printk(" converting queue: \n");
+ list_for_each(iter2, &res->converting) {
+ lock = list_entry(iter2, dlm_lock, list);
+ spin_lock(&lock->spinlock);
+ printk(" type=%d, conv=%d, node=%u, cookie=%llu\n",
+ lock->type, lock->convert_type, lock->node, lock->cookie);
+ spin_unlock(&lock->spinlock);
+ }
+ printk(" blocked queue: \n");
+ list_for_each(iter2, &res->blocked) {
+ lock = list_entry(iter2, dlm_lock, list);
+ spin_lock(&lock->spinlock);
+ printk(" type=%d, conv=%d, node=%u, cookie=%llu\n",
+ lock->type, lock->convert_type, lock->node, lock->cookie);
+ spin_unlock(&lock->spinlock);
+ }
+ spin_unlock(&res->spinlock);
+ }
+ }
+ spin_unlock(&dlm->spinlock);
+}
+
+module_init (dlm_driver_entry);
+module_exit (dlm_driver_exit);
Added: trunk/cluster/dlmmod.h
===================================================================
--- trunk/cluster/dlmmod.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmmod.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,467 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_DLMMOD_H
+#define CLUSTER_DLMMOD_H
+
+
+
+#if 0
+#define dlmprintk(x, arg...)
+#define dlmprintk0(x)
+#else
+#define dlmprintk(x, arg...) printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__, ##arg)
+#define dlmprintk0(x) printk("(%d)(%s:%d) " x, current->pid, __FUNCTION__, __LINE__)
+#endif
+
+
+
+
+#define DLM_HB_NODE_DOWN_PRI (0xf000000)
+#define DLM_HB_NODE_UP_PRI (0x8000000)
+
+#define DLM_LVB_LEN 64
+#define DLM_LOCKID_NAME_MAX 32
+
+#define DLM_DOMAIN_NAME_MAX_LEN 255
+#define DLM_LOCK_RES_OWNER_UNKNOWN NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
+#define DLM_THREAD_MS 200 // flush at least every 200 ms
+
+#define DLM_HASH_BITS 7
+#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
+#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
+
+typedef enum _dlm_ast_type {
+ DLM_AST = 0,
+ DLM_BAST,
+ DLM_ASTUNLOCK
+} dlm_ast_type;
+
+
+#define LKM_IVMODE (-1) /* invalid mode */
+#define LKM_NLMODE 0 /* null lock */
+#define LKM_CRMODE 1 /* concurrent read */ /* unsupported */
+#define LKM_CWMODE 2 /* concurrent write */ /* unsupported */
+#define LKM_PRMODE 3 /* protected read */
+#define LKM_PWMODE 4 /* protected write */ /* unsupported */
+#define LKM_EXMODE 5 /* exclusive */
+#define LKM_MAXMODE 5
+#define LKM_MODEMASK 0xff
+
+
+/* TODO: Flags which OCFS2 will require:
+ * - LKM_LOCAL
+ * - LKM_VALBLK
+ * - LKM_NOQUEUE
+ * - LKM_CONVERT
+ * - LKM_CANCEL */
+#define LKM_ORPHAN 0x10 /* this lock is orphanable */ /* unsupported */
+#define LKM_PARENTABLE 0x20 /* this lock was orphaned */ /* unsupported */
+#define LKM_BLOCK 0x40 /* blocking lock request */ /* unsupported */
+#define LKM_LOCAL 0x80 /* local lock request */
+#define LKM_VALBLK 0x100 /* lock value block request */
+#define LKM_NOQUEUE 0x200 /* non blocking request */
+#define LKM_CONVERT 0x400 /* conversion request */
+#define LKM_NODLCKWT 0x800 /* this lock wont deadlock */ /* unsupported */
+#define LKM_UNLOCK 0x1000 /* deallocate this lock */
+#define LKM_CANCEL 0x2000 /* cancel conversion request */
+#define LKM_DEQALL 0x4000 /* remove all locks held by proc */ /* unsupported */
+#define LKM_INVVALBLK 0x8000 /* invalidate lock value block */
+#define LKM_SYNCSTS 0x10000 /* return synchronous status if poss */ /* unsupported */
+#define LKM_TIMEOUT 0x20000 /* lock request contains timeout */ /* unsupported */
+#define LKM_SNGLDLCK 0x40000 /* request can self-deadlock */ /* unsupported */
+#define LKM_FINDLOCAL 0x80000 /* find local lock request */ /* unsupported */
+#define LKM_PROC_OWNED 0x100000 /* owned by process, not group */ /* unsupported */
+#define LKM_XID 0x200000 /* use transaction id for deadlock */ /* unsupported */
+#define LKM_XID_CONFLICT 0x400000 /* do not allow lock inheritance */ /* unsupported */
+#define LKM_FORCE 0x800000 /* force unlock flag */
+#define LKM_REVVALBLK 0x1000000 /* temporary solution: re-validate lock value block */ /* unsupported */
+
+#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock, used to avoid recovery rwsem */
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+ LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+ LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN 9
+
+static inline int dlm_is_recovery_lock(char *lock_name, int name_len)
+{
+ if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+ strncmp(lock_name, DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN)==0)
+ return 1;
+ return 0;
+}
+
+typedef enum _dlm_status {
+ DLM_NORMAL, /* request in progress */
+ DLM_GRANTED, /* request granted */
+ DLM_DENIED, /* request denied */
+ DLM_DENIED_NOLOCKS, /* request denied, out of system resources */
+ DLM_WORKING, /* async request in progress */
+ DLM_BLOCKED, /* lock request blocked */
+ DLM_BLOCKED_ORPHAN, /* lock request blocked by a orphan lock*/
+ DLM_DENIED_GRACE_PERIOD, /* topological change in progress */
+ DLM_SYSERR, /* system error */
+ DLM_NOSUPPORT, /* unsupported */
+ DLM_CANCELGRANT, /* can't cancel convert: already granted */
+ DLM_IVLOCKID, /* bad lockid */
+ DLM_SYNC, /* synchronous request granted */
+ DLM_BADTYPE, /* bad resource type */
+ DLM_BADRESOURCE, /* bad resource handle */
+ DLM_MAXHANDLES, /* no more resource handles */
+ DLM_NOCLINFO, /* can't contact cluster manager */
+ DLM_NOLOCKMGR, /* can't contact lock manager */
+ DLM_NOPURGED, /* can't contact purge daemon */
+ DLM_BADARGS, /* bad api args */
+ DLM_VOID, /* no status */
+ DLM_NOTQUEUED, /* NOQUEUE was specified and request failed */
+ DLM_IVBUFLEN, /* invalid resource name length */
+ DLM_CVTUNGRANT, /* attempted to convert ungranted lock */
+ DLM_BADPARAM, /* invalid lock mode specified */
+ DLM_VALNOTVALID, /* value block has been invalidated */
+ DLM_REJECTED, /* request rejected, unrecognized client */
+ DLM_ABORT, /* blocked lock request cancelled */
+ DLM_CANCEL, /* conversion request cancelled */
+ DLM_IVRESHANDLE, /* invalid resource handle */
+ DLM_DEADLOCK, /* deadlock recovery refused this request */
+ DLM_DENIED_NOASTS, /* failed to allocate AST */
+ DLM_FORWARD, /* request must wait for primary's response */
+ DLM_TIMEOUT, /* timeout value for lock has expired */
+ DLM_IVGROUPID, /* invalid group specification */
+ DLM_VERS_CONFLICT, /* version conflicts prevent request handling */
+ DLM_BAD_DEVICE_PATH, /* Locks device does not exist or path wrong */
+ DLM_NO_DEVICE_PERMISSION, /* Client has insufficient pers for device */
+ DLM_NO_CONTROL_DEVICE, /* Cannot set options on opened device */
+ DLM_MAXSTATS, /* upper limit for return code validation */
+
+ DLM_RECOVERING /* our lame addition to allow caller to fail a lock
+ request if it is being recovered */
+} dlm_status;
+
+
+
+typedef struct _dlm_recovery_ctxt
+{
+ struct list_head resources;
+ struct list_head received; // list of dlm_reco_lock_infos received from other nodes during recovery
+ u16 new_master;
+ u16 dead_node;
+ u16 sending_node;
+ u32 next_seq;
+ util_thread_info thread;
+} dlm_recovery_ctxt;
+
+
+struct _dlm_ctxt
+{
+ struct list_head list;
+ struct list_head *resources;
+ struct list_head dirty_list;
+ spinlock_t spinlock;
+ struct rw_semaphore recovery_sem;
+ char *name;
+ char *net_buf;
+ util_thread_info thread;
+ struct inode *group;
+ u32 key;
+ u16 group_index;
+ u32 node_map[8];
+ u32 recovery_map[8];
+ dlm_recovery_ctxt reco;
+};
+
+#define DLM_LOCK_RES_UNINITED 0x00000001
+#define DLM_LOCK_RES_RECOVERING 0x00000002
+#define DLM_LOCK_RES_READY 0x00000004
+#define DLM_LOCK_RES_DIRTY 0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
+
+typedef struct _dlm_lock_resource
+{
+ struct list_head list;
+ struct list_head granted;
+ struct list_head converting;
+ struct list_head blocked;
+ struct list_head dirty;
+ struct list_head recovering; // dlm_recovery_ctxt.resources list
+ spinlock_t spinlock;
+ wait_queue_head_t wq;
+ u16 owner; // node which owns the lock resource, or unknown
+ u16 state;
+ struct qstr lockname;
+ char lvb[DLM_LVB_LEN];
+} dlm_lock_resource;
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, dlm_status);
+
+typedef struct _dlm_lockstatus dlm_lockstatus;
+
+typedef struct _dlm_lock
+{
+ struct list_head list;
+ struct list_head ast_list;
+ dlm_lock_resource *lockres;
+ spinlock_t spinlock;
+
+ s8 type;
+ s8 convert_type;
+ s8 highest_blocked;
+ s8 reserved1;
+ u16 node;
+ u16 reserved2;
+
+ dlm_astlockfunc_t *ast; // ast and bast must be callable while holding a spinlock!
+ dlm_bastlockfunc_t *bast;
+ void *astdata;
+ u64 cookie;
+ dlm_lockstatus *lksb;
+} dlm_lock;
+
+
+struct _dlm_lockstatus {
+ dlm_status status;
+ dlm_lock *lockid;
+ char lvb[DLM_LVB_LEN];
+};
+
+enum {
+ DLM_MLE_BLOCK,
+ DLM_MLE_MASTER
+};
+
+typedef struct _dlm_lock_name
+{
+ u8 len;
+ u8 name[0]; // [DLM_LOCKID_NAME_MAX]
+} dlm_lock_name;
+
+/* good god this needs to be trimmed down */
+typedef struct _dlm_master_list_entry
+{
+ struct list_head list;
+ dlm_ctxt *dlm;
+ spinlock_t spinlock;
+ wait_queue_head_t wq;
+ atomic_t woken;
+ atomic_t refcnt;
+ u32 maybe_map[8];
+ u32 vote_map[8];
+ u32 response_map[8];
+ u32 node_map[8];
+ u16 master;
+ u8 error;
+ u8 type; // BLOCK or MASTER
+ union {
+ dlm_lock_resource *res;
+ dlm_lock_name name;
+ } u;
+} dlm_master_list_entry;
+
+void dlm_put_mle(dlm_master_list_entry *mle);
+static inline void dlm_get_mle(dlm_master_list_entry *mle)
+{
+ atomic_inc(&mle->refcnt);
+}
+
+
+#define DLM_MASTER_REQUEST_MSG 500
+#define DLM_MASTER_REQUEST_RESP_MSG 501
+#define DLM_ASSERT_MASTER_MSG 502
+#define DLM_CREATE_LOCK_MSG 503
+#define DLM_CONVERT_LOCK_MSG 504
+#define DLM_PROXY_AST_MSG 505
+#define DLM_UNLOCK_LOCK_MSG 506
+
+
+enum {
+ DLM_MASTER_RESP_NO,
+ DLM_MASTER_RESP_YES,
+ DLM_MASTER_RESP_MAYBE,
+ DLM_MASTER_RESP_ERROR
+};
+
+typedef struct _dlm_master_request
+{
+ u16 node_idx;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request;
+
+typedef struct _dlm_master_request_resp
+{
+ u16 node_idx;
+ u8 response;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} dlm_master_request_resp;
+
+typedef struct _dlm_assert_master
+{
+ u16 node_idx;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} dlm_assert_master;
+
+
+
+
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res);
+void dlm_thread_run_lock_resources(dlm_ctxt *dlm);
+int dlm_thread(void *data);
+int dlm_launch_thread(dlm_ctxt *dlm);
+void dlm_complete_thread(dlm_ctxt *dlm);
+
+dlm_status dlmlock(dlm_ctxt *dlm, int mode, dlm_lockstatus *lksb, int flags, char *name,
+ dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast);
+
+
+dlm_status do_dlmlock(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lockstatus *lksb,
+ int flags, int type, dlm_astlockfunc_t *ast,
+ dlm_bastlockfunc_t *bast, void *data);
+dlm_status dlmlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+dlm_status dlmlock_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags);
+
+dlm_status do_dlmconvert(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+dlm_status dlmconvert_remote(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int flags, int type);
+
+dlm_status dlmunlock(dlm_ctxt *dlm, dlm_lockstatus *lksb, int flags, dlm_astunlockfunc_t *unlockast, void *data);
+dlm_status dlmunlock_local(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, dlm_lockstatus *lksb, int flags, int *call_ast);
+
+dlm_ctxt * dlm_register_domain(char *domain, char *group_name, u32 key);
+void dlm_unregister_domain(dlm_ctxt *dlm);
+dlm_lock_resource * dlm_get_lock_resource(dlm_ctxt *dlm, struct qstr *lockname, int flags);
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_refresh_lock_resource(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_do_ast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock);
+int dlm_do_bast(dlm_ctxt *dlm, dlm_lock_resource *res, dlm_lock *lock, int blocked_type);
+u16 dlm_nm_this_node(dlm_ctxt *dlm);
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res);
+
+int dlm_nm_init(dlm_ctxt *dlm);
+int dlm_heartbeat_init(dlm_ctxt *dlm);
+
+dlm_lock_resource * dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+dlm_ctxt * dlm_lookup_domain(char *domain);
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data);
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int dlm_hb_node_up(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node);
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node);
+
+int dlm_lock_owner_broadcast(dlm_ctxt *dlm, dlm_lock_resource *res);
+int dlm_master_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_master_request_resp_handler(net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(net_msg *msg, u32 len, void *data);
+int dlm_do_master_request(dlm_master_list_entry *mle, int to);
+int dlm_do_master_request_resp(dlm_ctxt *dlm, struct qstr *name, int response, int to);
+int dlm_do_assert_master(dlm_master_list_entry *mle);
+void dlm_mle_node_down(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_mle_node_up(struct inode *group, struct inode *node, int idx, void *data);
+dlm_lock_resource * __dlm_lookup_lock(dlm_ctxt *dlm, struct qstr *lockname);
+void dlm_init_lockres(dlm_lock_resource *res, struct qstr *lockname);
+void dlm_wait_on_lockres(dlm_lock_resource *res);
+void dlm_dump_everything(void);
+void dlm_dump_dlm(dlm_ctxt *dlm);
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+ /* NO_LOCK compatible with all */
+ if (request == LKM_NLMODE ||
+ existing == LKM_NLMODE)
+ return 1;
+
+ /* EX incompatible with all non-NO_LOCK */
+ if (request == LKM_EXMODE)
+ return 0;
+
+ /* request must be PR, which is compatible with PR */
+ if (existing == LKM_PRMODE)
+ return 1;
+
+ return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head, dlm_lock *lock)
+{
+ struct list_head *iter;
+ dlm_lock *tmplock;
+
+ list_for_each(iter, head) {
+ tmplock = list_entry(iter, dlm_lock, list);
+ if (tmplock == lock)
+ return 1;
+ }
+ return 0;
+}
+
+static inline int dlm_mle_equal(dlm_ctxt *dlm, dlm_master_list_entry *mle, struct qstr *lockname)
+{
+ dlm_lock_resource *res;
+
+ if (dlm != mle->dlm)
+ return 0;
+
+ if (mle->type == DLM_MLE_BLOCK) {
+ if (lockname->len != mle->u.name.len ||
+ strncmp(lockname->name, mle->u.name.name, lockname->len)!=0)
+ return 0;
+ } else {
+ res = mle->u.res;
+ if (res->lockname.hash != lockname->hash ||
+ res->lockname.len != lockname->len ||
+ strncmp(res->lockname.name, lockname->name, lockname->len)!=0)
+ return 0;
+ }
+ return 1;
+}
+
+static inline dlm_status dlm_err_to_dlm_status(int err)
+{
+ dlm_status ret;
+ if (err == -ENOMEM)
+ ret = DLM_SYSERR;
+ else if (err == -ETIMEDOUT || net_link_down(err, NULL))
+ ret = DLM_NOLOCKMGR;
+ else if (err == -EINVAL)
+ ret = DLM_BADPARAM;
+ else if (err == -ENAMETOOLONG)
+ ret = DLM_IVBUFLEN;
+ else
+ ret = DLM_BADARGS;
+ return ret;
+}
+
+#endif /* CLUSTER_DLMMOD_H */
Added: trunk/cluster/dlmrecovery.c
===================================================================
--- trunk/cluster/dlmrecovery.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmrecovery.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,705 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked);
+
+int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(dlm_ctxt *dlm);
+
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node);
+static int dlm_remaster_locks_local(dlm_ctxt *dlm);
+int dlm_init_recovery_area(dlm_ctxt *dlm, u16 dead_node, u16 num_nodes);
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node);
+void dlm_destroy_recovery_area(dlm_ctxt *dlm, u16 dead_node);
+
+#define DLM_RECOVERY_THREAD_MS 2000
+
+#if 0
+/*
+ * RECOVERY THREAD
+ */
+
+void dlm_kick_recovery_thread(dlm_ctxt *dlm)
+{
+ /* wake the recovery thread */
+ atomic_set(&dlm->reco.thread.woken, 1);
+ wake_up(&dlm->reco.thread.thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(dlm_ctxt *dlm)
+{
+ printk("starting recovery thread...\n");
+ dlm->reco.thread.pid = kernel_thread (dlm_recovery_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (dlm->reco.thread.pid < 0) {
+ printk("unable to launch recovery thread, error=%d", dlm->reco.thread.pid);
+ return -EINVAL;
+ }
+ printk("recovery thread running...\n");
+ return 0;
+}
+
+void dlm_complete_recovery_thread(dlm_ctxt *dlm)
+{
+ printk ("waiting for recovery thread to exit....");
+ send_sig (SIGINT, dlm->reco.thread.task, 0);
+ wait_for_completion (&dlm->reco.thread.complete);
+ printk ("recovery thread exited\n");
+ dlm->reco.thread.task = NULL;
+}
+
+ /*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ * ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ * thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ * and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ * one lock at a time, forcing each node to communicate back
+ * before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ * message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ * and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ * everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+
+
+int dlm_recovery_thread(void *data)
+{
+ int status, i;
+ int cnt = 0, dlm_num;
+ struct list_head *iter, *iter2, *tmpiter;
+ dlm_lock_resource *res;
+ char name[12];
+ dlm_ctxt *dlm = data;
+ u16 tmp;
+
+
+ dlm_num = nm_get_group_global_index(dlm->group);
+ sprintf(name, "dlmreco-%03u", dlm_num);
+ util_daemonize (name, strlen(name), 1);
+ dlm->reco.thread.task = current;
+
+ while (1) {
+ spin_lock(&dlm->spinlock);
+
+ /* check to see if the new master has died */
+ if (dlm->reco.new_master != NM_INVALID_SLOT_NUM &&
+ test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+ printk("new master %u died while recovering %u!\n",
+ dlm->reco.new_master, dlm->reco.dead_node);
+ // unset the new_master, leave dead_node
+ dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+ }
+
+ /* select a target to recover */
+ if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+ dlm->reco.dead_node = find_next_bit (dlm->recovery_map, NM_MAX_NODES, 0);
+ if (dlm->reco.dead_node >= NM_MAX_NODES)
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+ // BUG?
+ printk("dead_node %u no longer in recovery map!\n",
+ dlm->reco.dead_node);
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ }
+
+ spin_unlock(&dlm->spinlock);
+
+ if (dlm->reco.dead_node == NM_INVALID_SLOT_NUM) {
+ printk("nothing to recover! sleeping now!\n");
+ goto sleep;
+ }
+
+ /* take write barrier */
+ /* (stops the list reshuffling thread, proxy ast handling) */
+ down_write(&dlm->recovery_sem);
+
+ /* choose a new master */
+ if (dlm->reco.new_master == NM_INVALID_SLOT_NUM) {
+ u16 new_dead_node = dlm->reco.dead_node;
+ dlm->reco.new_master = dlm_pick_recovery_master(dlm, &new_dead_node);
+ if (new_dead_node != dlm->reco.dead_node) {
+ // master wants to recover a different node
+ dlm->reco.dead_node = new_dead_node;
+
+ // do local cleanup if heartbeat has not added the
+ // node to the recovery map yet
+ spin_lock(&dlm->spinlock);
+ if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+ dlm_do_local_recovery_cleanup(dlm, dlm->reco.dead_node, 1);
+ set_bit(dlm->reco.dead_node, dlm->recovery_map);
+ clear_bit(dlm->reco.dead_node, dlm->node_map);
+ }
+ spin_unlock(&dlm->spinlock);
+ }
+ }
+
+
+ if (dlm->reco.new_master == dlm->group_index) {
+ status = dlm_remaster_locks_local(dlm);
+ if (status < 0) {
+ printk("error remastering locks for node %u!!!! retrying!\n",
+ dlm->reco.dead_node);
+ } else {
+ // success! see if any other nodes need recovery
+ spin_lock(&dlm->spinlock);
+ clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+ spin_unlock(&dlm->spinlock);
+ dlm->reco.dead_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.new_master = NM_INVALID_SLOT_NUM;
+ dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.next_seq = 0;
+ }
+ up_write(&dlm->recovery_sem);
+ // pick another dead node
+ continue;
+ } else {
+ // sit around until new_master is dead or done
+ // we will get signalled by the waitqueue either way
+ printk("new_master %u is recovering dead_node %u... waiting...\n",
+ dlm->reco.new_master, dlm->reco.dead_node);
+ }
+
+ up_write(&dlm->recovery_sem);
+
+sleep:
+ atomic_set(&dlm->reco.thread.woken, 0);
+ status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq,
+ &dlm->reco.thread.woken,
+ 1, DLM_RECOVERY_THREAD_MS);
+ if (status == 0 || status == -ETIMEDOUT) {
+ if (atomic_read(&dlm->reco.thread.woken))
+ printk("aha!!! recovery thread woken!\n");
+ else
+ printk("timed out waiting, running again\n");
+ continue;
+ }
+ printk("recovery thread got %d while waiting\n", status);
+ break;
+ }
+
+ flush_scheduled_work();
+ complete (&dlm->reco.thread.complete);
+ printk("quitting recovery thread!!!!!!\n");
+ return 0;
+}
+
+/* +- if this node is NOT the new master... */
+/* +--- if master's dead_node is not the one we chose, do local cleanup again with proper dead_node */
+/* +--- wait for poll messages from new master: register net message handler, it will do the work */
+/* +--- check for death of new master */
+/* +--- if dead, unregister the handler, unset new_master, keep dead_node and goto "select a target" */
+/* |- on request, send header with number of packets, get response, then start blasting packets */
+/* |- retransmit any missed packets on request */
+/* |- once ALL DONE is received, run all locks again */
+/* +--- unset the RECOVERING flag */
+/* +--- set the new owner as new_master */
+/* +--- remove dead_node from recovery map */
+/* +--- unset new_master and dead_node and start all over */
+
+
+static int dlm_remaster_locks_local(dlm_ctxt *dlm)
+{
+ int num_nodes = 255, i, status = 0;
+ u32 node_map[8];
+
+
+/* +- if this node is the new master, init the temp recovery area */
+/* |- poll each live node for lock state */
+/* |- collect the data from each node until node says it's done, or dead */
+/* +--- if node died, throw away temp recovery area, keep new_master and dead_node, goto "select a target" */
+/* |- apply all temp area changes to real lock */
+/* +- send ALL DONE message to each node */
+
+
+ status = dlm_init_recovery_area(dlm, dlm->reco.dead_node, num_nodes);
+ if (status < 0)
+ return status;
+
+ spin_lock(&dlm->spinlock);
+ num_nodes = nm_get_group_max_slots(dlm->group);
+ memcpy(node_map, dlm->node_map, sizeof(node_map));
+ spin_unlock(&dlm->spinlock);
+
+ for (i=0; i<num_nodes; i++) {
+ if (test_bit(i, node_map)) {
+ spin_lock(&dlm->spinlock);
+ dlm->reco.sending_node = i;
+ dlm->reco.next_seq = 0;
+ spin_unlock(&dlm->spinlock);
+ status = dlm_request_all_locks(dlm, i, dlm->reco.dead_node);
+ if (status < 0) {
+ spin_lock(&dlm->spinlock);
+ dlm->reco.sending_node = NM_INVALID_SLOT_NUM;
+ dlm->reco.next_seq = 0;
+ spin_unlock(&dlm->spinlock);
+ dlm_destroy_recovery_area(dlm, dlm->reco.dead_node);
+ return status;
+ }
+ }
+ }
+ return status;
+}
+
+int dlm_request_all_locks(dlm_ctxt *dlm, u16 request_from, u16 dead_node)
+{
+ printk("dlm_request_all_locks: dead node is %u, sending request to %u\n",
+ dead_node, request_from);
+ // send message
+ // sleep until all received or error
+ return 0;
+}
+
+#endif
+
+#if 0
+
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data);
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data);
+
+typedef struct _dlm_reco_lock_info
+{
+ u16 node;
+ u16 unused1;
+ u64 cookie;
+ s8 type;
+ s8 convert_type;
+ u8 list;
+ u8 lockname_len;
+ u8 lockname[DLM_LOCKID_NAME_MAX];
+} dlm_reco_lock_info;
+
+enum {
+ DLM_RECO_MASTER_REQUEST,
+ DLM_RECO_XMIT_LOCKS_REQUEST,
+ DLM_RECO_XMIT_LOCK_HDR_REQUEST,
+ DLM_RECO_XMIT_LOCK_ARR_REQUEST,
+ DLM_RECO_XMIT_COMPLETE_REQUEST,
+ DLM_RECO_ALL_DONE_REQUEST
+};
+
+enum {
+ DLM_RECO_NO_RESPONSE,
+ DLM_RECO_YES_RESPONSE
+};
+
+#define DLM_LOCKS_PER_PACKET 40
+
+typedef struct _dlm_reco_lock_arr_req
+{
+ u8 request_type;
+ u8 num_locks;
+ u16 dead_node;
+ u32 seqnum;
+ dlm_reco_lock_info lock[DLM_LOCKS_PER_PACKET];
+} dlm_reco_lock_arr_req;
+
+typedef struct _dlm_reco_request
+{
+ u8 request_type;
+ u8 unused1;
+ u16 dead_node;
+ u32 num;
+} dlm_reco_request;
+
+typedef struct _dlm_reco_response
+{
+ u8 response_type;
+ u8 unused1[7];
+} dlm_reco_response;
+
+static inline int dlm_reco_lock_info_valid(dlm_reco_lock_info *info)
+{
+ if (info->type != LKM_NLMODE &&
+ info->type != LKM_PRMODE &&
+ info->type != LKM_EXMODE)
+ return 0;
+ if (info->convert_type != LKM_NLMODE &&
+ info->convert_type != LKM_PRMODE &&
+ info->convert_type != LKM_EXMODE)
+ return 0;
+ if (info->list > 2)
+ return 0;
+ return 1;
+}
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order);
+
+static inline int dlm_check_reco_lock_arr_msg(net_msg *msg, dlm_ctxt *dlm, int *out_of_order)
+{
+ int ret = -EINVAL;
+ dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+
+ /* check a bunch of ugly conditions */
+ *out_of_order = 0;
+ if (req->num_locks > DLM_LOCKS_PER_PACKET) {
+ printk("num_locks too large! %u\n", req->num_locks);
+ } else if (req->seqnum != dlm->reco.next_seq) {
+ printk("expected seq %lu from node %u, got %lu\n",
+ dlm->reco.next_seq, msg->src_node,
+ req->seqnum);
+ *out_of_order = 1;
+ } else if (dlm->reco.dead_node != req->dead_node) {
+ printk("bad lock array: dead node=%u, sent=%u\n",
+ dlm->reco.dead_node != req->dead_node);
+ } else if (dlm->reco.new_master != dlm->group_index) {
+ printk("this node is not the recovery master!\n");
+ } else if (dlm->reco.sending_node != msg->src_node ||
+ dlm->group_index == msg->dest_node) {
+ printk("eek. sending_node=%u, actual=%u, dest=%u, me=%u\n",
+ dlm->reco.sending_node, msg->src_node,
+ msg->dest_node, dlm->group_index);
+ } else
+ ret = 0;
+ return ret;
+}
+
+
+/*
+ * gawd i hate udp
+ */
+int dlm_recovery_lock_arr_req_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+ dlm_reco_lock_arr_req *req = (dlm_reco_lock_arr_req *)msg->buf;
+ dlm_lock_resource *res = NULL;
+ dlm_reco_lock_info *info;
+ dlm_lock **newlocks = NULL;
+ dlm_lock *lock = NULL;
+ int ret, i, out_of_order = 0;
+
+ // TODO: ntoh(req)
+
+ ret = 0;
+ if (req->num_locks == 0)
+ goto send_response;
+
+ /* check to see if it's worth kmallocing */
+ spin_lock(&dlm->spinlock);
+ ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+ spin_unlock(&dlm->spinlock);
+ if (ret < 0)
+ goto send_response;
+
+ newlocks = kmalloc(req->num_locks * sizeof(dlm_lock *), GFP_KERNEL);
+ if (!newlocks) {
+ printk("failed to alloc temp lock array!\n");
+ ret = -ENOMEM;
+ goto send_response;
+ }
+ memset(newlocks, 0, req->num_locks * sizeof(dlm_lock *));
+ for (i=0; i<req->num_locks; i++) {
+ info = &(req->lock[i]);
+ if (!dlm_reco_lock_info_valid(info)) {
+ ret = -EINVAL;
+ goto send_response;
+ }
+ lock = newlocks[i] = kmem_cache_alloc(dlm_lock_cache, GFP_KERNEL);
+ if (!newlocks[i]) {
+ ret = -ENOMEM;
+ goto send_response;
+ }
+ memset(lock, 0, sizeof(dlm_lock));
+ LIST_HEAD_INIT(&lock->list);
+ LIST_HEAD_INIT(&lock->ast_list);
+ spin_lock_init(&lock->spinlock);
+ lock->type = info->type;
+ lock->convert_type = info->convert_type;
+ lock->node = dlm->group_index;
+ //atomic_set(&lock->ast_lock, 0);
+ //atomic_set(&lock->bast_lock, 0);
+ lock->ast = NULL;
+ lock->bast = NULL;
+ lock->astdata = (void *)info->list; // cheating here...
+ lock->cookie = info->cookie;
+ }
+
+ spin_lock(&dlm->spinlock);
+ /* ok now that everything is allocated and the lock has
+ * been taken again, recheck all those stupid conditions */
+ ret = dlm_check_reco_lock_arr_msg(msg, dlm, &out_of_order);
+ if (ret < 0) {
+ spin_unlock(&dlm->spinlock);
+ goto send_response;
+ }
+ for (i=0; i<req->num_locks; i++) {
+ info = &(req->lock[i]);
+ lock = newlocks[i];
+ list_add_tail(&lock->list, &dlm->reco.received);
+ }
+ spin_unlock(&dlm->spinlock);
+
+send_response:
+ if (newlocks) {
+ if (ret < 0) {
+ for (i=0; i<req->num_locks; i++)
+ if (newlocks[i])
+ kmem_cache_free(dlm_reco_lock_info_cache, newlocks[i]);
+ }
+ kfree(newlocks);
+ }
+
+ return ret;
+}
+int dlm_recovery_request_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+}
+int dlm_recovery_response_handler(net_msg *msg, u32 len, void *data)
+{
+ dlm_ctxt *dlm = data;
+}
+
+
+
+
+
+static int dlm_send_reco_request(dlm_ctxt *dlm, dlm_reco_request *buf, u16 to, struct inode *node)
+{
+ int ret;
+ net_msg *msg = net_package_message(DLM_NET_RECOVERY_REQUEST_MSG_TYPE,
+ dlm->key, buf, sizeof(*buf),
+ dlm->group_index, to);
+ if (!msg)
+ return -ENOMEM;
+ ret = net_send_udp_msg (node, msg, sizeof(*buf));
+ kfree(msg);
+ return ret;
+}
+
+static int dlm_recover_domain(dlm_ctxt *dlm)
+{
+
+
+ return 0;
+}
+
+
+#endif
+
+#warning may need to change kfree to put_lock and refcounting here
+static void dlm_do_local_recovery_cleanup(dlm_ctxt *dlm, u16 dead_node, int locked)
+{
+ struct list_head *iter, *iter2, *tmpiter;
+ dlm_lock_resource *res;
+ dlm_lock *lock;
+ int i;
+ struct list_head *bucket;
+
+ if (!locked)
+ spin_lock(&dlm->spinlock);
+
+ for (i=0; i<DLM_HASH_SIZE; i++) {
+ bucket = &(dlm->resources[i]);
+ list_for_each(iter, bucket) {
+ res = list_entry (iter, dlm_lock_resource, list);
+ spin_lock(&res->spinlock);
+ if (res->owner == dead_node) {
+ res->state |= DLM_LOCK_RES_RECOVERING;
+ list_del(&res->recovering);
+ list_add_tail(&res->recovering, &dlm->reco.resources);
+ } else if (res->owner == dlm->group_index) {
+ list_for_each_safe(iter2, tmpiter, &res->granted) {
+ lock = list_entry (iter2, dlm_lock, list);
+ if (lock->node == dead_node) {
+ list_del(&lock->list);
+ kfree(lock);
+ }
+ }
+ list_for_each_safe(iter2, tmpiter, &res->converting) {
+ lock = list_entry (iter2, dlm_lock, list);
+ if (lock->node == dead_node) {
+ list_del(&lock->list);
+ kfree(lock);
+ }
+ }
+ list_for_each_safe(iter2, tmpiter, &res->blocked) {
+ lock = list_entry (iter2, dlm_lock, list);
+ if (lock->node == dead_node) {
+ list_del(&lock->list);
+ kfree(lock);
+ }
+ }
+ }
+ spin_unlock(&res->spinlock);
+ }
+ }
+
+ if (!locked)
+ spin_unlock(&dlm->spinlock);
+}
+
+
+void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //int ret;
+ //struct inode *group = ptr1;
+ //struct inode *node = ptr2;
+ dlm_ctxt *dlm = data;
+
+ spin_lock(&dlm->spinlock);
+
+ if (!test_bit(idx, dlm->node_map))
+ printk("node %u already removed from nodemap!\n", idx);
+ else
+ clear_bit(idx, dlm->node_map);
+
+ if (test_bit(idx, dlm->recovery_map))
+ printk("node %u already added to recovery map!\n", idx);
+ else {
+ set_bit(idx, dlm->recovery_map);
+ dlm_do_local_recovery_cleanup(dlm, idx, 1);
+ }
+ spin_unlock(&dlm->spinlock);
+}
+
+void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data)
+{
+ //struct inode *group = ptr1;
+ //struct inode *node = ptr2;
+ dlm_ctxt *dlm = data;
+
+ spin_lock(&dlm->spinlock);
+
+ if (test_bit(idx, dlm->recovery_map)) {
+ printk("BUG!!! node up message on node in recovery (%u)!!!\n", idx);
+ } else {
+ if (test_bit(idx, dlm->node_map))
+ printk("node %u already in node map!!!\n", idx);
+ else
+ set_bit(idx, dlm->node_map);
+ }
+
+ spin_unlock(&dlm->spinlock);
+}
+
+int __dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+ if (test_bit(node, dlm->recovery_map))
+ return 1;
+ return 0;
+}
+
+int __dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+ if (test_bit(node, dlm->node_map))
+ return 1;
+ return 0;
+}
+
+int dlm_hb_node_dead(dlm_ctxt *dlm, int node)
+{
+ int ret;
+ spin_lock(&dlm->spinlock);
+ ret = __dlm_hb_node_dead(dlm, node);
+ spin_unlock(&dlm->spinlock);
+ return ret;
+}
+
+int dlm_hb_node_up(dlm_ctxt *dlm, int node)
+{
+ int ret;
+ spin_lock(&dlm->spinlock);
+ ret = __dlm_hb_node_up(dlm, node);
+ spin_unlock(&dlm->spinlock);
+ return ret;
+}
+
+u16 dlm_pick_recovery_master(dlm_ctxt *dlm, u16 *new_dead_node)
+{
+ u16 master = 0;
+#if 0
+ dlm_status ret;
+ dlm_lockstatus lksb;
+
+ ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
+ DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+
+ if (ret == DLM_NORMAL) {
+ // I am master
+ // send message to all nodes saying that I am beginning a recovery session for node XX,
+ // then call dlmunlock???
+
+ } else if (ret == DLM_NOTQUEUED) {
+ // another node is master
+ // wait on reco.new_master != NM_INVALID_SLOT_NUM
+ }
+
+ // at this point, every node in this domain should have reco.new_master and .dead_node set, even
+ // if they have not discovered the dead node on their own
+ //
+ //
+ // atomic_set(&dlm->reco.thread.woken, 0);
+ // 232 status = util_wait_atomic_eq(&dlm->reco.thread.thread_wq,
+ // 233 &dlm->reco.thread.woken,
+ // 234 1, DLM_RECOVERY_THREAD_MS);
+ //
+#endif
+ return master;
+}
Added: trunk/cluster/dlmthread.c
===================================================================
--- trunk/cluster/dlmthread.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/dlmthread.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,329 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#include "tcp.h"
+#include "dlmmod.h"
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+extern u16 dlm_global_index;
+
+#define dlm_lock_is_remote(dlm, lock) ((lock)->node != (dlm)->group_index)
+
+/*
+ * DLM THREAD
+ */
+
+void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+ dlm_lock *lock, *target;
+ struct list_head *iter, *tmpiter;
+ LIST_HEAD(bast_list);
+ struct list_head *head;
+ s8 hi;
+
+ spin_lock(&res->spinlock);
+
+#if 0
+ {
+ int g=0, c=0, b=0;
+ list_for_each(iter, &res->granted) {
+ g++;
+ }
+ list_for_each(iter, &res->converting) {
+ c++;
+ }
+ list_for_each(iter, &res->blocked) {
+ b++;
+ }
+ printk("(%d) granted: %d, converting: %d, blocked: %d\n", current->pid, g, c, b);
+ }
+#endif
+
+converting:
+ if (list_empty(&res->converting))
+ goto blocked;
+ target = list_entry(res->converting.next, dlm_lock, list);
+ if (target->convert_type == LKM_IVMODE) {
+ printk("eeek!!! converting a lock with no convert_type!!!!\n");
+ BUG();
+ }
+ head = &res->granted;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+ head = &res->converting;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->convert_type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+
+ /* we can convert the lock */
+ if (list_empty(&bast_list)) {
+ spin_lock(&target->spinlock);
+ DLM_ASSERT(target->highest_blocked == LKM_IVMODE);
+
+ dlmprintk("calling ast for converting lock: %*s, have: %d, granting: %d, node: %u\n",
+ res->lockname.len, res->lockname.name, target->type, target->convert_type, target->node);
+
+ target->type = target->convert_type;
+ target->convert_type = LKM_IVMODE;
+ list_del(&target->list);
+ list_add_tail(&target->list, &res->granted);
+
+ if (target->node == dlm->group_index) {
+ DLM_ASSERT(target->lksb);
+ DLM_ASSERT(target->lksb->status);
+
+ target->lksb->status = DLM_NORMAL;
+ } else {
+ dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+ }
+
+ spin_unlock(&target->spinlock);
+
+ if (dlm_do_ast(dlm, res, target) < 0)
+ printk("eek\n");
+ /* go back and check for more */
+ goto converting;
+ }
+
+blocked:
+ if (list_empty(&res->blocked)) {
+ goto basts;
+ }
+ target = list_entry(res->blocked.next, dlm_lock, list);
+
+ head = &res->granted;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+
+ head = &res->converting;
+ list_for_each(iter, head) {
+ lock = list_entry(iter, dlm_lock, list);
+ if (lock==target)
+ continue;
+ if (!dlm_lock_compatible(lock->type, target->type)) {
+ if (lock->highest_blocked == LKM_IVMODE)
+ list_add(&lock->ast_list, &bast_list);
+ if (lock->highest_blocked < target->type)
+ lock->highest_blocked = lock->type;
+ }
+ }
+
+ /* we can grant the blocked lock (only
+ * possible if converting list empty) */
+ if (list_empty(&bast_list)) {
+ spin_lock(&target->spinlock);
+ DLM_ASSERT(target->highest_blocked == LKM_IVMODE);
+
+ dlmprintk("calling ast for blocked lock: %*s, granting: %d, node: %u\n",
+ res->lockname.len, res->lockname.name, target->type, target->node);
+
+ // target->type is already correct
+ list_del(&target->list);
+ list_add_tail(&target->list, &res->granted);
+
+ if (target->node == dlm->group_index) {
+ DLM_ASSERT(target->lksb);
+ DLM_ASSERT(target->lksb->status);
+
+ target->lksb->status = DLM_NORMAL;
+ } else {
+ dlmprintk0("nonlocal lock, not setting DLM_NORMAL in lksb\n");
+ }
+
+ spin_unlock(&target->spinlock);
+
+ if (dlm_do_ast(dlm, res, target) < 0)
+ printk("eek\n");
+ /* go back and check for more */
+ goto converting;
+ }
+
+basts:
+ list_for_each_safe(iter, tmpiter, &bast_list) {
+ lock = list_entry(iter, dlm_lock, ast_list);
+ spin_lock(&lock->spinlock);
+ DLM_ASSERT(lock->highest_blocked > LKM_IVMODE);
+ hi = lock->highest_blocked;
+ lock->highest_blocked = LKM_IVMODE;
+ list_del(&lock->ast_list);
+ spin_unlock(&lock->spinlock);
+
+ if (dlm_do_bast(dlm, res, lock, hi) < 0)
+ printk("eeek\n");
+ }
+ spin_unlock(&res->spinlock);
+}
+
+
+/* must have NO locks when calling this */
+void dlm_kick_thread(dlm_ctxt *dlm, dlm_lock_resource *res)
+{
+ if (res) {
+ spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
+ if (!(res->state & DLM_LOCK_RES_DIRTY)) {
+ list_add_tail(&res->dirty, &dlm->dirty_list);
+ res->state |= DLM_LOCK_RES_DIRTY;
+ }
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ }
+
+ /* wake the dlm thread */
+ atomic_set(&dlm->thread.woken, 1);
+ wake_up(&dlm->thread.thread_wq);
+}
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(dlm_ctxt *dlm)
+{
+ printk("starting dlm thread...\n");
+ dlm->thread.pid = kernel_thread (dlm_thread, dlm, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (dlm->thread.pid < 0) {
+ printk("unable to launch dlm thread, error=%d", dlm->thread.pid);
+ return -EINVAL;
+ }
+ printk("dlm thread running for %s...\n", dlm->name);
+ return 0;
+}
+
+void dlm_complete_thread(dlm_ctxt *dlm)
+{
+ printk ("waiting for dlm thread to exit....");
+ send_sig (SIGINT, dlm->thread.task, 0);
+ wait_for_completion (&dlm->thread.complete);
+ printk ("dlm thread exited\n");
+ dlm->thread.task = NULL;
+}
+
+
+
+
+int dlm_thread(void *data)
+{
+ int status;
+ struct list_head *iter, *tmpiter;
+ dlm_lock_resource *res;
+ dlm_ctxt *dlm = data;
+
+ util_daemonize ("dlm_thread", strlen("dlm_thread"), 1);
+ dlm->thread.task = current;
+
+ while (1) {
+ down_read(&dlm->recovery_sem);
+ spin_lock(&dlm->spinlock);
+ list_for_each_safe(iter, tmpiter, &dlm->dirty_list) {
+ res = list_entry(iter, dlm_lock_resource, dirty);
+ /* don't shuffle secondary queues */
+ if (res->owner != dlm->group_index)
+ continue;
+ dlm_shuffle_lists(dlm, res);
+ spin_lock(&res->spinlock);
+ list_del(&res->dirty);
+ res->state &= ~DLM_LOCK_RES_DIRTY;
+ spin_unlock(&res->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+ up_read(&dlm->recovery_sem);
+
+ atomic_set(&dlm->thread.woken, 0);
+ status = util_wait_atomic_eq(&dlm->thread.thread_wq,
+ &dlm->thread.woken,
+ 1, DLM_THREAD_MS);
+
+ if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+ if (atomic_read(&dlm->thread.woken))
+ printk("aha!!! dlm thread woken!\n");
+ else
+ printk("timed out waiting, running again\n");
+#endif
+ continue;
+ }
+
+ printk("DLM thread got %d while waiting\n", status);
+ break;
+ }
+
+ flush_scheduled_work();
+ complete (&dlm->thread.complete);
+ printk("quitting DLM thread!!!!!!\n");
+ return 0;
+}
Added: trunk/cluster/heartbeat.c
===================================================================
--- trunk/cluster/heartbeat.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/heartbeat.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,869 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Keeps track of alive nodes in the cluster.
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+static void hb_teardown(void);
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx);
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx);
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start);
+static ssize_t write_disk(struct file *file, char *buf, size_t size);
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx);
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx);
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx);
+static int hb_do_disk_heartbeat(void *page);
+static int hb_thread(void *data);
+static void hb_complete_thread(void);
+static void hb_kick_thread(void);
+static int hb_launch_thread(void);
+static inline int hb_wait_on_callback_state(int type);
+
+
+
+/* globals */
+extern char *nm_nodename;
+static spinlock_t hb_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(hb_net_groups);
+static LIST_HEAD(hb_disk_groups);
+static int hb_callback_state[HB_NUM_CB];
+struct list_head hb_callbacks[HB_NUM_CB];
+static spinlock_t hb_cb_lock = SPIN_LOCK_UNLOCKED;
+static struct task_struct *hb_task = NULL;
+static atomic_t hb_thread_woken = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(hb_thread_wait_queue);
+static struct completion hb_complete;
+static int hb_pid = -1;
+
+static wait_queue_head_t hb_cb_wq;
+static atomic_t hb_cb_ready = ATOMIC_INIT(0);
+
+
+static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate)
+ set_buffer_uptodate(bh);
+ else {
+ printk("eek! EIO!\n");
+ clear_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+}
+
+
+
+static int hb_do_node_down(struct inode *group, struct inode *node, int idx)
+{
+ int ret;
+ printk("hb_do_node_down: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+ printk("NOT removing node from group\n");
+ //ret = nm_remove_node_from_group(group, node);
+ hb_do_callbacks(HB_NODE_DOWN_CB, group, node, 0);
+ return 0;
+}
+
+static int hb_do_node_up(struct inode *group, struct inode *node, int idx)
+{
+ printk("hb_do_node_up: group=%lu, node=%lu\n", group->i_ino, node->i_ino);
+ hb_do_callbacks(HB_NODE_UP_CB, group, node, 0);
+ return 0;
+}
+
+static inline void hb_submit_bh(int rw, struct buffer_head *bh)
+{
+ printk("submit_bh: rw=%s, blocknr=%lu, mapped=%s\n",
+ rw==WRITE?"write":"read", bh->b_blocknr,
+ buffer_mapped(bh) ? "yes" : "no");
+ submit_bh(rw, bh);
+}
+
+
+static int hb_do_disk_heartbeat(void *page)
+{
+ nm_group_inode_private *priv;
+ struct inode *group, *node;
+ struct list_head *iter;
+ struct buffer_head *bh;
+ hb_disk_slot *slot;
+ hb_disk_heartbeat_block *hb_block;
+ int ino, idx, ret, i;
+ struct inode **dead_nodes, **live_nodes;
+ LIST_HEAD(tmplist);
+ u64 blkno;
+ cluster_disk *disk;
+
+ // NM_MAX_NODES is 255
+ dead_nodes = page;
+ live_nodes = page + (sizeof(struct inode *) * 256);
+
+ spin_lock(&hb_lock);
+ list_splice_init(&hb_disk_groups, &tmplist);
+ spin_unlock(&hb_lock);
+
+ list_for_each(iter, &tmplist) {
+ priv = list_entry(iter, nm_group_inode_private, disk_list);
+ group = priv->inode;
+ disk = &priv->disk;
+
+ memset(page, 0, PAGE_SIZE);
+ down(&group->i_sem);
+
+ idx = 0;
+ while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+retry_submit:
+ bh = slot->bh;
+ node = slot->inode;
+
+ ino = nm_get_node_global_index(node);
+
+ if (ino == nm_this_node(group)) {
+ lock_buffer(bh);
+ if (!buffer_mapped(bh)) {
+ blkno = (unsigned long long) bh->b_blocknr;
+ unlock_buffer(bh);
+ brelse(bh);
+ slot->bh = getblk(disk->dev,
+ blkno,
+ (1 << disk->blocksize_bits));
+ goto retry_submit;
+ }
+ memset(bh->b_data, 0, bh->b_size);
+ hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+ hb_block->time = CURRENT_TIME;
+ if (!hb_block->time)
+ hb_block->time = 1;
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ bh->b_end_io = hb_end_buffer_io_sync;
+ hb_submit_bh(WRITE, bh);
+ } else {
+ lock_buffer(bh);
+ if (!buffer_mapped(bh)) {
+ blkno = (unsigned long long) bh->b_blocknr;
+ unlock_buffer(bh);
+ brelse(bh);
+ slot->bh = getblk(disk->dev,
+ blkno,
+ (1 << disk->blocksize_bits));
+ goto retry_submit;
+ }
+ clear_buffer_uptodate(bh);
+ bh->b_end_io = hb_end_buffer_io_sync;
+ hb_submit_bh(READ, bh);
+ }
+ idx++;
+ }
+
+ idx = 0;
+ while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+ bh = slot->bh;
+ node = slot->inode;
+
+ ino = nm_get_node_global_index(node);
+
+ wait_on_buffer(bh);
+ hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+ if (hb_block->time != slot->last_time) {
+ if (slot->state == HB_NODE_STATE_INIT) {
+ printk("first time for this node!\n");
+ live_nodes[ino] = node;
+ slot->state = HB_NODE_STATE_UP;
+ }
+ node->i_atime = hb_block->time;
+ slot->last_time = hb_block->time;
+ slot->margin = HB_DISK_MARGIN;
+ hb_do_callbacks(HB_NODE_RESPONDED_CB, group, node, HB_TYPE_DISK);
+ } else {
+ slot->margin--;
+ printk("node %d missed. margin=%d\n", ino, slot->margin);
+ }
+
+ if (ino != nm_this_node(group) && slot->margin <= 0) {
+ printk("node %d JUST DIED!!!!\n", ino);
+ dead_nodes[ino] = node;
+ slot->state = HB_NODE_STATE_DOWN;
+ }
+ idx++;
+ }
+
+ up(&group->i_sem);
+
+ /* Do holding group i_sem while doing node-up/down.
+ * Changes may need to be made to the group, so
+ * i_sem will be needed... */
+ for (i=0; i<NM_MAX_NODES; i++) {
+ if (live_nodes[i])
+ ret = hb_do_node_up(group, live_nodes[i], i);
+ else if (dead_nodes[i])
+ ret = hb_do_node_down(group, dead_nodes[i], i);
+ }
+ }
+
+ spin_lock(&hb_lock);
+ list_splice(&tmplist, &hb_disk_groups);
+ spin_unlock(&hb_lock);
+ return 0;
+}
+
+
+static int hb_thread(void *data)
+{
+ int status;
+ void *page;
+
+ page = (void *) __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ util_daemonize ("hb_thread", strlen("hb_thread"), 1);
+ hb_task = current;
+
+ while (1) {
+ status = hb_do_disk_heartbeat(page);
+
+ atomic_set(&hb_thread_woken, 0);
+ status = util_wait_atomic_eq(&hb_thread_wait_queue,
+ &hb_thread_woken,
+ 1, HB_THREAD_MS);
+
+ if (status == 0 || status == -ETIMEDOUT) {
+#if 0
+ if (atomic_read(&hb_thread_woken))
+ printk("aha!!! hb thread woken!\n");
+ else
+ printk("hb thread timed out waiting, running again\n");
+#endif
+ continue;
+ }
+ printk("hb thread got %d while waiting\n", status);
+ break;
+ }
+
+ flush_scheduled_work();
+ complete (&hb_complete);
+ printk("quitting hb thread!!!!!!\n");
+ return 0;
+}
+
+
+static void hb_kick_thread(void)
+{
+ atomic_set(&hb_thread_woken, 1);
+ wake_up(&hb_thread_wait_queue);
+}
+
+/* Launch the hb thread for the mounted volume */
+static int hb_launch_thread(void)
+{
+ hb_pid = -1;
+ hb_task = NULL;
+ init_completion (&hb_complete);
+
+ printk("starting hb thread...\n");
+ hb_pid = kernel_thread (hb_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (hb_pid < 0) {
+ printk("unable to launch hb thread, error=%d", hb_pid);
+ return -EINVAL;
+ }
+ printk("hb thread running...\n");
+ return 0;
+}
+
+static void hb_complete_thread(void)
+{
+ printk ("waiting for hb thread to exit....");
+ send_sig (SIGINT, hb_task, 0);
+ wait_for_completion (&hb_complete);
+ printk ("hb thread exited\n");
+ hb_task = NULL;
+}
+
+
+
+
+
+
+
+static int hb_init_disk_hb_group(struct inode *group, kdev_t dev, u32 bits, u32 blocks, u64 start)
+{
+ int ret = -EINVAL;
+ cluster_disk *disk;
+ nm_group_inode_private *priv;
+
+ priv = group->u.generic_ip;
+ if (!priv)
+ goto leave;
+
+ if (priv->state == NM_GROUP_READY)
+ return 0;
+
+ /* hold an extra ref as long as hb keeps track of the group */
+ igrab(group);
+
+ disk = &priv->disk;
+ if (blocks > NM_MAX_NODES)
+ blocks = NM_MAX_NODES;
+ disk->dev = dev;
+ disk->blocksize_bits = bits;
+ disk->num_blocks = blocks;
+ disk->start_block = start;
+ util_init_rarray(&disk->slots, sizeof(hb_disk_slot));
+
+ /* start allowing group additions */
+ ret = nm_make_group_ready(group);
+
+leave:
+ if (ret < 0)
+ iput(group);
+
+ return ret;
+}
+
+
+static ssize_t write_disk(struct file *file, char *buf, size_t size)
+{
+ hb_op *data;
+ struct inode *group = NULL;
+ struct file *filp = NULL;
+ kdev_t dev;
+ int ret, tmpret;
+ nm_group_inode_private *priv;
+ u32 tmpmap[8];
+
+ printk("write_disk\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (hb_op *) buf; if (data->magic != HB_OP_MAGIC)
+ return -EINVAL;
+
+ switch (data->opcode)
+ {
+ case HB_OP_START_DISK_HEARTBEAT:
+ if (data->bits < 9 || data->bits > 12) {
+ ret = sprintf(buf, "%d: bad blocksize bits! %u", -EINVAL, data->bits);
+ break;
+ }
+ group = nm_get_group_by_num(data->group_num);
+ if (!group || !group->u.generic_ip) {
+ ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+ break;
+ }
+ priv = group->u.generic_ip;
+ if (strncmp(priv->disk.uuid, data->disk_uuid, CLUSTER_DISK_UUID_LEN) != 0) {
+ ret = sprintf(buf, "%d: bad disk uuid!", -EINVAL);
+ break;
+ }
+ filp = fget(data->fd);
+ if (!filp) {
+ ret = sprintf(buf, "%d: bad fd!", -EINVAL);
+ break;
+ }
+ dev = filp->f_dentry->d_inode->i_rdev;
+ tmpret = hb_init_disk_hb_group(group, dev, data->bits, data->blocks, data->start);
+ if (tmpret < 0) {
+ fput(filp);
+ ret = sprintf(buf, "%d: failed to init disk heartbeat for group %u!",
+ -EINVAL, data->group_num);
+ } else {
+ ret = sprintf(buf, "0: disk heartbeat started for group %u!",
+ data->group_num);
+ }
+ break;
+
+ case HB_OP_GET_NODE_MAP:
+ group = nm_get_group_by_num(data->group_num);
+ if (!group || !group->u.generic_ip) {
+ ret = sprintf(buf, "%d: bad group number! %u", -EINVAL, data->group_num);
+ break;
+ }
+
+ if ((ret = hb_fill_node_map(group, tmpmap, sizeof(tmpmap))) == 0) {
+ ret = sprintf(buf, "0: ");
+ buf += ret;
+ memcpy(buf, tmpmap, sizeof(tmpmap));
+ ret += sizeof(tmpmap);
+ } else {
+ ret = sprintf(buf, "%d: error occurred in hb_fill_node_map", ret);
+ }
+ break;
+
+ default:
+ ret = sprintf(buf, "%d: bad opcode! %u", -EINVAL, data->opcode);
+ break;
+ }
+
+ if (group)
+ iput(group);
+
+ return ret;
+}
+
+
+extern struct file_operations transaction_ops;
+
+/*----------------------------------------------------------------------------*/
+/*
+ * populating the filesystem.
+ */
+static int hb_fill_super(struct super_block * sb, void * data, int silent)
+{
+ int ret;
+ TA_write_ops *ops;
+ static struct tree_descr hb_files[] = {
+ [HB_Disk] = {".disk", &transaction_ops, S_IWUSR},
+ /* last one */ {""}
+ };
+
+ ops = kmalloc(sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ memset(ops, 0, sizeof(TA_write_ops) + (1 * sizeof(TA_write_op *)));
+ ops->num_ops = HB_WriteOpArraySize;
+ ops->write_op[HB_Disk] = write_disk;
+
+ printk("calling simple_fill_super...\n");
+ ret = simple_fill_super(sb, 0x5551212f, hb_files);
+ if (ret >= 0)
+ TA_GENERIC_SB_MEMBER(sb) = ops;
+ else
+ kfree(ops);
+ return ret;
+}
+
+static struct super_block *hb_read_super (struct super_block *sb, void *data, int silent)
+{
+ printk("welcome to hb_read_super!!!\n");
+ return (hb_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (hb_fs_type, "hb", hb_read_super, FS_SINGLE|FS_LITTER);
+
+
+/* TODO: make callbacks all return int */
+static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ hb_disk_slot *slot;
+ struct inode *group = ptr1;
+ struct inode *node = ptr2;
+ cluster_disk *disk;
+ nm_group_inode_private *priv;
+ int ino, ret = 0;
+ u64 block;
+
+ printk("hb_nm_group_node_add_cb: group=%lu, node=%lu, idx=%u\n",
+ group->i_ino, node->i_ino, idx);
+
+ down(&group->i_sem);
+ priv = group->u.generic_ip;
+ if (!priv) {
+ printk("eek! bad group inode!\n");
+ goto leave;
+ }
+ disk = &priv->disk;
+ if (disk->uuid[0]) {
+ ret = util_resize_rarray(&disk->slots, idx+1);
+ if (ret < 0) {
+ printk("eeeeeeek!!!! failed to resize disk state data\n");
+ goto leave;
+ }
+
+ ino = nm_get_node_global_index(node);
+ if (ino > disk->num_blocks) {
+ printk("disk heartbeat area does not have enough blocks!\n");
+ goto leave;
+ }
+ block = ino + disk->start_block;
+
+ slot = util_rarray_idx_to_slot(&disk->slots, idx);
+ if (!slot) {
+ printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+ goto leave;
+ }
+ slot->inode = igrab(node);
+ slot->last_time = 0;
+ slot->margin = HB_INITIAL_DISK_MARGIN;
+#warning needs to change for 2.6
+ slot->bh = getblk(disk->dev, (int)block, (1 << disk->blocksize_bits));
+ slot->state = HB_NODE_STATE_INIT;
+ } else {
+ printk("doing nothing for group add for non-disk heartbeat group\n");
+ }
+
+leave:
+ up(&group->i_sem);
+ return;
+}
+
+static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ hb_disk_slot *slot;
+ struct inode *group = ptr1;
+ struct inode *node = ptr2;
+ cluster_disk *disk;
+ nm_group_inode_private *priv;
+ int ret = -EINVAL;
+
+ printk("hb_nm_group_node_del_cb: group=%lu, node=%lu, idx=%u\n",
+ group->i_ino, node->i_ino, idx);
+
+ down(&group->i_sem);
+ priv = group->u.generic_ip;
+ if (!priv) {
+ printk("eek! bad group inode!\n");
+ goto leave;
+ }
+ disk = &priv->disk;
+ slot = util_rarray_idx_to_slot(&disk->slots, idx);
+ if (!slot) {
+ printk("eeeeeeek!!!! failed to get disk state data pointer: %d\n", idx);
+ goto leave;
+ }
+ if (slot->inode!=node) {
+ printk("eeeeeeek!!!! node inode changed!\n");
+ goto leave;
+ }
+ iput(node);
+ if (slot->bh) {
+ wait_on_buffer(slot->bh);
+ brelse(slot->bh);
+ }
+ memset(slot, 0, sizeof(hb_disk_slot));
+ ret = 0;
+leave:
+
+ up(&group->i_sem);
+ printk("hb_nm_group_node_del_cb done: %d\n", ret);
+ return;
+}
+
+static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ //struct inode *node = ptr1;
+}
+
+static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u16 idx)
+{
+ struct inode *group = ptr1;
+ nm_group_inode_private *priv;
+
+ printk("hb_nm_group_add_cb: group=%lu, idx=%u\n",
+ group->i_ino, idx);
+
+ priv = group->u.generic_ip;
+ if (!priv) {
+ printk("eek! bad group inode!\n");
+ return;
+ }
+
+ spin_lock(&hb_lock);
+ list_add_tail(&priv->net_list, &hb_net_groups);
+ if (priv->disk.uuid[0]) {
+ printk("adding priv=%p inode=%p to disk group list\n", priv, group);
+ list_add_tail(&priv->disk_list, &hb_disk_groups);
+ }
+ spin_unlock(&hb_lock);
+}
+
+enum {
+ HB_CB_STATE_FROZEN = 0,
+ HB_CB_STATE_READY
+};
+
+static int __init init_hb(void)
+{
+ int retval=-1, i;
+ printk("loading heartbeat module: nodename is %s\n", nm_nodename);
+
+ if (proc_mkdir("cluster/heartbeat", 0)) {
+ // ???
+ }
+
+ //hb_net_timestamps = __get_free_page(GFP_KERNEL);
+ //if (!hb_net_timestamps)
+ // goto done;
+
+ for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+ INIT_LIST_HEAD(&hb_callbacks[i]);
+ init_waitqueue_head(&hb_cb_wq);
+ for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++)
+ hb_callback_state[i] = HB_CB_STATE_READY;
+
+ if (nm_register_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb))
+ goto done;
+ if (nm_register_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb))
+ goto done;
+ if (nm_register_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb))
+ goto done;
+ if (nm_register_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb))
+ goto done;
+
+ if (hb_launch_thread() < 0)
+ goto done;
+
+ retval = register_filesystem(&hb_fs_type);
+done:
+ if (retval)
+ hb_teardown();
+ return retval;
+}
+
+static void __exit exit_hb(void)
+{
+ int i;
+ spin_lock(&hb_cb_lock);
+ for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++) {
+ hb_wait_on_callback_state(i);
+ hb_callback_state[i] = HB_CB_STATE_FROZEN;
+ }
+ spin_unlock(&hb_cb_lock);
+
+ hb_complete_thread();
+ hb_teardown();
+ unregister_filesystem(&hb_fs_type);
+ printk("unloading heartbeat module\n");
+}
+
+static void hb_teardown(void)
+{
+ nm_unregister_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb);
+ nm_unregister_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb);
+ nm_unregister_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb);
+ nm_unregister_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb);
+ remove_proc_entry("cluster/heartbeat", NULL);
+ //if (hb_net_timestamps)
+ // kfree(hb_net_timestamps);
+}
+
+module_init(init_hb)
+module_exit(exit_hb)
+
+
+int hb_fill_node_map(struct inode *group, void *map, int size)
+{
+ hb_disk_slot *slot;
+ int idx = 0;
+ nm_group_inode_private *priv;
+
+ priv = group->u.generic_ip;
+
+ memset(map, 0, size);
+ down(&group->i_sem);
+
+ if (priv->disk.uuid[0]) {
+ while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+ if (idx >= size-1) {
+ printk("map size (%d) too small for idx (%d)\n",
+ size, idx);
+ up(&group->i_sem);
+ return -EINVAL;
+ }
+ if (slot->state == HB_NODE_STATE_UP)
+ set_bit(idx, map);
+ idx++;
+ }
+ } else {
+ printk("filling straight from slot bitmap for non-disk heartbeat group\n");
+ memcpy(map, priv->slot_bitmap, size);
+ }
+
+ up(&group->i_sem);
+
+ return 0;
+}
+
+
+static inline int hb_wait_on_callback_state(int type)
+{
+ while (hb_callback_state[type] == HB_CB_STATE_FROZEN) {
+ spin_unlock(&hb_cb_lock);
+ atomic_set(&hb_cb_ready, 0);
+ if (util_wait_atomic_eq(&hb_cb_wq, &hb_cb_ready, 1, 0) == -EINTR) {
+ return -EINTR;
+ }
+ spin_lock(&hb_cb_lock);
+ }
+ return 0;
+}
+
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority)
+{
+ hb_callback_func *f, *tmp;
+ struct list_head *iter;
+ int ret;
+
+ if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+ return -EINVAL;
+ f = kmalloc(sizeof(hb_callback_func), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOMEM;
+ memset(f, 0, sizeof(hb_callback_func));
+ f->func = func;
+ f->data = data;
+ f->priority = priority;
+
+ spin_lock(&hb_cb_lock);
+ ret = hb_wait_on_callback_state(type);
+ if (ret < 0) {
+ spin_unlock(&hb_cb_lock);
+ kfree(f);
+ return ret;
+ }
+
+ list_for_each(iter, &hb_callbacks[type]) {
+ tmp = list_entry (iter, hb_callback_func, list);
+ if (priority < tmp->priority) {
+ list_add_tail(&f->list, iter);
+ spin_unlock(&hb_cb_lock);
+ return 0;
+ }
+ }
+ list_add_tail(&f->list, &hb_callbacks[type]);
+ spin_unlock(&hb_cb_lock);
+ return 0;
+}
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data)
+{
+ struct list_head *iter, *tmpiter;
+ int ret = -EINVAL;
+ hb_callback_func *f;
+
+ if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
+ return -EINVAL;
+
+ spin_lock(&hb_cb_lock);
+ ret = hb_wait_on_callback_state(type);
+ if (ret < 0) {
+ spin_unlock(&hb_cb_lock);
+ return ret;
+ }
+ hb_callback_state[type] = HB_CB_STATE_FROZEN;
+ spin_unlock(&hb_cb_lock);
+
+ list_for_each_safe(iter, tmpiter, &hb_callbacks[type]) {
+ f = list_entry (iter, hb_callback_func, list);
+ if (f->func == func && f->data == data) {
+ list_del(&f->list);
+ kfree(f);
+ ret = 0;
+ break;
+ }
+ }
+
+ spin_lock(&hb_cb_lock);
+ hb_callback_state[type] = HB_CB_STATE_READY;
+ atomic_set(&hb_cb_ready, 1);
+ wake_up(&hb_cb_wq);
+ spin_unlock(&hb_cb_lock);
+ return ret;
+}
+
+
+
+static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx)
+{
+ struct list_head *iter;
+ hb_callback_func *f;
+ int ret;
+
+ spin_lock(&hb_cb_lock);
+ ret = hb_wait_on_callback_state(type);
+ if (ret < 0) {
+ spin_unlock(&hb_cb_lock);
+ printk("missed hb callback(%d) due to EINTR!\n", type);
+ return;
+ }
+ hb_callback_state[type] = HB_CB_STATE_FROZEN;
+ spin_unlock(&hb_cb_lock);
+
+ list_for_each(iter, &hb_callbacks[type]) {
+ f = list_entry (iter, hb_callback_func, list);
+ (f->func) (ptr1, ptr2, idx, f->data);
+ }
+
+ spin_lock(&hb_cb_lock);
+ hb_callback_state[type] = HB_CB_STATE_READY;
+ atomic_set(&hb_cb_ready, 1);
+ wake_up(&hb_cb_wq);
+ spin_unlock(&hb_cb_lock);
+}
Added: trunk/cluster/heartbeat.h
===================================================================
--- trunk/cluster/heartbeat.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/heartbeat.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,129 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_HEARTBEAT_H
+#define CLUSTER_HEARTBEAT_H
+
+
+enum {
+ HB_NODE_STATE_INIT = 0,
+ HB_NODE_STATE_DOWN,
+ HB_NODE_STATE_UP
+};
+
+struct _heartbeat_ctxt
+{
+ int dummy;
+};
+
+typedef struct _hb_disk_slot
+{
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct list_head list;
+ unsigned long last_time;
+ u16 margin;
+ u16 state;
+} hb_disk_slot;
+
+
+
+#define HB_THREAD_MS 2000 // every 2 seconds
+
+
+#define HB_OP_MAGIC 0xf00d
+enum {
+ HB_OP_START_DISK_HEARTBEAT=371,
+ HB_OP_GET_NODE_MAP
+};
+
+typedef struct _hb_op
+{
+ u16 magic;
+ u16 opcode;
+ unsigned int fd;
+ char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+ u16 group_num;
+ u32 bits;
+ u32 blocks;
+ u64 start;
+} hb_op;
+
+enum {
+ HB_TYPE_DISK = 0,
+ HB_TYPE_NET
+};
+
+
+/* callback stuff */
+
+enum {
+ HB_NODE_DOWN_CB = 0,
+ HB_NODE_UP_CB,
+ HB_NODE_RESPONDED_CB, // this one is very chatty
+ HB_NUM_CB
+};
+
+typedef void (hb_cb_func)(struct inode *, struct inode *, int, void *);
+
+typedef struct _hb_callback_func
+{
+ struct list_head list;
+ hb_cb_func *func;
+ void *data;
+ int priority;
+} hb_callback_func;
+
+
+enum {
+ HB_Root = 1,
+ HB_Disk,
+ HB_WriteOpArraySize
+};
+
+typedef struct _hb_disk_heartbeat_block
+{
+ u64 time;
+} hb_disk_heartbeat_block;
+
+
+// number of initial allowed misses
+#define HB_INITIAL_DISK_MARGIN 60
+#define HB_INITIAL_NET_MARGIN 60
+
+// number of allowed misses in steady state
+#define HB_DISK_MARGIN 30
+#define HB_NET_MARGIN 30
+
+
+int hb_unregister_callback(int type, hb_cb_func *func, void *data);
+int hb_register_callback(int type, hb_cb_func *func, void *data, int priority);
+int hb_fill_node_map(struct inode *group, void *map, int size);
+
+
+
+#endif /* CLUSTER_HEARTBEAT_H */
Added: trunk/cluster/nodemanager.c
===================================================================
--- trunk/cluster/nodemanager.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/nodemanager.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1330 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.c
+ *
+ * totally lame static node management placeholder
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/module.h>
+
+#include <linux/linkage.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/unistd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/hash.h>
+
+#include <asm/uaccess.h>
+
+#include "tcp.h"
+#include "dlmmod.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+
+#include "compat_libfs.h"
+
+#ifndef __user
+#define __user
+#endif
+
+
+/*
+ * This nm module is similar to nfsd/nfsctl.c in that it uses
+ * transaction files (in /proc/cluster/nm) to communicate with
+ * the kernel module instead of ioctls or other means.
+ *
+ * Files involved:
+ * /proc/cluster/nm/cluster - used to create/destroy cluster, adds
+ * nodes/groups to the cluster, queries info
+ * about the cluster
+ * /proc/cluster/nm/group - adds/removes nodes from a group, queries
+ * info about a group
+ * /proc/cluster/nm/node - changes info for a node, queries info about
+ * a node
+ *
+ * This nm implementation basically allows this node to live in exactly one
+ * cluster. All "clustered" nodes that are known to this node should be
+ * added to the cluster, and all nodes should see the same list of nodes in
+ * the same order at all times. The "slot" number given to a node in this
+ * global cluster list is fixed and never changes. Groups can be dynamically
+ * created within a cluster (TODO: currently static only) and be made up of
+ * one or more nodes (listed at most once) in the global list. A node may exist
+ * in many groups. Also, a group may have an optional disk UUID which is simply
+ * stored for later use by the heartbeat service. (The heartbeat service will
+ * do disk heartbeating only for those groups with valid UUIDs.)
+ *
+ * USAGE:
+ * For our purposes, the nm service can be autoloaded by an fstab entry or manually
+ * through mount (mount -t nm none /proc/cluster/nm). Once that is done, an init
+ * script (or single executable on an initrd) should be run to create the static
+ * cluster info, possibly from a file like /etc/nm.conf or similar. We should
+ * probably create a "dlm" or "everyone" group (with NO disk heartbeating) so that
+ * the dlm service can be used with the network only. This group should contain
+ * all known nodes. After this is done, the net, hb and dlm modules can come up.
+ * The nm service is now ready for use, since groups don't need to be created till
+ * later.
+ *
+ * A group services daemon can be written (by someone!? ;-) to run at this point.
+ * Since the "dlm" group has everything it needs for full dlmming (since it uses
+ * only network), the dlm itself can be used to arbitrate for group creation,
+ * and additions/deletions from groups. Callbacks should be registered with nm by
+ * other services that care on each of these events. For instance, heartbeat should
+ * register a callback with nm for group creation, and addition and deletion from
+ * a group so that it can make any necessary changes to its heartbeating (primarily
+ * so that it can begin/end disk heartbeat for any group/node that needs it).
+ *
+ * NOTE NOTE NOTE !!!!:
+ * This is intended to be a quickie implementation. (translation: lame) I do not
+ * want to step on anyone's toes who may have implemented something wayyy better.
+ * If something out there "wins", we will plug into that instead. If nothing really
+ * takes off, we at least have a (lame) reference to work off of. However, since this
+ * implementation exists solely to make ocfs2 work, and one of the major advantages
+ * of ocfs version 1 was ease of setup, we don't want to move to something
+ * substantially more complicated than this (one conf file).
+ *
+ */
+
+
+
+/* globals */
+nm_cluster cluster;
+struct super_block *single_sb;
+char *nm_nodename;
+static spinlock_t nm_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t nm_cb_lock = SPIN_LOCK_UNLOCKED;
+struct list_head nm_callbacks[NM_NUM_CB];
+
+
+static void nm_teardown(void);
+static int nm_create_cluster(char *buf);
+static void nm_init_cluster(nm_cluster *cluster);
+int nm_create_node(char *buf, nm_op *data);
+int nm_name_cluster(char *buf, nm_op *data);
+int nm_destroy_cluster(char *buf);
+int nm_get_cluster_num_nodes(char *buf);
+int nm_get_cluster_num_groups(char *buf);
+int nm_get_node_info(char *buf, nm_op *data);
+int nm_get_group_info(char *buf, nm_op *data);
+nm_cluster *nm_get_cluster(void);
+struct inode *nm_get_group_by_name(char *node_name);
+struct inode *nm_get_node_by_name(char *node_name);
+int nm_init(dlm_ctxt *dlm);
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx);
+
+/* support for adding files, dirs, hardlinks in /proc/cluster/nm/... */
+extern struct file_operations simple_dir_operations;
+extern struct inode_operations simple_dir_inode_operations;
+extern struct file_operations transaction_ops;
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request);
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
+
+static ssize_t write_node(struct file *file, char *buf, size_t size);
+static ssize_t write_group(struct file *file, char *buf, size_t size);
+static ssize_t write_cluster(struct file *file, char *buf, size_t size);
+
+static struct inode * __nm_get_group_by_num(u16 group_num);
+static struct inode * __nm_get_node_by_num(u16 node_num);
+
+
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child);
+
+#define NM_HASH_BITS 7
+#define NM_HASH_SIZE (1 << NM_HASH_BITS)
+#define NM_HASH_MASK (NM_HASH_SIZE - 1)
+
+static struct list_head *nm_ip_hash = NULL;
+static spinlock_t nm_ip_hash_lock;
+
+static int nm_init_ip_hash(void);
+static void nm_destroy_ip_hash(void);
+
+
+static void nm_destroy_ip_hash(void)
+{
+ int i;
+ if (!nm_ip_hash)
+ return;
+ for (i=0; i<NM_HASH_SIZE; i++) {
+ /* TODO: cleanup */
+ }
+ free_page((unsigned long)nm_ip_hash);
+}
+
+static int nm_init_ip_hash(void)
+{
+ int i;
+
+ if ((PAGE_SIZE / sizeof(struct list_head)) < NM_HASH_SIZE) {
+ printk("eek! hash size too big for this arch!\n");
+ BUG();
+ }
+
+ nm_ip_hash = (struct list_head *) __get_free_page(GFP_KERNEL);
+ if (!nm_ip_hash)
+ return -ENOMEM;
+ for (i=0; i<NM_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nm_ip_hash[i]);
+ spin_lock_init(&nm_ip_hash_lock);
+ return 0;
+}
+
+
+
+
+
+static inline int nm_find_next_slot(void *bitmap, int max, int request)
+{
+ int start = 0, slot_num;
+ if (request != NM_INVALID_SLOT_NUM)
+ start = request;
+ slot_num = find_next_zero_bit (bitmap, max, start);
+ if (slot_num >= max)
+ return -1;
+ if (request != NM_INVALID_SLOT_NUM && slot_num != request)
+ return -1;
+ set_bit(slot_num, bitmap);
+ return slot_num;
+}
+
+
+
+
+static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+ struct qstr name;
+ struct dentry *dentry = ERR_PTR(-EINVAL);
+ struct inode *inode;
+
+ if (!file->name)
+ goto out;
+ name.name = file->name;
+ name.len = strlen(name.name);
+ printk("adding file %*s\n", name.len, name.name);
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_alloc(parent, &name);
+ if (!dentry) {
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode = new_inode(s);
+ if (!inode) {
+ dput(dentry);
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode->i_mode = file->mode;
+ inode->i_uid = inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (file->mode & S_IFDIR) {
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ } else {
+ inode->i_fop = file->ops;
+ }
+ inode->i_ino = ino;
+ insert_inode_hash(inode);
+ d_add(dentry, inode);
+
+out:
+ return dentry;
+}
+
+
+static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino)
+{
+ struct qstr name;
+ struct dentry *dentry = ERR_PTR(-EINVAL);
+ struct inode *inode;
+
+ if (!file->name)
+ goto out;
+ name.name = file->name;
+ name.len = strlen(name.name);
+ printk("adding link %*s\n", name.len, name.name);
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_alloc(parent, &name);
+ if (!dentry) {
+ printk("failed to d_alloc\n");
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode = iget(s, ino);
+ if (!inode) {
+ printk("failed to iget\n");
+ dput(dentry);
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ if (!inode->u.generic_ip) {
+ printk("bad inode: %d\n", ino);
+ iput(inode);
+ dput(dentry);
+ dentry = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ inode->i_nlink++;
+ d_add(dentry, inode);
+
+out:
+ return dentry;
+}
+
+
+
+
+
+/* cluster, node and group transaction files.
+ * here's where the actual work of nm takes place. */
+
+static int nm_create_cluster(char *buf)
+{
+ int ret = -EINVAL;
+
+ printk("create cluster...\n");
+
+ spin_lock(&nm_lock);
+ if (cluster.state == NM_CLUSTER_UP) {
+ ret = sprintf(buf, "%d: cluster already up\n", -EINVAL);
+ } else {
+ cluster.state = NM_CLUSTER_UP;
+ ret = sprintf(buf, "0: cluster state: UP");
+ }
+ spin_unlock(&nm_lock);
+ return ret;
+}
+
+
+
+int nm_create_group(char *buf, nm_op *data)
+{
+ struct tree_descr desc;
+ struct dentry *dentry = NULL;
+ struct inode *inode = NULL;
+ int ino, group_num;
+ int ret = -EINVAL;
+ nm_group_inode_private *g = NULL;
+
+ printk("create group...\n");
+
+ data->arg_u.gc.name[NM_MAX_NAME_LEN] = '\0';
+ inode = nm_get_group_by_name(data->arg_u.gc.name);
+ if (inode) {
+ ret = sprintf(buf, "%d: group %u (%s) already exists", -EEXIST,
+ nm_get_group_global_index(inode), data->arg_u.gc.name);
+ iput(inode);
+ return ret;
+ }
+
+ group_num = data->arg_u.gc.group_num;
+ if (group_num > NM_INVALID_SLOT_NUM)
+ goto leave;
+
+ spin_lock(&cluster.bitmap_lock);
+ group_num = nm_find_next_slot(&(cluster.group_bitmap[0]), 255, group_num);
+ spin_unlock(&cluster.bitmap_lock);
+
+ if (group_num < 0) {
+ printk("out of group slots!\n");
+ goto leave;
+ }
+
+ ino = group_num + NM_GROUP_INODE_START;
+
+ desc.name = data->arg_u.gc.name;
+ desc.ops = NULL;
+ desc.mode = S_IFDIR | 0755;
+ dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+ if (IS_ERR(dentry))
+ goto leave;
+ inode = igrab(dentry->d_inode);
+ if (!inode) {
+ printk("igrab failed!\n");
+ goto leave;
+ }
+
+ g = kmalloc(sizeof(nm_group_inode_private), GFP_KERNEL);
+ if (!g)
+ goto leave;
+
+ memset(g, 0, sizeof(nm_group_inode_private));
+ memcpy(g->disk.uuid, data->arg_u.gc.disk_uuid, CLUSTER_DISK_UUID_LEN);
+ spin_lock_init(&g->bitmap_lock);
+ if (g->disk.uuid[0])
+ g->state = NM_GROUP_NOT_READY;
+ else
+ g->state = NM_GROUP_READY;
+ g->inode = inode;
+ inode->u.generic_ip = g;
+
+ ret = sprintf(buf, "0: group %u (%s) added, uuid: %s", group_num,
+ data->arg_u.gc.name, g->disk.uuid);
+ nm_do_callbacks(NM_GROUP_ADD_CB, inode, NULL, group_num);
+
+leave:
+ if (ret < 0) {
+ if (inode) {
+ if (inode->u.generic_ip)
+ kfree(inode->u.generic_ip);
+ iput(inode);
+ }
+ if (dentry)
+ dput(dentry);
+ }
+ return ret;
+}
+
+
+int nm_create_node(char *buf, nm_op *data)
+{
+ struct tree_descr desc;
+ struct dentry *dentry = NULL;
+ struct inode *inode = NULL;
+ int ino, node_num, bucket;
+ int ret = -EINVAL;
+ nm_node_inode_private *n = NULL;
+
+ printk("add cluster node ...\n");
+
+ data->arg_u.node.node_name[NM_MAX_NAME_LEN] = '\0';
+ inode = nm_get_node_by_name(data->arg_u.node.node_name);
+ if (inode) {
+ ret = sprintf(buf, "%d: node %u (%s) already exists", -EEXIST,
+ nm_get_node_global_index(inode),
+ data->arg_u.node.node_name);
+ iput(inode);
+ return ret;
+ }
+
+ node_num = data->arg_u.node.node_num;
+ if (node_num > NM_INVALID_SLOT_NUM) {
+ printk("bad node_num: %d\n", node_num);
+ goto leave;
+ }
+
+ spin_lock(&cluster.bitmap_lock);
+ node_num = nm_find_next_slot(&(cluster.node_bitmap[0]), 255, node_num);
+ spin_unlock(&cluster.bitmap_lock);
+
+ if (node_num < 0) {
+ printk("out of node slots!\n");
+ goto leave;
+ }
+
+ ino = node_num + NM_NODE_INODE_START;
+
+ desc.name = data->arg_u.node.node_name;
+ desc.ops = NULL;
+ desc.mode = S_IFREG | S_IWUSR;
+ dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
+ if (IS_ERR(dentry)) {
+ printk("bad dentry\n");
+ goto leave;
+ }
+ inode = igrab(dentry->d_inode);
+ if (!inode) {
+ printk("igrab failed!\n");
+ goto leave;
+ }
+
+ n = kmalloc(sizeof(nm_node_inode_private), GFP_KERNEL);
+ if (!n) {
+ printk("could not kmalloc\n");
+ goto leave;
+ }
+ memcpy(&n->node, &data->arg_u.node, sizeof(nm_node_info));
+ INIT_LIST_HEAD(&n->ip_hash);
+ n->net.sock = NULL;
+ INIT_LIST_HEAD(&n->net.list);
+ spin_lock_init(&n->net.sock_lock);
+ n->net.flags = 0;
+
+ /* hash on first ip address */
+ spin_lock(&nm_ip_hash_lock);
+ bucket = hash_long(n->node.ifaces[0].addr_u.ip_addr4, NM_HASH_BITS);
+ list_add_tail(&n->ip_hash, &nm_ip_hash[bucket]);
+ spin_unlock(&nm_ip_hash_lock);
+ printk("hashed ip %d.%d.%d.%d to bucket %d\n", NIPQUAD(n->node.ifaces[0].addr_u.ip_addr4), bucket);
+ n->inode = inode;
+ inode->u.generic_ip = n;
+
+ ret = sprintf(buf, "0: node %u (%s) added", node_num, n->node.node_name);
+ nm_do_callbacks(NM_NODE_ADD_CB, inode, NULL, node_num);
+
+leave:
+ if (ret < 0) {
+ if (inode) {
+ if (inode->u.generic_ip)
+ kfree(inode->u.generic_ip);
+ iput(inode);
+ }
+ if (dentry)
+ dput(dentry);
+ }
+ return ret;
+}
+
+int nm_make_group_ready(struct inode *group)
+{
+ nm_group_inode_private *g = group->u.generic_ip;
+ if (!g)
+ return -EINVAL;
+ g->state = NM_GROUP_READY;
+ return 0;
+}
+
+int nm_add_node_to_group(char *buf, nm_op *data)
+{
+ struct tree_descr desc;
+ struct inode *inode = NULL;
+ struct dentry *dentry = NULL, *child = NULL;
+ nm_group_inode_private *g = NULL;
+ int group_num, slot_num;
+ int ret = -EINVAL;
+ u16 ino;
+ char tmpname[6];
+
+ printk("add node to group...\n");
+
+ group_num = data->arg_u.gc.group_num;
+ ino = data->arg_u.gc.node_num;
+ slot_num = data->arg_u.gc.slot_num;
+
+ /* request a certain slot, or NM_INVALID_SLOT_NUM for any slot */
+ if (slot_num > NM_INVALID_SLOT_NUM)
+ goto leave;
+
+ if (ino >= NM_INVALID_SLOT_NUM || group_num >= NM_INVALID_SLOT_NUM)
+ goto leave;
+
+ inode = __nm_get_group_by_num(group_num);
+ if (!inode)
+ goto leave;
+ if (list_empty(&inode->i_dentry))
+ goto leave;
+ dentry = dget(list_entry(inode->i_dentry.next, struct dentry, d_alias));
+ if (!dentry)
+ goto leave;
+ g = inode->u.generic_ip;
+ if (!g)
+ goto leave;
+
+ if (g->state == NM_GROUP_NOT_READY) {
+ ret = sprintf(buf, "%d: group disk has not been discovered. cannot add nodes.", -EROFS);
+ goto leave;
+ }
+
+ spin_lock(&g->bitmap_lock);
+ slot_num = nm_find_next_slot(&(g->slot_bitmap[0]), 255, slot_num);
+ spin_unlock(&g->bitmap_lock);
+ if (slot_num < 0)
+ goto leave;
+
+ /* create hardlink to ino with name "slot_num" */
+ sprintf(tmpname, "%03u", slot_num);
+ desc.name = &(tmpname[0]);
+ desc.ops = NULL;
+ desc.mode = 0;
+ child = nm_add_link(single_sb, dentry, &desc,
+ NM_NODE_INODE_START+ino);
+ if (IS_ERR(child)) {
+ printk("error adding link for %s\n", tmpname);
+ child = NULL;
+ goto leave;
+ }
+
+ ret = sprintf(buf, "0: node %u added to group: %*s",
+ ino, dentry->d_name.len, dentry->d_name.name);
+
+ if (!igrab(child->d_inode))
+ goto leave;
+ nm_do_callbacks(NM_GROUP_NODE_ADD_CB, inode, child->d_inode, slot_num);
+ iput(child->d_inode);
+
+leave:
+ if (dentry)
+ dput(dentry);
+ if (child)
+ dput(child);
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+
+int nm_remove_node_from_group(struct inode *group, struct inode *node)
+{
+ struct dentry *child = NULL;
+ nm_group_inode_private *g = NULL;
+ int slot_num;
+ int ret = -EINVAL;
+
+ printk("remove node from group...\n");
+
+ slot_num = nm_get_group_index(group, node, &child);
+
+ if (slot_num == NM_MAX_NODES || !child)
+ goto leave;
+
+ g = group->u.generic_ip;
+ if (!g)
+ goto leave;
+
+ printk("killing the dentry now!!\n");
+ down(&group->i_zombie);
+ node->i_nlink--;
+ d_delete(child);
+ up(&group->i_zombie);
+ printk("done killing the dentry!!\n");
+
+
+ if (!igrab(node))
+ goto leave;
+ nm_do_callbacks(NM_GROUP_NODE_DEL_CB, group, node, slot_num);
+ iput(node);
+
+ spin_lock(&g->bitmap_lock);
+ clear_bit(slot_num, (void *)(&g->slot_bitmap[0]));
+ spin_unlock(&g->bitmap_lock);
+
+ ret = 0;
+
+leave:
+ if (child)
+ dput(child);
+ return ret;
+}
+
+
+
+int nm_name_cluster(char *buf, nm_op *data)
+{
+ int ret = -EINVAL;
+
+ printk("name cluster...\n");
+ spin_lock(&nm_lock);
+ if (cluster.state == NM_CLUSTER_UP) {
+ ret = sprintf(buf, "%d: cluster name could not be set. cluster already up.", -EINVAL);
+ goto leave;
+ }
+ memset(cluster.name, 0, NM_MAX_NAME_LEN+1);
+ memcpy(cluster.name, data->arg_u.name, NM_MAX_NAME_LEN);
+ ret = sprintf(buf, "0: cluster name set: %s", cluster.name);
+leave:
+ spin_unlock(&nm_lock);
+ return ret;
+}
+
+int nm_destroy_cluster(char *buf)
+{
+ int ret;
+ printk("destroy cluster...\n");
+
+ /* TODO */
+ spin_lock(&nm_lock);
+ nm_init_cluster(&cluster);
+ ret = sprintf(buf, "0: rudely destroyed cluster!!!");
+ spin_unlock(&nm_lock);
+ return ret;
+}
+
+int nm_get_cluster_num_nodes(char *buf)
+{
+ int num_nodes=0, i;
+
+ printk("get cluster num nodes...\n");
+
+ spin_lock(&cluster.bitmap_lock);
+ for (i=0; i<8; i++)
+ num_nodes += hweight32(cluster.node_bitmap[i]);
+ spin_unlock(&cluster.bitmap_lock);
+
+ return sprintf(buf, "0: %d", num_nodes);
+}
+
+int nm_get_cluster_num_groups(char *buf)
+{
+ int num_groups=0, i;
+
+ printk("get cluster num groups...\n");
+
+ spin_lock(&cluster.bitmap_lock);
+ for (i=0; i<8; i++)
+ num_groups += hweight32(cluster.group_bitmap[i]);
+ spin_unlock(&cluster.bitmap_lock);
+
+ return sprintf(buf, "0: %d", num_groups);
+}
+
+int nm_get_group_num_nodes(struct inode *group)
+{
+ int num_nodes=0, i;
+ nm_group_inode_private *g;
+
+ printk("get group num nodes...\n");
+
+ g = group->u.generic_ip;
+ if (!g)
+ return -EINVAL;
+
+ spin_lock(&g->bitmap_lock);
+ for (i=0; i<8; i++)
+ num_nodes += hweight32(g->slot_bitmap[i]);
+ spin_unlock(&g->bitmap_lock);
+
+ return num_nodes;
+}
+
+int nm_get_group_max_slots(struct inode *group)
+{
+ int last=0, i;
+ nm_group_inode_private *g;
+
+ printk("get group num nodes...\n");
+
+ g = group->u.generic_ip;
+ if (!g)
+ return -EINVAL;
+
+#warning need to change this for 64 bit
+ spin_lock(&g->bitmap_lock);
+ for (i=7; i>=0; i--) {
+ if (g->slot_bitmap[i]) {
+ last = fls(g->slot_bitmap[i]);
+ last += (i * sizeof(g->slot_bitmap[i]));
+ break;
+ }
+ }
+ spin_unlock(&g->bitmap_lock);
+
+ return last;
+}
+
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx)
+{
+ nm_group_inode_private *priv;
+ int next;
+
+ if (*idx >= 255)
+ return NULL;
+ priv = group->u.generic_ip;
+ if (!priv)
+ return NULL;
+ next = find_next_bit(priv->slot_bitmap, 255, *idx);
+ if (next >= 255)
+ return NULL;
+ *idx = next;
+ return util_rarray_idx_to_slot(&priv->disk.slots, next);
+}
+
+int nm_get_node_info(char *buf, nm_op *data)
+{
+ int ret, tmpret, i;
+ nm_node_inode_private *priv;
+ nm_network_iface *n;
+ struct inode *inode = NULL;
+ struct dentry *dentry;
+ u16 node_num;
+ u16 vers;
+
+ ret = -EINVAL;
+ node_num = data->arg_u.index;
+ inode = __nm_get_node_by_num(node_num);
+ if (inode) {
+ dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ priv = inode->u.generic_ip;
+ ret = sprintf(buf, "0: global_index=%u\n"
+ "name=%*s\n",
+ priv->node.node_num, dentry->d_name.len,
+ dentry->d_name.name);
+ buf += ret;
+ for (i=0; i<NM_MAX_IFACES; i++) {
+ n = &priv->node.ifaces[i];
+ vers = ntohs(n->ip_version);
+ printk("ip_version=%u, vers=%u\n", n->ip_version, vers);
+ if (vers!=4 && vers!=6)
+ continue;
+ /* TODO: how to print ipv6? */
+ tmpret = sprintf(buf, "iface%d.port=%u\n"
+ "iface%d.version=%d\n"
+ "iface%d.addr=%d.%d.%d.%d\n",
+ i, ntohs(n->ip_port), i, vers, i,
+ NIPQUAD(n->addr_u.ip_addr4));
+ buf += tmpret;
+ ret += tmpret;
+ }
+ iput(inode);
+ }
+ return ret;
+}
+
+int nm_get_group_info(char *buf, nm_op *data)
+{
+ int ret, tmpret;
+ nm_group_inode_private *g = NULL;
+ struct inode *inode = NULL;
+ u16 group_num;
+ struct dentry *dentry, *child;
+
+ ret = -EINVAL;
+ group_num = data->arg_u.index;
+ inode = __nm_get_group_by_num(group_num);
+ if (inode) {
+ g = inode->u.generic_ip;
+ dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ ret = sprintf(buf, "0: group_num=%u\n"
+ "name=%*s\n"
+ "disk_uuid=%s\n",
+ group_num, dentry->d_name.len,
+ dentry->d_name.name, g->disk.uuid);
+ buf += ret;
+
+ spin_lock(&dcache_lock);
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ tmpret = sprintf(buf, "%*s\n", child->d_name.len,
+ child->d_name.name);
+ buf += tmpret;
+ ret += tmpret;
+ }
+ spin_unlock(&dcache_lock);
+ iput(inode);
+ }
+ return ret;
+}
+
+
+
+static ssize_t write_cluster(struct file *file, char *buf, size_t size)
+{
+ nm_op *data;
+ int ret;
+ u16 me;
+
+ printk("write_cluster\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (nm_op *) buf;
+ if (data->magic != NM_OP_MAGIC)
+ return -EINVAL;
+
+ switch (data->opcode) {
+ case NM_OP_CREATE_CLUSTER:
+ ret = nm_create_cluster(buf);
+ break;
+ case NM_OP_CREATE_GROUP:
+ ret = nm_create_group(buf, data);
+ break;
+ case NM_OP_NAME_CLUSTER:
+ ret = nm_name_cluster(buf, data);
+ break;
+ case NM_OP_DESTROY_CLUSTER:
+ ret = nm_destroy_cluster(buf);
+ break;
+ case NM_OP_ADD_CLUSTER_NODE:
+ ret = nm_create_node(buf, data);
+ break;
+ case NM_OP_GET_CLUSTER_NUM_NODES:
+ ret = nm_get_cluster_num_nodes(buf);
+ break;
+ case NM_OP_GET_GLOBAL_NODE_NUM:
+ ret = 0;
+ me = nm_this_node(NULL);
+ if (me >= NM_MAX_NODES)
+ ret = -EINVAL;
+ ret = sprintf(buf, "%d: %u", ret, me);
+ break;
+ default:
+ ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+ break;
+ }
+ printk("leaving!\n");
+ return ret;
+}
+
+static ssize_t write_node(struct file *file, char *buf, size_t size)
+{
+ nm_op *data;
+ int ret;
+
+ printk("write_node\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (nm_op *) buf;
+ if (data->magic != NM_OP_MAGIC)
+ return -EINVAL;
+
+ switch (data->opcode) {
+ case NM_OP_GET_NODE_INFO:
+ ret = nm_get_node_info(buf, data);
+ break;
+ default:
+ ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+ break;
+ }
+ printk("leaving!\n");
+ return ret;
+}
+
+static ssize_t write_group(struct file *file, char *buf, size_t size)
+{
+ nm_op *data;
+ int ret;
+
+ printk("write_group\n");
+
+ if (size < sizeof(*data))
+ return -EINVAL;
+ data = (nm_op *) buf;
+ if (data->magic != NM_OP_MAGIC)
+ return -EINVAL;
+
+ printk("opcode is %u, add_group is %u\n", data->opcode, NM_OP_ADD_GROUP_NODE);
+ switch (data->opcode) {
+ case NM_OP_GET_GROUP_INFO:
+ ret = nm_get_group_info(buf, data);
+ break;
+
+ case NM_OP_ADD_GROUP_NODE:
+ ret = nm_add_node_to_group(buf, data);
+ break;
+
+ default:
+ ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL, data->opcode);
+ break;
+ }
+ printk("leaving!\n");
+ return ret;
+}
+
+
+
+static struct inode * __nm_get_group_by_num(u16 group_num)
+{
+ struct inode *inode = iget(single_sb, group_num + NM_GROUP_INODE_START);
+ if (!inode)
+ return NULL;
+ if (!inode->u.generic_ip) {
+ iput(inode);
+ return NULL;
+ }
+ return inode;
+}
+
+static struct inode * __nm_get_node_by_num(u16 node_num)
+{
+ struct inode *inode = iget(single_sb, node_num + NM_NODE_INODE_START);
+ if (!inode)
+ return NULL;
+ if (!inode->u.generic_ip) {
+ iput(inode);
+ return NULL;
+ }
+ return inode;
+}
+
+/* ipv4 only for now... */
+struct inode * nm_get_node_by_ip(u32 addr)
+{
+ int bucket;
+ struct list_head *iter;
+ nm_node_inode_private *priv;
+ struct inode *ret = NULL;
+
+ bucket = hash_long(addr, NM_HASH_BITS);
+
+ spin_lock(&nm_ip_hash_lock);
+ list_for_each(iter, &nm_ip_hash[bucket]) {
+ priv = list_entry(iter, nm_node_inode_private, ip_hash);
+ if (priv->node.ifaces[0].addr_u.ip_addr4 == addr) {
+ ret = igrab(priv->inode);
+ break;
+ }
+
+ }
+ spin_unlock(&nm_ip_hash_lock);
+ return ret;
+}
+
+
+struct inode * nm_get_group_by_num(u16 group_num)
+{
+ struct inode *inode;
+ spin_lock(&nm_lock);
+ inode = __nm_get_group_by_num(group_num);
+ spin_unlock(&nm_lock);
+ return inode;
+}
+
+nm_cluster * nm_get_cluster(void)
+{
+ return &cluster;
+}
+
+struct inode * nm_get_node_by_num(u16 node_num)
+{
+ struct inode *inode;
+ spin_lock(&nm_lock);
+ inode = __nm_get_node_by_num(node_num);
+ spin_unlock(&nm_lock);
+ return inode;
+}
+
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index)
+{
+ struct dentry *dentry = NULL, *parent;
+ struct inode *inode = NULL;
+ char tmpname[6];
+
+ if (list_empty(&group->i_dentry))
+ return NULL;
+ parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+ if (!parent)
+ return NULL;
+
+ sprintf(tmpname, "%03u", index);
+ dentry = lookup_one_len(tmpname, parent, strlen(tmpname));
+ if (!IS_ERR(dentry)) {
+ inode = dentry->d_inode;
+ if (inode) {
+ inode = igrab(inode);
+ if (!inode->u.generic_ip || !S_ISREG (inode->i_mode)) {
+ printk("bad inode!\n");
+ iput(inode);
+ inode = NULL;
+ }
+ }
+ if (!inode)
+ dput(dentry);
+ }
+ dput(parent);
+ return inode;
+}
+
+
+struct inode * __nm_get_node_by_name(char *node_name, int dir)
+{
+ struct dentry *dentry = NULL;
+ struct inode *inode = NULL;
+
+ dentry = lookup_one_len(node_name, single_sb->s_root, strlen(node_name));
+ if (!IS_ERR(dentry)) {
+ inode = dentry->d_inode;
+ if (inode) {
+ inode = igrab(inode);
+ if (!inode->u.generic_ip ||
+ (dir && !S_ISDIR (inode->i_mode)) ||
+ (!dir && !S_ISREG (inode->i_mode))) {
+ printk("bad inode!\n");
+ iput(inode);
+ inode = NULL;
+ }
+ }
+ }
+ return inode;
+}
+
+
+/*
+ * if group is NULL: return the global index for this node
+ * if group is non NULL: return the index within the group of this node
+ *
+ * NOTE: currently getting the group index is slow
+ * will need to change this somehow
+ */
+u16 nm_this_node(struct inode *group)
+{
+ struct inode *inode = NULL;
+ struct dentry *child = NULL;
+ u16 node_num = NM_MAX_NODES;
+
+ inode = nm_get_node_by_name(nm_nodename);
+ if (inode && inode->u.generic_ip) {
+ if (group)
+ node_num = nm_get_group_index(group, inode, &child);
+ else
+ node_num = nm_get_node_global_index(inode);
+
+ }
+ iput(inode);
+ dput(child);
+ //printk("for group=%p, this node is %u\n", group, node_num);
+ return node_num;
+}
+
+/* slow */
+static u16 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child)
+{
+ struct dentry *tmp = NULL, *parent = NULL;
+ u16 slot_num = NM_MAX_NODES;
+ struct list_head *iter;
+ char tmpname[6];
+ char *err;
+
+ *child = NULL;
+ parent = NULL;
+ if (list_empty(&group->i_dentry))
+ goto leave;
+ parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
+ if (!parent)
+ goto leave;
+
+ spin_lock(&dcache_lock);
+ list_for_each(iter, &parent->d_subdirs) {
+ tmp = list_entry(iter, struct dentry, d_child);
+ if (tmp->d_inode == inode)
+ break;
+ tmp = NULL;
+ }
+ if (tmp)
+ dget_locked(tmp);
+ spin_unlock(&dcache_lock);
+
+ if (!tmp || tmp->d_name.len > 3)
+ goto leave;
+ strncpy(tmpname, tmp->d_name.name, tmp->d_name.len);
+ tmpname[tmp->d_name.len] = '\0';
+ err=NULL;
+ slot_num = simple_strtoul(tmpname, &err, 10);
+
+ if (*err != '\0')
+ slot_num = NM_MAX_NODES; // error
+ else
+ *child = dget(tmp); // done, get extra ref for child
+
+leave:
+ dput(parent);
+ dput(tmp);
+
+ return slot_num;
+}
+
+int nm_init(dlm_ctxt *dlm)
+{
+ return 0;
+}
+
+int nm_register_callback(int type, void (*func)(void *, void *, u16))
+{
+ nm_callback_func *f;
+
+ if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+ return -EINVAL;
+ f = kmalloc(sizeof(nm_callback_func), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOMEM;
+ memset(f, 0, sizeof(nm_callback_func));
+ f->func = func;
+ spin_lock(&nm_cb_lock);
+ list_add_tail(&f->list, &nm_callbacks[type]);
+ spin_unlock(&nm_cb_lock);
+ return 0;
+}
+
+#warning need to change nm callbacks to be like hb callbacks... no locks when calling.
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16))
+{
+ struct list_head *iter, *tmpiter;
+ int ret = -EINVAL;
+ nm_callback_func *f;
+
+ if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
+ return ret;
+
+ spin_lock(&nm_cb_lock);
+ list_for_each_safe(iter, tmpiter, &nm_callbacks[type]) {
+ f = list_entry (iter, nm_callback_func, list);
+ if (f->func == func) {
+ list_del(&f->list);
+ kfree(f);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock(&nm_cb_lock);
+ return ret;
+}
+
+
+
+static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u16 idx)
+{
+ struct list_head *iter;
+ nm_callback_func *f;
+
+ spin_lock(&nm_cb_lock);
+ list_for_each(iter, &nm_callbacks[type]) {
+ f = list_entry (iter, nm_callback_func, list);
+ (f->func) (ptr1, ptr2, idx);
+ }
+ spin_unlock(&nm_cb_lock);
+}
+
+
+static void nm_teardown(void)
+{
+ remove_proc_entry("cluster/nm", NULL);
+ remove_proc_entry("cluster", NULL);
+}
+
+static void nm_init_cluster(nm_cluster *cluster)
+{
+ int i;
+ memset(cluster, 0, sizeof(nm_cluster));
+ cluster->state = NM_CLUSTER_DOWN;
+ spin_lock_init(&cluster->bitmap_lock);
+
+ for (i=NM_NODE_ADD_CB; i<=NM_GROUP_NODE_DEL_CB; i++)
+ INIT_LIST_HEAD(&nm_callbacks[i]);
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ * populating the filesystem.
+ */
+static int nm_fill_super(struct super_block * sb, void * data, int silent)
+{
+ int ret, sz;
+ TA_write_ops *ops;
+ static struct tree_descr nm_files[] = {
+ [NM_Cluster] = {".cluster", &transaction_ops, S_IWUSR},
+ [NM_Node] = {".node", &transaction_ops, S_IWUSR},
+ [NM_Group] = {".group", &transaction_ops, S_IWUSR},
+ /* last one */ {""}
+ };
+
+ sz = sizeof(nm_files) / sizeof(struct tree_descr);
+ ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+ ops->num_ops = sz;
+ ops->write_op[NM_Cluster] = write_cluster;
+ ops->write_op[NM_Node] = write_node;
+ ops->write_op[NM_Group] = write_group;
+
+ single_sb = NULL;
+ printk("calling simple_fill_super...\n");
+ ret = simple_fill_super(sb, 0x98675309, nm_files);
+ if (ret >= 0) {
+ TA_GENERIC_SB_MEMBER(sb) = ops;
+ single_sb = sb;
+ } else {
+ kfree(ops);
+ }
+ return ret;
+}
+
+static struct super_block *nm_read_super (struct super_block *sb, void *data, int silent)
+{
+ printk("welcome to nm_read_super!!!\n");
+ return (nm_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (nm_fs_type, "nm", nm_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_nm(void)
+{
+ int retval;
+ nm_nodename = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
+ if (nm_nodename==NULL) {
+ printk("could not allocate a few bytes for nodename!\n");
+ return -ENOMEM;
+ }
+ strcpy(nm_nodename, system_utsname.nodename);
+ printk("loading nm module: nodename is %s\n", nm_nodename);
+
+ if (nm_init_ip_hash() < 0) {
+ printk("failed to allocate node IP hash\n");
+ return -ENOMEM;
+ }
+
+ nm_init_cluster(&cluster);
+
+ if (proc_mkdir("cluster", 0)) {
+ if (proc_mkdir("cluster/nm", 0)) {
+ }
+ }
+ printk("calling register_filesystem\n");
+ retval = register_filesystem(&nm_fs_type);
+ printk("done calling register_filesystem: ret=%d\n", retval);
+ if (retval)
+ nm_teardown();
+ return retval;
+}
+
+static void __exit exit_nm(void)
+{
+ nm_teardown();
+ unregister_filesystem(&nm_fs_type);
+ nm_destroy_ip_hash();
+ kfree(nm_nodename);
+ printk("unloading nm module\n");
+}
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_nm)
+module_exit(exit_nm)
Added: trunk/cluster/nodemanager.h
===================================================================
--- trunk/cluster/nodemanager.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/nodemanager.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,252 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_NODEMANAGER_H
+#define CLUSTER_NODEMANAGER_H
+
+
+
+struct _nm_ctxt
+{
+ int dummy;
+};
+
+#define NM_MAX_IFACES 2
+#define NM_MAX_NODES 255
+#define NM_INVALID_SLOT_NUM 255
+
+/* host name, group name, cluster name all 64 bytes */
+#define NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
+
+
+#define NM_GROUP_INODE_START 200000
+#define NM_NODE_INODE_START 100000
+
+enum {
+ NM_CLUSTER_DOWN=0,
+ NM_CLUSTER_UP
+};
+
+enum {
+ NM_GROUP_NOT_READY=0,
+ NM_GROUP_READY
+};
+
+enum {
+ NM_Root = 1,
+ NM_Cluster,
+ NM_Node,
+ NM_Group,
+};
+
+
+
+
+typedef struct _nm_network_iface
+{
+ u16 ip_port; /* for simplicity, just define exactly one port for this if */
+ u16 ip_version;
+ union {
+ u32 ip_addr4; /* IPv4 address in NBO */
+ u32 ip_addr6[4]; /* IPv6 address in NBO */
+ } addr_u;
+} nm_network_iface;
+
+typedef struct _nm_node_info
+{
+ u16 node_num;
+ char node_name[NM_MAX_NAME_LEN+1];
+ nm_network_iface ifaces[NM_MAX_IFACES];
+} nm_node_info;
+
+
+typedef struct _nm_cluster
+{
+ char name[NM_MAX_NAME_LEN+1];
+ int state;
+ spinlock_t bitmap_lock;
+ u32 group_bitmap[8];
+ u32 node_bitmap[8];
+} nm_cluster;
+
+
+typedef struct _nm_group_inode_private
+{
+ struct inode *inode;
+ struct list_head net_list;
+ struct list_head disk_list;
+ cluster_disk disk;
+ int state;
+ spinlock_t bitmap_lock;
+ u32 slot_bitmap[8];
+} nm_group_inode_private;
+
+#ifdef __KERNEL__
+/* TODO: move this */
+#define NET_FLAG_CREATING_SOCKET 0x00000001
+typedef struct _net_inode_private
+{
+ struct socket *sock;
+ wait_queue_t sleep;
+ spinlock_t sock_lock;
+ struct list_head handlers;
+ struct list_head list;
+ int flags;
+} net_inode_private;
+
+typedef struct _nm_node_inode_private
+{
+ struct inode *inode;
+ nm_node_info node;
+ struct list_head ip_hash;
+ net_inode_private net;
+} nm_node_inode_private;
+#endif
+
+/* transaction file nm_op stuff */
+
+#define NM_OP_MAGIC 0xbeaf
+enum {
+ NM_OP_CREATE_CLUSTER=123,
+ NM_OP_DESTROY_CLUSTER,
+ NM_OP_NAME_CLUSTER,
+ NM_OP_ADD_CLUSTER_NODE,
+ NM_OP_GET_CLUSTER_NUM_NODES,
+ NM_OP_GET_NODE_INFO,
+ NM_OP_CREATE_GROUP,
+ NM_OP_GET_GROUP_INFO,
+ NM_OP_ADD_GROUP_NODE,
+ NM_OP_GET_GLOBAL_NODE_NUM
+};
+
+typedef struct _nm_group_change
+{
+ u16 group_num;
+ u16 node_num;
+ u16 slot_num;
+ char disk_uuid[CLUSTER_DISK_UUID_LEN+1];
+ char name[NM_MAX_NAME_LEN+1];
+} nm_group_change;
+
+typedef struct _nm_op
+{
+ u16 magic;
+ u16 opcode;
+ union {
+ u16 index;
+ char name[NM_MAX_NAME_LEN+1];
+ nm_node_info node;
+ nm_group_change gc;
+ } arg_u;
+} nm_op;
+
+
+/* callback stuff */
+
+enum {
+ NM_NODE_ADD_CB = 0,
+ NM_NODE_DEL_CB,
+ NM_GROUP_ADD_CB,
+ NM_GROUP_DEL_CB,
+ NM_GROUP_NODE_ADD_CB,
+ NM_GROUP_NODE_DEL_CB,
+ NM_NUM_CB
+};
+
+typedef void (nm_cb_func)(void *, void *, u16);
+
+typedef struct _nm_callback_func
+{
+ struct list_head list;
+ nm_cb_func *func;
+ //void (*func)(void *, void *, u16);
+} nm_callback_func;
+
+
+
+
+u16 nm_this_node(struct inode *group);
+int nm_init(struct _dlm_ctxt *dlm);
+nm_cluster * nm_get_cluster(void);
+int nm_register_callback(int type, void (*func)(void *, void *, u16));
+int nm_unregister_callback(int type, void (*func)(void *, void *, u16));
+int nm_get_group_num_nodes(struct inode *group);
+int nm_get_group_max_slots(struct inode *group);
+int nm_make_group_ready(struct inode *group);
+void * nm_iterate_group_disk_slots(struct inode *group, int *idx);
+int nm_remove_node_from_group(struct inode *group, struct inode *node);
+int nm_create_group(char *buf, nm_op *data);
+int nm_add_node_to_group(char *buf, nm_op *data);
+
+#ifdef __KERNEL__
+
+
+struct inode * nm_get_group_by_num(u16 group_num);
+struct inode * nm_get_node_by_num(u16 node_num);
+struct inode * __nm_get_node_by_name(char *node_name, int dir);
+struct inode * nm_get_node_by_ip(u32 addr);
+struct inode * nm_get_group_node_by_index(struct inode *group, u16 index);
+
+static inline struct inode * nm_get_node_by_name(char *node_name)
+{
+ return __nm_get_node_by_name(node_name, 0);
+}
+static inline struct inode * nm_get_group_by_name(char *group_name)
+{
+ return __nm_get_node_by_name(group_name, 1);
+}
+
+
+static inline int nm_get_node_global_index(struct inode *node)
+{
+ return (node->i_ino - NM_NODE_INODE_START);
+}
+static inline int nm_get_group_global_index(struct inode *group)
+{
+ return (group->i_ino - NM_GROUP_INODE_START);
+}
+#endif
+
+static inline int nm_valid_ino(int ino)
+{
+#if 0
+ // these should never be referred to in kernel
+ if (ino >= NM_Cluster && ino <= NM_Group)
+ return 1;
+#endif
+ if (ino >= NM_NODE_INODE_START &&
+ ino < NM_NODE_INODE_START + NM_MAX_NODES)
+ return 1;
+ if (ino >= NM_GROUP_INODE_START &&
+ ino < NM_GROUP_INODE_START + NM_MAX_NODES)
+ return 1;
+ return 0;
+}
+
+
+
+#endif /* CLUSTER_NODEMANAGER_H */
Added: trunk/cluster/tcp.c
===================================================================
--- trunk/cluster/tcp.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/tcp.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1614 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.c
+ *
+ * tcp network stuff
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+
+//#if 0
+#define netprintk(x, arg...) printk("(%d) " x, current->pid, ##arg)
+#define netprintk0(x) printk("(%d) " x, current->pid)
+//#else
+#if 0
+#define netprintk(x, arg...)
+#define netprintk0(x)
+#endif
+
+struct socket *recv_sock = NULL;
+static u16 ip_version, ip_port;
+static void *net_junk_buf = NULL;
+static struct inode *net_inode = NULL;
+static u16 net_node_num;
+
+char *gsd_buf = NULL;
+char *gsd_handler_buf = NULL;
+
+
+static spinlock_t net_handler_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_list_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t net_status_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(net_handlers);
+static LIST_HEAD(net_recv_list);
+static LIST_HEAD(net_dispatch_list);
+static LIST_HEAD(net_status_list);
+
+static DECLARE_WAIT_QUEUE_HEAD(net_disp_thread_wait_queue);
+static DECLARE_WAIT_QUEUE_HEAD(net_recv_thread_wait_queue);
+static int net_recv_pid = -1;
+static struct task_struct *net_recv_task = NULL;
+static struct completion net_recv_complete;
+
+
+
+/////////////////////
+static void net_shutdown(void);
+static int net_startup(void);
+static int __init net_driver_entry (void);
+static int net_init_driver(void);
+static void __exit net_driver_exit (void);
+static void net_remove_handlers(void);
+static int net_check_message_valid(net_msg *msg, u32 len);
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode);
+static void net_dump_msg(struct socket *sock, struct inode *inode);
+static int net_recv_message_header(net_msg *hdr, struct socket *sock);
+static int net_init_tcp_recv_sock(void);
+static int net_receive_thread(void *data);
+static int net_receive(void);
+static int net_accept_tcp_connections(void);
+static void net_release_tcp_sock(void);
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd);
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+
+int gsd_message_action(gsd_message *g);
+int gsd_message_handler(net_msg *msg, u32 len, void *data);
+void gsd_teardown(void);
+int gsd_setup(void);
+
+
+//////////////////////
+
+
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_get_handler(net_msg_handler *nmh)
+{
+ atomic_inc(&nmh->refcnt);
+}
+
+static inline void net_get_handler(net_msg_handler *nmh)
+{
+ spin_lock(&net_handler_lock);
+ __net_get_handler(nmh);
+ spin_unlock(&net_handler_lock);
+}
+
+
+/* use if already holding net_handler_lock */
+static inline void __net_put_handler(net_msg_handler *nmh)
+{
+ atomic_dec(&nmh->refcnt);
+ if (!atomic_read(&nmh->refcnt)) {
+ if (net_handler_in_use(nmh))
+ netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+ kfree(nmh);
+ }
+}
+
+static inline void net_put_handler(net_msg_handler *nmh)
+{
+ if (atomic_dec_and_lock(&nmh->refcnt, &net_handler_lock)) {
+ if (net_handler_in_use(nmh))
+ netprintk0("EEEEK! killing inuse handler! bugbug!\n");
+ kfree(nmh);
+ spin_unlock(&net_handler_lock);
+ }
+}
+
+
+
+DECLARE_MUTEX(net_state_lock);
+u32 net_driver_state = NET_DRIVER_UNINITED;
+u32 net_num_dispatched = 0;
+
+
+/*
+ * net_driver_entry()
+ *
+ * Driver entry point. Called on insmod.
+ */
+static int __init net_driver_entry (void)
+{
+ struct proc_dir_entry *de;
+ de = proc_mkdir("cluster/net", 0);
+ if (!de)
+ return -1;
+ de->proc_fops->ioctl = net_ioctl;
+
+ netprintk0("Loaded net Driver module\n");
+ return 0;
+} /* net_driver_entry */
+
+static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ net_ioc data;
+ gsd_ioc gsd_data;
+ int ret = 0;
+ gsd_message g;
+ int response = 0;
+ struct inode *to = NULL;
+ struct file *file = NULL;
+
+ if (_IOC_TYPE (cmd) != NET_IOC_MAGIC) {
+ ret = -ENOTTY;
+ goto exit_ioctl;
+ }
+
+ switch (cmd) {
+ case NET_IOC_ACTIVATE:
+ memset(&data, 0, sizeof(net_ioc));
+ down(&net_state_lock);
+ data.status = net_driver_state;
+ if (net_driver_state == NET_DRIVER_UNINITED) {
+ ret = net_init_driver();
+ if (ret < 0) {
+ netprintk("error trying to activate net driver: %d\n", ret);
+ data.status = NET_DRIVER_UNINITED;
+ } else {
+ netprintk0("activated net driver!\n");
+ net_driver_state = data.status = NET_DRIVER_READY;
+ }
+ }
+ up(&net_state_lock);
+
+ ret = copy_to_user ((net_ioc *) arg, &data,
+ sizeof (net_ioc));
+ break;
+ case NET_IOC_GETSTATE:
+ memset(&data, 0, sizeof(net_ioc));
+ down(&net_state_lock);
+ data.status = net_driver_state;
+ up(&net_state_lock);
+ ret = copy_to_user ((net_ioc *) arg, &data,
+ sizeof (net_ioc));
+ break;
+
+ case GSD_IOC_CREATE_GROUP:
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+
+ file = fget(gsd_data.fd);
+ if (!file || !file->f_dentry || !file->f_dentry->d_inode) {
+ ret = -EINVAL;
+ break;
+ }
+ to = file->f_dentry->d_inode;
+
+ g.action = GSD_ACTION_ADD_GROUP;
+ g.from = net_node_num;
+ g.namelen = gsd_data.namelen;
+ memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+ if (to == net_inode) {
+ /* create the group locally */
+ ret = gsd_message_action(&g);
+ } else {
+ /* create the group on remote node */
+ ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response);
+ if (ret == 0)
+ ret = response;
+ }
+
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ gsd_data.status = ret;
+ ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+ break;
+
+ case GSD_IOC_ADD_GROUP_NODE:
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ ret = copy_from_user(&gsd_data, (gsd_ioc *)arg, sizeof(gsd_ioc));
+
+ file = fget(gsd_data.fd);
+ if (!file || !file->f_dentry || !file->f_dentry->d_inode) {
+ ret = -EINVAL;
+ break;
+ }
+ to = file->f_dentry->d_inode;
+
+ g.action = GSD_ACTION_ADD_GROUP_NODE;
+ g.from = net_node_num;
+ g.namelen = gsd_data.namelen;
+ memcpy(g.name, gsd_data.name, gsd_data.namelen);
+
+ if (to == net_inode) {
+ /* create the group locally */
+ ret = gsd_message_action(&g);
+ } else {
+ /* create the group on remote node */
+ ret = net_send_message(GSD_MESSAGE, 0, &g, sizeof(g), to, &response);
+ if (ret == 0)
+ ret = response;
+ }
+ memset(&gsd_data, 0, sizeof(gsd_ioc));
+ gsd_data.status = ret;
+ ret = copy_to_user((gsd_ioc *)arg, &gsd_data, sizeof(gsd_ioc));
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+
+exit_ioctl:
+
+ if (file)
+ fput(file);
+
+ return ret;
+} /* net_ioctl */
+
+static int net_init_driver(void)
+{
+ nm_node_info *info;
+ nm_node_inode_private *priv;
+
+ /* get the global node number for this node */
+ net_node_num = nm_this_node(NULL);
+ if (net_node_num >= NM_MAX_NODES) {
+ netprintk0("local nm node number not initialized!\n");
+ return -1;
+ }
+ net_inode = nm_get_node_by_num(net_node_num);
+ if (!net_inode) {
+ netprintk0("local nm node inode not initialized!\n");
+ return -1;
+ }
+ priv = (nm_node_inode_private *)net_inode->u.generic_ip;
+ if (!priv) {
+ iput(net_inode);
+ netprintk0("local nm node info not initialized!\n");
+ return -1;
+ }
+ info = &priv->node;
+ ip_version = info->ifaces[0].ip_version;
+ ip_port = info->ifaces[0].ip_port;
+
+ if (net_startup() < 0)
+ return -1;
+
+ if (gsd_setup() < 0)
+ return -1;
+
+ return 0;
+} /* net_init_driver*/
+
+
+/*
+ * net_driver_exit()
+ *
+ * Called on rmmod
+ */
+static void __exit net_driver_exit (void)
+{
+ down(&net_state_lock);
+ if (net_driver_state == NET_DRIVER_READY) {
+ netprintk0("shutting down network\n");
+ net_shutdown();
+ netprintk0("removing all net driver handlers\n");
+ net_remove_handlers();
+ gsd_teardown();
+ if (net_inode)
+ iput(net_inode);
+ net_driver_state = NET_DRIVER_UNINITED;
+ }
+ up(&net_state_lock);
+ remove_proc_entry("cluster/net", NULL);
+ netprintk0("Unloading net driver module\n");
+ return;
+} /* net_driver_exit */
+
+
+static int net_startup(void)
+{
+ net_recv_pid = -1;
+ net_recv_task = NULL;
+ init_completion (&net_recv_complete);
+
+ net_junk_buf = (void *) __get_free_page(GFP_KERNEL);
+ if (!net_junk_buf)
+ return -ENOMEM;
+
+ netprintk0("starting net receive thread...\n");
+ net_recv_pid = kernel_thread (net_receive_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (net_recv_pid < 0) {
+ netprintk("unable to launch net receive thread, error=%d", net_recv_pid);
+ net_shutdown();
+ return -EINVAL;
+ }
+
+ netprintk0("net thread running...\n");
+ return 0;
+}
+
+static void net_shutdown(void)
+{
+ netprintk ("waiting for net thread to exit....");
+ send_sig (SIGINT, net_recv_task, 0);
+ wait_for_completion (&net_recv_complete);
+ free_page((unsigned long)net_junk_buf);
+ netprintk ("net thread exited\n");
+}
+
+
+static int net_receive_thread(void *data)
+{
+ int status = 0;
+ DECLARE_WAITQUEUE(main_wait, current);
+
+ util_daemonize ("netrecv", strlen("netrecv"), 1);
+ net_recv_task = current;
+
+ status = net_init_tcp_recv_sock();
+ if (status >= 0 && recv_sock) {
+ add_wait_queue_exclusive(recv_sock->sk->sleep, &main_wait);
+ while (1) {
+ status = 0;
+ if (recv_sock->sk->tp_pinfo.af_tcp.accept_queue)
+ status = net_accept_tcp_connections();
+ if (!list_empty(&net_recv_list))
+ status = net_receive();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(20*HZ);
+ current->state = TASK_RUNNING;
+
+ if (signal_pending(current)) {
+ netprintk0("net recv thread got signal!\n");
+ break;
+ }
+ }
+ remove_wait_queue(recv_sock->sk->sleep, &main_wait);
+ } else {
+ netprintk0("failed to initialize net_thread!\n");
+ }
+
+ /* Flush all scheduled tasks */
+ flush_scheduled_work();
+ net_release_tcp_sock();
+ net_recv_task = NULL;
+ complete (&net_recv_complete);
+ return 0;
+}
+
+typedef union _my_timing_t
+{
+ __u64 q;
+ __u32 lohi[2];
+} my_timing_t;
+
+
+static int net_check_message_valid(net_msg *msg, u32 len)
+{
+ return 1;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/* for lack of a better place to do this */
+
+int gsd_setup()
+{
+ int ret;
+ gsd_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!gsd_buf)
+ return -ENOMEM;
+ /* need this stupidity until I can divorce the actual nm actions
+ * from the output they send to their user buffer */
+ gsd_handler_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!gsd_handler_buf)
+ return -ENOMEM;
+
+ ret = net_register_handler(GSD_MESSAGE, 0, 0, sizeof(gsd_message),
+ gsd_message_handler, NULL, gsd_buf);
+
+ return ret;
+}
+
+void gsd_teardown()
+{
+ free_page((unsigned long)gsd_buf);
+ free_page((unsigned long)gsd_handler_buf);
+}
+
+int gsd_message_handler(net_msg *msg, u32 len, void *data)
+{
+ return gsd_message_action((gsd_message *)msg->buf);
+}
+
+int gsd_message_action(gsd_message *g)
+{
+ int ret;
+ nm_op op;
+ int namelen = g->namelen;
+ struct inode *node=NULL, *group=NULL;
+ char name[NM_MAX_NAME_LEN+1];
+
+ if (namelen > NM_MAX_NAME_LEN)
+ return -EINVAL;
+ strncpy(name, g->name, namelen);
+ name[namelen] = '\0';
+
+ memset(&op, 0, sizeof(op));
+ switch (g->action) {
+ case GSD_ACTION_ADD_GROUP:
+ group = nm_get_group_by_name(name);
+ if (group) {
+ ret = 0;
+ break;
+ }
+ op.arg_u.gc.group_num = NM_INVALID_SLOT_NUM;
+ memcpy(op.arg_u.gc.name, name, namelen);
+ memcpy(op.arg_u.gc.disk_uuid, name, namelen);
+
+ ret = nm_create_group(gsd_handler_buf, &op);
+ if (ret >= 0)
+ ret = 0;
+ break;
+
+ case GSD_ACTION_ADD_GROUP_NODE:
+ group = nm_get_group_by_name(name);
+ if (!group) {
+ ret = -EINVAL;
+ break;
+ }
+ node = nm_get_group_node_by_index(group, g->from);
+ if (node) {
+ ret = 0;
+ if (nm_get_node_global_index(node) != g->from)
+ ret = -EINVAL;
+ break;
+ }
+ op.arg_u.gc.group_num = nm_get_group_global_index(group);
+ op.arg_u.gc.node_num = g->from;
+ op.arg_u.gc.slot_num = g->from;
+ ret = nm_add_node_to_group(gsd_handler_buf, &op);
+ if (ret >= 0)
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (node)
+ iput(node);
+ if (group)
+ iput(group);
+ return ret;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+int net_register_handler(u32 msg_type, u32 key, int flags, u32 max_len,
+ net_msg_handler_func *func, void *data, void *buf)
+{
+ net_msg_handler *nmh, *found=NULL;
+ u32 packet_len = sizeof(net_msg) + max_len;
+
+ if (packet_len < NET_MIN_MSG_LEN || packet_len > NET_MAX_MSG_LEN) {
+ netprintk("max_len for message handler out of range: %u\n",
+ max_len);
+ return -EINVAL;
+ }
+
+ /* if expecting any message payload, must pass a prealloced buffer */
+ if (!buf && max_len) {
+ netprintk("max_len > 0 (%u), but no buffer supplied!\n",
+ max_len);
+ return -EINVAL;
+ }
+
+ if (!msg_type) {
+ netprintk("no message type provided: %u, %p\n", msg_type, func);
+ return -EINVAL;
+
+ }
+ if (!func) {
+ netprintk("no message handler provided: %u, %p\n",
+ msg_type, func);
+ return -EINVAL;
+ }
+
+ nmh = kmalloc(sizeof(net_msg_handler), GFP_KERNEL);
+ if (!nmh) {
+ return -ENOMEM;
+ }
+ memset(nmh, 0, sizeof(net_msg_handler));
+ nmh->func = func;
+ nmh->data = data;
+ nmh->msg_type = msg_type;
+ nmh->max_len = max_len;
+ nmh->key = key;
+ spin_lock_init(&nmh->lock);
+ atomic_set(&nmh->refcnt, 0);
+ if (max_len == 0) {
+ nmh->buf = &nmh->hdr;
+ } else {
+ nmh->buf = buf;
+ }
+ nmh->flags = flags;
+ INIT_LIST_HEAD(&nmh->list);
+ net_get_handler(nmh);
+
+
+ /* add the new handler, checking for pre-existing */
+ spin_lock(&net_handler_lock);
+ found = net_lookup_handler(msg_type, key);
+ if (!found) {
+ list_add_tail(&nmh->list, &net_handlers);
+ } else {
+ spin_unlock(&net_handler_lock);
+ net_put_handler(found);
+ netprintk("message handler for type %u, key %u already exists!!!\n",
+ msg_type, key);
+ /* this should destroy it */
+ net_put_handler(nmh);
+ return -EEXIST;
+ }
+ spin_unlock(&net_handler_lock);
+ return 0;
+}
+
+
+
+/* net_handler_lock should be held */
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key)
+{
+ net_msg_handler *ret;
+ struct list_head *iter;
+
+ list_for_each(iter, &net_handlers) {
+ ret = list_entry(iter, net_msg_handler, list);
+ if (ret->msg_type == msg_type && ret->key == key) {
+ __net_get_handler(ret);
+ return ret;
+ }
+ }
+ return NULL;
+}
+
+
+
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len)
+{
+ net_msg *ret = NULL;
+ net_msg_handler *handler = NULL;
+ u32 packet_len;
+
+ spin_lock(&net_handler_lock);
+ handler = net_lookup_handler(msg_type, key);
+ spin_unlock(&net_handler_lock);
+
+ if (!handler) {
+ netprintk("no such message type: %u/%u\n", msg_type, key);
+ return NULL;
+ }
+ if (net_handler_msg_len_ok(handler, len)) {
+ netprintk("len for message type %u incorrect: %u, should be %u\n",
+ msg_type, len, handler->max_len);
+ goto done;
+ }
+ packet_len = len + sizeof(net_msg);
+ ret = kmalloc(packet_len, GFP_KERNEL);
+ if (!ret) {
+ netprintk("failed to allocate %u bytes for message!\n", packet_len);
+ goto done;
+ }
+ memset(ret, 0, packet_len);
+ ret->magic = NET_MSG_MAGIC;
+ ret->data_len = len;
+ ret->msg_type = msg_type;
+ ret->key = key;
+ if (len > 0)
+ memcpy(&(ret->buf[0]), data, len);
+
+done:
+ if (handler)
+ net_put_handler(handler);
+ return ret;
+}
+
+/* TODO Fix */
+static void net_remove_handlers(void)
+{
+ /* TODO: make an iterator in nm for running over each global inode
+ * do I have this already? then call destroy on each. last put
+ * will do the work. doesnt matter if it's slow. this is only
+ * on shutdown... */
+}
+
+
+
+
+/*
+ * net_recv_tcp_msg()
+ *
+ */
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len)
+{
+ nm_node_inode_private *priv;
+ nm_node_info *node;
+ int status = -EINVAL, error;
+ mm_segment_t oldfs;
+ struct sockaddr_in sin;
+ struct iovec iov = {
+ .iov_len = *packet_len,
+ .iov_base = data
+ };
+ struct msghdr msg = {
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_iovlen = 1,
+ .msg_iov = &iov,
+ .msg_name = (struct sockaddr *) &sin,
+ .msg_namelen = sizeof (sin),
+ .msg_flags = 0
+ };
+
+
+ priv = (nm_node_inode_private *)inode->u.generic_ip;
+ node = &priv->node;
+ if (!sock) {
+ spin_lock(&priv->net.sock_lock);
+ /* TODO: sock refcounting... i think we can get/put the sk */
+ sock = priv->net.sock;
+ if (!sock)
+ return -EINVAL;
+ spin_unlock(&priv->net.sock_lock);
+ }
+
+ memset (&sin, 0, sizeof (sin));
+ oldfs = get_fs ();
+ set_fs (get_ds ());
+ error = sock_recvmsg (sock, &msg, *packet_len, msg.msg_flags);
+ set_fs (oldfs);
+
+ status = 0;
+ if (error < 0) {
+ if (error == -ERESTARTSYS) {
+ status = -EBADF;
+ netprintk ("Shutting down\n");
+ } else {
+ status = -EINVAL;
+ netprintk ("unable to recvmsg, error=%d\n", error);
+ }
+ goto bail;
+ } else {
+ *packet_len = iov.iov_len;
+ status = 0;
+ netprintk("woot. recevied len=%d\n", *packet_len);
+ if (!net_check_message_valid(data, *packet_len)) {
+ netprintk0("eeeek bad net message!\n");
+ status = -EINVAL;
+ }
+ }
+
+ //netprintk ("Received packet from: %d.%d.%d.%d\n",
+ // NIPQUAD (sin.sin_addr.s_addr));
+
+bail:
+ return status;
+} /* net_recv_tcp_msg */
+
+
+/*
+ * net_send_tcp_msg()
+ *
+ */
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len)
+{
+ int status = 0, error;
+ struct sockaddr_in sin;
+ mm_segment_t oldfs;
+ nm_node_inode_private *priv;
+ nm_node_info *node;
+
+ priv = (nm_node_inode_private *)inode->u.generic_ip;
+ node = &priv->node;
+ if (!sock) {
+ spin_lock(&priv->net.sock_lock);
+ /* TODO: sock refcounting... i think we can get/put the sk */
+ sock = priv->net.sock;
+ spin_unlock(&priv->net.sock_lock);
+ }
+
+ oldfs = get_fs ();
+ netprintk("Sending msg to node=%u, name=%s\n", node->node_num, node->node_name);
+ memset (&sin, 0, sizeof (sin));
+ sin.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+ sin.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+ sin.sin_port = node->ifaces[0].ip_port;
+
+
+ status = -EINVAL;
+ if (sock) {
+ struct iovec iov = {
+ .iov_base = data,
+ .iov_len = packet_len
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_name = (struct sockaddr *) &sin,
+ .msg_namelen = sizeof (sin),
+ .msg_flags = 0
+ };
+
+ status = 0;
+ set_fs (get_ds ());
+ error = sock_sendmsg (sock, &msg, packet_len);
+ set_fs (oldfs);
+
+ if (error < 0) {
+ netprintk ("unable to sendmsg, error=%d\n", error);
+ status = -EINVAL;
+ }
+ }
+ if (status < 0)
+ netprintk ("bad status: %d\n", status);
+
+ status = 0;
+ return status;
+} /* net_send_tcp_msg */
+
+static spinlock_t net_msg_num_lock = SPIN_LOCK_UNLOCKED;
+static u64 net_msg_num = 1;
+
+/*
+ * net_send_message
+ *
+ * - this is probably the function you are looking for
+ * - it will package up the message for you, verifying that
+ * the message handler is there and the length is ok,
+ * connect to the other node if there is not already a
+ * socket for it, and optionally wait on a status return
+ * from the other node
+ * - all you need prior to this call is to have inited the
+ * net stuff, to have a valid inode for the node to contact
+ * in nm, and to have registered the message handler
+ */
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status)
+{
+ int ret = 0, tmpret;
+ net_msg *msg = NULL;
+ net_msg_handler *handler = NULL;
+ u32 packet_len;
+ net_status_ctxt nsc;
+ wait_queue_t sleep;
+ nm_node_inode_private *priv = NULL;
+ net_inode_private *net = NULL;
+
+ if (!inode || !inode->u.generic_ip) {
+ netprintk0("bad inode, cannot send message\n");
+ return -EINVAL;
+ }
+ priv = (nm_node_inode_private *)inode->u.generic_ip;
+ net = &priv->net;
+ spin_lock(&net->sock_lock);
+ if (!net->sock) {
+ spin_unlock(&net->sock_lock);
+ ret = net_init_tcp_sock(inode);
+ if (!(ret == 0 || ret == -EEXIST)) {
+ netprintk0("failed to create socket!");
+ return -EINVAL;
+ }
+ }
+ spin_unlock(&net->sock_lock);
+
+
+ spin_lock(&net_handler_lock);
+ handler = net_lookup_handler(msg_type, key);
+ spin_unlock(&net_handler_lock);
+
+ if (!handler) {
+ netprintk("no such message type: %u/%u\n", msg_type, key);
+ return -EINVAL;
+ }
+
+ if (net_handler_msg_len_ok(handler, len)) {
+ netprintk("len for message type %u incorrect: %u, should be %u\n",
+ msg_type, len, handler->max_len);
+ ret = -EINVAL;
+ goto done;
+ }
+ packet_len = len + sizeof(net_msg);
+ msg = kmalloc(packet_len, GFP_KERNEL);
+ if (!msg) {
+ netprintk("failed to allocate %u bytes for message!\n", packet_len);
+ ret = -ENOMEM;
+ goto done;
+ }
+ memset(msg, 0, packet_len);
+ msg->magic = NET_MSG_MAGIC;
+ msg->data_len = len;
+ msg->msg_type = msg_type;
+ msg->key = key;
+ spin_lock(&net_msg_num_lock);
+ msg->msg_num = net_msg_num;
+ if (net_msg_num == NET_MSG_NUM_MAX) {
+ printk("eek! net_msg_num wrapping to 1 now...\n");
+ net_msg_num = 1;
+ }
+ spin_unlock(&net_msg_num_lock);
+ if (len > 0)
+ memcpy(&(msg->buf[0]), data, len);
+
+ /* does the caller want to wait for a simple status? */
+ if (status) {
+ msg->status = 1;
+
+ INIT_LIST_HEAD(&nsc.list);
+ init_waitqueue_head(&nsc.wq);
+ atomic_set(&nsc.woken, 0);
+ nsc.msg_num = msg->msg_num;
+ nsc.status = 0;
+ spin_lock(&net_status_lock);
+ list_add(&nsc.list, &net_status_list);
+ spin_unlock(&net_status_lock);
+
+ init_waitqueue_entry(&sleep, current);
+ spin_lock(&net->sock_lock);
+ if (!net->sock) {
+ spin_unlock(&net->sock_lock);
+ netprintk0("caller wanted status return but socket went away!\n");
+ kfree(msg);
+ return -EINVAL;
+ }
+ add_wait_queue(net->sock->sk->sleep, &sleep);
+ spin_unlock(&net->sock_lock);
+ }
+{
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2;
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+
+
+ ret = net_send_tcp_msg(inode, NULL, msg, packet_len);
+
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ netprintk("net_send_tcp_msg took %llu cycles\n", u2.q-u1.q);
+ if (status) {
+ if (ret >= 0) {
+ /* wait on other node's handler */
+ rdtsc(u1.hilo[0], u1.hilo[1]);
+ tmpret = util_wait_atomic_eq(&nsc.wq, &nsc.woken, 1, 0);
+ rdtsc(u2.hilo[0], u2.hilo[1]);
+ netprintk("waiting on status took %llu cycles\n", u2.q-u1.q);
+ *status = nsc.status;
+ netprintk("status return requested, status is %d\n", *status);
+ remove_wait_queue(recv_sock->sk->sleep, &sleep);
+ } else {
+ netprintk("status return requested, and error returned from net_send_tcp_msg=%d\n", ret);
+ /* return bad status right away */
+ *status = ret;
+ }
+ } else if (ret < 0) {
+ netprintk("no status return requested, but error returned from net_send_tcp_msg=%d\n", ret);
+ }
+}
+
+done:
+ if (handler)
+ net_put_handler(handler);
+ if (msg)
+ kfree(msg);
+ return ret;
+}
+
+
+
+
+
+/*
+ * net_receive: receive from and dispatch all sockets with data pending
+ */
+static int net_receive(void)
+{
+ struct inode *inode;
+ struct list_head *iter, *tmpiter;
+ nm_node_inode_private *priv;
+ net_inode_private *net;
+ struct socket *sock;
+ struct sock *sk;
+ net_msg hdr;
+ net_msg_handler *hnd = NULL;
+ int err = 0;
+ int tmperr;
+ union {
+ u64 q;
+ u32 hilo[2];
+ } u1, u2, u3, u4, u5, u6;
+
+
+start_over:
+ spin_lock(&net_list_lock);
+ list_for_each_safe(iter, tmpiter, &net_recv_list) {
+ net = list_entry(iter, net_inode_private, list);
+ priv = container_of(net, nm_node_inode_private, net);
+ inode = priv->inode;
+ sock = net->sock;
+
+ if (!sock) {
+ //netprintk0("no socket yet....\n");
+ continue;
+ }
+
+ if (sock->sk->state != TCP_ESTABLISHED &&
+ sock->sk->state != TCP_CLOSE_WAIT) {
+ netprintk0("kill it and continue\n");
+ net_dump_and_close_sock(sock, inode);
+ continue;
+ }
+
+ sk = sock->sk;
+ if (skb_queue_empty(&sk->receive_queue)) {
+ //netprintk("queue empty for %lu\n", inode->i_ino);
+ continue;
+ }
+
+
+
+ list_del(&net->list);
+ spin_unlock(&net_list_lock);
+
+ memset(&hdr, 0, sizeof(net_msg));
+ err = net_recv_message_header(&hdr, sock);
+ if (err < 0) {
+ netprintk0("failed to receive message!\n");
+ goto error;
+ }
+ netprintk("received message header... magic=%u type=%u key=%u\n",
+ hdr.magic, hdr.msg_type, hdr.key);
+
+ if (hdr.magic == NET_MSG_STATUS_MAGIC) {
+rdtsc(u1.hilo[0], u1.hilo[1]);
+ net_dump_msg(sock, inode);
+ /* special type for returning message status */
+rdtsc(u2.hilo[0], u2.hilo[1]);
+ net_do_status_return(hdr.msg_num, hdr.status);
+rdtsc(u3.hilo[0], u3.hilo[1]);
+printk("status return: net_dump_msg took %llu, net_do_status_return took %llu\n", u2.q-u1.q, u3.q-u2.q);
+ err = 0;
+ goto error;
+ } else if (hdr.magic != NET_MSG_MAGIC) {
+ netprintk("bad magic: %u\n", hdr.magic);
+ goto error;
+ }
+
+ if (net_is_valid_error_type(hdr.msg_type)) {
+ /* do error handling */
+ netprintk("this is a standard error message: type=%d\n", hdr.msg_type);
+ if (hdr.msg_type == NET_ALREADY_CONNECTED) {
+ netprintk0("error: there is already a socket for this connection\n");
+ } else if (hdr.msg_type == NET_UNKNOWN_HOST) {
+ netprintk0("error: unknown host\n");
+ }
+ net_dump_msg(sock, inode);
+ err = 0;
+ goto error;
+ }
+
+ /* find a handler for it */
+ spin_lock(&net_handler_lock);
+ hnd = net_lookup_handler(hdr.msg_type, hdr.key);
+ spin_unlock(&net_handler_lock);
+
+ if (!hnd) {
+ err = -EINVAL;
+ netprintk0("no handler for message.\n");
+ goto error;
+ }
+rdtsc(u1.hilo[0], u1.hilo[1]);
+ err = net_dispatch_message(inode, sock, &hdr, hnd);
+rdtsc(u2.hilo[0], u2.hilo[1]);
+printk("net_dispatch_message took %llu\n", u2.q-u1.q);
+
+ /* if node has requested status return, do it now */
+ if (hdr.status) {
+#ifdef BIG_NET_MSG
+ u16 n = hdr.src_node;
+ hdr.src_node = hdr.dst_node;
+ hdr.dst_node = n;
+#endif
+ hdr.status = err;
+ hdr.magic = NET_MSG_STATUS_MAGIC; // twiddle the magic
+rdtsc(u3.hilo[0], u3.hilo[1]);
+ tmperr = net_send_tcp_msg(inode, sock, &hdr, sizeof(net_msg));
+rdtsc(u4.hilo[0], u4.hilo[1]);
+printk("status return (net_send_tcp_msg) took %llu\n", u4.q-u3.q);
+ } else if (err < 0) {
+ netprintk("dispatch (%u/%u) returned %d\n",
+ hdr.msg_type, hdr.key, err);
+ }
+
+
+ net_put_handler(hnd);
+
+ // re-add this socket
+ spin_lock(&net_list_lock);
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ goto start_over;
+
+error:
+ if (err < 0) {
+ if (net_link_down(err, sock)) {
+ // do NOT re-add this socket
+ netprintk("link down! err=%d\n", err);
+ net_dump_and_close_sock(sock, inode);
+ } else {
+ netprintk("bad message... node=%lu.\n", inode->i_ino);
+ net_dump_msg(sock, inode);
+ // re-add this socket
+ spin_lock(&net_list_lock);
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+ } else {
+ // re-add this socket
+ spin_lock(&net_list_lock);
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+ goto start_over;
+ }
+ spin_unlock(&net_list_lock);
+
+ return 0;
+}
+
+
+void net_do_status_return(u64 msg_num, s32 status)
+{
+ net_status_ctxt *nsc;
+ struct list_head *iter;
+
+ spin_lock(&net_status_lock);
+ list_for_each(iter, &net_status_list) {
+ nsc = list_entry(iter, net_status_ctxt, list);
+ if (nsc->msg_num == msg_num) {
+ nsc->status = status;
+ atomic_set(&nsc->woken, 1);
+ list_del(&nsc->list);
+ spin_unlock(&net_status_lock);
+ wake_up(&nsc->wq);
+ return;
+ }
+ }
+ spin_unlock(&net_status_lock);
+}
+
+static int net_dispatch_message(struct inode *inode, struct socket *sock, net_msg *hdr, net_msg_handler *hnd)
+{
+ int ret = -EINVAL;
+ int len, packet_len;
+
+ len = hdr->data_len;
+ packet_len = len + sizeof(net_msg);
+
+ spin_lock(&hnd->lock);
+ if (net_handler_in_use(hnd)) {
+ netprintk0("EEEEEK! handler in use! bugbug\n");
+ spin_unlock(&hnd->lock);
+ return -EINVAL;
+ }
+ if (len > hnd->max_len) {
+ netprintk("eek! advertised message data len is too large %u (max: %u)\n",
+ len, hnd->max_len);
+ spin_unlock(&hnd->lock);
+ return -EINVAL;
+ }
+ hnd->flags |= (1 << NET_HND_IN_USE);
+ spin_unlock(&hnd->lock);
+
+ memset(hnd->buf, 0, packet_len);
+ ret = net_recv_tcp_msg(inode, sock, hnd->buf, &packet_len);
+ if (ret < 0) {
+ netprintk("net_recv_tcp_msg returned: %d\n", ret);
+ } else {
+ net_num_dispatched++;
+ ret = (hnd->func)((net_msg *)hnd->buf, packet_len, hnd->data);
+ }
+
+ spin_lock(&hnd->lock);
+ hnd->flags &= ~(1 << NET_HND_IN_USE);
+ spin_unlock(&hnd->lock);
+
+ return ret;
+}
+
+
+
+/*
+ * net_accept_tcp_connections()
+ *
+ */
+static int net_accept_tcp_connections(void)
+{
+ int error, slen;
+ struct sockaddr_in sin;
+ struct socket *sock;
+ struct inode *inode;
+
+ if (!recv_sock) {
+ netprintk0("no socket!\n");
+ return 0;
+ }
+
+ if (!recv_sock->sk->tp_pinfo.af_tcp.accept_queue) {
+ //netprintk0("no connections on the queue\n");
+ return 0;
+ }
+ error = 0;
+ while (error >= 0) {
+ sock = sock_alloc();
+ if (!sock)
+ break;
+
+ sock->type = recv_sock->type;
+ sock->ops = recv_sock->ops;
+ error = recv_sock->ops->accept(recv_sock, sock, O_NONBLOCK);
+ if (error < 0) {
+ sock_release(sock);
+ break;
+ }
+ if (sock->sk->state == TCP_CLOSE) {
+ sock_release(sock);
+ continue;
+ }
+
+ slen = sizeof(sin);
+ error = sock->ops->getname(sock, (struct sockaddr *) &sin, &slen, 1);
+ if (error < 0)
+ break;
+
+ netprintk("attempt to connect from %u.%u.%u.%u:%04x\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+
+ inode = nm_get_node_by_ip(sin.sin_addr.s_addr);
+ if (inode) {
+ int exists = 1;
+ nm_node_inode_private *priv = inode->u.generic_ip;
+ net_inode_private *net = NULL;
+
+ if (priv) {
+ net = &priv->net;
+ netprintk("connect from known host: %s\n",
+ priv->node.node_name);
+ if (ntohs(sin.sin_port) >= 1024)
+ netprintk("warning: connect from unprivileged port: %u.%u.%u.%u:%d\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ spin_lock(&priv->net.sock_lock);
+ if (!priv->net.sock) {
+ netprintk("new sock, doesnt exist\n");
+ exists = 0;
+ priv->net.sock = sock;
+ if (current != net_recv_task) {
+ netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+ if (net_recv_task == NULL)
+ BUG();
+ init_waitqueue_entry(&priv->net.sleep, net_recv_task);
+ } else {
+ netprintk("process %p added to waitqueue\n", current);
+ init_waitqueue_entry(&priv->net.sleep, current);
+ }
+ add_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+ }
+ spin_unlock(&priv->net.sock_lock);
+
+ if (exists) {
+ netprintk0("already a socket for this connection!\n");
+ net_send_error(sock, NET_ALREADY_CONNECTED);
+ net_dump_and_close_sock(sock, inode);
+ } else {
+ spin_lock(&net_list_lock);
+ netprintk("added inode %lu to net_recv_list\n", inode->i_ino);
+ if (list_empty(&net->list))
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+ }
+
+ iput(inode);
+ } else {
+ netprintk0("connect from unknown host...\n");
+ net_send_error(sock, NET_UNKNOWN_HOST);
+ net_dump_and_close_sock(sock, inode);
+ }
+ }
+ return error;
+}
+
+
+int net_send_error(struct socket *sock, u32 err_type)
+{
+ struct msghdr msg;
+ mm_segment_t oldfs;
+ struct iovec iov;
+ int len;
+ static net_msg err;
+
+ if (!net_is_valid_error_type(err_type)) {
+ netprintk("bug! bad error type! %u\n", err_type);
+ return -EINVAL;
+ }
+ memset(&err, 0, sizeof(net_msg));
+ err.magic = NET_MSG_MAGIC;
+ err.msg_type = err_type;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_NOSIGNAL;
+ msg.msg_iov->iov_len = (__kernel_size_t)sizeof(net_msg);
+ msg.msg_iov->iov_base = (char*) &err;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_sendmsg(sock, &msg, (size_t)(sizeof(net_msg)));
+ set_fs(oldfs);
+
+ return len;
+}
+
+
+static int net_recv_message_header(net_msg *hdr, struct socket *sock)
+{
+ int status;
+ mm_segment_t oldfs;
+ struct iovec iov = {
+ .iov_base = hdr,
+ .iov_len = sizeof(net_msg)
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_name = 0, // (struct sockaddr *) &sin,
+ .msg_namelen = 0, // sizeof (sin),
+ .msg_flags = 0
+ };
+
+ status = 0;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ status = sock_recvmsg(sock, &msg, sizeof(net_msg), MSG_PEEK);
+ set_fs(oldfs);
+
+ if (status < 0) {
+ if (status == -ERESTARTSYS) {
+ status = -EBADF;
+ netprintk ("Shutting down\n");
+ } else {
+ status = -EINVAL;
+ netprintk ("unable to recvmsg, error=%d\n", status);
+ }
+ }
+ // error or bytes received
+ return status;
+}
+
+static void net_dump_and_close_sock(struct socket *sock, struct inode *inode)
+{
+ nm_node_inode_private *priv = NULL;
+
+ net_dump_msg(sock, inode);
+
+ if (sock->sk) {
+ if (inode) {
+ priv = inode->u.generic_ip;
+ if (priv) {
+ spin_lock(&priv->net.sock_lock);
+ remove_wait_queue(sock->sk->sleep, &(priv->net.sleep));
+ priv->net.sock = NULL;
+ spin_unlock(&priv->net.sock_lock);
+ }
+ }
+ }
+ sock_release(sock);
+}
+
+static void net_dump_msg(struct socket *sock, struct inode *inode)
+{
+ struct msghdr msg;
+ struct iovec iov;
+ int len;
+ mm_segment_t oldfs;
+
+ if (sock->sk) {
+ len = 1;
+ while (len>0)
+ {
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_iov->iov_base = net_junk_buf;
+ msg.msg_iov->iov_len = (__kernel_size_t)PAGE_SIZE;
+ len = 0;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+ set_fs(oldfs);
+ }
+ }
+}
+
+
+int net_init_tcp_sock(struct inode *inode)
+{
+ nm_node_inode_private *priv;
+ nm_node_info *node;
+ net_inode_private *net = NULL;
+ struct sockaddr_in myaddr, remoteaddr;
+ int err = -EINVAL;
+ int i;
+ struct sock *sk;
+ struct socket *sock = NULL;
+
+ priv = inode->u.generic_ip;
+ if (!priv) {
+ netprintk0("bad inode\n");
+ return -EINVAL;
+ }
+ net = &priv->net;
+ node = &priv->node;
+
+ if ((err = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+ netprintk("can't create socket: err=%d\n", err);
+ return err;
+ }
+
+ spin_lock(&net->sock_lock);
+ if (net->sock || net->flags & NET_FLAG_CREATING_SOCKET) {
+ netprintk("socket already created or creating for inode %lu\n", inode->i_ino);
+ spin_unlock(&net->sock_lock);
+ sock_release(sock);
+ return -EEXIST;
+ }
+ net->flags |= NET_FLAG_CREATING_SOCKET;
+ spin_unlock(&net->sock_lock);
+
+ memset(&myaddr, 0, sizeof(myaddr));
+ myaddr.sin_family = AF_INET;
+ myaddr.sin_port = htons(0); // any port
+ err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, sizeof(myaddr));
+
+ memset (&remoteaddr, 0, sizeof (remoteaddr));
+ remoteaddr.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
+ remoteaddr.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
+ remoteaddr.sin_port = node->ifaces[0].ip_port;
+
+ //netprintk("connecting new socket: ip %d.%d.%d.%d, port %d\n", NIPQUAD(remoteaddr.sin_addr.s_addr), remoteaddr.sin_port);
+ err = sock->ops->connect(sock, (struct sockaddr *) &remoteaddr,
+ sizeof(remoteaddr), 0); /* TODO put this back! O_NONBLOCK); */
+ //netprintk("connect status %d\n", err);
+
+ if (err >= 0) {
+ spin_lock(&net->sock_lock);
+ net->sock = sock;
+ net->flags &= ~NET_FLAG_CREATING_SOCKET;
+
+ netprintk0("1) ok this node is actively trying to connect, add to waitqueue\n");
+ if (current != net_recv_task) {
+ netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+ if (net_recv_task == NULL)
+ BUG();
+ init_waitqueue_entry(&net->sleep, net_recv_task);
+ } else {
+ netprintk("process %p added to waitqueue\n", current);
+ init_waitqueue_entry(&net->sleep, current);
+ }
+ add_wait_queue(sock->sk->sleep, &net->sleep);
+
+ spin_unlock(&net->sock_lock);
+ goto out;
+ }
+
+ sk = sock->sk;
+ switch (err) {
+ case -EALREADY:
+ case -EINPROGRESS:
+
+ /* TODO: awful awful awful */
+ for (i=0; i<100; i++) {
+ /* Protect against TCP socket state changes */
+ lock_sock(sk);
+ if (sk->state == TCP_ESTABLISHED) {
+ release_sock(sk);
+ netprintk0("woo! connected...\n");
+ err = 0;
+ spin_lock(&net->sock_lock);
+ net->flags &= ~NET_FLAG_CREATING_SOCKET;
+ net->sock = sock;
+
+ netprintk0("2) ok this node is actively trying to connect, add to waitqueue\n");
+ if (current != net_recv_task) {
+ netprintk("net_recv_task=%p... maybe i should add THAT instead\n", net_recv_task);
+ if (net_recv_task == NULL)
+ BUG();
+ init_waitqueue_entry(&net->sleep, net_recv_task);
+ } else {
+ netprintk("process %p added to waitqueue\n", current);
+ init_waitqueue_entry(&net->sleep, current);
+ }
+ add_wait_queue(sock->sk->sleep, &net->sleep);
+
+ spin_unlock(&net->sock_lock);
+ break;
+ } else {
+ netprintk("waiting for connection: pass %d, state %d\n", i, sk->state);
+ /* TODO */
+#if 0
+ task->tk_timeout = RPC_CONNECT_TIMEOUT;
+ /* if the socket is already closing, delay briefly */
+ if ((1<<sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+ task->tk_timeout = RPC_REESTABLISH_TIMEOUT;
+ rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
+#endif
+ /* TODO: this is awful... change it later */
+ }
+ release_sock(sk);
+ util_sleep(100);
+ }
+ break;
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ENOTCONN:
+ netprintk("conn refused, reset or not connected\n");
+ break;
+ default:
+ /* Report myriad other possible returns. If this file
+ * system is soft mounted, just error out, like Solaris. */
+ netprintk("error %d connecting to server\n", err);
+ /* TODO */
+#if 0
+ /* This will prevent anybody else from connecting */
+ rpc_delay(task, RPC_REESTABLISH_TIMEOUT);
+ task->tk_status = status;
+#endif
+ break;
+ }
+
+out:
+ if (err < 0) {
+ if (net) {
+ spin_lock(&net->sock_lock);
+ if (net->sock)
+ netprintk0("wha?! there's a socket there already!!!!\n");
+ net->flags &= ~NET_FLAG_CREATING_SOCKET;
+ spin_unlock(&net->sock_lock);
+ }
+ if (sock)
+ sock_release(sock);
+ } else {
+ /* add this inode to the receive list, if not already */
+ spin_lock(&net_list_lock);
+ if (list_empty(&net->list))
+ list_add_tail(&net->list, &net_recv_list);
+ spin_unlock(&net_list_lock);
+ }
+
+ return err;
+}
+
+
+
+/*
+ * net_init_tcp_recv_sock()
+ *
+ */
+static int net_init_tcp_recv_sock(void)
+{
+ struct sockaddr_in sin;
+ int status = -EINVAL;
+
+ /* Create Receive Socket */
+ status = sock_create(net_ip_version_to_family(ip_version),
+ SOCK_STREAM, IPPROTO_TCP,
+ &recv_sock);
+ if (status < 0) {
+ netprintk ("unable to create socket, error=%d", status);
+ goto bail;
+ }
+
+
+ /* Bind Receive Socket */
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = net_ip_version_to_family(ip_version);
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = ip_port;
+
+ status = recv_sock->ops->bind(recv_sock,
+ (struct sockaddr *)&sin,
+ sizeof(sin));
+ if (status < 0) {
+ netprintk ("unable to bind socket to port %d, error=%d",
+ ntohs(ip_port), status);
+ }
+
+ /* !!! dunno about these... */
+ recv_sock->sk->reuse = 1;
+ status = recv_sock->ops->listen(recv_sock, 64);
+
+bail:
+ return status;
+} /* net_init_tcp_recv_sock */
+
+
+static void net_release_tcp_sock(void)
+{
+ if (recv_sock) {
+ sock_release (recv_sock);
+ recv_sock = NULL;
+ }
+}
+
+
+module_init (net_driver_entry);
+module_exit (net_driver_exit);
Added: trunk/cluster/tcp.h
===================================================================
--- trunk/cluster/tcp.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/tcp.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_TCP_H
+#define CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+#include "nodemanager.h"
+
+
+#ifdef __KERNEL__
+
+#define NET_DISP_THREAD_MS 5000 /* TODO */
+#define NET_RECV_THREAD_MS 5000 /* TODO */
+
+#ifdef BIG_NET_MSG
+#define NET_MSG_MAGIC ((u32)0xbc0ffa55)
+#define NET_MSG_STATUS_MAGIC ((u32)0xbc0ffa56)
+#define NET_MSG_NUM_MAX ((u64)0xffffffffffffffffULL)
+typedef struct _net_msg
+{
+ __u32 magic;
+ __u32 data_len;
+ __u16 src_node;
+ __u16 dst_node;
+ __u32 msg_type;
+ __u32 key;
+ __s32 status;
+ __u64 msg_num;
+ __u8 buf[0];
+} net_msg;
+#else
+
+#define NET_MSG_MAGIC ((u16)0xfa55)
+#define NET_MSG_STATUS_MAGIC ((u16)0xfa56)
+#define NET_MSG_NUM_MAX ((u32)0xffffffffUL)
+typedef struct _net_msg
+{
+ __u16 magic;
+ __u16 data_len;
+ __u16 msg_type;
+ __s16 status;
+ __u32 key;
+ __u32 msg_num;
+ __u8 buf[0];
+} net_msg;
+
+#endif
+
+typedef int (net_msg_handler_func)(net_msg *msg, u32 len, void *data);
+
+typedef struct _net_msg_handler
+{
+ struct list_head list;
+ u32 msg_type;
+ u32 key;
+ net_msg_handler_func *func;
+ void *data;
+ net_msg hdr;
+ u32 max_len;
+ void *buf;
+ spinlock_t lock;
+ atomic_t refcnt;
+ int flags;
+} net_msg_handler;
+
+typedef struct _net_status_ctxt
+{
+ struct list_head list;
+ s32 status;
+ u64 msg_num;
+ wait_queue_head_t wq;
+ atomic_t woken;
+} net_status_ctxt;
+
+void net_do_status_return(u64 msg_num, s32 status);
+
+/* no clue for these yet... */
+#define NET_MIN_MSG_LEN (0)
+#define NET_MAX_MSG_LEN (8192)
+
+
+#define NET_ALREADY_CONNECTED 2
+#define NET_UNKNOWN_HOST 3
+
+
+static inline int net_is_valid_error_type(u32 err_type)
+{
+ if (err_type == NET_ALREADY_CONNECTED ||
+ err_type == NET_UNKNOWN_HOST)
+ return 1;
+ return 0;
+}
+
+enum {
+ NET_HND_VAR_LEN = 0,
+ NET_HND_IN_USE,
+};
+
+#define net_handler_variable_len(h) ((h)->flags & (1 << NET_HND_VAR_LEN))
+#define net_handler_in_use(h) ((h)->flags & (1 << NET_HND_IN_USE))
+
+static inline int net_handler_msg_len_ok(net_msg_handler *handler, u32 len)
+{
+ return (net_handler_variable_len(handler) ?
+ len > handler->max_len : len != handler->max_len);
+}
+
+
+static inline int net_ip_version_to_family(u16 ip_version)
+{
+ printk("ip_version passed: %u, host byteorder: %u\n", ip_version, ntohs(ip_version));
+ return PF_INET;
+ switch (ntohs(ip_version)) {
+ case 4:
+ return PF_INET;
+ case 6:
+ return PF_INET6;
+ default:
+ BUG();
+ }
+
+ return 4;
+}
+
+
+
+/* TODO: figure this out.... */
+static inline int net_link_down(int err, struct socket *sock)
+{
+ if (sock) {
+ if (sock->sk->state != TCP_ESTABLISHED &&
+ sock->sk->state != TCP_CLOSE_WAIT)
+ return 1;
+ }
+
+ if (err >= 0)
+ return 0;
+ switch (err) {
+ /* ????????????????????????? */
+ case -ERESTARTSYS:
+ case -EBADF:
+ /* When the server has died, an ICMP port unreachable
+ * message prompts ECONNREFUSED. */
+ case -ECONNREFUSED:
+ case -ENOTCONN:
+ case -ECONNRESET:
+ case -EPIPE:
+ return 1;
+ }
+ return 0;
+}
+
+enum {
+ NET_DRIVER_UNINITED,
+ NET_DRIVER_READY,
+};
+
+
+int net_register_handler(u32 msg_type, u32 key, int flags,
+ u32 max_len, net_msg_handler_func *func, void *data, void *buf);
+net_msg * net_package_message(u32 msg_type, u32 key, void *data, u32 len);
+int net_recv_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 *packet_len);
+int net_send_tcp_msg (struct inode *inode, struct socket *sock, void *data, u32 packet_len);
+int net_send_error(struct socket *sock, u32 err_type);
+int net_init_tcp_sock(struct inode *inode);
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status);
+int net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *group);
+net_msg_handler * net_lookup_handler(u32 msg_type, u32 key);
+
+#endif /* __KERNEL__ */
+
+typedef struct _net_ioc
+{
+ u32 status;
+} net_ioc;
+
+typedef struct _gsd_ioc
+{
+ int fd;
+ int namelen;
+ char name[NM_MAX_NAME_LEN+1];
+ int status;
+} gsd_ioc;
+
+#define NET_IOC_MAGIC 'O'
+#define NET_IOC_ACTIVATE _IOR(NET_IOC_MAGIC, 1, net_ioc)
+#define NET_IOC_GETSTATE _IOR(NET_IOC_MAGIC, 2, net_ioc)
+#define GSD_IOC_CREATE_GROUP _IOR(NET_IOC_MAGIC, 3, gsd_ioc)
+#define GSD_IOC_ADD_GROUP_NODE _IOR(NET_IOC_MAGIC, 4, gsd_ioc)
+
+#define GSD_MESSAGE 130
+#define GSD_ACTION_ADD_GROUP (0x01)
+#define GSD_ACTION_ADD_GROUP_NODE (0x02)
+
+typedef struct _gsd_message
+{
+ u16 from;
+ u8 action;
+ u8 namelen;
+ u8 name[NM_MAX_NAME_LEN];
+} gsd_message;
+
+#endif /* CLUSTER_TCP_H */
Added: trunk/cluster/test.c
===================================================================
--- trunk/cluster/test.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/test.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,811 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * test.c
+ *
+ * test module
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#endif
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/proc_fs.h>
+
+#include <asm/uaccess.h>
+
+#include "dlm_compat.h"
+#include "util.h"
+#include "dlmcommon.h"
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "dlmmod.h"
+
+#include "compat_libfs.h"
+
+#define testprintk(x, arg...) printk("TEST: (%d) " x, current->pid, ##arg)
+#define testprintk0(x) printk("TEST: (%d) " x, current->pid)
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size);
+static ssize_t write_net_send(struct file *file, char *buf, size_t size);
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size);
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size);
+
+enum {
+ TEST_Root = 1,
+ TEST_NetRegister,
+ TEST_NetSend,
+ TEST_NetGetNum,
+ TEST_DLMPoop,
+ TEST_DLMPoop2,
+ TEST_DLMPoop3,
+ TEST_DLMRegister
+};
+
+extern spinlock_t net_state_lock;
+extern u32 net_driver_state;
+extern struct file_operations transaction_ops;
+extern char *nm_nodename;
+extern u32 net_num_dispatched;
+
+
+static void test_teardown(void);
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data);
+
+static int test_net_send(int arg);
+static int test_net_register(int arg);
+static int test_net_get_num(int arg);
+static int test_dlm_poop(int arg);
+static int test_dlm_poop2(int arg);
+static int test_dlm_poop3(int arg);
+static int test_dlm_register(int arg);
+
+
+
+int test_small_msg_func(net_msg *msg, u32 len, void *data)
+{
+ testprintk("got a message! type=%u, len=%u, data=%d\n", msg->msg_type, len, *(int *)data);
+ return 0;
+}
+
+#define TEST_MSG_TYPE1 87654321
+#define TEST_KEY1 12378534
+
+int test_data1 = 723123123;
+
+static int test_net_register(int arg)
+{
+ int ret;
+ struct inode *dest_inode;
+ u16 dest_node_num = (u16)arg;
+
+ testprintk("running test_net_register: will contact node %u\n", dest_node_num);
+
+ dest_inode = nm_get_node_by_num(dest_node_num);
+ if (!dest_inode) {
+ testprintk("eeek! failed to find node %u\n", dest_node_num);
+ return 0;
+ }
+ {
+ struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+ testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+ }
+
+ ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+ test_small_msg_func, &test_data1, NULL);
+ if (ret < 0) {
+ testprintk0("eek! register failed!\n");
+ return -1;
+ }
+ ret = net_register_handler(TEST_MSG_TYPE1, TEST_KEY1, 0, 0,
+ test_small_msg_func, &test_data1, NULL);
+ if (ret >= 0) {
+ testprintk0("eek! re-register was supposed to fail but didnt!!!\n");
+ return -1;
+ }
+ testprintk0("sweet. re-register failed like it should have.\n");
+
+ testprintk0("creating socket now...\n");
+ ret = net_init_tcp_sock(dest_inode);
+ if (ret < 0) {
+ testprintk0("failed to make socket\n");
+ return -1;
+ }
+ testprintk("net_init_tcp_sock returned %d\n", ret);
+
+ testprintk0("leaving test_net_register!\n");
+ return 0;
+}
+
+
+static int test_net_send(int arg)
+{
+ int ret;
+ struct inode *dest_inode;
+ u16 dest_node_num = (u16)arg;
+
+ testprintk("running test_net_send: will contact node %u\n", dest_node_num);
+
+ dest_inode = nm_get_node_by_num(dest_node_num);
+ if (!dest_inode) {
+ testprintk("eeek! failed to find node %u\n", dest_node_num);
+ return 0;
+ }
+ {
+ struct dentry *dentry = list_entry(dest_inode->i_dentry.next, struct dentry, d_alias);
+ testprintk("found node %u, name %*s\n", dest_node_num, dentry->d_name.len, dentry->d_name.name);
+ }
+
+ testprintk0("packaging message now\n");
+
+ {
+ testprintk0("woo! made a message packet... lets try sending it to ourself...\n");
+ testprintk0("waiting for socket to be created\n");
+ while (1) {
+ printk(".");
+ spin_lock(&net_state_lock);
+ if (net_driver_state == NET_DRIVER_READY) {
+ spin_unlock(&net_state_lock);
+ break;
+ }
+ spin_unlock(&net_state_lock);
+ util_sleep (100);
+ }
+ printk(". done... let's go!\n");
+ ret = net_send_message(TEST_MSG_TYPE1, TEST_KEY1, NULL, 0, dest_inode, NULL);
+ testprintk("sent!!!! ret=%d\n", ret);
+ }
+ testprintk0("leaving test_net_send!\n");
+ return 0;
+
+}
+
+static int test_net_get_num(int arg)
+{
+ testprintk("number of messages dispatched: %u\n", net_num_dispatched);
+ return 0;
+}
+
+void my_ast(void *data);
+void my_bast(void *data, int blocked_type);
+
+dlm_lockstatus lksb1, lksb2;
+wait_queue_head_t convert_wq;
+atomic_t convert_flag;
+
+dlm_ctxt *the_dlm = NULL;
+
+static int test_dlm_poop(int arg)
+{
+ testprintk("calling dlm_dump_dlm(%p)\n", the_dlm);
+ if (the_dlm)
+ dlm_dump_dlm(the_dlm);
+
+#if 0
+ dlm_ctxt *dlm;
+ dlm_status status;
+ void *data1 = &lksb1;
+ void *data2 = &lksb2;
+ int ret;
+
+ memset(&lksb1, 0, sizeof(dlm_lockstatus));
+ memset(&lksb1, 0, sizeof(dlm_lockstatus));
+
+ testprintk0("calling dlm_register_domain...\n");
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+ testprintk("dlm_register_domain returned %p\n", dlm);
+
+ testprintk0("calling dlmlock...\n");
+ status = dlmlock(dlm, LKM_EXMODE, &lksb1, 0, "lock1", my_ast, data1, my_bast);
+ testprintk("dlmlock returned %d. lksb.status=%d, lock=%p\n", status, lksb1.status, lksb1.lockid);
+
+ testprintk0("calling dlmlock to do a convert...\n");
+ status = dlmlock(dlm, LKM_PRMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data1, my_bast);
+ testprintk("dlmlock returned %d\n", status);
+
+ init_waitqueue_head (&convert_wq);
+ atomic_set(&convert_flag, 0);
+
+ testprintk0("calling second dlmlock...\n");
+ status = dlmlock(dlm, LKM_EXMODE, &lksb2, 0, "lock1", my_ast, data2, my_bast);
+ testprintk("dlmlock returned %d. lksb.status=%d, lock=%p\n", status, lksb2.status, lksb2.lockid);
+
+ testprintk0("sleeping now!\n");
+ ret = util_wait_atomic_eq(&convert_wq, &convert_flag, 1, 20000);
+ testprintk("wait returned %d\n", ret);
+
+ testprintk0("calling dlmlock to do a convert the blocking lock to NL...\n");
+ status = dlmlock(dlm, LKM_NLMODE, &lksb1, LKM_CONVERT, "lock1", my_ast, data2, my_bast);
+ testprintk("dlmlock returned %d\n", status);
+
+ testprintk0("sleeping\n");
+ util_sleep(10000);
+ testprintk0("DONE!\n");
+#endif
+ return 0;
+}
+
+
+void my_ast(void *data)
+{
+ dlm_lockstatus *l = data;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ testprintk("AST!!!: lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+}
+
+void my_bast(void *data, int blocked_type)
+{
+ dlm_lockstatus *l = data;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ testprintk("BAST!!!: blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&convert_flag, 1);
+ wake_up(&convert_wq);
+}
+
+atomic_t finish;
+
+typedef struct _poo
+{
+ struct task_struct *task;
+ dlm_ctxt *dlm;
+ dlm_lockstatus *lksb;
+ wait_queue_head_t wq;
+ atomic_t ast_flag;
+ atomic_t bast_flag;
+ struct completion complete;
+} poo;
+void my_ast2(void *data);
+void my_bast2(void *data, int blocked_type);
+int test_dlm_thread(void *data);
+atomic_t asts_fired, basts_fired;
+
+typedef union _my_timing_t
+{
+ __u64 q;
+ __u32 lohi[2];
+} my_timing_t;
+
+
+static int test_dlm_poop2(int arg)
+{
+ dlm_ctxt *dlm;
+ dlm_status status;
+ void *data1 = &lksb1;
+ void *data2 = &lksb2;
+ int ret;
+ int pid1, pid2;
+ poo *poo1, *poo2;
+ my_timing_t t1, t2, t3;
+
+ poo1 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo1=%p\n", poo1);
+ poo2 = kmalloc(sizeof(poo), GFP_KERNEL);
+testprintk("poo2=%p\n", poo2);
+
+ atomic_set(&finish, 0);
+ atomic_set(&asts_fired, 0);
+ atomic_set(&basts_fired, 0);
+
+ testprintk0("calling dlm_register_domain...\n");
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+ testprintk("dlm_register_domain returned %p\n", dlm);
+
+ poo1->dlm = dlm;
+ poo2->dlm = dlm;
+ init_completion(&poo1->complete);
+ init_completion(&poo2->complete);
+
+ rdtsc(t1.lohi[0], t1.lohi[1]);
+ pid1 = kernel_thread (test_dlm_thread, poo1, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (pid1 < 0) {
+ printk("unable to launch thread, error=%d", pid1);
+ return -EINVAL;
+ }
+ pid2 = kernel_thread (test_dlm_thread, poo2, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (pid2 < 0) {
+ printk("unable to launch thread, error=%d", pid2);
+ return -EINVAL;
+ }
+ testprintk("dlm threads running for %s...\n", dlm->name);
+ testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+ testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+ //testprintk("poo1->dlm=%p, ->task=%p\n", poo1->dlm, poo1->task);
+ //testprintk("poo2->dlm=%p, ->task=%p\n", poo2->dlm, poo2->task);
+ //testprintk("sending sigint now...\n");
+ //send_sig (SIGINT, poo1->task, 0);
+ //send_sig (SIGINT, poo2->task, 0);
+ //atomic_set(&finish, 1);
+ while (1) {
+ util_sleep(30000);
+ rdtsc(t3.lohi[0], t3.lohi[1]);
+ testprintk("another 30 sec: asts=%d, basts=%d, diff=%llu\n",
+ atomic_read(&asts_fired), atomic_read(&basts_fired),
+ t3.q - t1.q);
+ if (atomic_read(&finish)==1) {
+ printk("finish set!\n");
+ break;
+ }
+ }
+ wait_for_completion (&poo1->complete);
+ wait_for_completion (&poo2->complete);
+ rdtsc(t2.lohi[0], t2.lohi[1]);
+ kfree(poo1);
+ kfree(poo2);
+ testprintk("leaving! asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired),
+ t2.q - t1.q);
+ return 0;
+}
+
+
+int test_dlm_thread(void *data)
+{
+ dlm_status status;
+ int ret;
+ dlm_lockstatus *lksb;
+ poo *mypoo = data;
+ dlm_ctxt *dlm = mypoo->dlm;
+
+ testprintk("mypoo=%p, dlm=%p\n", mypoo, dlm);
+ mypoo->task = current;
+ lksb = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+ memset(lksb, 0, sizeof(dlm_lockstatus));
+
+ mypoo->lksb = lksb;
+ init_waitqueue_head(&mypoo->wq);
+
+ atomic_set(&mypoo->ast_flag, 0);
+ atomic_set(&mypoo->bast_flag, 0);
+
+ testprintk("mypoo=%p, dlm=%p, task=%p\n", mypoo, dlm, mypoo->task);
+
+ testprintk("calling dlmlock(%p, %d, %p, 0, \"lock1\", %p, %p, %p) to create the lock...\n",
+ dlm, LKM_EXMODE, lksb, my_ast2, data, my_bast2);
+ status = dlmlock(dlm, LKM_EXMODE, lksb, 0, "lock1", my_ast2, data, my_bast2);
+ testprintk("dlmlock returned %d. lksb.status=%d, lock=%p\n", status, lksb->status, lksb->lockid);
+
+again:
+ ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+ if (ret < 0) {
+ testprintk("1: waiting on ast converting to EX, ret=%d, type=%d, convtype=%d\n",
+ ret, lksb->lockid->type, lksb->lockid->convert_type);
+ if (ret == -EINTR)
+ goto leave;
+ goto again;
+ }
+ atomic_set(&mypoo->ast_flag, 0);
+
+
+
+wait_bast:
+ ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->bast_flag, 1, 0);
+ if (ret < 0) {
+ testprintk("2: waiting on bast after converting to EX, ret=%d, type=%d, convtype=%d\n",
+ ret, lksb->lockid->type, lksb->lockid->convert_type);
+ if (ret == -EINTR)
+ goto leave;
+ goto wait_bast;
+ }
+ atomic_set(&mypoo->bast_flag, 0);
+
+
+
+
+ atomic_set(&mypoo->ast_flag, 0);
+
+ status = dlmlock(dlm, LKM_NLMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+wait_ast:
+ ret = util_wait_atomic_eq(&mypoo->wq, &mypoo->ast_flag, 1, 0);
+ if (ret < 0) {
+ testprintk("3: waiting on ast converting to NL, ret=%d, type=%d, convtype=%d\n",
+ ret, lksb->lockid->type, lksb->lockid->convert_type);
+ if (ret == -EINTR)
+ goto leave;
+ goto wait_ast;
+ }
+
+ atomic_set(&mypoo->ast_flag, 0);
+ atomic_set(&mypoo->bast_flag, 0);
+
+ status = dlmlock(dlm, LKM_EXMODE, lksb, LKM_CONVERT, "lock1", my_ast2, data, my_bast2);
+
+
+ if (atomic_read(&finish) == 0)
+ goto again;
+leave:
+
+ atomic_set(&finish, 1);
+ kfree(mypoo->lksb);
+ complete (&mypoo->complete);
+ testprintk0("exiting thread\n");
+ return 0;
+}
+
+
+void my_ast2(void *data)
+{
+ poo *mypoo = data;
+ dlm_lockstatus *l = mypoo->lksb;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&asts_fired);
+ //testprintk("AST!!!: lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ // l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&mypoo->ast_flag, 1);
+ wake_up(&mypoo->wq);
+}
+
+void my_bast2(void *data, int blocked_type)
+{
+ poo *mypoo = data;
+ dlm_lockstatus *l = mypoo->lksb;
+ dlm_lock *lock = l->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&basts_fired);
+ //testprintk("BAST!!!: blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ // blocked_type, l, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&mypoo->bast_flag, 1);
+ wake_up(&mypoo->wq);
+}
+
+wait_queue_head_t wq3;
+atomic_t ast_flag3, bast_flag3;
+dlm_lockstatus *lksb3;
+
+void my_bast3(void *data, int blocked_type);
+void my_ast3(void *data);
+
+void my_ast3(void *data)
+{
+ dlm_lock *lock = lksb3->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&asts_fired);
+ testprintk("AST!!!: lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&ast_flag3, 1);
+ wake_up(&wq3);
+}
+
+void my_bast3(void *data, int blocked_type)
+{
+ dlm_lock *lock = lksb3->lockid;
+ dlm_lock_resource *res = lock->lockres;
+
+ atomic_inc(&basts_fired);
+ testprintk("BAST!!!: blocked=%d, lockstatus=%p, lock=%p, lockres=%p, lockname=%*s, type=%d\n",
+ blocked_type, lksb3, lock, res, res->lockname.len, res->lockname.name, lock->type);
+ atomic_set(&bast_flag3, 1);
+ wake_up(&wq3);
+}
+
+static int test_dlm_poop3(int arg)
+{
+ dlm_ctxt *dlm;
+ dlm_status status;
+ int ret, i;
+ my_timing_t t1, t2, t3, t4;
+
+ atomic_set(&finish, 0);
+ atomic_set(&asts_fired, 0);
+ atomic_set(&basts_fired, 0);
+
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+
+ lksb3 = kmalloc(sizeof(dlm_lockstatus), GFP_KERNEL);
+ memset(lksb3, 0, sizeof(dlm_lockstatus));
+
+ init_waitqueue_head(&wq3);
+
+ atomic_set(&ast_flag3, 0);
+ atomic_set(&bast_flag3, 0);
+
+ i = 0;
+ rdtsc(t1.lohi[0], t1.lohi[1]);
+
+ /* CREATE -> NL */
+ testprintk0("creating lock\n");
+rdtsc(t3.lohi[0], t3.lohi[1]);
+ status = dlmlock(dlm, LKM_NLMODE, lksb3, 0, "lock1", my_ast3, NULL, my_bast3);
+
+ while (1) {
+ testprintk("%d: waiting on ast\n", i);
+ ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+ if (ret == -EINTR)
+ break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->NL took: %llu\n", i, t4.q - t3.q);
+ testprintk("%d: no bast for NL\n", i);
+
+ atomic_set(&ast_flag3, 0);
+ atomic_set(&bast_flag3, 0);
+
+ if (i == 10) {
+ testprintk("%d: reached 10, goodbye\n", i);
+ break;
+ }
+ dlm_dump_dlm(dlm);
+
+ /* CONVERT -> EX */
+ testprintk("%d: converting dlmlock->EX\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+ status = dlmlock(dlm, LKM_EXMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+ testprintk("%d: waiting on ast\n", i);
+ ret = util_wait_atomic_eq(&wq3, &ast_flag3, 1, 0);
+ if (ret == -EINTR)
+ break;
+rdtsc(t4.lohi[0], t4.lohi[1]);
+testprintk("%d: ->EX took: %llu\n", i, t4.q - t3.q);
+ atomic_set(&ast_flag3, 0);
+
+ testprintk("%d: waiting on bast\n", i);
+ ret = util_wait_atomic_eq(&wq3, &bast_flag3, 1, 0);
+ if (ret == -EINTR)
+ break;
+ atomic_set(&ast_flag3, 0);
+ atomic_set(&bast_flag3, 0);
+
+ /* CONVERT -> NL */
+ testprintk("%d: converting dlmlock->NL\n", i);
+rdtsc(t3.lohi[0], t3.lohi[1]);
+ status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+ /* WAIT ON AST AGAIN */
+ i++;
+ }
+
+ /* DOWNCONVERT LAST TIME */
+ /* TODO: replace with dlmunlock once implemented */
+ status = dlmlock(dlm, LKM_NLMODE, lksb3, LKM_CONVERT, "lock1", my_ast3, NULL, my_bast3);
+
+ kfree(lksb3);
+
+ rdtsc(t2.lohi[0], t2.lohi[1]);
+ testprintk("leaving! asts=%d, basts=%d, diff=%llu\n", atomic_read(&asts_fired), atomic_read(&basts_fired),
+ t2.q - t1.q);
+ return 0;
+}
+
+
+static int test_dlm_register(int arg)
+{
+ dlm_ctxt *dlm;
+
+ testprintk0("calling dlm_register_domain...\n");
+ dlm = dlm_register_domain("mylittledomain", "grupo2", 0x6543abcd);
+ testprintk("dlm_register_domain returned %p\n", dlm);
+
+ the_dlm = dlm;
+ testprintk0("leaving!\n");
+ return 0;
+}
+
+
+
+
+/*
+ * module stuff
+ */
+
+
+static ssize_t write_net_register(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_net_register(%d)\n", arg);
+ tmpret = test_net_register(arg);
+ ret = sprintf(buf, "test_net_register(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_net_send(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_net_send(%d)\n", arg);
+ tmpret = test_net_send(arg);
+ ret = sprintf(buf, "test_net_send(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_net_get_num(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_net_get_num(%d)\n", arg);
+ tmpret = test_net_get_num(arg);
+ ret = sprintf(buf, "test_net_get_num(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_dlm_poop(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_poop(%d)\n", arg);
+ tmpret = test_dlm_poop(arg);
+ ret = sprintf(buf, "test_dlm_poop(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_dlm_poop2(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_poop2(%d)\n", arg);
+ tmpret = test_dlm_poop2(arg);
+ ret = sprintf(buf, "test_dlm_poop2(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+static ssize_t write_dlm_poop3(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_poop3(%d)\n", arg);
+ tmpret = test_dlm_poop3(arg);
+ ret = sprintf(buf, "test_dlm_poop3(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+
+static ssize_t write_dlm_register(struct file *file, char *buf, size_t size)
+{
+ int arg = 0, tmpret, ret;
+ if (size > 0)
+ arg = simple_strtoul(buf, NULL, 0);
+ printk("calling test_dlm_register(%d)\n", arg);
+ tmpret = test_dlm_register(arg);
+ ret = sprintf(buf, "test_dlm_register(%d) returned: %d\n", arg, tmpret);
+ return ret;
+}
+
+
+
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ * populating the filesystem.
+ */
+static int test_fill_super(struct super_block * sb, void * data, int silent)
+{
+ int ret, sz;
+ TA_write_ops *ops;
+ static struct tree_descr test_files[] = {
+ [TEST_NetRegister] = {"net-register", &transaction_ops, S_IWUSR},
+ [TEST_NetSend] = {"net-send", &transaction_ops, S_IWUSR},
+ [TEST_NetGetNum] = {"net-get-num", &transaction_ops, S_IWUSR},
+ [TEST_DLMPoop] = {"dlm-poop", &transaction_ops, S_IWUSR},
+ [TEST_DLMPoop2] = {"dlm-poop2", &transaction_ops, S_IWUSR},
+ [TEST_DLMPoop3] = {"dlm-poop3", &transaction_ops, S_IWUSR},
+ [TEST_DLMRegister] = {"dlm-register", &transaction_ops, S_IWUSR},
+ /* last one */ {""}
+ };
+
+ sz = sizeof(test_files) / sizeof(struct tree_descr);
+ ops = kmalloc(sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ memset(ops, 0, sizeof(TA_write_ops) + (sz * sizeof(TA_write_op *)));
+ ops->num_ops = sz;
+ ops->write_op[TEST_NetRegister] = write_net_register;
+ ops->write_op[TEST_NetSend] = write_net_send;
+ ops->write_op[TEST_NetGetNum] = write_net_get_num;
+ ops->write_op[TEST_DLMPoop] = write_dlm_poop;
+ ops->write_op[TEST_DLMPoop2] = write_dlm_poop2;
+ ops->write_op[TEST_DLMPoop3] = write_dlm_poop3;
+ ops->write_op[TEST_DLMRegister] = write_dlm_register;
+
+ printk("calling simple_fill_super...\n");
+ ret = simple_fill_super(sb, 0x12beAf00L, test_files);
+ if (ret >= 0) {
+ TA_GENERIC_SB_MEMBER(sb) = ops;
+ } else {
+ kfree(ops);
+ }
+ return ret;
+}
+
+static struct super_block *test_read_super (struct super_block *sb, void *data, int silent)
+{
+ printk("welcome to test_read_super!!!\n");
+ return (test_fill_super(sb, data, silent) < 0) ? NULL : sb;
+}
+
+
+static DECLARE_FSTYPE (test_fs_type, "test", test_read_super, FS_SINGLE|FS_LITTER);
+
+static int __init init_test(void)
+{
+ int retval;
+ void *ret;
+
+ printk("loading test module: nodename is %s\n", nm_nodename);
+
+ ret = proc_mkdir("cluster/test", 0);
+ printk("proc_mkdir of cluster/test returned %p\n", ret);
+
+ printk("calling register_filesystem\n");
+ retval = register_filesystem(&test_fs_type);
+ printk("done calling register_filesystem: ret=%d\n", retval);
+ if (retval) {
+ printk("oopsy that did not work\n");
+ test_teardown();
+ } else
+ printk("woot. good to go.\n");
+ return retval;
+}
+
+static void __exit exit_test(void)
+{
+ test_teardown();
+ unregister_filesystem(&test_fs_type);
+ printk("unloading test module\n");
+}
+
+static void test_teardown(void)
+{
+ printk("removing cluster/test\n");
+ remove_proc_entry("cluster/test", NULL);
+}
+
+
+
+
+
+MODULE_LICENSE("GPL");
+module_init(init_test)
+module_exit(exit_test)
Added: trunk/cluster/util.c
===================================================================
--- trunk/cluster/util.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/util.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,349 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.c
+ *
+ * General purpose code
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#include "warning_hack.h"
+
+#include "dlm_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "util.h"
+
+static void util_timeout_func(unsigned long data);
+
+/* block all but 'mask' sigs, optionally saving off our previous
+ * signal state. */
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+ sigset_t tmpsig;
+
+ siginitsetinv(&tmpsig, mask);
+ sigprocmask(SIG_BLOCK, &tmpsig, oldsigs);
+#else
+#ifdef HAVE_NPTL
+ spin_lock_irq (¤t->sighand->siglock);
+ if (oldsigs)
+ *oldsigs = current->blocked;
+ siginitsetinv (¤t->blocked, mask);
+ recalc_sigpending ();
+ spin_unlock_irq (¤t->sighand->siglock);
+#else
+ spin_lock_irq (¤t->sigmask_lock);
+ if (oldsigs)
+ *oldsigs = current->blocked;
+ siginitsetinv (¤t->blocked, mask);
+ recalc_sigpending (current);
+ spin_unlock_irq (¤t->sigmask_lock);
+#endif
+#endif
+}
+
+void util_unblock_sigs(sigset_t newsig)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+ sigprocmask(SIG_SETMASK, &newsig, NULL);
+#else
+#ifdef HAVE_NPTL
+ spin_lock_irq (¤t->sighand->siglock);
+ current->blocked = newsig;
+ recalc_sigpending ();
+ spin_unlock_irq (¤t->sighand->siglock);
+#else
+ spin_lock_irq (¤t->sigmask_lock);
+ current->blocked = newsig;
+ recalc_sigpending (current);
+ spin_unlock_irq (¤t->sigmask_lock);
+#endif
+#endif
+}
+
+/*
+ * util_daemonize()
+ *
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/* yes, len is unused but kept here for backwards compatibility. */
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+ sigset_t tmpsig;
+
+ daemonize (name);
+
+ if (shutdown_sigs) {
+ /* Unblock SIGKILL, SIGSTOP, SIGHUP and SIGINT */
+ sigemptyset(&tmpsig);
+ sigaddsetmask(&tmpsig, SHUTDOWN_SIGS);
+ sigprocmask(SIG_UNBLOCK, &tmpsig, NULL);
+ }
+
+ return;
+} /* util_daemonize */
+#else
+void util_daemonize (char *name, int len, int shutdown_sigs)
+{
+ daemonize ();
+ reparent_to_init ();
+
+ if (len > 0) {
+ if (len > 15)
+ BUG();
+ strncpy (current->comm, name, len);
+ current->comm[len] = '\0';
+ }
+
+ if (shutdown_sigs)
+ util_block_sigs(NULL, SHUTDOWN_SIGS);
+ else
+ util_block_sigs(NULL, 0);
+ return;
+} /* util_daemonize */
+#endif
+
+/*
+ * util_sleep()
+ *
+ * The interval time is in milliseconds
+ *
+ * This function needs to be removed.
+ * Instead call schedule_timeout() directly and handle signals.
+ */
+int util_sleep (__u32 ms)
+{
+ __u32 numJiffies;
+
+ /* 10ms = 1 jiffy, minimum resolution is one jiffy */
+ numJiffies = ms * HZ / 1000;
+ numJiffies = (numJiffies < 1) ? 1 : numJiffies;
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ numJiffies = schedule_timeout (numJiffies);
+
+ return 0;
+} /* util_sleep */
+
+/* prefetch has been declared to allow to build in debug mode */
+#ifdef DEBUG
+#ifndef ARCH_HAS_PREFETCH
+inline void prefetch (const void *x)
+{;
+}
+#endif
+#endif
+
+
+static void util_timeout_func(unsigned long data)
+{
+ util_timeout *to = (util_timeout *)data;
+
+ to->timed_out = 1;
+ wake_up(&to->wait);
+}
+
+void util_init_timeout(util_timeout *to)
+{
+ init_timer(&to->timer);
+ to->timer.data = (unsigned long)to;
+ to->timer.function = util_timeout_func;
+ to->timed_out = 0;
+ init_waitqueue_head(&to->wait);
+}
+
+void util_set_timeout(util_timeout *to, __u32 timeout)
+{
+ __u32 how_long;
+
+ if (!timeout) {
+ to->timed_out = 1;
+ return ;
+ }
+
+ how_long = (timeout * HZ / 1000);
+ if (how_long < 1)
+ how_long = 1;
+
+ to->timer.expires = jiffies + how_long;
+ add_timer(&to->timer);
+}
+
+void util_clear_timeout(util_timeout *to)
+{
+ del_timer_sync(&to->timer);
+}
+
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms)
+{
+ int ret;
+ util_timeout timeout;
+ DECLARE_WAITQUEUE(wait, current);
+ DECLARE_WAITQUEUE(to_wait, current);
+
+ util_init_timeout(&timeout);
+
+ if (ms) {
+ util_set_timeout(&timeout, ms);
+ if (timeout.timed_out) {
+ util_clear_timeout(&timeout);
+ }
+ }
+ add_wait_queue(wq, &wait);
+ add_wait_queue(&timeout.wait, &to_wait);
+ do {
+ ret = 0;
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_read(var)==val)
+ break;
+ ret = -ETIMEDOUT;
+ if (timeout.timed_out)
+ break;
+ schedule();
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ } while (1);
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(wq, &wait);
+ remove_wait_queue(&timeout.wait, &to_wait);
+
+ if (ms)
+ util_clear_timeout(&timeout);
+
+ return ret;
+}
+
+/* resizable (using chained pages) array stuff */
+void util_init_rarray(util_rarray *arr, u16 elem_size)
+{
+ arr->elements = 0;
+ arr->max_elem = 0;
+ arr->elem_size = elem_size;
+ arr->page = NULL;
+}
+
+
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx)
+{
+ int pgnum, pgoff;
+ util_rarray_page *pg;
+
+ if (idx >= arr->max_elem) {
+ printk("eek! asked for %d, but only %d elements\n",
+ idx, arr->max_elem);
+ return NULL;
+ }
+
+ pgnum = idx / UTIL_RARRAY_ELEM_PER_BUF(arr);
+ pgoff = idx % UTIL_RARRAY_ELEM_PER_BUF(arr);
+ pg = (util_rarray_page *)arr->page;
+ while (pgnum--) {
+ if (!pg->next) {
+ printk("eeek! no next page!\n");
+ return NULL;
+ }
+ pg = pg->next;
+ }
+ return (((char *)pg->buf) + (pgoff * arr->elem_size));
+}
+
+
+void * util_get_new_rarray_slot(util_rarray *arr, int *index)
+{
+ char *tmp;
+ util_rarray_page *newpg, *pg;
+
+ if (arr->max_elem == arr->elements) {
+ newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+ if (!newpg) {
+ printk("could not grow array!!!\n");
+ return NULL;
+ }
+ memset(newpg, 0, PAGE_SIZE);
+ if (arr->page) {
+ pg = (util_rarray_page *)arr->page;
+ while (pg->next)
+ pg = pg->next;
+ pg->next = newpg;
+ } else
+ arr->page = newpg;
+ arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+ }
+
+ tmp = util_rarray_idx_to_slot(arr, arr->elements);
+ if (tmp) {
+ if (index)
+ *index = arr->elements;
+ arr->elements++;
+ }
+ return tmp;
+}
+
+
+int util_add_to_rarray(util_rarray *arr, void *new)
+{
+ void *slot;
+ int idx;
+
+ slot = util_get_new_rarray_slot(arr, &idx);
+ if (slot == NULL)
+ return -EINVAL;
+ memcpy(slot, new, arr->elem_size);
+ return idx;
+}
+
+/* resizes rarray to at least newelem elements */
+int util_resize_rarray(util_rarray *arr, int newelem)
+{
+ util_rarray_page *newpg, *pg;
+
+ printk("util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+ while (arr->max_elem < newelem) {
+ newpg = (util_rarray_page *) __get_free_page(GFP_KERNEL);
+ if (!newpg) {
+ printk("could not grow array!!!\n");
+ return -ENOMEM;
+ }
+ memset(newpg, 0, PAGE_SIZE);
+ if (arr->page) {
+ pg = (util_rarray_page *)arr->page;
+ while (pg->next)
+ pg = pg->next;
+ pg->next = newpg;
+ } else
+ arr->page = newpg;
+ arr->max_elem += UTIL_RARRAY_ELEM_PER_BUF(arr);
+ }
+ printk("leaving util_resize_rarray: newsize=%d, maxelem=%d\n", newelem, arr->max_elem);
+
+ return 0;
+}
+
+
Added: trunk/cluster/util.h
===================================================================
--- trunk/cluster/util.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/util.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * util.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef CLUSTER_UTIL_H
+#define CLUSTER_UTIL_H
+
+#ifdef __KERNEL__
+#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGHUP) | \
+ sigmask(SIGINT) | sigmask(SIGQUIT))
+
+/* timeout structure taken from Ben's aio.c */
+typedef struct _util_timeout {
+ struct timer_list timer;
+ int timed_out;
+ wait_queue_head_t wait;
+} util_timeout;
+
+void util_clear_timeout(util_timeout *to);
+void util_daemonize(char *name, int len, int shutdown_sigs);
+void util_init_timeout(util_timeout *to);
+void util_set_timeout(util_timeout *to, __u32 timeout);
+void util_show_stack(unsigned long *esp);
+void util_show_trace(unsigned long *stack);
+int util_sleep(__u32 ms);
+int __util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms);
+void util_block_sigs(sigset_t *oldsigs, unsigned long mask);
+void util_unblock_sigs(sigset_t newsig);
+
+/* exits when var == val, or on timeout */
+static inline int util_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int timeout)
+{
+ int ret = 0;
+ if (atomic_read(var) != val)
+ ret = __util_wait_atomic_eq(wq, var, val, timeout);
+ return ret;
+}
+
+#endif /* __KERNEL__ */
+
+/* resizable array */
+typedef struct _util_rarray
+{
+ void *page;
+ u16 elements;
+ u16 max_elem;
+ u16 elem_size;
+ u16 reserved1;
+} util_rarray;
+
+#define UTIL_RARRAY_PAGE_BUF_SIZE (PAGE_SIZE - offsetof(util_rarray_page, buf))
+#define UTIL_RARRAY_ELEM_PER_BUF(r) ((UTIL_RARRAY_PAGE_BUF_SIZE) / (r)->elem_size)
+typedef struct _util_rarray_page
+{
+ void *next;
+ char buf[0];
+} util_rarray_page;
+
+void util_init_rarray(util_rarray *arr, u16 elem_size);
+void * util_get_new_rarray_slot(util_rarray *arr, int *index);
+int util_add_to_rarray(util_rarray *arr, void *new);
+void * util_rarray_idx_to_slot(util_rarray *arr, int idx);
+int util_resize_rarray(util_rarray *arr, int newelem);
+
+#ifdef __KERNEL__
+typedef struct _util_thread_info
+{
+ wait_queue_head_t thread_wq;
+ atomic_t woken;
+ struct task_struct *task;
+ struct completion complete;
+ int pid;
+} util_thread_info;
+
+
+static inline void util_thread_info_init(util_thread_info *info)
+{
+ init_waitqueue_head(&info->thread_wq);
+ atomic_set(&info->woken, 0);
+ info->task = NULL;
+ info->pid = -1;
+ init_completion(&info->complete);
+}
+#endif /* __KERNEL__ */
+
+#endif /* CLUSTER_UTIL_H */
Added: trunk/cluster/warning_hack.h
===================================================================
--- trunk/cluster/warning_hack.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/cluster/warning_hack.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * warning_hack.h
+ *
+ * just to get rid of stupid warnings
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel
+ */
+
+#ifndef WARNING_HACK_H
+#define WARNING_HACK_H
+
+struct mem_dqinfo;
+struct request;
+
+extern __inline__ int generic_fls(int x);
+extern __inline__ int get_bitmask_order(unsigned int count);
+extern inline void mark_info_dirty(struct mem_dqinfo *info);
+extern inline int rq_data_dir(struct request *rq);
+
+
+#endif /* WARNING_HACK_H */
Modified: trunk/src/Makefile
===================================================================
--- trunk/src/Makefile 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/Makefile 2004-12-06 21:45:32 UTC (rev 1693)
@@ -69,7 +69,7 @@
buffer_head_io.c \
dcache.c \
dir.c \
- dlm.c \
+ dlmglue.c \
extent_map.c \
file.c \
heartbeat.c \
@@ -77,22 +77,18 @@
ioctl.c \
journal.c \
localalloc.c \
- lockres.c \
namei.c \
- nm.c \
proc.c \
+ slot_map.c \
suballoc.c \
super.c \
symlink.c \
sysfile.c \
util.c \
ver.c \
- volcfg.c \
vote.c
-
HFILES = \
ocfs2_fs.h \
- ocfs2_disk_dlm.h \
ocfs1_fs_compat.h \
ocfs.h \
ocfs_log.h \
@@ -102,7 +98,7 @@
alloc.h \
dcache.h \
dir.h \
- dlm.h \
+ dlmglue.h \
extent_map.h \
file.h \
heartbeat.h \
@@ -110,19 +106,16 @@
ioctl.h \
journal.h \
localalloc.h \
- lockres.h \
namei.h \
- nm.h \
proc.h \
+ slot_map.h \
suballoc.h \
super.h \
symlink.h \
sysfile.h \
util.h \
ver.h \
- volcfg.h \
vote.h
-
VERSION_FILES = $(CFILES) $(HFILES)
VERSION_SRC = ver.c
VERSION_PREFIX = OCFS
@@ -195,7 +188,7 @@
BASE_DEFINES = -DMODULE -DLINUX -D__KERNEL__
DEFINES += $(BASE_DEFINES) $(GLOBAL_DEFINES)
-INCLUDES = -I. -I$(KERNELINC) -I$(GCCINC)
+INCLUDES = -I. -I$(TOPDIR) -I$(KERNELINC) -I$(GCCINC)
CFLAGS = $(OPTS) $(MACH_CFLAGS) -pipe -nostdinc -fno-strict-aliasing \
-fno-common -fomit-frame-pointer $(MODVERSIONS) $(WARNINGS)
@@ -244,8 +237,8 @@
INSTALL_RULES = install-ocfs
install-ocfs: $(INSTALL_MODULE)
- $(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)
- $(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/$<
+ $(TOPDIR)/mkinstalldirs $(DESTDIR)$(MODULEDIR)/ocfs2
+ $(INSTALL_DATA) $< $(DESTDIR)$(MODULEDIR)/ocfs2/$<
include $(TOPDIR)/Postamble.make
@@ -271,7 +264,7 @@
STAMP_DIR = $(OCFS_SRC_DIR)
include $(OCFS_SRC_DIR)/../Versioning.make
-EXTRA_CFLAGS += $(GLOBAL_DEFINES)
+EXTRA_CFLAGS += $(GLOBAL_DEFINES) -I$(CLUSTERINC)
CFLAGS_$(VERSION_OBJ) += $(VERDEFS)
Modified: trunk/src/alloc.c
===================================================================
--- trunk/src/alloc.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/alloc.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -35,7 +35,7 @@
#include "ocfs2.h"
#include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
#include "localalloc.h"
@@ -210,7 +210,7 @@
/* we always use node zeros suballocator */
eb->h_suballoc_node = 0;
#else
- eb->h_suballoc_node = osb->node_num;
+ eb->h_suballoc_node = osb->slot_num;
#endif
eb->h_suballoc_bit = suballoc_bit_start;
eb->h_list.l_count = ocfs2_extent_recs_per_eb(osb->sb);
@@ -1170,12 +1170,16 @@
down_write(&OCFS_I(inode)->ip_alloc_sem);
+ target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+ inode->i_size);
+
+ /* the extent map gets truncated in ocfs_do_truncate */
+ ocfs2_lvb_set_trunc_clusters(inode, target_i_clusters);
+
last_eb_bh = tc->tc_last_eb_bh;
tc->tc_last_eb_bh = NULL;
handle = tc->tc_handle;
- target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
- inode->i_size);
fe = (ocfs2_dinode *) fe_bh->b_data;
if (fe->id2.i_list.l_tree_depth) {
@@ -1236,6 +1240,14 @@
LOG_ERROR_STATUS(status);
goto bail;
}
+ /* Since we got our cluster lock from caller and we
+ * don't add it to the handle: */
+ ocfs_set_inode_lock_trans(osb->journal, inode);
+
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ status = ocfs_mark_inode_dirty(handle, inode, fe_bh);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
} else {
status = ocfs_extend_trans(handle, credits);
if (status < 0) {
@@ -1346,15 +1358,15 @@
}
ocfs_handle_add_inode(handle, ext_alloc_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
- &ext_alloc_bh, ext_alloc_inode);
+ status = ocfs2_meta_lock(ext_alloc_inode,
+ handle,
+ &ext_alloc_bh,
+ 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- 0, ext_alloc_inode);
}
data_alloc_inode = ocfs_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, -1);
@@ -1365,15 +1377,12 @@
}
ocfs_handle_add_inode(handle, data_alloc_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
- 0, &data_alloc_bh, data_alloc_inode);
+ status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- 0, data_alloc_inode);
(*tc)->tc_bitmap_inode = data_alloc_inode;
(*tc)->tc_bitmap_bh = data_alloc_bh;
Modified: trunk/src/aops.c
===================================================================
--- trunk/src/aops.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/aops.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -83,7 +83,7 @@
/* We don't use the page cache to create symlink data, so if
* need be, copy it over from the buffer cache. */
- if (!buffer_uptodate(bh_result) && ocfs_inode_is_new(osb, inode)) {
+ if (!buffer_uptodate(bh_result) && ocfs_inode_is_new(inode)) {
buffer_cache_bh = sb_getblk(osb->sb,
fe->id2.i_list.l_recs[0].e_blkno + iblock);
if (!buffer_cache_bh) {
@@ -96,7 +96,7 @@
* the bh, even if it commits while we're doing the
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
- && ocfs_inode_is_new(osb, inode)) {
+ && ocfs_inode_is_new(inode)) {
kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
if (!kaddr) {
LOG_ERROR_ARGS("couldn't kmap!\n");
@@ -125,12 +125,11 @@
}
static int ocfs_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+ struct buffer_head *bh_result, int create)
{
int err = -EIO;
u64 vbo = 0;
u64 p_blkno;
- int open_direct;
LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create);
@@ -140,8 +139,6 @@
inode, inode->i_ino);
}
- open_direct = OCFS_I(inode)->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO;
-
if (S_ISLNK(inode->i_mode)) {
/* this always does I/O for some reason. */
err = ocfs_symlink_get_block (inode, iblock, bh_result,
@@ -162,13 +159,8 @@
}
spin_unlock(&OCFS_I(inode)->ip_lock);
- if (!open_direct)
- down_read(&OCFS_I(inode)->ip_node_extend_sem);
-
err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
NULL);
- if (!open_direct)
- up_read(&OCFS_I(inode)->ip_node_extend_sem);
if (err) {
LOG_ERROR_ARGS("Error %d from get_blocks(0x%p, %llu, 1, %llu, NULL)\n",
@@ -500,55 +492,39 @@
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
*/
-static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create)
+static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+ unsigned long max_blocks,
+ struct buffer_head *bh_result, int create)
{
int ret = -1;
int status;
- ocfs_super *osb = NULL;
u64 vbo_max; /* file offset, max_blocks from iblock */
u64 p_blkno;
int contig_blocks;
- int set_new = 0; /* flag */
unsigned char blocksize_bits;
if (!inode || !bh_result) {
- LOG_ERROR_STR("ocfs_direct_IO_get_blocks: inode or bh_result is null");
+ LOG_ERROR_STR("inode or bh_result is null");
return -EIO;
}
- osb = OCFS_SB(inode->i_sb);
blocksize_bits = inode->i_sb->s_blocksize_bits;
- /* make sure we're up to date... */
- if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
- LOG_TRACE_STR ("ocfs_direct_IO_get_blocks: verify oin.");
- status = ocfs_verify_update_inode (osb, inode);
- if (status < 0) {
- LOG_TRACE_STR ("ocfs_verify_update_inode failed");
- ret = -EIO;
- goto bail;
- }
- }
/* This function won't even be called if the request isn't all
* nicely aligned and of the right size, so there's no need
* for us to check any of that. */
- vbo_max = (u64)(iblock + max_blocks) << blocksize_bits;
+ vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
- /* NOTE: create flag is set when we ?may? have to allocate some
- blocks for the file. */
- if (create &&
- (vbo_max > ocfs2_clusters_to_bytes(inode->i_sb,
- OCFS_I(inode)->ip_clusters))) {
- /* WARNING: How much do we really want to extend the file? */
- status = ocfs_extend_file(osb, inode, vbo_max);
- if (status < 0) {
- status = -ENOSPC;
- LOG_ERROR_STR("ocfs_direct_IO_get_blocks: failed to extend the file!");
- goto bail;
- }
- set_new = 1;
+ spin_lock(&OCFS_I(inode)->ip_lock);
+ if ((iblock + max_blocks) >
+ ocfs2_clusters_to_blocks(inode->i_sb,
+ OCFS_I(inode)->ip_clusters)) {
+ spin_unlock(&OCFS_I(inode)->ip_lock);
+ err = -EIO;
+ goto bail;
}
+ spin_unlock(&OCFS_I(inode)->ip_lock);
/* This figure out the size of the next contiguous block, and
* our logical offset */
@@ -561,16 +537,7 @@
goto bail;
}
- /* Do whatever we need to the buffer_head */
- if (set_new) {
- set_buffer_new(bh_result);
- /* Do we really want to set bh_result->b_blocknr here too? */
- bh_result->b_blocknr = p_blkno;
- } else {
- clear_buffer_new(bh_result);
- /* is the last argument here correct? */
- map_bh(bh_result, inode->i_sb, p_blkno);
- }
+ map_bh(bh_result, inode->i_sb, p_blkno);
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
Modified: trunk/src/dcache.c
===================================================================
--- trunk/src/dcache.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dcache.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,7 +38,6 @@
#include "alloc.h"
#include "dcache.h"
#include "file.h"
-#include "vote.h"
#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_DCACHE
@@ -72,13 +71,16 @@
goto bail;
}
+ spin_lock(&OCFS_I(inode)->ip_lock);
/* did we or someone else delete this inode? */
if (INODE_DELETED(inode)) {
+ spin_unlock(&OCFS_I(inode)->ip_lock);
LOG_TRACE_ARGS("dentry_revalidate: inode (%llu) deleted, "
"returning false\n",
OCFS_I(inode)->ip_blkno);
goto bail;
}
+ spin_unlock(&OCFS_I(inode)->ip_lock);
#warning "should we do this for all files?"
if (S_ISDIR(inode->i_mode) && (!inode->i_nlink)) {
@@ -88,23 +90,8 @@
goto bail;
}
- if (ocfs_node_map_is_only(osb, &osb->publ_map, osb->node_num)) {
- LOG_TRACE_STR ("Only node alive. revalidate=true.");
- ret = 1;
- goto bail;
- }
-
- /* if I hold cache lock, no revalidate needed */
- if (ocfs_is_local_cache_lock(osb, inode)) {
- ret = 1;
- goto bail;
- }
-
ret = 1;
- /* TODO: Is this really necessary? */
- atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
-
bail:
LOG_EXIT_INT (ret);
Modified: trunk/src/dir.c
===================================================================
--- trunk/src/dir.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dir.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -49,7 +49,7 @@
#include "alloc.h"
#include "dir.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "inode.h"
@@ -85,7 +85,6 @@
struct inode *inode = filp->f_dentry->d_inode;
struct super_block * sb = inode->i_sb;
int have_disk_lock = 0;
- ocfs_super *osb = OCFS_SB(sb);
LOG_SET_CONTEXT(READDIR);
@@ -94,7 +93,7 @@
stored = 0;
bh = NULL;
- error = ocfs_acquire_lock_ro(osb, inode);
+ error = ocfs2_meta_lock(inode, NULL, NULL, 0);
if (error < 0) {
if (error != -EINTR)
LOG_ERROR_STATUS (error);
@@ -201,11 +200,8 @@
stored = 0;
bail:
- if (have_disk_lock) {
- error = ocfs_release_lock_ro (osb, inode);
- if (error < 0)
- LOG_ERROR_STATUS (error);
- }
+ if (have_disk_lock)
+ ocfs2_meta_unlock(inode, 0);
LOG_EXIT_STATUS(stored);
LOG_CLEAR_CONTEXT();
@@ -224,7 +220,6 @@
struct ocfs2_dir_entry **dirent)
{
int status = -ENOENT;
- int tmpstat;
int lock_acq = 0;
LOG_ENTRY_ARGS ("(osb=%p, parent=%llu, name='%*s', blkno=%p, inode=%p)\n",
@@ -232,7 +227,7 @@
if (take_lock) {
/* Get a lock on the directory... */
- status = ocfs_acquire_lock_ro (osb, inode);
+ status = ocfs2_meta_lock(inode, NULL, NULL, 0);
if (status < 0) {
/* Volume should be disabled in this case */
if (status != -EINTR)
@@ -254,13 +249,7 @@
leave:
if (take_lock && lock_acq)
- {
- tmpstat = ocfs_release_lock_ro (osb, inode);
- if (tmpstat < 0) {
- LOG_ERROR_STATUS (tmpstat);
- /* Volume should be disabled in this case */
- }
- }
+ ocfs2_meta_unlock(inode, 0);
if (status < 0) {
*dirent = NULL;
Deleted: trunk/src/dlm.c
===================================================================
--- trunk/src/dlm.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlm.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,732 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlm.c
- *
- * Distributed lock manager. Requests and processes lock votes.
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/random.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-#include "ocfs2.h"
-
-#include "alloc.h"
-#include "dcache.h"
-#include "dlm.h"
-#include "inode.h"
-#include "lockres.h"
-#include "nm.h"
-#include "util.h"
-#include "vote.h"
-
-#include "ocfs_journal.h"
-#include "buffer_head_io.h"
-
-#define WAIT_FOR_VOTE_INCREMENT 200
-
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_DLM
-
-/* inode is definitely non NULL */
-static inline int ocfs_wait_for_readonly_drop(ocfs_super *osb, struct inode *inode)
-{
- int status = 0;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
- if (ocfs_node_map_is_empty(&lockres->readonly_map))
- return status;
- status = ocfs_drop_readonly_cache_lock(osb, inode, 0);
- return status;
-}
-
-/*
- * ocfs_update_disk_lock()
- * inode is definitely non NULL
- */
-void ocfs_update_disk_lock (ocfs_super * osb,
- struct buffer_head *bh,
- struct inode *inode)
-{
- ocfs2_dinode *fe = NULL;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
- LOG_ENTRY ();
-
- fe = (ocfs2_dinode *) bh->b_data;
-
- /* We do our own I/O here to lock out dirty readers from
- * refreshing the bh when we're in the middle of changing
- * it. We shouldn't ever get here if it's a journalled buffer
- * so io_sem is not necessary. */
- if (buffer_jbd(bh)) {
- printk("Ugh, block %llu has the JBD bit set!\n",
- (unsigned long long)bh->b_blocknr);
- BUG();
- }
-
- if ((DISK_LOCK(fe)->dl_master == lockres->master_node_num)
- && (DISK_LOCK(fe)->dl_level == lockres->lock_type))
- goto skip_write;
-
- lock_buffer(bh);
-
- if (buffer_jbd(bh)) {
- printk("Ugh, block %llu has the JBD bit set!\n",
- (unsigned long long)bh->b_blocknr);
- BUG();
- }
-
- DISK_LOCK(fe)->dl_master = lockres->master_node_num;
- DISK_LOCK(fe)->dl_level = lockres->lock_type;
-
- set_buffer_uptodate(bh);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
- /*
- * mark_buffer_clean() doesn't exist in 2.6.x kernels.
- * Not many places actually used mark_buffer_clean, but
- * at least reiserfs uses clear_buffer_dirty() as
- * a replacment.
- */
- clear_buffer_dirty(bh);
-#else
- mark_buffer_clean(bh);
-#endif
- bh->b_end_io = ocfs_end_buffer_io_sync;
- submit_bh(WRITE, bh);
- wait_on_buffer(bh);
- SET_BH_SEQNUM(inode, bh);
-
-skip_write:
- LOG_EXIT ();
-} /* ocfs_update_disk_lock */
-
-int ocfs_notify_cluster(ocfs_super *osb,
- struct inode *inode,
- u32 message_flags)
-{
- int status = -EAGAIN;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- u32 flags;
-
- LOG_ENTRY_ARGS("(inode = %llu, flags = 0x%x)\n",
- OCFS_I(inode)->ip_blkno, message_flags);
-
- while (status == -EAGAIN) {
- ocfs_acquire_lockres_write(inode);
-
- flags = message_flags;
- if (ocfs_inode_is_new(osb, inode))
- flags |= FLAG_FAST_PATH_LOCK;
-
- if (ocfs_task_interruptible ()) {
- ocfs_release_lockres_write (inode);
- LOG_TRACE_ARGS("interrupted... inode = %llu\n",
- OCFS_I(inode)->ip_blkno);
- status = -EINTR;
- goto bail;
- }
-
- status = new_lock_function(osb, lockres->lock_type, flags, NULL, inode);
-
- if (status < 0) {
- if (status != -EAGAIN)
- LOG_ERROR_STATUS (status);
- ocfs_release_lockres_write (inode); // ocfs_file_open ocfs_symlink
- if (status == -EAGAIN || status == -ETIMEDOUT) {
- ocfs_sleep (50);
- status = -EAGAIN;
- continue;
- }
-
- goto bail;
- }
- ocfs_release_lockres_write (inode); // ocfs_file_open
- }
-bail:
- LOG_EXIT_STATUS (status);
- return status;
-}
-
-enum {
- invalid_path = 0,
- fast_path,
- become_master,
- get_x,
- wait_for_release,
- master_request,
- num_paths
-};
-
-static const char *lock_path_strs[] = {
- "invalid_path", "fast_path", "become_master",
- "get_x", "wait_for_release", "master_request"
-};
-
-static inline const char * lock_path_str(int lock_path);
-static inline const char * lock_path_str(int lock_path)
-{
- if (lock_path >= num_paths || lock_path <= invalid_path)
- return lock_path_strs[0];
- return lock_path_strs[lock_path];
-}
-
-/*
- * ocfs_acquire_lock()
- * inode is definitely non NULL
- */
-int ocfs_acquire_lock (ocfs_super * osb, __u32 lock_type,
- __u32 flags, struct buffer_head **bh, struct inode *inode)
-{
- int status;
- __u32 updated;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- int k = 0;
- int no_owner = 0, owner_dead = 0, wait_on_recovery = 0;
- __u32 extra_lock_flags = 0;
- __u64 lock_id;
-
- LOG_ENTRY_ARGS ("(0x%p, %u, %u, 0x%p)\n",
- osb, lock_type, flags, bh);
-
- OCFS_ASSERT(lock_type != OCFS_LKM_NLMODE);
- OCFS_ASSERT(inode);
- OCFS_ASSERT(bh);
- OCFS_ASSERT(!journal_current_handle());
-
- lock_id = OCFS_I(inode)->ip_blkno;
- LOG_TRACE_ARGS("lock_id = %llu\n", lock_id);
-
- flags |= FLAG_ACQUIRE_LOCK;
-
- *bh = sb_getblk(osb->sb, OCFS_I(inode)->ip_blkno);
- if (*bh == NULL) {
- status = -EIO;
- LOG_ERROR_STATUS(status);
- goto finally;
- }
-
- updated = 0;
-again:
- ocfs_acquire_lockres_write (inode);
-
- LOG_TRACE_ARGS("attempting to get lock, pass: %d\n", ++k);
-
- /* if updated = 1 then we've read a valid bh so skip the
- * update_lockres if we can trust it. */
- if (updated && (lockres->master_node_num != osb->node_num))
- updated = 0;
-
- if (!updated) {
- status = ocfs_update_lockres(osb, *bh, inode, 1);
- if (status < 0) {
- ocfs_release_lockres_write (inode);
- LOG_ERROR_STATUS (status);
- goto finally;
- }
- updated = 1;
- }
-
-reevaluate:
- no_owner = (lockres->master_node_num == OCFS_INVALID_NODE_NUM);
-
- /* master node is an invalid node */
- if (unlikely(lockres->master_node_num >= osb->max_nodes && !no_owner)) {
- LOG_ERROR_ARGS("lockres: master_node=%d, owner=%s, lockid=%llu\n",
- lockres->master_node_num, no_owner?"no":"yes",
- lock_id);
- LOG_ERROR_STATUS (status = -EINVAL);
- ocfs_release_lockres_write (inode); // ocfs_acquire_lock
- goto finally;
- }
-
- wait_on_recovery =
- ocfs_node_is_recovering(osb, lockres->master_node_num);
- owner_dead = !(no_owner ||
- ocfs_node_is_alive(&osb->publ_map,
- lockres->master_node_num));
- if (owner_dead || wait_on_recovery) {
- // if owner is dead or in recovery and the lockres
- // has the readonly flag set, clear it
- clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
- }
-
- status = 0;
- extra_lock_flags = 0;
-
- if (flags & FLAG_READONLY) {
- if (test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ||
- (lockres->master_node_num == osb->node_num &&
- lockres->lock_type == OCFS_LKM_EXMODE)) {
- /* already readonly or local node is master */
- /* THIS node will see it as readonly, but OTHER
- * nodes will have to wait until lock_holders drops
- * to 0 (to finish journal flush on this inode) */
-#ifdef VERBOSE_LOCKING_TRACE
- printk("acquire_lock: lockid %llu, setting readonly\n",
- lock_id);
-#endif
- set_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
- goto skip_lock_write;
- }
-
- if (lockres->master_node_num == OCFS_INVALID_NODE_NUM ||
- owner_dead || wait_on_recovery) {
- /* no master or dead master */
- extra_lock_flags = FLAG_REMASTER;
- } else {
- /* valid master, but either not cachelock or elsewhere */
- if (lockres->lock_type != OCFS_LKM_EXMODE) {
- /* treat just like a normal master change request */
- extra_lock_flags = FLAG_CHANGE_MASTER;
- }
- }
- goto do_lock;
- }
-
-#warning NEED MORE HANDLING HERE NOW FOR DROPPING LOCAL READONLY!!!
- // anything else is NOT a readonly request
- if (lockres->master_node_num != osb->node_num)
- clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-
- status = ocfs_wait_for_readonly_drop(osb, inode);
- if (status < 0) {
- if (status == -EAGAIN) {
- // the rodrop thread is already running and needs the lockres
- ocfs_release_lockres_write(inode);
- ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
- ocfs_acquire_lockres_write(inode);
- goto reevaluate;
- }
- LOG_ERROR_STATUS(status);
- goto finally;
- }
-
- if (ocfs_inode_is_new(osb, inode)) {
- if (lockres->master_node_num != osb->node_num) {
- printk("inode is new, but lockres is out of date! "
- "owner = %d, type = %d\n",
- lockres->master_node_num, lockres->lock_type);
-
- BUG();
- }
- extra_lock_flags |= FLAG_FAST_PATH_LOCK;
- }
-
- /* some lock requests are simple messages and don't require a
- * master change. */
- if (flags & FLAG_TRUNCATE_PAGES)
- goto do_lock;
-
- if ((lockres->master_node_num != osb->node_num)
- && (wait_on_recovery || no_owner || owner_dead)) {
- extra_lock_flags |= FLAG_REMASTER;
- } else if (lockres->master_node_num != osb->node_num) {
- extra_lock_flags |= FLAG_CHANGE_MASTER;
- }
-
-do_lock:
- LOG_TRACE_ARGS("lockres: master=%d, locktype=%d, flags: %08x\n",
- lockres->master_node_num, lockres->lock_type,
- flags|extra_lock_flags);
-
-#ifdef VERBOSE_LOCKING_TRACE
- printk("acquire_lock: lockid=%llu, this=%d, master=%d, locktype=%d, "
- "flags=%08x, readonly=%s\n", lock_id, osb->node_num,
- lockres->master_node_num, lockres->lock_type, flags|extra_lock_flags,
- test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no");
-#endif
- if (wait_on_recovery
- && !((flags|extra_lock_flags) & FLAG_FILE_RECOVERY)) {
- int waitcnt = 0;
- LOG_TRACE_ARGS("Waiting on node %u to be recovered\n",
- lockres->master_node_num);
- while (1) {
- LOG_TRACE_ARGS("waitcnt = %d\n", waitcnt);
- if (!ocfs_node_is_recovering(osb,
- lockres->master_node_num))
- break;
- ocfs_sleep(500);
- }
- }
-
- if (ocfs_task_interruptible ()) {
- ocfs_release_lockres_write (inode);
- LOG_TRACE_ARGS("interrupted... inode %llu\n",
- OCFS_I(inode)->ip_blkno);
- status = -EINTR;
- goto finally;
- }
- status = new_lock_function(osb, lock_type, flags|extra_lock_flags, *bh, inode);
-
- if (status < 0) {
- ocfs_release_lockres_write (inode); // ocfs_acquire_lock
- if (status == -EAGAIN || status == -ETIMEDOUT) {
- if (status == -ETIMEDOUT)
- LOG_ERROR_ARGS("Timed out acquiring lock for inode "
- "%llu, retrying...\n", OCFS_I(inode)->ip_blkno);
- ocfs_sleep (50);
- goto again;
- }
- goto finally;
- }
-
- /* We got the lock */
- status = 0;
-
-skip_lock_write:
- OCFS_ASSERT(status == 0);
- lockres->lock_holders++;
- if ((extra_lock_flags & FLAG_FAST_PATH_LOCK)
- && ((flags & FLAG_FILE_EXTEND) || (flags & FLAG_FILE_TRUNCATE)))
- lockres->uncommitted_holders++;
- LOG_TRACE_ARGS("lockres->lock_holders = %u\n", lockres->lock_holders);
- LOG_TRACE_ARGS("lockres->uncommitted_holders = %u\n",
- lockres->uncommitted_holders);
- ocfs_release_lockres_write (inode); // ocfs_acquire_lock
-
- if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
- ocfs2_dinode *fe = (ocfs2_dinode *) (*bh)->b_data;
- status = ocfs_refresh_inode(inode, fe);
- if (status < 0)
- LOG_ERROR_STATUS(status);
- status = 0;
- }
-finally:
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_acquire_lock */
-
-
-/*
- * ocfs_release_lock_full()
- * inode is definitely non NULL
- */
-int ocfs_release_lock_full (ocfs_super * osb, __u32 lock_type, __u32 flags, struct inode *inode, __u32 num_ident)
-{
- int status = 0;
- int vote_status = 0;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- __u64 lock_id;
- __u32 num_to_send;
- ocfs_node_map votemap;
-
- LOG_ENTRY_ARGS ("(0x%p, %u, %u, 0x%p)\n",
- osb, lock_type, flags, lockres);
-
- OCFS_ASSERT(inode);
- OCFS_ASSERT(num_ident);
-
- lock_id = OCFS_I(inode)->ip_blkno;
- LOG_TRACE_ARGS("lock_id = %llu", lock_id);
-
- flags |= FLAG_RELEASE_LOCK;
-
- ocfs_acquire_lockres_write(inode);
-
- if ((lockres->lock_type == OCFS_LKM_EXMODE) &&
- (lockres->master_node_num == osb->node_num) &&
- !(flags & (FLAG_FILE_EXTEND|FLAG_FILE_TRUNCATE))) {
- status = 0;
- goto finally;
- }
-
- if (flags & FLAG_READONLY) {
- if (lockres->lock_type != OCFS_LKM_EXMODE ||
- lockres->master_node_num == OCFS_INVALID_NODE_NUM ||
- !(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state))) {
- LOG_ERROR_ARGS("READONLY release has issues! type=%d, master=%d, readonly=%s\n",
- lockres->lock_type, lockres->master_node_num,
- test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no");
- BUG();
- }
- status = 0;
- goto finally;
- }
-
- OCFS_ASSERT(lockres->uncommitted_holders <= lockres->lock_holders);
-
- num_to_send = num_ident;
- /* we don't want to send over a count for any size change
- * which includes acquires which we also didn't broadcast. */
- if ((flags & FLAG_FILE_EXTEND) || (flags & FLAG_FILE_TRUNCATE)) {
- if (num_ident <= lockres->uncommitted_holders) {
- /* it breaks the rules to send zero or
- * negative lock releases! */
- num_to_send = 0;
- lockres->uncommitted_holders -= num_ident;
- status = 0;
- goto finally;
- }
- num_to_send -= lockres->uncommitted_holders;
- lockres->uncommitted_holders = 0;
- }
-
- OCFS_ASSERT(num_to_send);
-
- ocfs_node_map_dup(osb, &votemap, &osb->publ_map);
- ocfs_node_map_clear_bit(&votemap, osb->node_num);
- if (ocfs_node_map_is_empty(&votemap))
- goto finally;
-
- if (!(flags & FLAG_FILE_UPDATE_OIN))
- goto finally;
-
- status = -EAGAIN;
- while (status == -EAGAIN) {
- // remove dead nodes
- ocfs_node_map_and(&votemap, &osb->publ_map);
- if (ocfs_node_map_is_empty(&votemap)) {
- // last node in map died, so this node gets the lock
- status = 0;
- break;
- }
- status = ocfs_send_dlm_request_msg (osb, lock_id, lock_type, flags,
- &votemap, inode, num_to_send, &vote_status);
- if (status >= 0 || status == -EAGAIN) {
- if (status != -EAGAIN)
- status = vote_status;
-
- if (status >= 0) {
- break;
- } else if (status == -EAGAIN) {
- LOG_TRACE_ARGS ("EAGAIN on net vote, id=%llu\n", lock_id);
- continue;
- } else {
- LOG_ERROR_STATUS (status);
- break;
- }
- } else if (status == -ETIMEDOUT) {
- LOG_TRACE_ARGS ("ETIMEDOUT on net vote, id=%llu\n", lock_id);
- status = -EAGAIN;
-
- LOG_ERROR_ARGS("Timed out releasing lock for inode %llu, retrying...\n", OCFS_I(inode)->ip_blkno);
- ocfs_release_lockres_write(inode);
- ocfs_sleep(200);
- ocfs_acquire_lockres_write(inode);
- continue;
- } else
- LOG_ERROR_STATUS (status);
- }
-
-finally:
- if (lockres->lock_holders - num_ident < 0) {
- printk("About to decrement lock_holders one too many! lockid "
- "= %llu\n", lock_id);
- BUG();
- }
-#warning "is this wise, or shouldn't we be retrying the lock release later?"
- lockres->lock_holders -= num_ident;
- LOG_TRACE_ARGS("lockres->lock_holders = %u\n", lockres->lock_holders);
-
- ocfs_release_lockres_write (inode);
- LOG_EXIT_STATUS (status);
- return (status);
-} /* ocfs_release_lock_full */
-
-/* inode is definitely non NULL */
-int new_lock_function(ocfs_super * osb, __u32 requested_lock, __u32 flags, struct buffer_head *bh, struct inode *inode)
-{
- ocfs_node_map vote_map;
- ocfs2_dinode *fe = NULL;
- __u64 lock_id;
- __u32 lock_type = requested_lock;
- int need_to_zap_buffers = 0, need_lock_write = 1;
- int is_readonly = (flags & FLAG_READONLY) ? 1 : 0;
- int status = 0, vote_status = 0;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
- LOG_ENTRY ();
-
- lock_id = OCFS_I(inode)->ip_blkno;
-
- if (flags & FLAG_READONLY) {
- if (flags & (FLAG_CHANGE_MASTER | FLAG_REMASTER)) {
- /* not currently readonly. treat like normal change master. */
- flags &= ~FLAG_READONLY;
- }
- } else if (flags & FLAG_CHANGE_MASTER) {
- /* non-readonly with CHANGE_MASTER should have no readonly flag */
- if (test_bit(LOCK_STATE_READONLY, &lockres->readonly_state)) {
- LOG_ERROR_ARGS("change_master but currently readonly\n");
- clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
- }
- }
- if (flags & (FLAG_CHANGE_MASTER | FLAG_REMASTER)) {
- /* on a master change... */
- need_to_zap_buffers = 1; /* need to dump local buffers */
- need_lock_write = 1; /* and rewrite the lock */
- } else if (flags & (FLAG_DROP_READONLY | FLAG_TRUNCATE_PAGES)) {
- need_lock_write = 0;
- need_to_zap_buffers = 0;
- } else if (flags & FLAG_READONLY) {
- need_lock_write = 0;
- need_to_zap_buffers = 1;
- } else if (!bh) {
- need_lock_write = 0;
- need_to_zap_buffers = 0;
- } else {
- fe = (ocfs2_dinode *) bh->b_data;
- /* may not need to rewrite the lock later if
- * we already have a cachelock */
- if ((DISK_LOCK(fe)->dl_master == osb->node_num)
- && (DISK_LOCK(fe)->dl_level != requested_lock))
- need_lock_write = 1;
- else
- need_lock_write = 0;
- need_to_zap_buffers = 0;
- }
-
- /* that's why it's called fast path */
- if (flags & FLAG_FAST_PATH_LOCK)
- goto vote_success;
-
-
-#define BROADCAST_FLAGS (FLAG_FILE_DELETE | FLAG_FILE_RENAME | FLAG_RELEASE_DENTRY | FLAG_FILE_EXTEND | FLAG_FILE_TRUNCATE | FLAG_FILE_UPDATE_OIN | FLAG_TRUNCATE_PAGES | FLAG_DROP_READONLY | FLAG_REMASTER)
-
- /* figure out who to vote with */
- if (flags & BROADCAST_FLAGS) {
- ocfs_node_map_dup(osb, &vote_map, &osb->publ_map); /* broadcast */
- /* only nodes that see this is readonly */
- if (flags & FLAG_DROP_READONLY)
- ocfs_node_map_and(&vote_map, &lockres->readonly_map);
- } else {
- ocfs_node_map_init(osb, &vote_map);
- ocfs_node_map_set_bit(&vote_map, lockres->master_node_num); /* just owner */
- lock_type = lockres->lock_type;
- }
- ocfs_node_map_clear_bit(&vote_map, osb->node_num);
-
- // remove dead nodes
- ocfs_node_map_and(&vote_map, &osb->publ_map);
-
- if (ocfs_node_map_is_empty(&vote_map)) {
- /* As this is the only node alive, make it master of the lock */
- goto vote_success;
- }
-
- status = ocfs_send_dlm_request_msg (osb, lock_id, lock_type,
- flags, &vote_map,
- inode, 1,
- &vote_status);
- if (status >= 0) {
- status = vote_status;
- }
-
- if (status < 0) {
- if (status != -EAGAIN &&
- status != -ETIMEDOUT &&
- status != -EINTR &&
- status != -EBUSY)
- LOG_ERROR_STATUS(status);
- goto bail;
- }
-
-vote_success:
- if (need_to_zap_buffers)
- ocfs_inc_inode_seq(osb, inode);
-
- /* just alerting owner on open */
- if (flags & FLAG_TRUNCATE_PAGES)
- goto bail;
-
- /* converted EX to readonly EX */
- if (flags & FLAG_READONLY)
- goto bail;
-
- /* drop readonly should remove anyone who has responded */
- if (flags & FLAG_DROP_READONLY) {
- ocfs_node_map_clear_bits(&lockres->readonly_map, &vote_map);
- goto bail;
- }
-
- /* update the disk lock */
- if (need_lock_write) {
- lockres->lock_type = requested_lock;
- lockres->master_node_num = osb->node_num;
- if (!bh) {
- printk("We're trying to write a lock but I wasn't "
- "passed a buffer: inode %llu, flags %u\n",
- OCFS_I(inode)->ip_blkno, flags);
- BUG();
- }
-
- /* want to refresh the lock from the latest on disk
- * state before writing it back out. */
- status = ocfs_read_block(osb, lock_id, &bh, 0, inode);
- if (!status)
- ocfs_update_disk_lock(osb, bh, inode);
-
- if (status < 0)
- LOG_ERROR_STATUS(status);
-
- atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
- }
-
-bail:
- /* if we removed FLAG_READONLY above, or converted an
- * EX to readonly, set the readonly state now */
- if (status >= 0 && (is_readonly || flags & FLAG_READONLY)) {
- set_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
- }
-
- LOG_EXIT_STATUS (status);
- return status;
-}
-
-void ocfs_compute_dlm_stats(int status, int vote_status, ocfs_dlm_stats *stats)
-{
- atomic_inc (&stats->total);
- if (status == -ETIMEDOUT)
- atomic_inc (&stats->etimedout);
- else {
- switch (vote_status) {
- case -EAGAIN:
- case FLAG_VOTE_UPDATE_RETRY:
- atomic_inc (&stats->eagain);
- break;
- case -ENOENT:
- case FLAG_VOTE_FILE_DEL:
- atomic_inc (&stats->enoent);
- break;
- case -EBUSY:
- case -ENETUNREACH:
- case FLAG_VOTE_OIN_ALREADY_INUSE:
- atomic_inc (&stats->efail);
- break;
- case 0:
- case FLAG_VOTE_NODE:
- case FLAG_VOTE_OIN_UPDATED:
- atomic_inc (&stats->okay);
- break;
- default:
- atomic_inc (&stats->def);
- break;
- }
- }
-}
Deleted: trunk/src/dlm.h
===================================================================
--- trunk/src/dlm.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlm.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,78 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlm.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_DLM_H
-#define OCFS2_DLM_H
-
-int new_lock_function(ocfs_super *osb, __u32 requested_lock,
- __u32 flags, struct buffer_head *bh,
- struct inode *inode);
-int ocfs_acquire_lock(ocfs_super *osb, __u32 lock_type,
- __u32 flags, struct buffer_head **bh,
- struct inode *inode);
-void ocfs_compute_dlm_stats(int status, int vote_status,
- ocfs_dlm_stats *stats);
-#define ocfs_release_lock(osb, lock_type, flags, inode) \
- ocfs_release_lock_full(osb, lock_type, flags, inode, 1)
-int ocfs_release_lock_full(ocfs_super *osb, __u32 lock_type,
- __u32 flags, struct inode *inode, __u32 num_ident);
-
-#define ocfs_acquire_lock_ro(osb, inode) \
-({ \
- int status; \
- struct buffer_head *junkbh = NULL;\
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, FLAG_READONLY, &junkbh, inode);\
- if (junkbh) \
- brelse(junkbh);\
- (status); \
- })
-
-#define ocfs_release_lock_ro(osb, inode) ocfs_release_lock(osb, OCFS_LKM_EXMODE, FLAG_READONLY, inode)
-
-
-void ocfs_get_publish_vote_map(ocfs_super *osb, ocfs_publish *publish,
- ocfs_node_map *vote_map);
-int ocfs_notify_cluster(ocfs_super *osb,
- struct inode *inode,
- u32 message_flags);
-static inline int ocfs_notify_on_rename(ocfs_super *osb, struct inode *inode)
-{
- /* whatcha tryin' to do to us! */
- OCFS_ASSERT(!S_ISDIR(inode->i_mode));
-
- return(ocfs_notify_cluster(osb,
- inode,
- FLAG_RELEASE_DENTRY|FLAG_FILE_RENAME));
-}
-static inline int ocfs_notify_on_open(ocfs_super *osb, struct inode *inode)
-{
- return(ocfs_notify_cluster(osb,
- inode,
- FLAG_TRUNCATE_PAGES));
-}
-void ocfs_update_disk_lock (ocfs_super * osb,
- struct buffer_head *bh,
- struct inode *inode);
-#endif /* OCFS2_DLM_H */
Added: trunk/src/dlmglue.c
===================================================================
--- trunk/src/dlmglue.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlmglue.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,1818 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * middle.c
+ *
+ * description here
+ *
+ * Copyright (C) 2003, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "util.h"
+#include "vote.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_DLMGLUE
+
+/* lock ids are made up in the following manner:
+ * name[0] --> type
+ * name[1-6] --> 6 pad characters, reserved for now
+ * name[7-22] --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31] --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN 32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+static char ocfs2_lock_type_char[OCFS_NUM_LOCK_TYPES] = {
+ [OCFS_TYPE_META] 'M',
+ [OCFS_TYPE_DATA] 'D',
+ [OCFS_TYPE_SUPER] 'S'
+};
+
+static int ocfs2_build_lock_name(enum ocfs2_lock_type type,
+ u64 blkno,
+ u32 generation,
+ char **ret);
+
+static void ocfs2_inode_ast_func(void *opaque);
+static void ocfs2_inode_bast_func(void *opaque, int level);
+static void ocfs2_super_ast_func(void *opaque);
+static void ocfs2_super_bast_func(void *opaque, int level);
+/* so far, all locks have gotten along with the same unlock ast */
+static void ocfs2_unlock_ast_func(void *opaque,
+ dlm_status status);
+static int ocfs2_do_unblock_meta(struct inode *inode,
+ int *requeue);
+static int ocfs2_unblock_meta(ocfs2_lock_res *lockres,
+ int *requeue);
+static int ocfs2_unblock_data(ocfs2_lock_res *lockres,
+ int *requeue);
+static int ocfs2_unblock_super(ocfs2_lock_res *lockres,
+ int *requeue);
+typedef void (ocfs2_convert_worker_t)(ocfs2_lock_res *, int);
+static int ocfs2_generic_unblock_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int *requeue,
+ ocfs2_convert_worker_t *worker);
+
+struct ocfs2_lock_res_ops {
+ void (*ast)(void *);
+ void (*bast)(void *, int);
+ void (*unlock_ast)(void *, dlm_status);
+ int (*unblock)(ocfs2_lock_res *, int *);
+};
+
+struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+ .ast = ocfs2_inode_ast_func,
+ .bast = ocfs2_inode_bast_func,
+ .unlock_ast = ocfs2_unlock_ast_func,
+ .unblock = ocfs2_unblock_meta,
+};
+
+static void ocfs2_data_convert_worker(ocfs2_lock_res *lockres,
+ int blocking);
+
+struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
+ .ast = ocfs2_inode_ast_func,
+ .bast = ocfs2_inode_bast_func,
+ .unlock_ast = ocfs2_unlock_ast_func,
+ .unblock = ocfs2_unblock_data,
+};
+
+struct ocfs2_lock_res_ops ocfs2_super_lops = {
+ .ast = ocfs2_super_ast_func,
+ .bast = ocfs2_super_bast_func,
+ .unlock_ast = ocfs2_unlock_ast_func,
+ .unblock = ocfs2_unblock_super,
+};
+
+static inline int ocfs2_is_inode_lock(ocfs2_lock_res *lockres)
+{
+ return lockres->l_type == OCFS_TYPE_META ||
+ lockres->l_type == OCFS_TYPE_DATA;
+}
+
+static inline int ocfs2_is_super_lock(ocfs2_lock_res *lockres)
+{
+ return lockres->l_type == OCFS_TYPE_SUPER;
+}
+
+static inline ocfs_super * ocfs2_lock_res_super(ocfs2_lock_res *lockres)
+{
+ OCFS_ASSERT(ocfs2_is_super_lock(lockres));
+
+ return (ocfs_super *) lockres->l_priv;
+}
+
+static inline struct inode * ocfs2_lock_res_inode(ocfs2_lock_res *lockres)
+{
+ OCFS_ASSERT(ocfs2_is_inode_lock(lockres));
+
+ return (struct inode *) lockres->l_priv;
+}
+
+static void ocfs2_lock_res_init_common(ocfs2_lock_res *res,
+ enum ocfs2_lock_type type,
+ void *priv);
+static int ocfs2_lock_create(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int level,
+ int flags);
+static inline int ocfs2_may_continue_on_blocked_lock(ocfs2_lock_res *lockres,
+ int wanted);
+static int ocfs2_cluster_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int level,
+ int lkm_flags);
+static void ocfs2_cluster_unlock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int level);
+static inline void ocfs2_generic_handle_downconvert_action(ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(ocfs2_lock_res *lockres);
+static void ocfs2_generic_handle_bast(ocfs2_lock_res *lockres, int level);
+static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
+ ocfs2_lock_res *lockres);
+static void ocfs2_inc_inode_seq(ocfs_super *osb,
+ struct inode *inode);
+static void ocfs2_schedule_blocked_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres);
+static void ocfs2_schedule_blocked_inode_lock(struct inode *inode,
+ ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(ocfs2_lock_res *lockres,
+ int convert);
+static void ocfs2_vote_on_unlock(ocfs_super *osb,
+ ocfs2_lock_res *lockres);
+/* Called after we refresh our inode, only has any effect if we have
+ * an EX lock. This populates the LVB with the initial values for our
+ * change set. */
+static void ocfs2_reset_meta_lvb_values(struct inode *inode);
+static void __ocfs2_stuff_meta_lvb(struct inode *inode);
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode);
+static void __ocfs2_lvb_on_downconvert(ocfs2_lock_res *lockres,
+ int new_level);
+static int ocfs2_meta_lock_update(struct inode *inode,
+ struct buffer_head **bh);
+static int __ocfs2_drop_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres);
+static void ocfs2_drop_super_lock(ocfs_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static int __ocfs2_downconvert_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int new_level,
+ int lvb);
+static int __ocfs2_cancel_convert(ocfs_super *osb,
+ ocfs2_lock_res *lockres);
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+ ocfs2_lock_res *lockres,
+ int new_level);
+
+static inline int ocfs2_lvb_is_trustable(ocfs2_lock_res *lockres)
+{
+ ocfs2_lvb *lvb = (ocfs2_lvb *) lockres->l_lksb.lvb;
+ int ret = 0;
+
+ spin_lock(&lockres->l_lock);
+ if (lvb->lvb_seq &&
+ lockres->l_local_seq == lvb->lvb_seq)
+ ret = 1;
+ spin_unlock(&lockres->l_lock);
+
+ return ret;
+}
+
+static inline void ocfs2_set_local_seq_from_lvb(ocfs2_lock_res *lockres)
+{
+ ocfs2_lvb *lvb = (ocfs2_lvb *) lockres->l_lksb.lvb;
+
+ spin_lock(&lockres->l_lock);
+ if (lvb->lvb_seq)
+ lockres->l_local_seq = lvb->lvb_seq;
+ spin_unlock(&lockres->l_lock);
+}
+
+/* fill in new values as we add them to the lvb. */
+static inline void ocfs2_meta_lvb_get_values(ocfs2_lock_res *lockres,
+ unsigned int *trunc_clusters)
+{
+ ocfs2_meta_lvb *lvb;
+ OCFS_ASSERT(lockres->l_type == OCFS_TYPE_DATA);
+
+ spin_lock(&lockres->l_lock);
+ OCFS_ASSERT(lockres->l_level > LKM_NLMODE);
+
+ lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ if (trunc_clusters)
+ *trunc_clusters = lvb->lvb_trunc_clusters;
+
+ spin_unlock(&lockres->l_lock);
+}
+
+static int ocfs2_build_lock_name(enum ocfs2_lock_type type,
+ u64 blkno,
+ u32 generation,
+ char **ret)
+{
+ int len;
+ char *name = NULL;
+
+ LOG_ENTRY();
+
+ OCFS_ASSERT(type < OCFS_NUM_LOCK_TYPES);
+
+ name = kmalloc(OCFS2_LOCK_ID_MAX_LEN, GFP_KERNEL);
+ if (!name) {
+ len = -ENOMEM;
+ goto bail;
+ }
+ memset(name, 0, OCFS2_LOCK_ID_MAX_LEN);
+
+ len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN - 1, "%c%s%016llx%08x",
+ ocfs2_lock_type_char[type], OCFS2_LOCK_ID_PAD, blkno,
+ generation);
+
+ OCFS_ASSERT(len = (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("built lock resource with name: %s\n", name);
+#endif
+ *ret = name;
+bail:
+ LOG_EXIT();
+ return (len);
+}
+
+static void ocfs2_lock_res_init_common(ocfs2_lock_res *res,
+ enum ocfs2_lock_type type,
+ void *priv)
+{
+ memset(res, 0, sizeof(ocfs2_lock_res));
+ spin_lock_init(&res->l_lock);
+ init_waitqueue_head(&res->l_event);
+ res->l_type = type;
+ res->l_level = LKM_IVMODE;
+ INIT_LIST_HEAD(&res->l_blocked_list);
+ res->l_priv = priv;
+}
+
+int ocfs2_inode_lock_res_init(ocfs2_lock_res *res,
+ enum ocfs2_lock_type type,
+ struct inode *inode)
+{
+ int status;
+
+ LOG_ENTRY();
+
+ OCFS_ASSERT(type == OCFS_TYPE_META ||
+ type == OCFS_TYPE_DATA);
+
+ ocfs2_lock_res_init_common(res, type, inode);
+
+ if (type == OCFS_TYPE_META)
+ res->l_ops = &ocfs2_inode_meta_lops;
+ else
+ res->l_ops = &ocfs2_inode_data_lops;
+
+ status = ocfs2_build_lock_name(type,
+ OCFS_I(inode)->ip_blkno,
+ inode->i_generation,
+ &res->l_name);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+int ocfs2_super_lock_res_init(ocfs2_lock_res *res,
+ ocfs_super *osb)
+{
+ enum ocfs2_lock_type type = OCFS_TYPE_SUPER;
+ int status;
+
+ LOG_ENTRY();
+
+ ocfs2_lock_res_init_common(res, type, osb);
+
+ res->l_ops = &ocfs2_super_lops;
+
+ status = ocfs2_build_lock_name(type,
+ OCFS2_SUPER_BLOCK_BLKNO,
+ 0,
+ &res->l_name);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+void ocfs2_lock_res_free(ocfs2_lock_res *res)
+{
+ if (res->l_name)
+ kfree(res->l_name);
+}
+
+static inline void ocfs2_inc_holders(ocfs2_lock_res *lockres,
+ int level)
+{
+ OCFS_ASSERT(lockres);
+
+ switch(level) {
+ case LKM_EXMODE:
+ lockres->l_ex_holders++;
+ break;
+ case LKM_PRMODE:
+ lockres->l_ro_holders++;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void ocfs2_dec_holders(ocfs2_lock_res *lockres,
+ int level)
+{
+ OCFS_ASSERT(lockres);
+
+ switch(level) {
+ case LKM_EXMODE:
+ OCFS_ASSERT(lockres->l_ex_holders);
+ lockres->l_ex_holders--;
+ break;
+ case LKM_PRMODE:
+ OCFS_ASSERT(lockres->l_ro_holders);
+ lockres->l_ro_holders--;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(ocfs2_lock_res *lockres)
+{
+ OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BUSY);
+ OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+ OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
+ lockres->l_level = lockres->l_requested;
+ lockres->l_blocking = LKM_NLMODE;
+ lockres->l_flags &= ~OCFS2_LOCK_BLOCKED;
+ lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+ wake_up(&lockres->l_event);
+}
+
+static void ocfs2_inc_inode_seq(ocfs_super *osb,
+ struct inode *inode)
+{
+ atomic_t *seq = GET_INODE_CLEAN_SEQ(inode);
+
+ LOG_TRACE_ARGS("incrementing inode seq... current is %d\n",
+ atomic_read(seq));
+
+ /* wrap to ONE after 13 bits, will need a spinlock */
+ spin_lock (&osb->clean_buffer_lock);
+ if ((atomic_read(&osb->clean_buffer_seq)+1) % STATE_BIT_MAX == 0)
+ atomic_set(&osb->clean_buffer_seq, 1);
+ else
+ atomic_inc(&osb->clean_buffer_seq);
+ spin_unlock (&osb->clean_buffer_lock);
+
+ /* doesn't matter if this another process */
+ /* has already incremented the global seq */
+ atomic_set(seq, atomic_read(&osb->clean_buffer_seq));
+
+ LOG_TRACE_ARGS("done incrementing inode seq... new is %d\n",
+ atomic_read(seq));
+}
+
+static inline void ocfs2_generic_handle_convert_action(ocfs2_lock_res *lockres)
+{
+ OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BUSY);
+ OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+ /* Convert from RO to EX doesn't really need anything as our
+ * information is already up to data. Convert from NL to
+ * *anything* however should mark ourselves as needing an
+ * update */
+ if (lockres->l_level == LKM_NLMODE)
+ lockres->l_flags |= OCFS2_LOCK_NEEDS_REFRESH;
+
+ lockres->l_level = lockres->l_requested;
+ lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+}
+
+static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
+ ocfs2_lock_res *lockres)
+{
+ ocfs_super *osb = OCFS2_SB(inode->i_sb);
+
+ /* generic_handle_convert_action will set the refresh flag for us. */
+ if (lockres->l_level == LKM_NLMODE)
+ ocfs2_inc_inode_seq(osb, inode);
+ ocfs2_generic_handle_convert_action(lockres);
+}
+
+static inline void ocfs2_generic_handle_attach_action(ocfs2_lock_res *lockres)
+{
+ OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BUSY);
+ OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+ if (lockres->l_requested > LKM_NLMODE &&
+ !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+ lockres->l_flags |= OCFS2_LOCK_NEEDS_REFRESH;
+
+ lockres->l_level = lockres->l_requested;
+ lockres->l_flags |= OCFS2_LOCK_ATTACHED;
+ lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+}
+
+static void ocfs2_inode_ast_func(void *opaque)
+{
+ ocfs2_lock_res *lockres = opaque;
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+ ocfs_super *osb = OCFS2_SB(inode->i_sb);
+ dlm_lockstatus *lksb;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("AST fired for inode %llu\n", OCFS_I(inode)->ip_blkno);
+#endif
+ OCFS_ASSERT(ocfs2_is_inode_lock(lockres));
+
+ spin_lock(&lockres->l_lock);
+ lksb = &(lockres->l_lksb);
+ if (lksb->status != DLM_NORMAL) {
+ printk("ocfs2_inode_ast_func: lksb status value of %u on "
+ "inode %llu\n", lksb->status, OCFS_I(inode)->ip_blkno);
+ spin_unlock(&lockres->l_lock);
+ return;
+ }
+
+ switch(lockres->l_action) {
+ case OCFS2_AST_ATTACH:
+ if (lockres->l_type == OCFS_TYPE_META &&
+ lockres->l_requested > LKM_NLMODE &&
+ !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+ ocfs2_inc_inode_seq(osb, inode);
+
+ ocfs2_generic_handle_attach_action(lockres);
+ break;
+ case OCFS2_AST_CONVERT:
+ if (lockres->l_type == OCFS_TYPE_META)
+ ocfs2_handle_meta_convert_action(inode, lockres);
+ else
+ ocfs2_generic_handle_convert_action(lockres);
+ break;
+ case OCFS2_AST_DOWNCONVERT:
+ ocfs2_generic_handle_downconvert_action(lockres);
+ break;
+ default:
+ BUG();
+ }
+
+ /* data locking ignores refresh flag for now. */
+ if (lockres->l_type == OCFS_TYPE_DATA)
+ lockres->l_flags &= ~OCFS2_LOCK_NEEDS_REFRESH;
+
+ /* set it to something invalid so if we get called again we
+ * can catch it. */
+ lockres->l_action = OCFS2_AST_INVALID;
+ spin_unlock(&lockres->l_lock);
+ wake_up(&lockres->l_event);
+}
+
+static void ocfs2_generic_handle_bast(ocfs2_lock_res *lockres, int level)
+{
+ spin_lock(&lockres->l_lock);
+ lockres->l_flags |= OCFS2_LOCK_BLOCKED;
+ if (level > lockres->l_blocking)
+ lockres->l_blocking = level;
+ spin_unlock(&lockres->l_lock);
+}
+
+static void ocfs2_inode_bast_func(void *opaque, int level)
+{
+ ocfs2_lock_res *lockres = opaque;
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+ ocfs_super *osb = OCFS2_SB(inode->i_sb);
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("BAST fired for inode %llu\n", OCFS_I(inode)->ip_blkno);
+#endif
+ ocfs2_generic_handle_bast(lockres, level);
+
+ ocfs2_schedule_blocked_inode_lock(inode, lockres);
+ ocfs2_kick_vote_thread(osb);
+}
+
+static void ocfs2_super_ast_func(void *opaque)
+{
+ ocfs2_lock_res *lockres = opaque;
+ dlm_lockstatus *lksb;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("AST fired for inode %llu\n", OCFS_I(inode)->ip_blkno);
+#endif
+ OCFS_ASSERT(ocfs2_is_super_lock(lockres));
+
+ spin_lock(&lockres->l_lock);
+ lksb = &(lockres->l_lksb);
+ if (lksb->status != DLM_NORMAL) {
+ printk("ocfs2_super_ast_func: lksb status value of %u!\n",
+ lksb->status);
+ spin_unlock(&lockres->l_lock);
+ return;
+ }
+
+ switch(lockres->l_action) {
+ case OCFS2_AST_ATTACH:
+ ocfs2_generic_handle_attach_action(lockres);
+ break;
+ case OCFS2_AST_CONVERT:
+ ocfs2_generic_handle_convert_action(lockres);
+ break;
+ case OCFS2_AST_DOWNCONVERT:
+ ocfs2_generic_handle_downconvert_action(lockres);
+ break;
+ default:
+ BUG();
+ }
+ /* set it to something invalid so if we get called again we
+ * can catch it. */
+ lockres->l_action = OCFS2_AST_INVALID;
+ spin_unlock(&lockres->l_lock);
+ wake_up(&lockres->l_event);
+}
+
+static void ocfs2_super_bast_func(void *opaque, int level)
+{
+ ocfs2_lock_res *lockres = opaque;
+ ocfs_super *osb = ocfs2_lock_res_super(lockres);
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("Superblock BAST fired\n");
+#endif
+ ocfs2_generic_handle_bast(lockres, level);
+
+ ocfs2_schedule_blocked_lock(osb, lockres);
+ ocfs2_kick_vote_thread(osb);
+}
+
+static inline void ocfs2_recover_from_dlm_error(ocfs2_lock_res *lockres,
+ int convert)
+{
+ spin_lock(&lockres->l_lock);
+ lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+ if (convert)
+ lockres->l_action = OCFS2_AST_INVALID;
+ else
+ lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+ spin_unlock(&lockres->l_lock);
+}
+
+static int ocfs2_lock_create(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int level,
+ int flags)
+{
+ int ret = 0;
+ dlm_status status;
+
+ LOG_ENTRY();
+
+ spin_lock(&lockres->l_lock);
+ if (lockres->l_flags & OCFS2_LOCK_ATTACHED) {
+ spin_unlock(&lockres->l_lock);
+ goto bail;
+ }
+
+ lockres->l_action = OCFS2_AST_ATTACH;
+ lockres->l_requested = level;
+ lockres->l_flags |= OCFS2_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
+
+ status = dlmlock(osb->dlm,
+ level,
+ &lockres->l_lksb,
+ flags,
+ lockres->l_name,
+ lockres->l_ops->ast,
+ lockres,
+ lockres->l_ops->bast);
+ if (status != DLM_NORMAL) {
+ LOG_ERROR_ARGS("Dlm returns %d\n", status);
+ ret = -ENOENT;
+ ocfs2_recover_from_dlm_error(lockres, 1);
+ }
+
+bail:
+ LOG_EXIT_STATUS(ret);
+ return ret;
+}
+
+static inline int ocfs2_check_wait_flag(ocfs2_lock_res *lockres,
+ int flag)
+{
+ int ret;
+ spin_lock(&lockres->l_lock);
+ ret = lockres->l_flags & flag;
+ spin_unlock(&lockres->l_lock);
+ return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(ocfs2_lock_res *lockres)
+
+{
+ wait_event_interruptible(lockres->l_event,
+ !ocfs2_check_wait_flag(lockres,
+ OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_blocked_lock(ocfs2_lock_res *lockres)
+
+{
+ wait_event_interruptible(lockres->l_event,
+ !ocfs2_check_wait_flag(lockres,
+ OCFS2_LOCK_BLOCKED));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(ocfs2_lock_res *lockres)
+
+{
+ wait_event_interruptible(lockres->l_event,
+ !ocfs2_check_wait_flag(lockres,
+ OCFS2_LOCK_REFRESHING));}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(ocfs2_lock_res *lockres,
+ int wanted)
+{
+ OCFS_ASSERT(lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
+ return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static int ocfs2_cluster_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int level,
+ int lkm_flags)
+{
+ int ret;
+ dlm_status status;
+
+ LOG_ENTRY();
+
+again:
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto bail;
+ }
+
+ spin_lock(&lockres->l_lock);
+
+ /* We only compare against the currently granted level
+ * here. If the lock is blocked waiting on a downconvert,
+ * we'll get caught below. */
+ if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+ level > lockres->l_level) {
+ /* is someone sitting in dlm_lock? If so, wait on
+ * them. */
+ spin_unlock(&lockres->l_lock);
+ ocfs2_wait_on_busy_lock(lockres);
+ goto again;
+ }
+
+ if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+ /* lock has not been created yet. */
+ spin_unlock(&lockres->l_lock);
+ ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+ if (ret < 0) {
+ LOG_ERROR_STATUS(ret);
+ goto bail;
+ }
+ goto again;
+ }
+
+ if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+ !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+ /* is the lock is currently blocked on behalf of
+ * another node */
+ spin_unlock(&lockres->l_lock);
+ ocfs2_wait_on_blocked_lock(lockres);
+ goto again;
+ }
+
+ if (level > lockres->l_level) {
+ lockres->l_action = OCFS2_AST_CONVERT;
+ lockres->l_requested = level;
+ lockres->l_flags |= OCFS2_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
+
+ /* call dlm_lock to upgrade lock now */
+ status = dlmlock(osb->dlm,
+ level,
+ &lockres->l_lksb,
+ lkm_flags|LKM_CONVERT|LKM_VALBLK,
+ lockres->l_name,
+ lockres->l_ops->ast,
+ lockres,
+ lockres->l_ops->bast);
+ if (status != DLM_NORMAL) {
+ if ((lkm_flags & LKM_NOQUEUE) &&
+ (status == DLM_NOTQUEUED))
+ ret = -EAGAIN;
+ else {
+ LOG_ERROR_ARGS("Dlm returns %d\n", status);
+ ret = -ENOENT;
+ }
+ ocfs2_recover_from_dlm_error(lockres, 1);
+ goto bail;
+ }
+
+ ocfs2_wait_on_busy_lock(lockres);
+ goto again;
+ }
+
+ /* Ok, if we get here then we're good to go. */
+ ocfs2_inc_holders(lockres, level);
+
+ spin_unlock(&lockres->l_lock);
+
+ ret = 0;
+bail:
+ LOG_EXIT_STATUS(ret);
+ return ret;
+}
+
+static void ocfs2_cluster_unlock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int level)
+{
+ spin_lock(&lockres->l_lock);
+ ocfs2_dec_holders(lockres, level);
+ ocfs2_vote_on_unlock(osb, lockres);
+ spin_unlock(&lockres->l_lock);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+ int status;
+ ocfs_super *osb = OCFS2_SB(inode->i_sb);
+ ocfs2_lock_res *lockres;
+
+ OCFS_ASSERT(inode);
+ OCFS_ASSERT(ocfs_inode_is_new(inode));
+
+ LOG_ENTRY();
+
+ /* NOTE: That we don't increment any of the holder counts, nor
+ * do we add anything to a journal handle. Since this is
+ * supposed to be a new inode which the cluster doesn't know
+ * about yet, there is no need to. As far as the LVB handling
+ * is concerned, this is basically like acquiring an EX lock
+ * on a resource which has an invalid one -- we'll set it
+ * valid when we release the EX. */
+
+ lockres = &OCFS_I(inode)->ip_meta_lockres;
+ OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+ lockres->l_flags |= OCFS2_LOCK_LOCAL;
+
+ status = ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+ lockres->l_flags &= ~OCFS2_LOCK_LOCAL;
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ lockres = &OCFS_I(inode)->ip_data_lockres;
+ OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+ lockres->l_flags |= OCFS2_LOCK_LOCAL;
+
+ status = ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+ lockres->l_flags &= ~OCFS2_LOCK_LOCAL;
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = 0;
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+int ocfs2_data_lock(struct inode *inode,
+ int write)
+{
+ int status, level;
+ ocfs2_lock_res *lockres;
+
+ OCFS_ASSERT(inode);
+
+ LOG_ENTRY();
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("ocfs2: (%u) inode %llu, take %s DATA lock\n",
+ current->pid, OCFS_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+#endif
+
+ lockres = &OCFS_I(inode)->ip_data_lockres;
+
+ level = write ? LKM_EXMODE : LKM_PRMODE;
+
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0);
+ if (status < 0 && status != -EINTR)
+ LOG_ERROR_STATUS(status);
+
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+static void ocfs2_vote_on_unlock(ocfs_super *osb,
+ ocfs2_lock_res *lockres)
+{
+ int kick = 0;
+
+ /* If we know that another node is waiting on our lock, kick
+ * the vote thread * pre-emptively when we reach a release
+ * condition. */
+ if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+ switch(lockres->l_blocking) {
+ case LKM_EXMODE:
+ if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+ kick = 1;
+ break;
+ case LKM_PRMODE:
+ if (!lockres->l_ex_holders)
+ kick = 1;
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ if (kick)
+ ocfs2_kick_vote_thread(osb);
+}
+
+void ocfs2_data_unlock(struct inode *inode,
+ int write)
+{
+ int level = write ? LKM_EXMODE : LKM_PRMODE;
+ ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_data_lockres;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("ocfs2: (%u) inode %llu drop %s DATA lock\n",
+ OCFS_I(inode)->ip_blkno, current->pid,
+ write ? "EXMODE" : "PRMODE");
+#endif
+
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+static inline int ocfs2_wait_on_recovery(ocfs_super *osb)
+{
+ wait_event_interruptible(osb->recovery_event,
+ ocfs_node_map_is_empty(osb,
+ &osb->recovery_map));
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ return 0;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_meta_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+ ocfs_inode_private *oip = OCFS_I(inode);
+ ocfs2_lock_res *lockres = &oip->ip_meta_lockres;
+ ocfs2_meta_lvb *lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+ lvb->lvb_iclusters = oip->ip_clusters;
+ lvb->lvb_iuid = inode->i_uid;
+ lvb->lvb_igid = inode->i_gid;
+ lvb->lvb_isize = inode->i_size;
+ lvb->lvb_imode = inode->i_mode;
+ lvb->lvb_inlink = inode->i_nlink;
+ lvb->lvb_iatime = ocfs_get_seconds(inode->i_atime);
+ lvb->lvb_ictime = ocfs_get_seconds(inode->i_ctime);
+ lvb->lvb_imtime = ocfs_get_seconds(inode->i_mtime);
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+ ocfs_inode_private *oip = OCFS_I(inode);
+ ocfs2_lock_res *lockres = &oip->ip_meta_lockres;
+ ocfs2_meta_lvb *lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+ /* We're safe here without the lockres lock... */
+ spin_lock(&oip->ip_lock);
+ oip->ip_clusters = lvb->lvb_iclusters;
+ inode->i_uid = lvb->lvb_iuid;
+ inode->i_gid = lvb->lvb_igid;
+ inode->i_size = lvb->lvb_isize;
+ inode->i_mode = lvb->lvb_imode;
+ inode->i_nlink = lvb->lvb_inlink;
+ inode->i_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1)
+ >> inode->i_sb->s_blocksize_bits;
+ OCFS_SET_INODE_TIME(inode, i_atime, lvb->lvb_iatime);
+ OCFS_SET_INODE_TIME(inode, i_ctime, lvb->lvb_ictime);
+ OCFS_SET_INODE_TIME(inode, i_mtime, lvb->lvb_imtime);
+ spin_unlock(&oip->ip_lock);
+}
+
+static void ocfs2_reset_meta_lvb_values(struct inode *inode)
+{
+ ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+ ocfs2_meta_lvb *lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ u32 i_clusters;
+
+ spin_lock(&OCFS_I(inode)->ip_lock);
+ i_clusters = OCFS_I(inode)->ip_clusters;
+ spin_unlock(&OCFS_I(inode)->ip_lock);
+
+ spin_lock(&lockres->l_lock);
+ if (lockres->l_level == LKM_EXMODE)
+ lvb->lvb_trunc_clusters = i_clusters;
+ spin_unlock(&lockres->l_lock);
+}
+
+static void __ocfs2_lvb_on_downconvert(ocfs2_lock_res *lockres,
+ int new_level)
+{
+ ocfs2_lvb *lvb = (ocfs2_lvb *) lockres->l_lksb.lvb;
+
+ if (lockres->l_level == LKM_EXMODE) {
+ lvb->lvb_seq++;
+ /* Overflow? */
+ if (!lvb->lvb_seq)
+ lvb->lvb_seq = 1;
+ lockres->l_local_seq = lvb->lvb_seq;
+ if (new_level == LKM_NLMODE)
+ lockres->l_local_seq++;
+ } else if (lockres->l_level == LKM_PRMODE) {
+ if (lvb->lvb_seq)
+ lockres->l_local_seq++;
+ }
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ * -1 means error, 0 means no refresh needed, > 0 means you need to
+ * refresh this and you MUST call ocfs2_complete_lock_res_refresh
+ * afterwards. */
+static int ocfs2_should_refresh_lock_res(ocfs2_lock_res *lockres)
+{
+
+ int status = 0;
+ LOG_ENTRY();
+
+refresh_check:
+ spin_lock(&lockres->l_lock);
+ if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+ spin_unlock(&lockres->l_lock);
+ goto bail;
+ }
+
+ if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+ spin_unlock(&lockres->l_lock);
+ if (signal_pending(current)) {
+ status = -EINTR;
+ goto bail;
+ }
+ ocfs2_wait_on_refreshing_lock(lockres);
+ goto refresh_check;
+ }
+
+ /* Ok, I'll be the one to refresh this lock. */
+ lockres->l_flags |= OCFS2_LOCK_REFRESHING;
+ spin_unlock(&lockres->l_lock);
+
+ status = 1;
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(ocfs2_lock_res *lockres,
+ int status)
+{
+ spin_lock(&lockres->l_lock);
+ lockres->l_flags &= ~OCFS2_LOCK_REFRESHING;
+ if (!status)
+ lockres->l_flags &= ~OCFS2_LOCK_NEEDS_REFRESH;
+ spin_unlock(&lockres->l_lock);
+
+ wake_up(&lockres->l_event);
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_meta_lock_update(struct inode *inode,
+ struct buffer_head **bh)
+{
+ int status;
+ u32 trustable_clusters = 0;
+ ocfs2_lock_res *lockres;
+ ocfs2_dinode *fe;
+
+ lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+ status = ocfs2_should_refresh_lock_res(lockres);
+ if (!status)
+ goto bail;
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ /* we don't want to use the LVB for bitmap files as the
+ * used/set bit union is not currently sent over the wire. */
+ if (!(OCFS_I(inode)->ip_flags & OCFS_INODE_BITMAP) &&
+ ocfs2_lvb_is_trustable(lockres)) {
+ /* yay, fastpath! */
+ ocfs2_meta_lvb_get_values(lockres, &trustable_clusters);
+ ocfs2_refresh_inode_from_lvb(inode);
+ } else {
+ /* Boo, we have to go to disk. */
+ /* read bh, cast, ocfs_refresh_inode */
+ status = ocfs_read_block(OCFS2_SB(inode->i_sb),
+ OCFS_I(inode)->ip_blkno, bh,
+ OCFS_BH_CACHED, inode);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ fe = (ocfs2_dinode *) (*bh)->b_data;
+
+ /* This is a good chance to make sure we're not
+ * locking an invalid object. */
+ OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+ OCFS_ASSERT(inode->i_generation ==
+ le32_to_cpu(fe->i_generation));
+ if ((fe->i_dtime) || (!(fe->i_flags & OCFS2_VALID_FL)))
+ BUG();
+
+ ocfs_refresh_inode(inode, fe);
+ }
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("inode %llu, I can only trust %u clusters\n",
+ OCFS_I(inode)->ip_blkno, trustable_clusters);
+#endif
+
+ ocfs2_extent_map_trunc(inode, trustable_clusters);
+
+ ocfs2_set_local_seq_from_lvb(lockres);
+ ocfs2_reset_meta_lvb_values(inode);
+
+ ocfs2_complete_lock_res_refresh(lockres, 0);
+bail:
+ return status;
+}
+
+int ocfs2_meta_lock_flags(struct inode *inode,
+ ocfs_journal_handle *handle,
+ struct buffer_head **ret_bh,
+ int ex,
+ int flags)
+{
+ int status, level, dlm_flags;
+ ocfs2_lock_res *lockres;
+ ocfs_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *bh = NULL;
+
+ OCFS_ASSERT(inode);
+
+ if (handle && !ex)
+ BUG();
+
+ LOG_ENTRY();
+
+#ifdef VERBOSE_LOCKING_TRACE
+ printk("ocfs2: (%u) inode %llu, take %s META lock\n",
+ OCFS_I(inode)->ip_blkno, current->pid,
+ ex ? "EXMODE" : "PRMODE");
+#endif
+
+ if (!(flags & OCFS2_META_LOCK_RECOVERY)) {
+ status = ocfs2_wait_on_recovery(osb);
+ if (status < 0)
+ goto bail;
+ }
+
+ lockres = &OCFS_I(inode)->ip_meta_lockres;
+ level = ex ? LKM_EXMODE : LKM_PRMODE;
+ dlm_flags = 0;
+ if (flags & OCFS2_META_LOCK_NOQUEUE)
+ dlm_flags |= LKM_NOQUEUE;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags);
+ if (status < 0) {
+ if (status != -EINTR && status != -EAGAIN)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ if (!(flags & OCFS2_META_LOCK_RECOVERY)) {
+ status = ocfs2_wait_on_recovery(osb);
+ if (status < 0)
+ goto bail;
+ }
+
+ status = ocfs2_meta_lock_update(inode, &bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ if (ret_bh && !bh) {
+ /* caller wants a buffer head but we haven't read it yet. */
+ status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &bh,
+ OCFS_BH_CACHED, inode);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ }
+ if (ret_bh) {
+ *ret_bh = bh;
+ get_bh(*ret_bh);
+ }
+ if (handle) {
+ status = ocfs_handle_add_lock(handle, inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+ }
+bail:
+ if (bh)
+ brelse(bh);
+
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+void ocfs2_meta_unlock(struct inode *inode,
+ int ex)
+{
+ int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("ocfs2: (%u) inode %llu drop %s META lock\n",
+ OCFS_I(inode)->ip_blkno, current->pid,
+ ex ? "EXMODE" : "PRMODE");
+#endif
+
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+int ocfs2_super_lock(ocfs_super *osb,
+ int ex)
+{
+ int status;
+ int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ ocfs2_lock_res *lockres = &osb->super_lockres;
+ struct buffer_head *bh;
+ ocfs2_slot_info *si = osb->slot_info;
+
+ LOG_ENTRY();
+
+ status = ocfs2_cluster_lock(osb, lockres, level, 0);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ /* The super block lock path is really in the best position to
+ * know when resources covered by the lock need to be
+ * refreshed, so we do it here. Of course, making sense of
+ * everything is up to the caller :) */
+ status = ocfs2_should_refresh_lock_res(lockres);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ if (status) {
+ bh = si->si_bh;
+ status = ocfs_read_block(osb, bh->b_blocknr, &bh, 0,
+ si->si_inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+ ocfs2_complete_lock_res_refresh(lockres, status);
+ }
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+void ocfs2_super_unlock(ocfs_super *osb,
+ int ex)
+{
+ int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ ocfs2_lock_res *lockres = &osb->super_lockres;
+
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_dlm_init(ocfs_super *osb)
+{
+ int status, pid;
+ u32 dlm_key;
+ dlm_ctxt *dlm = NULL;
+
+ LOG_ENTRY();
+
+ /* launch vote thread */
+ init_completion (&osb->vote_event_init);
+ pid = kernel_thread(ocfs2_vote_thread, osb,
+ CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (pid < 0) {
+ status = pid;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ wait_for_completion(&osb->vote_event_init);
+
+ /* used by the dlm code to make message headers unique, each
+ * node in this domain must agree on this. For now we'll just
+ * yank that off uuid. */
+ memcpy(&dlm_key, osb->uuid, sizeof(dlm_key));
+
+ /* for now, group_name == domain */
+ dlm = dlm_register_domain(osb->group_name, osb->group_name, dlm_key);
+ if (!dlm) {
+ /* This is a best guess on return value... */
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ osb->dlm = dlm;
+
+ status = ocfs2_super_lock_res_init(&osb->super_lockres, osb);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+bail:
+
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+void ocfs2_dlm_shutdown(ocfs_super *osb)
+{
+ int wait_on_vote_task = 0;
+
+ ocfs2_drop_super_lock(osb);
+
+ /* needs to be able to deal with the dlm being in many
+ * different states. */
+ spin_lock(&osb->vote_task_lock);
+ if (osb->vote_task) {
+ osb->vote_exit = 1;
+ ocfs2_kick_vote_thread(osb);
+ wait_on_vote_task = 1;
+ }
+ spin_unlock(&osb->vote_task_lock);
+
+ if (wait_on_vote_task)
+ wait_for_completion(&osb->vote_event_complete);
+
+ ocfs2_lock_res_free(&osb->super_lockres);
+ dlm_unregister_domain(osb->dlm);
+}
+
+static void ocfs2_unlock_ast_func(void *opaque, dlm_status status)
+{
+ ocfs2_lock_res *lockres = opaque;
+
+ if (status != DLM_NORMAL)
+ LOG_ERROR_ARGS("Dlm returns status %d\n", status);
+
+ spin_lock(&lockres->l_lock);
+ switch(lockres->l_unlock_action) {
+ case OCFS2_UNLOCK_CANCEL_CONVERT:
+ lockres->l_action = OCFS2_AST_INVALID;
+ break;
+ case OCFS2_UNLOCK_DROP_LOCK:
+ lockres->l_level = LKM_IVMODE;
+ break;
+ default:
+ BUG();
+ }
+ lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+ lockres->l_flags &= ~OCFS2_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
+
+ wake_up(&lockres->l_event);
+}
+
+/* BEWARE: called with lockres lock, and always drops it. */
+static int __ocfs2_drop_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres)
+{
+ int ret = 0;
+ dlm_status status;
+
+ if (lockres->l_flags & OCFS2_LOCK_BUSY)
+ printk("ocfs2: destroying busy lock!\n");
+ if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+ printk("ocfs2: destroying blocked lock!\n");
+
+ if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+ spin_unlock(&lockres->l_lock);
+ goto bail;
+ }
+
+ lockres->l_flags &= ~OCFS2_LOCK_ATTACHED;
+
+ /* make sure we never get here while waiting for an ast to
+ * fire. */
+ OCFS_ASSERT(lockres->l_action == OCFS2_AST_INVALID);
+
+ /* is this necessary? */
+ lockres->l_flags |= OCFS2_LOCK_BUSY;
+ lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+ spin_unlock(&lockres->l_lock);
+
+ status = dlmunlock(osb->dlm,
+ &lockres->l_lksb,
+ LKM_VALBLK,
+ lockres->l_ops->unlock_ast,
+ lockres);
+ if (status != DLM_NORMAL) {
+ LOG_ERROR_ARGS("Dlm returns %d\n", status);
+ ret = -ENOENT;
+ goto bail;
+ }
+
+ ocfs2_wait_on_busy_lock(lockres);
+ if (signal_pending(current)) {
+ printk("ocfs2_drop_lock: Signal caught!\n");
+ ret = -EINTR;
+ }
+bail:
+ LOG_EXIT_STATUS(ret);
+ return ret;
+}
+
+static void ocfs2_drop_super_lock(ocfs_super *osb)
+{
+ int status;
+ ocfs2_lock_res *lockres;
+
+ lockres = &osb->super_lockres;
+
+ spin_lock(&lockres->l_lock);
+ status = __ocfs2_drop_lock(osb, lockres);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+ int status, err;
+ ocfs2_lock_res *lockres;
+
+ lockres = &OCFS_I(inode)->ip_data_lockres;
+ spin_lock(&lockres->l_lock);
+ err = __ocfs2_drop_lock(OCFS2_SB(inode->i_sb), lockres);
+ if (err < 0)
+ LOG_ERROR_STATUS(err);
+
+ status = err;
+
+ /* the metadata lock requires a bit more work as we have an
+ * LVB to worry about. */
+ lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+ spin_lock(&lockres->l_lock);
+ if (lockres->l_flags & OCFS2_LOCK_ATTACHED) {
+ if (lockres->l_level == LKM_EXMODE)
+ __ocfs2_stuff_meta_lvb(inode);
+ /* fake an NLMODE downconvert for the lvb code. */
+ __ocfs2_lvb_on_downconvert(lockres, LKM_NLMODE);
+ }
+ err = __ocfs2_drop_lock(OCFS2_SB(inode->i_sb), lockres);
+ if (err < 0)
+ LOG_ERROR_STATUS(err);
+ if (err < 0 && !status)
+ status = err;
+
+ return status;
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+ int new_level = LKM_EXMODE;
+
+ if (level == LKM_EXMODE)
+ new_level = LKM_NLMODE;
+ else if (level == LKM_PRMODE)
+ new_level = LKM_PRMODE;
+ return new_level;
+}
+
+/* called with the spinlock held, and WILL drop it. */
+static int __ocfs2_downconvert_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int new_level,
+ int lvb)
+{
+ int status, flags = LKM_CONVERT;
+ OCFS_ASSERT(lockres->l_blocking > LKM_NLMODE);
+ OCFS_ASSERT(lockres->l_level > new_level);
+
+ lockres->l_action = OCFS2_AST_DOWNCONVERT;
+ lockres->l_requested = new_level;
+ lockres->l_flags |= OCFS2_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
+
+ if (lvb)
+ flags |= LKM_VALBLK;
+
+ status = dlmlock(osb->dlm,
+ new_level,
+ &lockres->l_lksb,
+ flags,
+ lockres->l_name,
+ lockres->l_ops->ast,
+ lockres,
+ lockres->l_ops->bast);
+ if (status != DLM_NORMAL) {
+ LOG_ERROR_ARGS("Dlm returns %d\n", status);
+ status = -ENOENT;
+ ocfs2_recover_from_dlm_error(lockres, 1);
+ goto bail;
+ }
+ status = 0;
+bail:
+ return status;
+}
+
+/* called with the spinlock held, and WILL drop it. */
+static int __ocfs2_cancel_convert(ocfs_super *osb,
+ ocfs2_lock_res *lockres)
+{
+ int status;
+
+ /* were we in a convert when we got the bast fire? */
+ OCFS_ASSERT(lockres->l_action == OCFS2_AST_CONVERT ||
+ lockres->l_action == OCFS2_AST_DOWNCONVERT);
+ /* set things up for the unlockast to know to just
+ * clear out the ast_action and unset busy, etc. */
+ lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+ spin_unlock(&lockres->l_lock);
+
+ status = dlmunlock(osb->dlm,
+ &lockres->l_lksb,
+ LKM_CANCEL,
+ lockres->l_ops->unlock_ast,
+ lockres);
+ if (status == DLM_NORMAL)
+ status = 0;
+
+ if (status == DLM_CANCELGRANT) {
+ /* If we got this, then the ast was fired
+ * before we could cancel. We cleanup our
+ * state, and restart the function. */
+ spin_lock(&lockres->l_lock);
+ lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+ spin_unlock(&lockres->l_lock);
+ } else {
+ LOG_ERROR_ARGS("Dlm returns %d\n", status);
+ status = -ENOENT;
+ ocfs2_recover_from_dlm_error(lockres, 0);
+ }
+
+ return status;
+}
+
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+ ocfs2_lock_res *lockres,
+ int new_level)
+{
+ int ret;
+
+ OCFS_ASSERT(new_level == LKM_NLMODE || new_level == LKM_PRMODE);
+ if (new_level == LKM_PRMODE)
+ ret = !lockres->l_ex_holders &&
+ ocfs_inode_fully_checkpointed(inode);
+ else /* Must be NLMODE we're converting to. */
+ ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
+ ocfs_inode_fully_checkpointed(inode);
+
+ return ret;
+}
+
+static int ocfs2_do_unblock_meta(struct inode *inode,
+ int *requeue)
+{
+ int new_level;
+ int set_lvb = 0;
+ ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+
+ spin_lock(&lockres->l_lock);
+ if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+ spin_unlock(&lockres->l_lock);
+ return 0;
+ }
+
+ OCFS_ASSERT(lockres->l_level == LKM_EXMODE ||
+ lockres->l_level == LKM_PRMODE);
+
+ if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+ *requeue++;
+ if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+ /* If we're already trying to cancel a lock conversion
+ * then just drop the spinlock and requeue ourselves
+ * to check again later. */
+ spin_unlock(&lockres->l_lock);
+ return 0;
+ }
+
+ return __ocfs2_cancel_convert(OCFS2_SB(inode->i_sb),
+ lockres);
+ }
+
+ new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+ if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
+ if (lockres->l_level == LKM_EXMODE) {
+ __ocfs2_stuff_meta_lvb(inode);
+ set_lvb = 1;
+ }
+ __ocfs2_lvb_on_downconvert(lockres, new_level);
+ return __ocfs2_downconvert_lock(OCFS2_SB(inode->i_sb),
+ lockres, new_level,
+ set_lvb);
+ }
+ if (!ocfs_inode_fully_checkpointed(inode))
+ ocfs_start_checkpoint(OCFS2_SB(inode->i_sb));
+
+ *requeue++;
+ spin_unlock(&lockres->l_lock);
+
+ return 0;
+}
+
+static int ocfs2_generic_unblock_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres,
+ int *requeue,
+ ocfs2_convert_worker_t *worker)
+{
+ int blocking;
+ int new_level;
+
+ spin_lock(&lockres->l_lock);
+ if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+ spin_unlock(&lockres->l_lock);
+ *requeue = 0;
+ return 0;
+ }
+
+ OCFS_ASSERT(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+recheck:
+ if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+ *requeue = 1;
+ if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+ /* If we're already trying to cancel a lock conversion
+ * then just drop the spinlock and requeue ourselves
+ * to check again later. */
+ spin_unlock(&lockres->l_lock);
+ return 0;
+ }
+
+ return __ocfs2_cancel_convert(osb, lockres);
+ }
+
+ /* if we're blocking an exclusive and we have *any* holders,
+ * then requeue. */
+ if ((lockres->l_blocking == LKM_EXMODE)
+ && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+ spin_unlock(&lockres->l_lock);
+ *requeue = 1;
+ return 0;
+ }
+
+ /* If it's a PR we're blocking, then only
+ * requeue if we've got any EX holders */
+ if (lockres->l_blocking == LKM_PRMODE &&
+ lockres->l_ex_holders) {
+ spin_unlock(&lockres->l_lock);
+ *requeue = 1;
+ return 0;
+ }
+
+ /* If we get here, then we know that there are no more
+ * incompatible holders (and anyone asking for an incompatible
+ * lock is blocked). We can now downconvert the lock */
+ if (!worker)
+ goto downconvert;
+
+ /* Some lockres types want to do a bit of work before
+ * downconverting a lock. Allow that here. The worker function
+ * may sleep, so we save off a copy of what we're blocking as
+ * it may change while we're not holding the spin lock. */
+ blocking = lockres->l_blocking;
+ spin_unlock(&lockres->l_lock);
+
+ worker(lockres, blocking);
+
+ spin_lock(&lockres->l_lock);
+ if (blocking != lockres->l_blocking) {
+ /* If this changed underneath us, then we can't drop
+ * it just yet. */
+ goto recheck;
+ }
+
+downconvert:
+ *requeue = 0;
+ new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+ return __ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+}
+
+static void ocfs2_data_convert_worker(ocfs2_lock_res *lockres,
+ int blocking)
+{
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+ sync_mapping_buffers(inode->i_mapping);
+ if (blocking == LKM_EXMODE)
+ ocfs_truncate_inode_pages(inode, 0);
+}
+
+int ocfs2_unblock_data(ocfs2_lock_res *lockres,
+ int *requeue)
+{
+ int status;
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+ ocfs_super *osb = OCFS2_SB(inode->i_sb);
+
+ status = ocfs2_generic_unblock_lock(osb,
+ lockres,
+ requeue,
+ ocfs2_data_convert_worker);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+ /* because of inode ref counting, we never want to propagate
+ * up requeue requests for inode locks. Instead we do it
+ * ourselves here, and lose the extra ref we got from queueing
+ * when we came in. */
+ if (*requeue)
+ ocfs2_schedule_blocked_inode_lock(inode, lockres);
+
+ iput(inode);
+ *requeue = 0;
+
+ return status;
+}
+
+int ocfs2_unblock_meta(ocfs2_lock_res *lockres,
+ int *requeue)
+{
+ int status;
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+ status = ocfs2_do_unblock_meta(inode, requeue);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+ /* if you're confused by this, see the comment in
+ * ocfs2_unblock_data */
+ if (*requeue)
+ ocfs2_schedule_blocked_inode_lock(inode, lockres);
+
+ iput(inode);
+ *requeue = 0;
+
+ return status;
+}
+
+static int ocfs2_unblock_super(ocfs2_lock_res *lockres,
+ int *requeue)
+{
+ int status;
+ ocfs_super *osb = ocfs2_lock_res_super(lockres);
+
+ status = ocfs2_generic_unblock_lock(osb,
+ lockres,
+ requeue,
+ NULL);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+ return status;
+}
+
+void ocfs2_process_blocked_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres)
+{
+ int status;
+ int requeue = 0;
+
+ OCFS_ASSERT(lockres);
+ OCFS_ASSERT(lockres->l_ops);
+ OCFS_ASSERT(lockres->l_ops->unblock);
+
+ status = lockres->l_ops->unblock(lockres, &requeue);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+ if (requeue)
+ ocfs2_schedule_blocked_lock(osb, lockres);
+}
+
+static void ocfs2_schedule_blocked_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres)
+{
+ spin_lock(&osb->vote_task_lock);
+ if (list_empty(&lockres->l_blocked_list)) {
+ list_add_tail(&lockres->l_blocked_list,
+ &osb->blocked_lock_list);
+ osb->blocked_lock_count++;
+ }
+ spin_unlock(&osb->vote_task_lock);
+}
+
+/* needed for inodes as we have to take a reference on them.. */
+static void ocfs2_schedule_blocked_inode_lock(struct inode *inode,
+ ocfs2_lock_res *lockres)
+{
+ if (!igrab(inode)) {
+ LOG_ERROR_ARGS("Inode %llu asked to be scheduled during "
+ "clear_inode!\n", OCFS_I(inode)->ip_blkno);
+ return;
+ }
+
+ ocfs2_schedule_blocked_lock(OCFS2_SB(inode->i_sb), lockres);
+}
Added: trunk/src/dlmglue.h
===================================================================
--- trunk/src/dlmglue.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/dlmglue.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,131 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef MIDDLE_H
+#define MIDDLE_H
+
+/*
+ * LVB Sequence number rules:
+ * local seq and lvb seq are initialized to zero.
+ *
+ * Note that the lvb is basically invalid until the 1st EX downconvert
+ * as he's the only guy that can set it valid. This is ok though as PR
+ * holders would have to do an I/O under lock anyway.
+ *
+ * NL->PR:
+ * NL->EX:
+ * If LVB is valid:
+ * if local seq == lvb seq, then we are up to date with the contents.
+ * otherwise, we take the slow path to get up to date and then set our
+ * local seq to the lvb seq.
+ *
+ * PR->NL:
+ * If LVB is valid:
+ * We increment our local seq. -- this allows up to
+ * one set of changes to the lvb before we considers ourselves
+ * invalid.
+ *
+ * PR->EX:
+ * Do nothing.
+ *
+ * EX->NL:
+ * EX->PR:
+ * Set the LVB as valid.
+ * Populate the LVB contents (this is lock type specific)
+ * Increment the LVB seq.
+ * Set my local seq to the LVB seq.
+ * if (EX->NL)
+ * do an additional increment of my local seq.
+ */
+typedef struct _ocfs2_lvb {
+ u32 lvb_seq;
+} ocfs2_lvb;
+typedef struct _ocfs2_meta_lvb {
+ ocfs2_lvb lvb;
+ u32 lvb_trunc_clusters;
+ u32 lvb_iclusters;
+ u32 lvb_iuid;
+ u32 lvb_igid;
+ u64 lvb_isize;
+ u16 lvb_imode;
+ u16 lvb_inlink;
+ u64 lvb_iatime;
+ u64 lvb_ictime;
+ u64 lvb_imtime;
+} ocfs2_meta_lvb;
+
+int ocfs2_dlm_init(ocfs_super *osb);
+void ocfs2_dlm_shutdown(ocfs_super *osb);
+int ocfs2_inode_lock_res_init(ocfs2_lock_res *res,
+ enum ocfs2_lock_type type,
+ struct inode *inode);
+int ocfs2_super_lock_res_init(ocfs2_lock_res *res,
+ ocfs_super *osb);
+void ocfs2_lock_res_free(ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_data_lock(struct inode *inode,
+ int write);
+void ocfs2_data_unlock(struct inode *inode,
+ int write);
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY (0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE (0x02)
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_flags(i, h, b, e, 0)
+int ocfs2_meta_lock_flags(struct inode *inode,
+ ocfs_journal_handle *handle,
+ struct buffer_head **ret_bh,
+ int ex,
+ int flags);
+void ocfs2_meta_unlock(struct inode *inode,
+ int ex);
+int ocfs2_super_lock(ocfs_super *osb,
+ int ex);
+void ocfs2_super_unlock(ocfs_super *osb,
+ int ex);
+/* for the vote thread */
+void ocfs2_process_blocked_lock(ocfs_super *osb,
+ ocfs2_lock_res *lockres);
+
+static inline void ocfs2_lvb_set_trunc_clusters(struct inode *inode,
+ unsigned int trunc_clusters)
+{
+ ocfs2_lock_res *lockres = &OCFS_I(inode)->ip_meta_lockres;
+ ocfs2_meta_lvb *lvb;
+
+ spin_lock(&lockres->l_lock);
+ OCFS_ASSERT(lockres->l_level == LKM_EXMODE);
+
+ lvb = (ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ if (lvb->lvb_trunc_clusters > trunc_clusters)
+ lvb->lvb_trunc_clusters = trunc_clusters;
+ spin_unlock(&lockres->l_lock);
+}
+
+#endif
Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/file.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,7 +38,7 @@
#include "alloc.h"
#include "dir.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "sysfile.h"
@@ -63,188 +63,62 @@
ocfs2_dinode *fe,
u64 new_size);
-static void ocfs_fe_set_attributes(ocfs2_dinode *fe, struct iattr *attr)
-{
- if (attr->ia_valid & ATTR_SIZE)
- fe->i_size = attr->ia_size;
- if (attr->ia_valid & ATTR_UID)
- fe->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- fe->i_gid = attr->ia_gid;
- if (attr->ia_valid & ATTR_MODE)
- fe->i_mode = attr->ia_mode;
- if (attr->ia_valid & ATTR_CTIME)
- fe->i_ctime = ocfs_get_seconds(attr->ia_ctime);
- if (attr->ia_valid & ATTR_ATIME)
- fe->i_atime = ocfs_get_seconds(attr->ia_atime);
- if (attr->ia_valid & ATTR_MTIME)
- fe->i_mtime = ocfs_get_seconds(attr->ia_mtime);
-}
-
int ocfs_sync_inode(struct inode *inode)
{
filemap_fdatawrite(inode->i_mapping);
return sync_mapping_buffers(inode->i_mapping);
}
-static inline int ocfs_wait_on_first_open(ocfs_super *osb,
- struct inode *inode)
+/* Checks an open request against our currently open mode */
+static inline int ocfs2_valid_open(int mode, int open_direct)
{
- int status = 0;
- sigset_t tmpsig;
+ int ret = 1;
- ocfs_block_sigs(&tmpsig, SHUTDOWN_SIGS);
-again:
- if (signal_pending(current)) {
- status = -EINTR;
- goto bail;
+ if (mode & O_DIRECT) {
+ if (!open_direct)
+ ret = 0;
+ } else {
+ if (open_direct && !(mode & O_RDONLY))
+ ret = 0;
}
-
- spin_lock(&OCFS_I(inode)->ip_lock);
- if (!(OCFS_I(inode)->ip_open_flags & OCFS_IN_FIRST_OPEN))
- goto bail;
- spin_unlock(&OCFS_I(inode)->ip_lock);
-
- interruptible_sleep_on(&osb->open_event);
- goto again;
-
-bail:
- spin_unlock(&OCFS_I(inode)->ip_lock);
- ocfs_unblock_sigs(tmpsig);
-
- return(status);
+ return ret;
}
-static inline void ocfs_notify_openers(ocfs_super *osb)
-{
- wake_up(&osb->open_event);
-}
-
/*
* ocfs_file_open()
*
*/
static int ocfs_file_open(struct inode *inode, struct file *file)
{
- int ret =0, err = 0, status = 0, first_open = 0;
+ int status;
int mode = file->f_flags;
- ocfs_super *osb = OCFS_SB(inode->i_sb);
ocfs_inode_private *oip = OCFS_I(inode);
LOG_ENTRY_ARGS ("(0x%p, 0x%p, '%*s')\n", inode, file,
file->f_dentry->d_name.len,
file->f_dentry->d_name.name);
- if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
- LOG_ERROR_STR ("Volume has been shutdown");
- status = -EACCES;
- goto leave;
- }
+ status = -EACCES;
- if (atomic_read(&oip->ip_needs_verification)) {
- down(&inode->i_sem);
- status = ocfs_verify_update_inode (osb, inode);
- up(&inode->i_sem);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto leave;
- }
- }
-
spin_lock(&oip->ip_lock);
- if (!oip->ip_open_cnt++) {
- first_open = 1;
- oip->ip_open_flags |= OCFS_IN_FIRST_OPEN;
+ if (oip->ip_open_cnt &&
+ !ocfs2_valid_open(mode,
+ oip->ip_flags & OCFS_INODE_OPEN_DIRECT)) {
+ spin_unlock(&oip->ip_lock);
+ goto leave;
}
- spin_unlock(&oip->ip_lock);
- if (!first_open)
- status = ocfs_wait_on_first_open(osb, inode);
+ if (mode & O_DIRECT)
+ oip->ip_flags |= OCFS_INODE_OPEN_DIRECT;
+ else
+ oip->ip_flags &= ~OCFS_INODE_OPEN_DIRECT;
- if (status < 0) {
- if (status != -EINTR)
- LOG_ERROR_STATUS(status);
- spin_lock(&oip->ip_lock);
- oip->ip_open_cnt--;
- goto leave_unlock;
- }
-
- /* TODO: if we're not opening for write then lets send an additional
- * flag over to tell the other node it's not necessary to do the
- * truncate_inode_pages (he just has to sync). */
-
- status = 0;
- if (!(mode & O_DIRECT)
- && (first_open || (mode & (O_WRONLY|O_RDWR))))
- status = ocfs_notify_on_open(osb, inode);
-
- spin_lock(&oip->ip_lock);
- if (first_open) {
- oip->ip_open_flags &= ~OCFS_IN_FIRST_OPEN;
- ocfs_notify_openers(osb);
- }
-
- if (status < 0) {
- oip->ip_open_cnt--;
- if (status != -EINTR)
- LOG_ERROR_STATUS(status);
- goto leave_unlock;
- }
-
- if (oip->ip_open_cnt > 1) {
- /* We're not the only person who has it open right
- * now so lets check whether the requested
- * access/share access conflicts with the existing
- * open operations. */
-
- LOG_TRACE_ARGS ("oin->ip_open_cnt > 0! : %u\n",
- oip->ip_open_cnt);
- if (!(mode & O_DIRECT)) {
- if ((oip->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO) && !(mode & O_RDONLY)) {
- oip->ip_open_cnt--;
- status = -EACCES;
- LOG_TRACE_STR("file is already open O_DIRECT, "
- "cannot open non O_DIRECT");
- goto leave_unlock;
- }
- } else if (mode & O_DIRECT) {
- if (!(oip->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO)) {
- oip->ip_open_cnt--;
- status = -EACCES;
- LOG_TRACE_STR("file is already open non " \
- "O_DIRECT, cannot open " \
- "O_DIRECT");
- goto leave_unlock;
- }
- }
- status = 0;
- } else {
- if (mode & O_DIRECT)
- OCFS_SET_FLAG(oip->ip_open_flags, OCFS_OIN_OPEN_FOR_DIRECTIO);
- else
- OCFS_CLEAR_FLAG(oip->ip_open_flags, OCFS_OIN_OPEN_FOR_DIRECTIO);
- }
-
-leave_unlock:
+ oip->ip_open_cnt++;
spin_unlock(&oip->ip_lock);
-
+ status = 0;
leave:
- if (status < 0) {
- if (status != -ENOENT && status != -ENOMEM &&
- status != -EACCES && status != -EINTR) {
- LOG_ERROR_STATUS (status);
- ret = -EACCES;
- } else
- ret = status;
- } else {
- ret = 0;
- }
-
- LOG_TRACE_ARGS
- ("exiting file_open: file=%p dentry=%p inode=%p kiovec=%d\n",
- file, file->f_dentry, file->f_dentry->d_inode, err);
- LOG_EXIT_INT (ret);
- return ret;
+ LOG_EXIT_STATUS(status);
+ return status;
} /* ocfs_file_open */
static int ocfs_file_release(struct inode *inode, struct file *file)
@@ -258,7 +132,7 @@
spin_lock(&oip->ip_lock);
if (!--oip->ip_open_cnt)
- oip->ip_open_flags &= ~OCFS_OIN_OPEN_FOR_DIRECTIO;
+ oip->ip_flags &= ~OCFS_INODE_OPEN_DIRECT;
spin_unlock(&oip->ip_lock);
LOG_EXIT_INT(0);
@@ -271,7 +145,7 @@
*
*/
static int ocfs_sync_file(struct file *file, struct dentry *dentry,
- int datasync)
+ int datasync)
{
int err = 0;
journal_t *journal;
@@ -303,87 +177,8 @@
return (err < 0) ? -EIO : 0;
} /* ocfs_sync_file */
-/* ocfs_change_file_attrib()
- *
- */
-static int ocfs_change_file_attrib(ocfs_super *osb, struct iattr *attr,
- struct inode *inode)
-{
- int status = 0;
- ocfs2_dinode *fe = NULL;
- struct buffer_head *bh = NULL;
- ocfs_journal_handle *handle = NULL;
- LOG_ENTRY ();
-
#ifdef PURE_EVIL
- if (evil_filename_check(EVIL_INODE, inode)) {
- LOG_ERROR_STR("EVIL ATTRIB");
- }
-#endif
-
- handle = ocfs_alloc_handle(osb);
- if (handle == NULL) {
- LOG_ERROR_STATUS(status);
- goto leave;
- }
-
- status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, &bh, inode);
- if (status < 0) {
- if (status != -EINTR)
- LOG_ERROR_STATUS (status);
- goto leave;
- }
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_FILE_UPDATE_OIN,
- inode);
-
- /* Start a transaction - need a minimal amount of block credits (1) */
- handle = ocfs_start_trans(osb, handle, OCFS_INODE_UPDATE_CREDITS);
- if (handle == NULL) {
- LOG_ERROR_STATUS(status);
- goto leave;
- }
-
- fe = (ocfs2_dinode *) bh->b_data;
-
- OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
-
- status = ocfs_journal_access(handle, inode, bh,
- OCFS_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- LOG_ERROR_STATUS(status);
- goto leave;
- }
-
- fe = (ocfs2_dinode *) bh->b_data;
-
- fe->i_mtime = OCFS_CURRENT_TIME;
-
- ocfs_fe_set_attributes(fe, attr);
-
- status = ocfs_journal_dirty(handle, bh);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto leave;
- }
-
-leave:
- if (handle)
- ocfs_commit_trans(handle);
-
- if (bh != NULL)
- brelse(bh);
-
- if (status < 0)
- if (status != -ENOSPC && status != -EINTR)
- LOG_ERROR_STATUS (status);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_change_file_attrib */
-
-
-#ifdef PURE_EVIL
int evil_filename_check(int type, void *ptr)
{
struct file *filp = ptr;
@@ -423,7 +218,7 @@
size_t count, loff_t *ppos)
{
int ret = 0;
- int writingAtEOF = 0;
+ int extended = 0;
ocfs_super *osb = NULL;
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
@@ -433,24 +228,15 @@
int do_direct_io = 0;
int sector_size;
int have_i_sem = 0;
+ int level = filp->f_flags & O_APPEND;
+ loff_t saved_ppos;
LOG_SET_CONTEXT(WRITE);
LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, '%*s')\n", filp, buf,
(unsigned int)count,
- filp->f_dentry->d_name.len, filp->f_dentry->d_name.name);
-
-#ifdef PURE_EVIL
- if (evil_filename_check(EVIL_DENTRY, dentry)) {
- int z;
- LOG_ERROR_ARGS("EVIL FILE_WRITE: count=%u, ppos=%llu, flags=%d\n", (unsigned int)count, *ppos, filp->f_flags);
- for (z=0; z<(count<16?count:16); z++) {
- printk("data[%d]=%02x ", z, ((char)buf[z]) & 0xff);
- }
- printk("\n");
- }
-#endif
-
+ filp->f_dentry->d_name.len,
+ filp->f_dentry->d_name.name);
/* happy write of zero bytes */
if (count == 0) {
ret = 0;
@@ -466,19 +252,25 @@
osb = OCFS_SB(inode->i_sb);
sector_size = 1 << osb->s_sectsize_bits;
- if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
- LOG_TRACE_STR ("Volume has already started shutdown");
- ret = -EIO;
- goto bail;
- }
-
down(&inode->i_sem);
have_i_sem = 1;
+lock:
+ status = ocfs2_meta_lock(inode, NULL, NULL, level);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ ret = status;
+ goto bail;
+ }
+
+ /* work on a copy of ppos until we're sure that we won't have
+ * to recalculate it due to relocking. */
+ saved_ppos = *ppos;
+
if (filp->f_flags & O_APPEND) {
LOG_TRACE_ARGS("O_APPEND: inode->i_size=%llu, ppos was %llu\n",
- inode->i_size, *ppos);
- *ppos = inode->i_size;
+ inode->i_size, saved_ppos);
+ saved_ppos = inode->i_size;
/* ugh, work around some applications which open
* everything O_DIRECT + O_APPEND and really don't
@@ -490,40 +282,38 @@
if (filp->f_flags & O_DIRECT) {
/* anything special for o_direct? */
LOG_TRACE_STR ("O_DIRECT");
- if (((*ppos) & (sector_size - 1)) || (count & (sector_size - 1)) ||
- ((unsigned long)buf & (sector_size - 1)) ) {
+ if ((saved_ppos & (sector_size - 1)) ||
+ (count & (sector_size - 1)) ||
+ ((unsigned long)buf & (sector_size - 1))) {
do_direct_io = 0;
filp->f_flags |= O_SYNC;
} else
do_direct_io = 1;
}
- if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
- LOG_TRACE_STR ("OIN_NEEDS_VERIFICATION");
- status = ocfs_verify_update_inode (osb, inode);
- if (status < 0) {
- LOG_TRACE_STR ("ocfs_verify_update_inode failed");
- LOG_TRACE_STR ("TODO: disable volume");
- ret = -EIO;
- goto bail;
- }
- }
- newsize = count + *ppos;
+ newsize = count + saved_ppos;
if (filp->f_flags & O_APPEND)
newsize = count + inode->i_size;
LOG_TRACE_ARGS ("ppos=%llu newsize=%llu cursize=%llu\n",
- *ppos, newsize, inode->i_size);
+ saved_ppos, newsize, inode->i_size);
if (newsize > inode->i_size) {
- writingAtEOF = 1;
+ if (!level) {
+ /* we want an extend, but need a higher
+ * level cluster lock. */
+ LOG_TRACE_ARGS("inode %llu, had a PR, looping back "
+ "for EX\n", OCFS_I(inode)->ip_blkno);
+ ocfs2_meta_unlock(inode, level);
+ level = 1;
+ goto lock;
+ }
+ extended = 1;
- LOG_TRACE_ARGS
- ("Writing at EOF, will need more allocation: have=%llu, "
- "need=%llu\n",
- ocfs2_clusters_to_bytes(inode->i_sb,
- OCFS_I(inode)->ip_clusters),
- newsize);
+ LOG_TRACE_ARGS("Writing at EOF, will need more allocation: "
+ "i_size=%llu, need=%llu\n",
+ inode->i_size, newsize);
+
status = ocfs_extend_file(osb, inode, newsize);
if (status < 0) {
if (status != -EINTR && status != -ENOSPC) {
@@ -534,10 +324,28 @@
ret = -ENOSPC;
} else
ret = status;
+
+ ocfs2_meta_unlock(inode, level);
goto bail;
}
}
+ /* we've got whatever cluster lock is appropriate now, so we
+ * can stuff *ppos back. */
+ *ppos = saved_ppos;
+
+ if (!do_direct_io) {
+ status = ocfs2_data_lock(inode, 1);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ ret = status;
+
+ ocfs2_meta_unlock(inode, level);
+ goto bail;
+ }
+ }
+
down_read(&OCFS_I(inode)->ip_alloc_sem);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
/*
@@ -565,7 +373,10 @@
ret = generic_file_write_nolock (filp, buf, count, ppos);
#endif
up_read(&OCFS_I(inode)->ip_alloc_sem);
- if (writingAtEOF) {
+ if (!do_direct_io)
+ ocfs2_data_unlock(inode, 1);
+
+ if (extended) {
LOG_TRACE_STR
("Generic_file_write ok, asking for OIN update now");
inode->i_size = newsize;
@@ -586,6 +397,7 @@
LOG_ERROR_ARGS("Unable to pre-zero extension of inode (%d)", status);
}
}
+ ocfs2_meta_unlock(inode, level);
bail:
if (have_i_sem)
@@ -644,14 +456,23 @@
} else
do_direct_io = 1;
}
- if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
- down(&inode->i_sem);
- status = ocfs_verify_update_inode (osb, inode);
- up(&inode->i_sem);
+
+ /* yay, PR (shared) locks all 'round :) */
+ status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ /* is this ret code correct? */
+ ret = status;
+ goto bail;
+ }
+
+ if (!do_direct_io) {
+ status = ocfs2_data_lock(inode, 0);
if (status < 0) {
- LOG_TRACE_STR ("ocfs_verify_update_inode failed");
- LOG_TRACE_STR ("TODO: disable volume");
- ret = -EIO;
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ /* is this ret code correct? */
+ ret = status;
goto bail;
}
}
@@ -684,6 +505,9 @@
if (ret == -EINVAL)
LOG_ERROR_STR ("Generic_file_read returned -EINVAL");
+ if (!do_direct_io)
+ ocfs2_data_unlock(inode, 0);
+ ocfs2_meta_unlock(inode, 0);
bail:
LOG_EXIT_INT (ret);
@@ -729,9 +553,10 @@
grow = new_i_size > inode->i_size;
inode->i_size = new_i_size;
- OCFS_SET_INODE_TIME(inode, i_mtime, OCFS_CURRENT_TIME);
inode->i_blocks = (new_i_size + sb->s_blocksize - 1)
>> sb->s_blocksize_bits;
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
status = ocfs_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
LOG_ERROR_STATUS (status);
@@ -797,22 +622,14 @@
LOG_ENTRY_ARGS("(inode = %llu, new_i_size = %llu\n",
OCFS_I(inode)->ip_blkno, new_i_size);
- handle = ocfs_alloc_handle(osb);
- if (handle == NULL) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto bail;
- }
+ ocfs_truncate_inode_pages(inode, new_i_size);
- status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE,
- FLAG_FILE_TRUNCATE|FLAG_FILE_UPDATE_OIN,
- &fe_bh, inode);
+ status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &fe_bh,
+ OCFS_BH_CACHED, inode);
if (status < 0) {
- if (status != -EINTR)
- LOG_ERROR_STATUS (status);
+ LOG_ERROR_STATUS(status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- FLAG_FILE_TRUNCATE|FLAG_FILE_UPDATE_OIN, inode);
fe = (ocfs2_dinode *) fe_bh->b_data;
OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
@@ -841,19 +658,32 @@
"truncate\n", fe->i_clusters);
/* No allocation change is required, so lets fast path
* this truncate. */
- handle = ocfs_start_trans(osb, handle,
+ handle = ocfs_start_trans(osb, NULL,
OCFS_INODE_UPDATE_CREDITS);
if (handle == NULL) {
LOG_ERROR_STATUS (status = -ENOMEM);
goto bail;
}
+ /* Since we got our cluster lock from caller and we
+ * don't add it to the handle: */
+ ocfs_set_inode_lock_trans(osb->journal, inode);
+
status = ocfs_set_inode_size(handle, inode, fe_bh, new_i_size);
if (status < 0)
LOG_ERROR_STATUS (status);
goto bail;
}
+ /* This forces other nodes to sync and drop their pages */
+ status = ocfs2_data_lock(inode, 1);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ ocfs2_data_unlock(inode, 1);
+
/* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the
* truncate if necessary. This does the task of marking
@@ -1046,7 +876,7 @@
/* TODO: We will keep a small history of allocs on the filp
* and calculate a reasonable overalloc based on that data
* here. */
- return(0);
+ return 0;
}
/* ocfs_extend_file()
@@ -1085,26 +915,20 @@
goto leave;
}
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, FLAG_FILE_EXTEND,
- &bh, inode);
+ status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &bh,
+ OCFS_BH_CACHED, inode);
if (status < 0) {
- if (status != -EINTR)
- LOG_ERROR_STATUS (status);
+ LOG_ERROR_STATUS(status);
goto leave;
}
- ocfs_handle_add_lock(handle,
- OCFS_LKM_EXMODE,
- FLAG_FILE_EXTEND|FLAG_FILE_UPDATE_OIN,
- inode);
fe = (ocfs2_dinode *) bh->b_data;
OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
- OCFS_ASSERT(new_i_size >= fe->i_size);
+ OCFS_ASSERT(inode->i_size == fe->i_size);
+ OCFS_ASSERT(new_i_size >= inode->i_size);
- if (fe->i_size == new_i_size) {
- OCFS_ASSERT(inode->i_size == new_i_size);
- goto leave;
- }
+ if (inode->i_size == new_i_size)
+ goto leave;
clusters_to_add = ocfs2_clusters_for_bytes(osb->sb, new_i_size)
- fe->i_clusters;
@@ -1114,14 +938,14 @@
OCFS_I(inode)->ip_blkno, new_i_size, inode->i_size,
fe->i_clusters, clusters_to_add);
- if (!clusters_to_add)
+ if (!clusters_to_add)
goto do_start_trans;
overalloc_bits = 0;
if (!skip_overalloc) {
- overalloc_bits = ocfs_calc_overalloc_bits(osb,
- NULL,
- fe,
+ overalloc_bits = ocfs_calc_overalloc_bits(osb,
+ NULL,
+ fe,
new_i_size);
clusters_to_add += overalloc_bits;
skip_overalloc = 1;
@@ -1171,6 +995,9 @@
goto leave;
}
+ /* Since we got our cluster lock from caller and we don't add
+ * it to the handle: */
+ ocfs_set_inode_lock_trans(osb->journal, inode);
restarted_transaction:
/* reserve a write to the file entry early on - that we if we
* run out of credits in the allocation path, we can still
@@ -1249,14 +1076,14 @@
fe->i_size = ocfs2_clusters_to_bytes(osb->sb, fe->i_clusters);
else
fe->i_size = new_i_size;
-#warning "is there a reason why we don't update i_blocks here?"
+
LOG_TRACE_ARGS("fe: i_clusters = %u, i_size=%llu\n",
fe->i_clusters, fe->i_size);
LOG_TRACE_ARGS("inode: ip_clusters=%u, i_size=%llu\n",
OCFS_I(inode)->ip_clusters, inode->i_size);
- fe->i_mtime = OCFS_CURRENT_TIME;
+ fe->i_ctime = fe->i_mtime = OCFS_CURRENT_TIME;
status = ocfs_journal_dirty(handle, bh);
if (status < 0) {
@@ -1299,33 +1126,18 @@
*/
int ocfs_setattr(struct dentry *dentry, struct iattr *attr)
{
+ int status = 0;
+ int unlock = 0;
+ u64 newsize;
struct inode *inode = dentry->d_inode;
- int error = 0;
- __u64 newsize;
- int status;
- ocfs_super *osb = NULL;
struct super_block *sb = inode->i_sb;
+ ocfs_super *osb = OCFS2_SB(sb);
+ struct buffer_head *bh = NULL;
+ ocfs_journal_handle *handle = NULL;
- LOG_SET_CONTEXT(SETATTR);
-
LOG_ENTRY_ARGS ("(0x%p, '%*s')\n", dentry,
dentry->d_name.len, dentry->d_name.name);
- osb = OCFS_SB(inode->i_sb);
-
-#ifdef PURE_EVIL
- if (evil_filename_check(EVIL_DENTRY, dentry)) {
- LOG_ERROR_ARGS("EVIL SETATTR\n");
- }
-#endif
-
- if (!dentry->d_parent || !dentry->d_parent->d_inode) {
- LOG_ERROR_STR ("bad inode or root inode");
- goto bail;
- }
-
- newsize = attr->ia_size;
-
if (attr->ia_valid & ATTR_MODE)
LOG_TRACE_ARGS ("mode change: %d\n", attr->ia_mode);
if (attr->ia_valid & ATTR_UID)
@@ -1337,38 +1149,43 @@
if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
LOG_TRACE_STR ("time change...");
- if (!(attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME |
- ATTR_SIZE | ATTR_GID | ATTR_UID | ATTR_MODE))) {
- LOG_TRACE_STR
- ("can only change mode, uid, gid, size and time. exiting!");
- goto bail;
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+ | ATTR_GID | ATTR_UID | ATTR_MODE)
+ if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
+ LOG_TRACE_ARGS("can't handle attrs: 0x%x\n", attr->ia_valid);
+ return 0;
}
- error = inode_change_ok (inode, attr);
- if (error)
+ status = inode_change_ok (inode, attr);
+ if (status)
+ return status;
+
+ newsize = attr->ia_size;
+
+ status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
goto bail;
+ }
+ unlock = 1;
- /* get the file and parent offsets, and the file oin if present */
- if (attr->ia_valid & ATTR_SIZE) {
- if (inode->i_size > newsize) {
- ocfs_truncate_inode_pages(inode, newsize);
- status = ocfs_truncate_file(osb, newsize,
- inode);
- } else {
+ if (attr->ia_valid & ATTR_SIZE &&
+ newsize != inode->i_size) {
+ if (inode->i_size > newsize)
+ status = ocfs_truncate_file(osb, newsize, inode);
+ else
status = ocfs_extend_file(osb, inode, newsize);
- }
if (status < 0) {
if (status != -EINTR && status != -ENOSPC)
- LOG_ERROR_STATUS (status);
- error = -ENOSPC;
+ LOG_ERROR_STATUS(status);
+ status = -ENOSPC;
goto bail;
}
-
spin_lock(&OCFS_I(inode)->ip_lock);
inode->i_size = newsize;
inode->i_blocks = (newsize + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
- if (OCFS_I(inode)->ip_open_flags &
- OCFS_OIN_OPEN_FOR_DIRECTIO) {
+ if (OCFS_I(inode)->ip_flags & OCFS_INODE_OPEN_DIRECT) {
/* This is a total broken hack for O_DIRECT crack */
OCFS_I(inode)->ip_mmu_private = inode->i_size;
}
@@ -1380,20 +1197,42 @@
}
}
- status = ocfs_change_file_attrib(osb, attr, inode);
+ handle = ocfs_start_trans(osb, NULL, OCFS_INODE_UPDATE_CREDITS);
+ if (handle == NULL) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ /* Ok, this is the last transaction we'll do for a setattr so
+ * just add our lock to the handle and let commit_trans deal
+ * with it. */
+ status = ocfs_handle_add_lock(handle, inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+ unlock = 0;
+
+ status = inode_setattr (inode, attr);
if (status < 0) {
- if (status != -EINTR)
- LOG_ERROR_STATUS (status);
- error = -EIO;
+ LOG_ERROR_STATUS(status);
goto bail;
}
- error = inode_setattr (inode, attr);
+ status = ocfs_mark_inode_dirty(handle, inode, bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS (status);
+ goto bail;
+ }
+
bail:
- LOG_EXIT_INT (error);
+ if (handle)
+ ocfs_commit_trans(handle);
+ if (unlock)
+ ocfs2_meta_unlock(inode, 1);
+ if (bh)
+ brelse(bh);
- LOG_CLEAR_CONTEXT();
- return error;
+ LOG_EXIT_STATUS(status);
+ return status;
} /* ocfs_setattr */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
Modified: trunk/src/heartbeat.c
===================================================================
--- trunk/src/heartbeat.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/heartbeat.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -3,7 +3,8 @@
*
* heartbeat.c
*
- * Keeps track of alive nodes in the cluster.
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recover when needed.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
@@ -30,6 +31,10 @@
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+
#include "ocfs_log.h"
#include "ocfs.h"
#include "ocfs2.h"
@@ -37,8 +42,6 @@
#include "alloc.h"
#include "heartbeat.h"
#include "util.h"
-#include "volcfg.h"
-#include "vote.h"
#include "ocfs_journal.h"
#include "buffer_head_io.h"
@@ -46,311 +49,386 @@
/* Tracing */
#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_HEARTBEAT
-/*
- * ocfs_nm_heart_beat()
- *
- * @osb: ocfs super block for the volume
- * @flag: type of heart beat
- * @read_publish: if the publish sector needs to be re-read
- *
- * Updates the timestamp in the nodes publish sector.
- * NOTE: must be called while holding publish_lock!
- *
- * Returns 0 if success, < 0 if error.
- */
-int ocfs_nm_heart_beat (ocfs_super * osb, __u32 flag, int read_publish)
+#define OCFS2_HB_NODE_DOWN_PRI (0x0000001)
+#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
+
+static void ocfs2_hb_node_down_cb(struct inode *group,
+ struct inode *node,
+ int node_num,
+ void *data);
+static void ocfs2_hb_node_up_cb(struct inode *group,
+ struct inode *node,
+ int node_num,
+ void *data);
+
+static void __ocfs_node_map_dup(ocfs_super *osb,
+ ocfs_node_map *target,
+ ocfs_node_map *from);
+static inline void __ocfs_node_map_set_bit(ocfs_node_map *map,
+ int bit);
+static inline void __ocfs_node_map_clear_bit(ocfs_node_map *map,
+ int bit);
+static inline int __ocfs_node_map_is_empty(ocfs_node_map *map);
+static void __ocfs_node_map_dup(ocfs_super *osb,
+ ocfs_node_map *target,
+ ocfs_node_map *from);
+static void __ocfs_node_map_set(ocfs_node_map *target, ocfs_node_map *from);
+
+void ocfs2_init_node_maps(ocfs_super *osb)
{
- int status = 0;
- ocfs_publish *publish = NULL;
- int publish_idx = OCFS_VOLCFG_NEWCFG_SECTORS + osb->node_num;
- struct buffer_head **pub_bh = &osb->autoconfig_bhs[publish_idx];
+ spin_lock_init(&osb->node_map_lock);
+ ocfs_node_map_init(osb, &osb->mounted_map);
+ ocfs_node_map_init(osb, &osb->recovery_map);
+ ocfs_node_map_init(osb, &osb->umount_map);
+}
- LOG_ENTRY_ARGS ("(0x%p, %u, %s)\n", osb, flag,
- read_publish ? "true" : "false");
+static void ocfs2_hb_node_down_cb(struct inode *group,
+ struct inode *node,
+ int node_num,
+ void *data)
+{
+ ocfs_super *osb = data;
- if (flag & HEARTBEAT_METHOD_DISK) {
- if (pub_bh == NULL && !read_publish)
- BUG();
+ if (osb->group_inode != group)
+ return;
- if (read_publish) {
- status = ocfs_read_block(osb,
- (osb->publish_blkno + osb->node_num),
- pub_bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
- }
+ OCFS_ASSERT(osb->node_num != node_num);
- publish = (ocfs_publish *) (*pub_bh)->b_data;
- if ((publish->dirty) && (!osb->publish_dirty)) {
- LOG_TRACE_STR(("NMThread reads the bit as dirty"));
- publish->dirty = 0;
- }
- /* Write the current time in local node's publish sector */
- publish->time = jiffies;
- /* Dissallow 0 */
- if (!publish->time)
- publish->time = 1;
- spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
- publish->comm_seq_num = OcfsGlobalCtxt.comm_seq_num;
- spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
+ printk("ocfs2: node down event for %d\n", node_num);
- status = ocfs_write_block(osb, *pub_bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
+ if (ocfs_node_map_test_bit(osb, &osb->umount_map, node_num)) {
+ /* If a node is in the umount map, then we've been
+ * expecting him to go down and we know ahead of time
+ * that recovery is not necessary. */
+ ocfs_node_map_clear_bit(osb, &osb->umount_map, node_num);
+ return;
}
- if (flag & HEARTBEAT_METHOD_IPC) {
- /* Plug this in later... */
- }
+ ocfs_recovery_thread(osb, node_num);
+}
-finally:
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_nm_heart_beat */
+static void ocfs2_hb_node_up_cb(struct inode *group,
+ struct inode *node,
+ int node_num,
+ void *data)
+{
+ ocfs_super *osb = data;
+ if (osb->group_inode != group)
+ return;
-/*
- * ocfs_update_publish_map()
- *
- * @osb: ocfs super block for the volume
- * @buffer: publish sectors read in the last round
- * @first_time: if true, the buffer needs to be initialized
- *
- * Reads the publish sectors and compares the timestamp of each node
- * to the one it read in the last round. As long as the timestamp keeps
- * changing, the node is marked alive. Conversely, if the timestamp does
- * not change over time, the node is marked dead. The function marks all
- * the live nodes in the publishmap.
- *
- */
-void ocfs_update_publish_map (ocfs_super * osb, struct buffer_head *bhs[], int first_time)
+ OCFS_ASSERT(osb->node_num != node_num);
+
+ printk("ocfs2: node up event for %d\n", node_num);
+ ocfs_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+/* Most functions here are just stubs for now... */
+int ocfs2_register_hb_callbacks(ocfs_super *osb)
{
- ocfs_publish *publish;
- ocfs_vol_node_map *node_map;
- int i;
- __u16 num_nodes;
+ int status;
- LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u)\n", osb, bhs, first_time);
+ status = hb_register_callback(HB_NODE_DOWN_CB,
+ ocfs2_hb_node_down_cb,
+ osb,
+ OCFS2_HB_NODE_DOWN_PRI);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
- num_nodes = osb->max_nodes;
- node_map = osb->vol_node_map;
+ status = hb_register_callback(HB_NODE_UP_CB,
+ ocfs2_hb_node_up_cb,
+ osb,
+ OCFS2_HB_NODE_UP_PRI);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
- /* First time thru, update buffer with timestamps for all nodes */
- if (first_time) {
- /* Read the last comm_seq_num */
- publish = (ocfs_publish *) bhs[osb->node_num]->b_data;
- spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
- OcfsGlobalCtxt.comm_seq_num = publish->comm_seq_num + 10;
- spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
+bail:
+ return status;
+}
- /* Refresh local buffers */
- for (i = 0; i < num_nodes; i++) {
- publish = (ocfs_publish *) bhs[i]->b_data;
- node_map[i].time = publish->time;
- if (publish->mounted && i != osb->node_num) {
- printk("ocfs2: Adding %s (node %d) to "
- "clustered device (%u,%u)\n",
- osb->node_cfg_info[i]->node_name, i,
- MAJOR(osb->sb->s_dev),
- MINOR(osb->sb->s_dev));
- node_map[i].miss_cnt = 0;
- ocfs_publish_map_set(&osb->publ_map, i);
- }
- }
- goto bail; /* exit */
- }
+void ocfs2_clear_hb_callbacks(ocfs_super *osb)
+{
+ int status;
- for (i = 0; i < num_nodes; i++) {
- publish = (ocfs_publish *) bhs[i]->b_data;
+ status = hb_unregister_callback(HB_NODE_DOWN_CB,
+ ocfs2_hb_node_down_cb, osb);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
- /* Loop if slot is unused */
- if (publish->time == 0ULL)
- continue;
+ status = hb_unregister_callback(HB_NODE_UP_CB,
+ ocfs2_hb_node_up_cb, osb);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
- /* Check if the node is hung or not by comparing the disk */
- /* and memory timestamp values */
- if (node_map[i].time == publish->time) {
- if (ocfs_node_is_alive(&osb->publ_map, i)) {
- char *err_msg = NULL;
- if (atomic_read (&(node_map[i].dismount))) {
- node_map[i].miss_cnt = MISS_COUNT_NODE_DEAD;
- atomic_set (&(node_map[i].dismount), 0);
- ocfs_publish_map_clear(&osb->publ_map, i);
- err_msg = "Received dismount message. Removing %s "
- "(node %d) from clustered device (%u,%u).\n";
- } else {
- (node_map[i].miss_cnt)++;
+}
- if (node_map[i].miss_cnt == MISS_COUNT_WARNING)
- err_msg = "warning: %s (node %d) may be ejected from cluster "
- "on device (%u.%u)... %d misses so far\n";
- else if (node_map[i].miss_cnt == MISS_COUNT_EMERGENCY)
- err_msg = "warning: %s (node %d) WILL BE EJECTED from cluster "
- "on device (%u.%u)... %d misses so far\n";
- else if (node_map[i].miss_cnt >= MISS_COUNT_NODE_DEAD)
- err_msg = "Removing %s (node %d) from clustered device "
- "(%u,%u) after %d misses\n";
- }
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!! */
+void ocfs_node_map_init(ocfs_super *osb,
+ ocfs_node_map *map)
+{
+ map->num_nodes = osb->max_nodes;
+ memset(map->map, 0, BITS_TO_LONGS(OCFS_NODE_MAP_MAX_NODES) *
+ sizeof(unsigned long));
+}
- if (err_msg)
- LOG_ERROR_ARGS(err_msg, osb->node_cfg_info[i]->node_name, i,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev),
- node_map[i].miss_cnt);
- if (node_map[i].miss_cnt >= MISS_COUNT_NODE_DEAD) {
- ocfs_recovery_map_set(osb, i);
- ocfs_publish_map_clear(&osb->publ_map, i);
+static inline void __ocfs_node_map_set_bit(ocfs_node_map *map,
+ int bit)
+{
+ set_bit(bit, map->map);
+}
- /* Ok, we'd better recover him now...*/
- ocfs_recovery_thread(osb, i);
- }
- }
- } else {
- if (!ocfs_node_is_alive(&osb->publ_map, i) &&
- (osb->node_num != i))
- printk ("ocfs2: Adding %s (node %d) to clustered device (%u,%u)\n",
- osb->node_cfg_info[i]->node_name, i,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
- node_map[i].miss_cnt = 0;
- node_map[i].time = publish->time;
- ocfs_publish_map_set(&osb->publ_map, i);
+void ocfs_node_map_set_bit(ocfs_super *osb,
+ ocfs_node_map *map,
+ int bit)
+{
+ if (bit==-1)
+ return;
+ OCFS_ASSERT(bit < map->num_nodes);
+ spin_lock(&osb->node_map_lock);
+ __ocfs_node_map_set_bit(map, bit);
+ spin_unlock(&osb->node_map_lock);
+}
- }
+static inline void __ocfs_node_map_clear_bit(ocfs_node_map *map,
+ int bit)
+{
+ clear_bit(bit, map->map);
+}
+
+void ocfs_node_map_clear_bit(ocfs_super *osb,
+ ocfs_node_map *map,
+ int bit)
+{
+ if (bit==-1)
+ return;
+ OCFS_ASSERT(bit < map->num_nodes);
+ spin_lock(&osb->node_map_lock);
+ __ocfs_node_map_clear_bit(map, bit);
+ spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs_node_map_test_bit(ocfs_super *osb,
+ ocfs_node_map *map,
+ int bit)
+{
+ int ret;
+ if (bit >= map->num_nodes) {
+ LOG_ERROR_ARGS("bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+ BUG();
}
+ spin_lock(&osb->node_map_lock);
+ ret = test_bit(bit, map->map);
+ spin_unlock(&osb->node_map_lock);
+ return ret;
+}
-bail:
- LOG_EXIT ();
- return;
-} /* ocfs_update_publish_map */
+static inline int __ocfs_node_map_is_empty(ocfs_node_map *map)
+{
+ int bit;
+ bit = find_next_bit(map->map, map->num_nodes, 0);
+ if (bit < map->num_nodes)
+ return 0;
+ return 1;
+}
+int ocfs_node_map_is_empty(ocfs_super *osb,
+ ocfs_node_map *map)
+{
+ int ret;
+ OCFS_ASSERT(map->num_nodes > 0);
+ spin_lock(&osb->node_map_lock);
+ ret = __ocfs_node_map_is_empty(map);
+ spin_unlock(&osb->node_map_lock);
+ return ret;
+}
-/* half a second timeout */
-#define OCFS_HEARTBEAT_JIFFIES (HZ >> 1)
+static void __ocfs_node_map_dup(ocfs_super *osb,
+ ocfs_node_map *target,
+ ocfs_node_map *from)
+{
+ OCFS_ASSERT(from->num_nodes > 0);
+ ocfs_node_map_init(osb, target);
+ __ocfs_node_map_set(target, from);
+}
-/*
- * ocfs_heartbeat_thread()
- *
- * This function is executed as a kernel thread for each mounted ocfs volume.
- */
-int ocfs_heartbeat_thread (void *arg)
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs_node_map_is_only(ocfs_super *osb,
+ ocfs_node_map *target,
+ int bit)
{
- ocfs_super *osb;
- char proc[16];
- int status = 0;
- __u8 *buffer = NULL;
- ocfs_publish *publish;
- unsigned long j;
- __u16 num_nodes = 0;
- ocfs_node_config_hdr *node_cfg_hdr = NULL;
- __u64 cfg_seq_num;
- struct buffer_head *bh = NULL;
- siginfo_t info;
+ ocfs_node_map temp;
+ int ret;
- LOG_ENTRY ();
+ spin_lock(&osb->node_map_lock);
+ __ocfs_node_map_dup(osb, &temp, target);
+ __ocfs_node_map_clear_bit(&temp, bit);
+ ret = __ocfs_node_map_is_empty(&temp);
+ spin_unlock(&osb->node_map_lock);
- osb = arg;
+ return ret;
+}
- sprintf (proc, "ocfs2nm-%d", osb->osb_id);
- ocfs_daemonize (proc, strlen(proc), 1);
+static void __ocfs_node_map_set(ocfs_node_map *target,
+ ocfs_node_map *from)
+{
+ int num_longs, i;
- osb->dlm_task = current;
+ OCFS_ASSERT(target->num_nodes == from->num_nodes);
+ OCFS_ASSERT(target->num_nodes > 0);
- osb->hbt = OCFS_HEARTBEAT_JIFFIES + jiffies;
- /* The delay changes based on multiplier */
- while (!(OcfsGlobalCtxt.flags & OCFS_FLAG_SHUTDOWN_VOL_THREAD) &&
- !(osb->osb_flags & OCFS_OSB_FLAGS_BEING_DISMOUNTED)) {
+ num_longs = BITS_TO_LONGS(target->num_nodes);
+ for (i=0; i<num_longs; i++)
+ target->map[i] = from->map[i];
+}
- buffer = NULL;
+void ocfs_recovery_map_set(ocfs_super *osb,
+ int num)
+{
+ spin_lock(&osb->node_map_lock);
+ __ocfs_node_map_clear_bit(&osb->mounted_map, num);
+ __ocfs_node_map_set_bit(&osb->recovery_map, num);
+ spin_unlock(&osb->node_map_lock);
+}
- if (!time_after (jiffies, (unsigned long) (osb->hbt)))
- goto again;
+void ocfs_recovery_map_clear(ocfs_super *osb,
+ int num)
+{
+ ocfs_node_map_clear_bit(osb, &osb->recovery_map, num);
+}
- /* lock publish to prevent overwrites from vote_req and vote_reset */
- down (&(osb->publish_lock));
+int ocfs_node_map_iterate(ocfs_super *osb,
+ ocfs_node_map *map,
+ int idx)
+{
+ int i = idx;
- /* Read disk for 4 autoconfig blocks + all nodes publish blocks */
- status = ocfs_read_blocks(osb,
- osb->new_autoconfig_blkno,
- osb->total_autoconfig_blocks,
- osb->autoconfig_bhs, 0, NULL);
- if (status < 0) {
- up (&(osb->publish_lock));
- LOG_ERROR_STATUS (status);
- BUG();
+ idx = OCFS_INVALID_NODE_NUM;
+ spin_lock(&osb->node_map_lock);
+ if ((i != OCFS_INVALID_NODE_NUM) &&
+ (i >= 0) &&
+ (i < map->num_nodes)) {
+ while(i < map->num_nodes) {
+ if (test_bit(i, map->map)) {
+ idx = i;
+ break;
+ }
+ i++;
}
+ }
+ spin_unlock(&osb->node_map_lock);
+ return idx;
+}
- bh = osb->autoconfig_bhs[OCFS_VOLCFG_NEWCFG_SECTORS + osb->node_num];
- publish = (ocfs_publish *) bh->b_data;
- if ((osb->check_mounted) && (publish->mounted == 0)) {
- printk("ocfs2: Heartbeat timed out, volume has been "
- "recovered from another node!\n");
+#if 0
+/* unused (for now) node map functions. */
- BUG();
- }
- bh = NULL;
+/* uses the heartbeat api to test whether a given global node num is
+ * heartbeating. Warning: this function can sleep in
+ * hb_fill_node_map() */
+int ocfs2_is_node_alive(ocfs_super *osb,
+ unsigned int node_num)
+{
+ int ret;
+ ocfs_node_map tmpmap;
- ocfs_nm_heart_beat (osb, HEARTBEAT_METHOD_DISK, 0);
+ ocfs_node_map_init(osb, &tmpmap);
- /* release publish lock */
- up (&(osb->publish_lock));
+ ret = hb_fill_node_map(osb->group_inode, &tmpmap, sizeof(tmpmap.map));
+ if (ret < 0) {
+ LOG_ERROR_STATUS(ret);
+ goto bail;
+ }
- /* If another node was added to the config read and update the cfg */
- node_cfg_hdr =
- (ocfs_node_config_hdr *) osb->autoconfig_bhs[1]->b_data;
- num_nodes = node_cfg_hdr->num_nodes;
- cfg_seq_num = node_cfg_hdr->cfg_seq_num;
+ ret = ocfs_node_map_test_bit(osb, &tmpmap, node_num);
- if ((osb->cfg_seq_num != cfg_seq_num) ||
- (osb->num_cfg_nodes != num_nodes)) {
- down (&(osb->cfg_lock));
- status = ocfs_chk_update_config (osb);
- up (&(osb->cfg_lock));
- if (status < 0)
- LOG_ERROR_STATUS (status);
- }
+bail:
+ return ret;
+}
- num_nodes = osb->max_nodes;
+static int ocfs_node_map_stringify(ocfs_node_map *map, char **str)
+{
+ int i, n;
+ char *s;
- /* Refresh the publish map */
- ocfs_update_publish_map (osb, &(osb->autoconfig_bhs[OCFS_VOLCFG_NEWCFG_SECTORS]), 0);
+ OCFS_ASSERT(map->num_nodes > 0);
- /* send signal to mount thread to continue */
- if (atomic_read (&osb->nm_init) < OCFS_HEARTBEAT_INIT) {
- atomic_inc (&osb->nm_init);
- } else if (atomic_read(&osb->nm_init) == OCFS_HEARTBEAT_INIT) {
- wake_up (&osb->nm_init_event);
- atomic_inc (&osb->nm_init);
+ *str = kmalloc( strlen("123 ") * map->num_nodes, GFP_KERNEL);
+ if (!(*str))
+ return -ENOMEM;
+
+ memset(*str, 0, strlen("123 ") * map->num_nodes);
+
+ s = *str;
+ for (i=0; i<map->num_nodes; i++) {
+ if (ocfs_node_map_test_bit(map, i)) {
+ n = sprintf(s, "%3d ", i);
+ if (n != strlen("123 ")) {
+ kfree(*str);
+ return -ENOMEM;
+ }
+ s += n;
}
+ }
+ return 0;
+}
- osb->hbt = OCFS_HEARTBEAT_JIFFIES + jiffies;
+void ocfs_node_map_and(ocfs_node_map *target, ocfs_node_map *mask)
+{
+ int num_longs, i;
-again:
- status = 0;
+ OCFS_ASSERT(target->num_nodes == mask->num_nodes);
+ OCFS_ASSERT(target->num_nodes > 0);
- if ((OcfsGlobalCtxt.flags & OCFS_FLAG_SHUTDOWN_VOL_THREAD) ||
- (osb->osb_flags & OCFS_OSB_FLAGS_BEING_DISMOUNTED))
- break;
- j = jiffies;
- if (time_after (j, (unsigned long) (osb->hbt))) {
- osb->hbt = OCFS_HEARTBEAT_JIFFIES + j;
- }
- set_current_state (TASK_INTERRUPTIBLE);
- schedule_timeout (osb->hbt - j);
+ num_longs = BITS_TO_LONGS(target->num_nodes);
+ for (i=0; i<num_longs; i++)
+ target->map[i] &= mask->map[i];
+}
- /* ignore the actual signal */
- if (signal_pending(current)) {
- dequeue_signal_lock(current, ¤t->blocked, &info);
- }
+int ocfs_node_map_is_equal(ocfs_node_map *map1, ocfs_node_map *map2)
+{
+ int num_longs, i;
+
+ OCFS_ASSERT(map1->num_nodes == map2->num_nodes);
+ OCFS_ASSERT(map1->num_nodes > 0);
+
+ num_longs = BITS_TO_LONGS(map1->num_nodes);
+ for (i=0; i<num_longs; i++) {
+ if (map1->map[i] != map2->map[i])
+ return 0;
}
+ return 1;
+}
- /* Flush all scheduled tasks */
- flush_scheduled_work();
- complete (&(osb->dlm_complete));
+// clear all the bits in "target" which are set in "mask"
+static void __ocfs_node_map_clear_bits(ocfs_node_map *target,
+ ocfs_node_map *mask)
+{
+ int bit, prev=0;
+ while (1) {
+ bit = find_next_bit (mask->map, mask->num_nodes, prev);
+ if (bit >= mask->num_nodes)
+ break;
+ ocfs_node_map_clear_bit(target, bit);
+ prev = bit+1;
+ }
+}
- LOG_EXIT_INT (0);
- return 0;
-} /* ocfs_heartbeat_thread */
+// set all the bits in "target" which are set in "mask"
+void __ocfs_node_map_set_bits(ocfs_node_map *target,
+ ocfs_node_map *mask)
+{
+ int bit, prev=0;
+ while (1) {
+ bit = find_next_bit (mask->map, mask->num_nodes, prev);
+ if (bit >= mask->num_nodes)
+ break;
+ ocfs_node_map_set_bit(target, bit);
+ prev = bit+1;
+ }
+}
+#endif
+
Modified: trunk/src/heartbeat.h
===================================================================
--- trunk/src/heartbeat.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/heartbeat.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,9 +26,40 @@
#ifndef OCFS2_HEARTBEAT_H
#define OCFS2_HEARTBEAT_H
-int ocfs_nm_heart_beat(ocfs_super *osb, __u32 flag, int read_publish);
-void ocfs_update_publish_map(ocfs_super *osb, struct buffer_head *bhs[],
- int first_time);
-int ocfs_heartbeat_thread(void *arg);
+void ocfs2_init_node_maps(ocfs_super *osb);
+int ocfs2_register_hb_callbacks(ocfs_super *osb);
+void ocfs2_clear_hb_callbacks(ocfs_super *osb);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs_node_map_init(ocfs_super *osb, ocfs_node_map *map);
+int ocfs_node_map_is_empty(ocfs_super *osb,
+ ocfs_node_map *map);
+void ocfs_node_map_set_bit(ocfs_super *osb,
+ ocfs_node_map *map,
+ int bit);
+void ocfs_node_map_clear_bit(ocfs_super *osb,
+ ocfs_node_map *map,
+ int bit);
+int ocfs_node_map_test_bit(ocfs_super *osb,
+ ocfs_node_map *map,
+ int bit);
+int ocfs_node_map_iterate(ocfs_super *osb,
+ ocfs_node_map *map,
+ int idx);
+static inline int ocfs_node_map_first_set_bit(ocfs_super *osb,
+ ocfs_node_map *map)
+{
+ return ocfs_node_map_iterate(osb, map, 0);
+}
+void ocfs_recovery_map_set(ocfs_super *osb,
+ int num);
+void ocfs_recovery_map_clear(ocfs_super *osb,
+ int num);
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs_node_map_is_only(ocfs_super *osb,
+ ocfs_node_map *target,
+ int bit);
+
#endif /* OCFS2_HEARTBEAT_H */
Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/inode.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -39,11 +39,10 @@
#include "ocfs2.h"
#include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "inode.h"
-#include "lockres.h"
#include "namei.h"
#include "suballoc.h"
#include "super.h"
@@ -57,7 +56,6 @@
#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_INODE
-extern struct semaphore recovery_list_sem;
extern struct address_space_operations ocfs_aops;
typedef struct _ocfs_find_inode_args
@@ -307,27 +305,22 @@
i->ip_flags = 0;
atomic_set(&i->ip_clean_buffer_seq, 0);
- init_rwsem(&(i->ip_node_extend_sem));
i->ip_open_cnt = 0;
spin_lock_init(&i->ip_lock);
ocfs2_extent_map_init(inode);
- INIT_LIST_HEAD(&i->ip_recovery_list);
INIT_LIST_HEAD(&i->ip_handle_list);
i->ip_handle = NULL;
i->ip_next_orphan = NULL;
init_rwsem(&i->ip_alloc_sem);
init_MUTEX(&(i->ip_io_sem));
- atomic_set(&i->ip_needs_verification, 0);
- INIT_LIST_HEAD(&i->ip_pending_locks);
- INIT_LIST_HEAD(&i->ip_j_inode);
/* These should be set in read_inode2. */
i->ip_clusters = 0;
i->ip_blkno = 0ULL;
i->ip_mmu_private = 0ULL;
- OCFS_SET_FLAG (i->ip_flags, OCFS_INODE_INITIALIZED);
+ i->ip_flags |= OCFS_INODE_INITIALIZED;
return 0;
} /* ocfs_inode_init_private */
@@ -405,7 +398,10 @@
inode->i_nlink = fe->i_links_count;
if (le32_to_cpu(fe->i_flags) & OCFS2_LOCAL_ALLOC_FL) {
+ OCFS_I(inode)->ip_flags |= OCFS_INODE_BITMAP;
LOG_TRACE_ARGS("local alloc inode: i_ino=%lu\n", inode->i_ino);
+ } else if (le32_to_cpu(fe->i_flags) & OCFS2_BITMAP_FL) {
+ OCFS_I(inode)->ip_flags |= OCFS_INODE_BITMAP;
} else if (le32_to_cpu(fe->i_flags) & OCFS2_SUPER_BLOCK_FL) {
LOG_TRACE_ARGS("superblock inode: i_ino=%lu\n", inode->i_ino);
// we can't actually hit this as read_inode can't handle
@@ -439,7 +435,16 @@
break;
}
- status = 0;
+ status = ocfs2_inode_lock_res_init(&OCFS_I(inode)->ip_meta_lockres,
+ OCFS_TYPE_META, inode);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ status = ocfs2_inode_lock_res_init(&OCFS_I(inode)->ip_data_lockres,
+ OCFS_TYPE_DATA, inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
bail:
LOG_EXIT_STATUS (status);
return status;
@@ -502,14 +507,8 @@
BUG();
if (sysfile)
- OCFS_SET_FLAG(OCFS_I(inode)->ip_flags, OCFS_INODE_SYSTEM_FILE);
+ OCFS_I(inode)->ip_flags |= OCFS_INODE_SYSTEM_FILE;
- ocfs_init_lockres (osb, inode);
- status = ocfs_update_lockres(osb, bh, inode, 0);
- if (status < 0) {
- make_bad_inode(inode);
- goto bail;
- }
status = 0;
bail:
@@ -613,31 +612,51 @@
goto bail;
}
+ spin_lock(&OCFS_I(inode)->ip_lock);
if (OCFS_I(inode)->ip_flags & OCFS_INODE_SKIP_DELETE) {
+ spin_unlock(&OCFS_I(inode)->ip_lock);
LOG_TRACE_ARGS("Skipping delete of %lu because another node "
"has done this for us.\n", inode->i_ino);
goto bail;
}
+ spin_unlock(&OCFS_I(inode)->ip_lock);
/* If we're coming from process_vote we can't go into our own
* voting [hello, deadlock city!], so unforuntately we just
* have to skip deleting this guy. That's OK though because
* the node who's doing the actual deleting should handle it
* anyway. */
- if (osb->voting_ino == inode->i_ino) {
+ if (current == osb->vote_task) {
LOG_TRACE_ARGS("Skipping delete of %lu because we're currently"
"in process_vote\n", inode->i_ino);
goto bail;
}
- /* acquire_lock and friends will igrab / iput this guy, so we
+ /* ocfs2_meta_lock and friends might igrab / iput this guy, so we
* take an extra ref. to avoid recursive calls to
* delete_inode. */
atomic_inc(&inode->i_count);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, FLAG_FILE_DELETE,
- &fe_bh, inode);
- atomic_set(&inode->i_count, 0);
+ status = ocfs2_meta_lock(inode, NULL, &fe_bh, 1);
+ atomic_dec(&inode->i_count);
if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ /* While we were waiting for the lock, another node might have
+ * asked to delete the inode. Recheck our flags to catch this
+ * race and just clear_inode instead.*/
+ spin_lock(&OCFS_I(inode)->ip_lock);
+ if (OCFS_I(inode)->ip_flags & OCFS_INODE_SKIP_DELETE) {
+ spin_unlock(&OCFS_I(inode)->ip_lock);
+ LOG_TRACE_ARGS("Skipping delete of %lu because another node "
+ "has done this for us.\n", inode->i_ino);
+ goto bail;
+ }
+ spin_unlock(&OCFS_I(inode)->ip_lock);
+
+ status = ocfs2_request_delete_vote(inode);
+ if (status < 0) {
/* EBUSY here is assumed to mean that other nodes are
* still using the inode. We're done here though, so
* avoid doing anything on disk and let them worry
@@ -685,14 +704,11 @@
goto bail;
}
ocfs_handle_add_inode(handle, orphan_dir_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
- &orphan_dir_bh, orphan_dir_inode);
+ status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0,
- orphan_dir_inode);
/* we do this while holding the orphan dir lock because we
* don't want recovery being run from another node to vote for
@@ -711,14 +727,11 @@
goto bail;
}
ocfs_handle_add_inode(handle, inode_alloc_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
- &inode_alloc_bh, inode_alloc_inode);
+ status = ocfs2_meta_lock(inode_alloc_inode, handle, &inode_alloc_bh, 1);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0,
- inode_alloc_inode);
handle = ocfs_start_trans(osb, handle, OCFS_DELETE_INODE_CREDITS);
if (handle == NULL) {
@@ -785,6 +798,7 @@
*/
void ocfs_clear_inode (struct inode *inode)
{
+ int status;
ocfs_super *osb;
LOG_SET_CONTEXT(CLEAR_INODE);
@@ -805,7 +819,7 @@
goto bail;
}
- OCFS_CLEAR_FLAG (OCFS_I(inode)->ip_flags, OCFS_INODE_INITIALIZED);
+ OCFS_I(inode)->ip_flags &= ~OCFS_INODE_INITIALIZED;
if (OCFS_I(inode)->ip_blkno == -1)
BUG();
@@ -819,10 +833,12 @@
ocfs2_extent_map_drop(inode, 0);
- down(&recovery_list_sem);
- list_del(&OCFS_I(inode)->ip_recovery_list);
- up(&recovery_list_sem);
+ status = ocfs2_drop_inode_locks(inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+ ocfs2_lock_res_free(&OCFS_I(inode)->ip_meta_lockres);
+ ocfs2_lock_res_free(&OCFS_I(inode)->ip_data_lockres);
/* clean out the inode private ... why?! */
memset(inode->u.generic_ip, 0, sizeof(ocfs_inode_private));
@@ -904,7 +920,6 @@
struct inode *inode = dentry->d_inode;
int status = 0;
ocfs_super *osb;
- ocfs_lock_res *lockres;
LOG_SET_CONTEXT(REVALIDATE);
@@ -928,28 +943,13 @@
}
spin_unlock(&OCFS_I(inode)->ip_lock);
- if (ocfs_node_map_is_only(osb, &osb->publ_map, osb->node_num)) {
- LOG_TRACE_STR ("Only node alive.");
+ status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
goto bail;
}
-
- lockres = GET_INODE_LOCKRES(inode);
- /* if I hold cache lock, no revalidate needed */
- ocfs_acquire_lockres_read(inode);
- if (ocfs_is_local_cache_lock(osb, inode)) {
- ocfs_release_lockres_read(inode);
- LOG_TRACE_STR("local cache lock\n");
- goto bail;
- }
- ocfs_release_lockres_read(inode);
-
- atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
- status = ocfs_verify_update_inode(osb, inode);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- status = -ENOENT;
- }
-
+ ocfs2_meta_unlock(inode, 0);
bail:
LOG_EXIT_STATUS(status);
@@ -998,10 +998,8 @@
// fe->i_generation = inode->i_generation;
status = ocfs_journal_dirty(handle, bh);
- if (status < 0) {
+ if (status < 0)
LOG_ERROR_STATUS(status);
- goto leave;
- }
status = 0;
leave:
@@ -1016,156 +1014,30 @@
* Updates a struct inode from a disk inode.
* does no i/o, only takes ip_lock.
*/
-int ocfs_refresh_inode(struct inode *inode,
- ocfs2_dinode *fe)
+void ocfs_refresh_inode(struct inode *inode,
+ ocfs2_dinode *fe)
{
- int status = 0;
- int drop_map = 0;
+ ocfs_inode_private *oip = OCFS_I(inode);
ocfs_super *osb = OCFS2_SB(inode->i_sb);
- spin_lock(&OCFS_I(inode)->ip_lock);
+ spin_lock(&oip->ip_lock);
- if (INODE_DELETED(inode)) {
- LOG_TRACE_ARGS("Inode %llu was marked as deleted!",
- OCFS_I(inode)->ip_blkno);
- status = -ENOENT;
- goto bail;
+ oip->ip_clusters = fe->i_clusters;
+ inode->i_size = fe->i_size;
+ if (S_ISREG(inode->i_mode)) {
+ oip->ip_mmu_private = inode->i_size;
}
+ inode->i_nlink = fe->i_links_count;
+ inode->i_blocks = (inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
+ inode->i_uid = fe->i_uid;
+ inode->i_gid = fe->i_gid;
+ inode->i_mode = fe->i_mode;
+ inode->i_blksize = (__u32) osb->s_clustersize;
+ OCFS_SET_INODE_TIME(inode, i_ctime, fe->i_ctime);
+ OCFS_SET_INODE_TIME(inode, i_atime, fe->i_atime);
+ OCFS_SET_INODE_TIME(inode, i_mtime, fe->i_mtime);
- /* Add checks as needed */
- if ((fe->i_dtime) || (!(fe->i_flags & OCFS2_VALID_FL))) {
- if (fe->i_dtime)
- LOG_ERROR_ARGS("Inode %lu has dtime = %llu\n",
- inode->i_ino, fe->i_dtime);
- else
- LOG_TRACE_STR ("File Entry is invalid");
-
- status = -ENOENT;
- goto bail;
- }
-
- if (inode->i_generation != le32_to_cpu(fe->i_generation)) {
- LOG_ERROR_ARGS("Inode %llu is stale! (%u, %u)\n",
- OCFS_I(inode)->ip_blkno,
- inode->i_generation,
- le32_to_cpu(fe->i_generation));
- SET_INODE_DELETED(inode);
- status = -ENOENT;
- goto bail;
- }
-
- if ((OCFS_I(inode)->ip_clusters != fe->i_clusters) ||
- (inode->i_size != fe->i_size) ||
- inode->i_uid != fe->i_uid ||
- inode->i_gid != fe->i_gid ||
- inode->i_mode != fe->i_mode ||
- inode->i_nlink != fe->i_links_count){
-
- if (OCFS_I(inode)->ip_clusters > fe->i_clusters) {
- LOG_TRACE_ARGS("destroying extent maps for %llu, "
- "ip_clusters = %u, i_clusters = %u\n",
- OCFS_I(inode)->ip_blkno,
- OCFS_I(inode)->ip_clusters,
- fe->i_clusters);
- drop_map = 1; /* Because we have the lock here */
- }
-
- LOG_TRACE_STR("Allocsize, filesize or seq no did not match");
- OCFS_I(inode)->ip_clusters = fe->i_clusters;
- inode->i_size = fe->i_size;
- if (S_ISREG(inode->i_mode)) {
- OCFS_I(inode)->ip_mmu_private = inode->i_size;
- }
- LOG_TRACE_ARGS("verifyupdate: setting nlink from %d to %d for %llu\n",
- inode->i_nlink, fe->i_links_count,
- OCFS_I(inode)->ip_blkno);
- inode->i_nlink = fe->i_links_count;
- inode->i_blocks = (inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
- inode->i_uid = fe->i_uid;
- inode->i_gid = fe->i_gid;
- inode->i_mode = fe->i_mode;
- inode->i_blksize = (__u32) osb->s_clustersize;
- OCFS_SET_INODE_TIME(inode, i_ctime, fe->i_ctime);
- OCFS_SET_INODE_TIME(inode, i_atime, fe->i_atime);
- OCFS_SET_INODE_TIME(inode, i_mtime, fe->i_mtime);
-
- if (S_ISCHR(fe->i_mode) ||
- S_ISBLK(fe->i_mode) ||
- S_ISFIFO(fe->i_mode) ||
- S_ISSOCK(fe->i_mode)) {
- inode->i_rdev = 0;
- init_special_inode(inode, inode->i_mode,
- huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)));
- }
- }
-
- atomic_set(&OCFS_I(inode)->ip_needs_verification, 0);
-bail:
-
- spin_unlock(&OCFS_I(inode)->ip_lock);
-
- if (drop_map)
- ocfs2_extent_map_trunc(inode, fe->i_clusters);
-
- return(status);
+ spin_unlock(&oip->ip_lock);
} /* ocfs_refresh_inode */
-/*
- * ocfs_verify_update_inode()
- */
-int ocfs_verify_update_inode (ocfs_super * osb, struct inode * inode)
-{
- int status = 0;
- struct buffer_head *fe_bh = NULL;
- ocfs2_dinode *fe;
- /* We are setting the oin Updated flag in the end. */
- LOG_ENTRY ();
-
- OCFS_ASSERT (inode);
-
- if (OCFS_I(inode)->ip_blkno == 0) {
- LOG_ERROR_ARGS("inode 0x%lu has zero blkno\n", inode->i_ino);
- status = -EINVAL;
- goto leave;
- }
-
- spin_lock(&OCFS_I(inode)->ip_lock);
- if (INODE_DELETED(inode)) {
- spin_unlock(&OCFS_I(inode)->ip_lock);
- LOG_TRACE_ARGS("Inode %llu was marked as deleted!",
- OCFS_I(inode)->ip_blkno);
- status = -ENOENT;
- goto leave;
- }
- spin_unlock(&OCFS_I(inode)->ip_lock);
-
- status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &fe_bh,
- OCFS_BH_CACHED, inode);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto leave;
- }
-
- fe = (ocfs2_dinode *) fe_bh->b_data;
-
- status = ocfs_refresh_inode(inode, fe);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto leave;
- }
-
- ocfs_acquire_lockres_write(inode);
- status = ocfs_update_lockres (osb, fe_bh, inode, 0);
- ocfs_release_lockres_write(inode);
-
- status = 0;
-leave:
-
- if (fe_bh)
- brelse(fe_bh);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_verify_update_inode */
-
Modified: trunk/src/inode.h
===================================================================
--- trunk/src/inode.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/inode.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -41,9 +41,8 @@
ssize_t ocfs_rw_direct(int rw, struct file *filp, char *buf,
size_t size, loff_t *offp);
void ocfs_sync_blockdev(struct super_block *sb);
-int ocfs_verify_update_inode(ocfs_super *osb, struct inode *inode);
-int ocfs_refresh_inode(struct inode *inode,
- ocfs2_dinode *fe);
+void ocfs_refresh_inode(struct inode *inode,
+ ocfs2_dinode *fe);
int ocfs_mark_inode_dirty(ocfs_journal_handle *handle,
struct inode *inode,
struct buffer_head *bh);
Modified: trunk/src/journal.c
===================================================================
--- trunk/src/journal.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/journal.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -35,14 +35,14 @@
#include "ocfs2.h"
#include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "extent_map.h"
+#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
-#include "lockres.h"
#include "namei.h"
-#include "nm.h"
+#include "slot_map.h"
#include "super.h"
#include "util.h"
#include "vote.h"
@@ -55,124 +55,42 @@
spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
-static int ocfs_reset_publish (ocfs_super * osb, __u64 node_num);
static int ocfs_force_read_journal(struct inode *inode);
static int ocfs_recover_node(struct _ocfs_super *osb, int node_num);
static int __ocfs_recovery_thread(void *arg);
static int ocfs_commit_cache (ocfs_super * osb);
static int ocfs_wait_on_mount(ocfs_super *osb);
-static void ocfs_handle_move_locks(ocfs_journal *journal,
- ocfs_journal_handle *handle);
-static void ocfs_journal_optimize_lock_list(ocfs_journal *journal);
+static int ocfs2_commit_thread_sleep(ocfs_super *osb);
+static void ocfs_handle_cleanup_locks(ocfs_journal *journal,
+ ocfs_journal_handle *handle,
+ int set_id);
static void ocfs_commit_unstarted_handle(ocfs_journal_handle *handle);
+static int ocfs_journal_toggle_dirty(ocfs_super *osb,
+ int dirty);
+static int ocfs2_trylock_journal(ocfs_super *osb,
+ int slot_num);
-static void ocfs_journal_optimize_lock_list(ocfs_journal *journal)
-{
- ocfs_journal_lock *lock = NULL;
- ocfs_journal_lock *first = NULL; /* 1st update_oin release for
- * an inode. */
- struct inode *curr_inode = NULL;
- struct list_head *p, *n;
-
- /* 1st pass: optimize away all UPDATE_OIN messages into one.
- * 2 types of UPDATE_OIN messages as of now: with size change
- * (truncate / extend) or without size change. We will take
- * *all* of them (regardless of type) and cram it into one
- * message. */
- curr_inode = NULL;
- list_for_each_safe(p, n, &journal->checkpointing_locks) {
- lock = list_entry(p, ocfs_journal_lock, lock_list);
-
- if (lock->inode != curr_inode) {
- /* Ok, new inode. */
- first = NULL;
-
- curr_inode = lock->inode;
- }
-
- /* if it's not an update oin then just continue */
- if (!(lock->flags & FLAG_FILE_UPDATE_OIN))
- continue;
-
- if (!first) {
- first = lock;
- /* nothing else to do to the 1st update release. */
- continue;
- }
-
- /* only the 1st update_oin lock stays -- the
- * rest are either num_ident into it or
- * dropped */
- list_del(&lock->lock_list);
- atomic_dec(&journal->num_chkpt_locks);
-
- /* we only incement first->num_ident if the current
- * lock has a size change AND first has recorded at
- * least one size change (which can include
- * itself). Otherwise we'll have an off by one as a
- * first with no size change recorded will keep at
- * least 1 reference for itself. */
- if ((lock->flags & FLAG_FILE_EXTEND)
- || (lock->flags & FLAG_FILE_TRUNCATE)) {
- if ((first->flags & FLAG_FILE_EXTEND)
- || (first->flags & FLAG_FILE_TRUNCATE))
- first->num_ident++;
- else
- first->drop_holders++;
- } else {
- /* Ok, no size change on this particular lock,
- * so we're discarding it without updating the
- * num_ident value -- that's fine but we need
- * to manually drop the lockres->lock_holders
- * value on it's behalf */
- first->drop_holders++;
- }
-
- /* record a size change in first if there was one. */
- if (lock->flags & FLAG_FILE_EXTEND)
- first->flags = first->flags | FLAG_FILE_EXTEND;
- if (lock->flags & FLAG_FILE_TRUNCATE)
- first->flags = first->flags | FLAG_FILE_TRUNCATE;
-
- iput(lock->inode);
-
- kmem_cache_free(OcfsGlobalCtxt.lock_cache, lock);
- }
-
- return;
-}
-
/*
* ocfs_commit_cache()
- *
- * This is in journal.c for lack of a better place.
- *
*/
static int ocfs_commit_cache(ocfs_super *osb)
{
- int status = 0, tmpstat;
- unsigned int flushed = 0;
- unsigned int cmt_locks;
+ int status = 0;
+ unsigned int flushed;
+ unsigned long old_id;
ocfs_journal * journal = NULL;
- struct list_head *p, *n;
- ocfs_journal_lock *lock = NULL;
- struct inode *inode;
- ocfs_inode_private *ip;
- ocfs_lock_res *lockres;
LOG_ENTRY();
journal = osb->journal;
- /* Step 1: flush all pending commits and checkpoint the journal. */
+ /* Flush all pending commits and checkpoint the journal. */
down_write(&journal->trans_barrier);
if (atomic_read(&journal->num_trans) == 0) {
+ up_write(&journal->trans_barrier);
LOG_TRACE_STR("No transactions for me to flush!");
- /* now, we may have locks left to drop even though no
- * transactions are in the journal. */
-
- goto drop_locks;
+ goto finally;
}
journal_lock_updates(journal->k_journal);
@@ -184,111 +102,18 @@
goto finally;
}
- ocfs_inc_trans_id(journal);
+ old_id = ocfs_inc_trans_id(journal);
-drop_locks:
flushed = atomic_read(&journal->num_trans);
atomic_set(&journal->num_trans, 0);
-
- /* Step 2: Drop any locks acquired during transactions which
- * have just been checkpointed. */
- spin_lock(&journal->cmt_lock);
-
- cmt_locks = atomic_read(&journal->num_cmt_locks);
-
- atomic_add(atomic_read(&journal->num_cmt_locks),
- &journal->num_chkpt_locks);
- atomic_set(&journal->num_cmt_locks, 0);
-
- /* move the locks off each inode onto the commit threads list. */
- list_for_each_safe(p, n, &journal->committing_inodes) {
- ip = list_entry(p, ocfs_inode_private, ip_j_inode);
- inode = ip->ip_inode;
-
- if (!list_empty(&OCFS_I(inode)->ip_pending_locks))
- list_splice_init(&OCFS_I(inode)->ip_pending_locks,
- (&journal->checkpointing_locks)->prev);
-
- /* we can now remove the inode from the committing
- * list. */
- list_del_init(&OCFS_I(inode)->ip_j_inode);
- }
- osb->needs_flush = 0;
-
- spin_unlock(&journal->cmt_lock);
-
- /* TODO: Can we assert this anymore and move these lock
- * releases back up?
- *
- * Once we've got cmt_lock, we can let
- * transactions start again -- it should protect us against
- * people mucking with the committed list... */
up_write(&journal->trans_barrier);
#ifdef VERBOSE_COMMIT_THREAD
- if (flushed || cmt_locks)
- printk("(%u) commit_thread: flushed %u transactions, "
- "releasing %u locks\n", current->pid, flushed,
- cmt_locks);
+ printk("(%u) commit_thread: flushed transaction %lu (%u handles)\n",
+ current->pid, journal->trans_id, flushed);
#endif
- ocfs_journal_optimize_lock_list(journal);
-#ifdef VERBOSE_COMMIT_THREAD
- if (flushed || cmt_locks)
- printk("(%u) commit_thread: after optimization, %u locks "
- "to release\n", current->pid,
- atomic_read(&journal->num_chkpt_locks));
-#endif
-
- p = n = NULL;
- list_for_each_safe(p, n, &journal->checkpointing_locks) {
- if (!atomic_read(&journal->num_chkpt_locks))
- BUG();
-
- lock = list_entry(p, ocfs_journal_lock, lock_list);
-
- list_del(&(lock->lock_list));
-
- if (!lock->inode)
- BUG();
-
-#if 0
- /* enable this for tons of output, which will likely
- * hang your box :) */
- printk("commit_thread: release lock %u (inode %llu)\n",
- atomic_read(&journal->num_chkpt_locks),
- OCFS_I(lock->inode)->ip_blkno);
-#endif
- tmpstat = 0;
- if (!INODE_DELETED(lock->inode))
- tmpstat = ocfs_release_lock_full(osb,
- lock->type,
- lock->flags,
- lock->inode,
- lock->num_ident);
- else
- LOG_ERROR_ARGS("commit_thread: Skipping release for "
- "inode %llu!\n",
- OCFS_I(lock->inode)->ip_blkno);
- if (tmpstat < 0)
- LOG_ERROR_ARGS("commit_thread: release_lock status is"
- " %d releasing lock on inode %llu!\n",
- tmpstat, OCFS_I(lock->inode)->ip_blkno);
-
- if (lock->drop_holders) {
- lockres = GET_INODE_LOCKRES(lock->inode);
- ocfs_acquire_lockres_write(lock->inode);
- OCFS_ASSERT(lockres->lock_holders >= lock->drop_holders);
- lockres->lock_holders -= lock->drop_holders;
- ocfs_release_lockres_write(lock->inode);
- }
-
- iput(lock->inode);
-
- atomic_dec(&journal->num_chkpt_locks);
- kmem_cache_free(OcfsGlobalCtxt.lock_cache, lock);
- }
-
+ ocfs2_kick_vote_thread(osb);
finally:
LOG_EXIT_STATUS (status);
return status;
@@ -384,7 +209,7 @@
{
ocfs_journal_handle * retval = NULL;
- retval = ocfs_malloc(sizeof(*retval));
+ retval = kmalloc(sizeof(*retval), GFP_KERNEL);
if (!retval) {
LOG_ERROR_STR("Failed to allocate memory for journal handle!");
return(NULL);
@@ -534,10 +359,7 @@
/* You are allowed to add journal locks before the transaction
* has started. */
osb = handle->osb;
- ocfs_handle_move_locks(osb->journal, handle);
- spin_lock(&osb->journal->cmt_lock);
- osb->needs_flush = 1;
- spin_unlock(&osb->journal->cmt_lock);
+ ocfs_handle_cleanup_locks(osb->journal, handle, 0);
kfree(handle);
LOG_EXIT();
@@ -587,10 +409,7 @@
BUG();
}
- ocfs_handle_move_locks(osb->journal, handle);
- spin_lock(&osb->journal->cmt_lock);
- osb->needs_flush = 1;
- spin_unlock(&osb->journal->cmt_lock);
+ ocfs_handle_cleanup_locks(osb->journal, handle, 1);
up_read(&journal->trans_barrier);
@@ -612,8 +431,9 @@
* during the transaction, so make sure they were taken *before*
* start_trans or we'll have ordering deadlocks.
*
- * This function would be alot simpler if we didn't have to worry
- * about abort.
+ * WARNING2: Note that we do *not* drop trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
*/
int ocfs_extend_trans(ocfs_journal_handle *handle, int nblocks)
{
@@ -732,90 +552,65 @@
return(status);
} /* ocfs_journal_dirty */
-void ocfs_handle_add_lock(ocfs_journal_handle *handle, __u32 type,
- __u32 flags, struct inode *inode)
+/* We always assume you're adding a metadata lock at level 'ex' */
+int ocfs_handle_add_lock(ocfs_journal_handle *handle,
+ struct inode *inode)
{
+ int status;
ocfs_journal_lock *lock;
OCFS_ASSERT(inode);
-
- LOG_ENTRY_ARGS("(inode=%llu, type=%u, flags=%u)\n",
- OCFS_I(inode)->ip_blkno, type, flags);
-
lock = kmem_cache_alloc(OcfsGlobalCtxt.lock_cache, GFP_NOFS);
- if (lock == NULL) {
- LOG_ERROR_STR("Out of memory -- cannot add lock to release.");
+ if (!lock) {
+ status = -ENOMEM;
LOG_ERROR_STATUS(-ENOMEM);
-
- BUG();
+ goto bail;
}
- lock->type = type;
- lock->flags = flags;
- lock->inode = inode;
-
- /* stuff for commit thread optimization. */
- lock->num_ident = 1;
- /* this is for *additional* decrements of lock_holders, not
- * the one given by ocfs_release_lock... */
- lock->drop_holders = 0;
-
if (!igrab(inode))
BUG();
+ lock->jl_inode = inode;
- list_add_tail(&(lock->lock_list), &(handle->locks));
+ list_add_tail(&(lock->jl_lock_list), &(handle->locks));
handle->num_locks++;
- spin_lock(&handle->journal->cmt_lock);
- atomic_inc(&handle->journal->num_cmt_locks);
- spin_unlock(&handle->journal->cmt_lock);
- LOG_EXIT();
- return;
+ status = 0;
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
}
-/* move the locks off a journal handle and onto the
- * inode->ip_pending_locks. makes sure the inodes are on
- * journal->committing_inodes so that the commit thread can get them
- * after checkpoint.
- *
- * You want to do this before dropping trans_barrier to prevent the commit
- * thread from missing the locks.
- *
- * TODO: When we get rid of the last checkpointed transactions, we can
- * just put locks right on the inode in ocfs_handle_add_lock...
- */
-static void ocfs_handle_move_locks(ocfs_journal *journal,
- ocfs_journal_handle *handle)
+static void ocfs_handle_cleanup_locks(ocfs_journal *journal,
+ ocfs_journal_handle *handle,
+ int set_id)
{
struct list_head *p, *n;
ocfs_journal_lock *lock;
struct inode *inode;
list_for_each_safe(p, n, &(handle->locks)) {
- lock = list_entry(p, ocfs_journal_lock, lock_list);
- list_del(&lock->lock_list);
+ lock = list_entry(p, ocfs_journal_lock, jl_lock_list);
+ list_del(&lock->jl_lock_list);
handle->num_locks--;
- inode = lock->inode;
-
- spin_lock(&journal->cmt_lock);
- /* add the lock to the inode */
- list_add_tail(&lock->lock_list,
- &OCFS_I(inode)->ip_pending_locks);
- /* and make sure the inode is on the journals list */
- if (list_empty(&OCFS_I(inode)->ip_j_inode))
- list_add_tail(&OCFS_I(inode)->ip_j_inode,
- &journal->committing_inodes);
- spin_unlock(&journal->cmt_lock);
+ inode = lock->jl_inode;
+ if (set_id)
+ ocfs_set_inode_lock_trans(journal, inode);
+ ocfs2_meta_unlock(inode, 1);
+ if (atomic_read(&inode->i_count) == 1)
+ LOG_ERROR_ARGS("Inode %llu, I'm doing a last iput "
+ "for!", OCFS_I(inode)->ip_blkno);
+ iput(inode);
+ kmem_cache_free(OcfsGlobalCtxt.lock_cache, lock);
}
- return;
}
#define OCFS_DEFAULT_COMMIT_INTERVAL (HZ * 5)
/*
* Setup the journal using the journal system file
*/
-int ocfs_journal_init(ocfs_super *osb)
+int ocfs_journal_init(ocfs_super *osb,
+ int *dirty)
{
int status = -1;
struct inode *inode = NULL; /* the journal inode */
@@ -828,10 +623,9 @@
if (!osb)
BUG();
- spin_lock_init(&(osb->journal->cmt_lock));
-
/* already have the inode for our journal */
- inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, osb->node_num);
+ inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+ osb->slot_num);
if (inode == NULL) {
LOG_ERROR_STR("access error");
status = -EACCES;
@@ -847,8 +641,7 @@
SET_INODE_JOURNAL(inode);
- status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE,
- 0, &bh, inode);
+ status = ocfs2_meta_lock(inode, NULL, &bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STR("Could not get lock on journal!");
@@ -893,21 +686,16 @@
/* yay, pass the proper info back to our journal structure. */
osb->journal->osb = osb;
- /* eventually this will be a value passed into us */
- osb->journal->node_num = osb->node_num;
osb->journal->k_journal = k_journal;
osb->journal->k_inode = inode;
osb->journal->version = OCFS_JOURNAL_CURRENT_VERSION;
osb->journal->lockbh = bh;
atomic_set(&(osb->journal->num_trans), 0);
- atomic_set(&(osb->journal->num_cmt_locks), 0);
- atomic_set(&(osb->journal->num_chkpt_locks), 0);
init_rwsem(&(osb->journal->trans_barrier));
osb->journal->state = OCFS_JOURNAL_LOADED;
osb->journal->trans_id = (unsigned long) 1;
- INIT_LIST_HEAD(&(osb->journal->committing_inodes));
- INIT_LIST_HEAD(&(osb->journal->checkpointing_locks));
+ *dirty = (fe->id1.journal1.i_flags & OCFS2_JOURNAL_DIRTY_FL);
status = 0;
done:
if (status < 0) {
@@ -924,8 +712,34 @@
return(status);
} /* ocfs_journal_init */
+static int ocfs_journal_toggle_dirty(ocfs_super *osb,
+ int dirty)
+{
+ int status;
+ ocfs_journal * journal = osb->journal;
+ struct buffer_head *bh = journal->lockbh;
+ ocfs2_dinode *fe;
+
+ LOG_ENTRY();
+
+ fe = (ocfs2_dinode *) bh->b_data;
+ OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+
+ if (dirty)
+ fe->id1.journal1.i_flags |= OCFS2_JOURNAL_DIRTY_FL;
+ else
+ fe->id1.journal1.i_flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+
+ status = ocfs_write_block(osb, bh, journal->k_inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
/*
- if the journal has been ocfs_malloc'd it needs to be freed after this call.
+ if the journal has been kmalloc'd it needs to be freed after this call.
*/
void ocfs_journal_shutdown(ocfs_super *osb)
{
@@ -968,8 +782,8 @@
if (osb->commit && osb->commit->c_task) {
/* Wait for the commit thread */
LOG_TRACE_STR ("Waiting for ocfs2commit to exit....");
- atomic_set (&osb->flush_event_woken, 1);
- wake_up (&osb->flush_event);
+ atomic_set (&osb->needs_checkpoint, 1);
+ wake_up (&osb->checkpoint_event);
wait_for_completion(&osb->commit->c_complete);
osb->commit->c_task = NULL;
kfree(osb->commit);
@@ -977,15 +791,17 @@
OCFS_ASSERT(atomic_read(&(osb->journal->num_trans)) == 0);
+ status = ocfs_journal_toggle_dirty(osb, 0);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
/* Shutdown the kernel journal system */
journal_destroy(journal->k_journal);
OCFS_I(inode)->ip_open_cnt--;
/* unlock our journal */
- status = ocfs_release_lock (osb, OCFS_LKM_EXMODE, 0, inode);
- if (status < 0)
- LOG_ERROR_STATUS (status);
+ ocfs2_meta_unlock(inode, 1);
brelse (journal->lockbh);
journal->lockbh = NULL;
@@ -1004,6 +820,7 @@
{
int status = 0;
int olderr = 0;
+ int child_pid;
ocfs_super *osb;
LOG_ENTRY();
@@ -1026,6 +843,30 @@
journal_clear_err(journal->k_journal);
}
+ status = ocfs_journal_toggle_dirty(osb, 1);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto done;
+ }
+
+ /* Launch the commit thread */
+ osb->commit = kmalloc(sizeof(ocfs_commit_task), GFP_KERNEL);
+ if (osb->commit == NULL) {
+ LOG_ERROR_STATUS(status = -ENOMEM);
+ goto done;
+ }
+ memset(osb->commit, 0, sizeof(ocfs_commit_task));
+ child_pid = kernel_thread (ocfs_commit_thread, osb,
+ CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ if (child_pid < 0) {
+ LOG_ERROR_ARGS ("unable to launch ocfs2commit thread, error=%d",
+ child_pid);
+ status = child_pid;
+ goto done;
+ } else {
+ init_completion (&osb->commit->c_complete);
+ }
+
done:
LOG_EXIT_STATUS(status);
return(status);
@@ -1044,7 +885,16 @@
BUG();
status = journal_wipe(journal->k_journal, full);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ status = ocfs_journal_toggle_dirty(journal->osb, 0);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+bail:
LOG_EXIT_STATUS(status);
return(status);
}
@@ -1116,116 +966,137 @@
return(status);
}
-struct ocfs_recover_arg {
- ocfs_super *osb;
- int node_num;
-};
-
static int __ocfs_recovery_thread(void *arg)
{
- struct ocfs_recover_arg *recover_arg = arg;
- ocfs_super *osb = recover_arg->osb;
- int node_num = recover_arg->node_num;
+ ocfs_super *osb = arg;
int status = 0;
+ int node_num;
char proc[16];
- LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n", node_num,
- osb->node_num);
+ LOG_ENTRY();
sprintf (proc, "ocfs2rec-%d", osb->osb_id);
ocfs_daemonize (proc, strlen(proc), 0);
-#ifdef HAVE_NPTL
- spin_lock_irq (¤t->sighand->siglock);
- sigfillset(¤t->blocked);
- recalc_sigpending();
- spin_unlock_irq (¤t->sighand->siglock);
-#else
- spin_lock_irq(¤t->sigmask_lock);
- sigfillset(¤t->blocked);
- recalc_sigpending(current);
- spin_unlock_irq(¤t->sigmask_lock);
-#endif
+ status = ocfs_wait_on_mount(osb);
+ if (status < 0) {
+ if (status == -EBUSY)
+ status = 0;
+ goto bail;
+ }
- status = ocfs_recover_node(osb, node_num);
- if (status < 0)
+restart:
+ status = ocfs2_super_lock(osb, 1);
+ if (status < 0) {
LOG_ERROR_STATUS(status);
+ goto bail;
+ }
- LOG_EXIT_STATUS(status);
+ while(!ocfs_node_map_is_empty(osb, &osb->recovery_map)) {
+ node_num = ocfs_node_map_first_set_bit(osb,
+ &osb->recovery_map);
+ if (node_num == OCFS_INVALID_NODE_NUM) {
+ LOG_TRACE_ARGS("Out of nodes to recover.\n");
+ break;
+ }
- kfree(arg);
- return status;
-}
+ ocfs_recovery_map_clear(osb, node_num);
+ /* TODO: Figure out how we're going to save all the
+ * local alloc stuff for after recovery on all nodes
+ * is complete? */
+ status = ocfs_recover_node(osb, node_num);
+ if (status < 0) {
+ printk("ocfs2: Error %d recovering node %d on device "
+ "(%u,%u)!\n", status, node_num,
+ MAJOR(osb->sb->s_dev),MINOR(osb->sb->s_dev));
+ printk("ocfs2: Volume requires unmount.\n");
+ continue;
+ }
+ }
+ ocfs2_super_unlock(osb, 1);
-void ocfs_recovery_thread(ocfs_super *osb, int node_num)
-{
- struct ocfs_recover_arg *arg;
+bail:
+ down(&osb->recovery_lock);
+ if (!ocfs_node_map_is_empty(osb, &osb->recovery_map)) {
+ up(&osb->recovery_lock);
+ goto restart;
+ }
- LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n", node_num,
- osb->node_num);
+ osb->recovery_launched = 0;
+ wake_up(&osb->recovery_event);
- arg = ocfs_malloc(sizeof(struct ocfs_recover_arg));
- if (arg == NULL) {
- LOG_ERROR_STATUS(-ENOMEM);
- goto done;
- }
+ up(&osb->recovery_lock);
- arg->osb = osb;
- arg->node_num = node_num;
+ LOG_EXIT_STATUS(status);
+ return status;
+}
- /* atomic_inc this here and let recover_vol dec it when
- * done. We do it this way to avoid races with umount. */
- atomic_inc(&osb->num_recovery_threads);
+void ocfs_recovery_thread(ocfs_super *osb, int node_num)
+{
+ LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n",
+ node_num, osb->node_num);
- LOG_TRACE_STR("starting recovery thread...");
+ down(&osb->recovery_lock);
+ if (!osb->disable_recovery) {
+ /* People waiting on recovery will wait on
+ * the recovery map to empty. */
+ ocfs_recovery_map_set(osb, node_num);
- kernel_thread(__ocfs_recovery_thread, arg,
- CLONE_VM | CLONE_FS | CLONE_FILES);
+ LOG_TRACE_STR("starting recovery thread...");
-done:
+ if (!osb->recovery_launched) {
+ kernel_thread(__ocfs_recovery_thread, osb,
+ CLONE_VM | CLONE_FS | CLONE_FILES);
+ osb->recovery_launched = 1;
+ }
+ }
+ up(&osb->recovery_lock);
+ wake_up(&osb->recovery_event);
+
LOG_EXIT();
return;
}
static int ocfs_recover_node(ocfs_super *osb, int node_num)
{
- int status = -1;
- int tmpstat;
+ int status = 0;
+// int tmpstat;
+ int slot_num;
ocfs2_dinode *fe;
ocfs2_dinode *local_alloc = NULL;
struct inode *inode = NULL;
journal_t *k_journal = NULL;
struct buffer_head *bh = NULL;
ocfs_journal * journal = NULL;
- int recovery_lock = 0, got_lock = 0, clean_orphans = 0;
+ int got_lock = 0, clean_orphans = 0;
+ ocfs2_slot_info *si = osb->slot_info;
- LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n", node_num,
- osb->node_num);
+ LOG_ENTRY_ARGS("(node_num=%d, osb->node_num = %d)\n",
+ node_num, osb->node_num);
- if (!osb || (node_num >= osb->max_nodes)) {
- LOG_ERROR_STATUS (status = -EINVAL);
- goto done;
- }
+ printk("ocfs2_recover_node: checking node %d\n", node_num);
- status = ocfs_wait_on_mount(osb);
- if (status < 0) {
- if (status == -EBUSY)
- status = 0;
- goto done;
- }
- journal = osb->journal;
+ /* Should not ever be called to recover ourselves -- in that
+ * case we should've called ocfs_journal_load instead. */
+ if (osb->node_num == node_num)
+ BUG();
- /* Grab the local recovery resource to ensure no other thread
- * comes in from this node for recovery */
- down(&(osb->recovery_lock));
- recovery_lock = 1;
- if (osb->disable_recovery) {
- LOG_TRACE_STR("Shutting down so skipping reovery.");
+ ocfs2_update_slot_info(si);
+ slot_num = ocfs2_node_num_to_slot(si, node_num);
+ if (slot_num == OCFS_INVALID_NODE_NUM) {
+ printk("ocfs2_recover_node: no slot for this node, so no "
+ "recovery required.\n");
goto done;
}
+ printk("ocfs2_recover_node: node %d was using slot %d\n", node_num,
+ slot_num);
+
+ journal = osb->journal;
+
/* Ok, look up the inode for our journal */
- inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, node_num);
+ inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+ slot_num);
if (inode == NULL) {
LOG_ERROR_STR("access error");
status = -EACCES;
@@ -1241,16 +1112,10 @@
SET_INODE_JOURNAL(inode);
- /* Should not ever be called to recover ourselves -- in that
- * case we should've called ocfs_journal_load instead. */
- if (osb->node_num == node_num)
- BUG();
-
- status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, FLAG_FILE_RECOVERY,
- &bh, inode);
-
+ status = ocfs2_meta_lock_flags(inode, NULL, &bh, 1,
+ OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
- LOG_TRACE_ARGS("status returned from acquire_lock=%d\n",
+ LOG_TRACE_ARGS("status returned from ocfs2_meta_lock=%d\n",
status);
if (status != -EINTR)
LOG_ERROR_STR("Could not lock journal!");
@@ -1258,19 +1123,20 @@
}
got_lock = 1;
- /* check if that nodes publish sector has been reset (mounted
- * is set false) if so, we can unlock and quit. otherwise we
- * should recover. */
- if (!ocfs_publish_get_mount_state(osb, node_num)) {
+ fe = (ocfs2_dinode *) bh->b_data;
+
+ if (!(fe->id1.journal1.i_flags & OCFS2_JOURNAL_DIRTY_FL)) {
LOG_TRACE_ARGS("No recovery required for node %d\n", node_num);
- status = 0;
+ printk("ocfs2_recover_node: No recovery required for node "
+ "%d\n", node_num);
goto clear_node;
}
- printk("ocfs2: Recovering node %d from device (%u,%u)\n", node_num,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+ printk("ocfs2: Recovering node %d from slot %d on device (%u,%u)\n",
+ node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
clean_orphans = 1;
- fe = (ocfs2_dinode *) bh->b_data;
OCFS_I(inode)->ip_clusters = fe->i_clusters;
status = ocfs_force_read_journal(inode);
@@ -1305,74 +1171,155 @@
if (status < 0)
LOG_ERROR_STATUS(status);
+ /* mark the node clean. */
+ fe->id1.journal1.i_flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+ status = ocfs_write_block(osb, bh, inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
if (ocfs_inc_icount(inode) < 0)
BUG();
/* shutdown the journal */
journal_destroy(k_journal);
+#warning "we can't complete local alloc recovery in this function!"
/* recover his local alloc file, AFTER recovering his journal... */
- status = ocfs_begin_local_alloc_recovery(osb, node_num, &local_alloc);
+ status = ocfs_begin_local_alloc_recovery(osb, slot_num, &local_alloc);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto done;
}
- /* clear the publish sector (mark it unmounted and clean) */
- status = ocfs_reset_publish(osb, node_num);
- if (status < 0)
- LOG_ERROR_STATUS(status);
-
status = 0;
clear_node:
- ocfs_recovery_map_clear(osb, node_num);
- ocfs_recover_oin_locks(osb, node_num);
+ ocfs2_clear_slot(si, slot_num);
+ status = ocfs2_update_disk_slots(osb, si);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
done:
- if (recovery_lock)
- up(&(osb->recovery_lock));
-
/* drop the lock on this nodes journal */
- if (got_lock) {
- tmpstat = ocfs_release_lock(osb, OCFS_LKM_EXMODE,
- FLAG_FILE_RECOVERY,
- inode);
- if (tmpstat < 0)
- LOG_ERROR_STATUS(tmpstat);
- }
+ if (got_lock)
+ ocfs2_meta_unlock(inode, 1);
if (inode)
iput(inode);
if (bh)
brelse(bh);
-
+#if 0
if (local_alloc && !status) {
tmpstat = ocfs_complete_local_alloc_recovery(osb, local_alloc);
if (tmpstat < 0)
LOG_ERROR_STATUS(tmpstat);
}
-
+#endif
if (local_alloc)
kfree(local_alloc);
-
+#if 0
if (clean_orphans && !status) {
tmpstat = ocfs_recover_orphans(osb);
if (tmpstat < 0)
LOG_ERROR_STATUS(tmpstat);
}
+#endif
- atomic_dec(&osb->num_recovery_threads);
-
LOG_EXIT_STATUS(status);
return(status);
}
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(ocfs_super *osb,
+ int slot_num)
+{
+ int status, flags;
+ struct inode *inode = NULL;
+
+ inode = ocfs_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+ slot_num);
+ if (inode == NULL) {
+ LOG_ERROR_STR("access error");
+ status = -EACCES;
+ goto bail;
+ }
+ if (is_bad_inode (inode)) {
+ LOG_ERROR_STR("access error (bad inode)");
+ iput (inode);
+ inode = NULL;
+ status = -EACCES;
+ goto bail;
+ }
+ SET_INODE_JOURNAL(inode);
+
+ flags = OCFS2_META_LOCK_RECOVERY|OCFS2_META_LOCK_NOQUEUE;
+ status = ocfs2_meta_lock_flags(inode, NULL, NULL, 1, flags);
+ if (status < 0) {
+ if (status != -EAGAIN || status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ ocfs2_meta_unlock(inode, 1);
+bail:
+ if (inode)
+ iput(inode);
+
+ return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(ocfs_super *osb)
+{
+ int status, i, node_num;
+ ocfs2_slot_info *si = osb->slot_info;
+
+ /* This is called with the super block cluster lock, so we
+ * know that the slot map can't change underneath us. */
+
+ spin_lock(&si->si_lock);
+ for(i = 0; i < si->si_num_slots; i++) {
+ node_num = si->si_global_node_nums[i];
+ if (i == osb->slot_num)
+ continue;
+ if (node_num == OCFS_INVALID_NODE_NUM)
+ continue;
+ if (ocfs_node_map_test_bit(osb, &osb->recovery_map, node_num))
+ continue;
+ spin_unlock(&si->si_lock);
+
+ /* Ok, we have a slot occupied by another node which
+ * is not in the recovery map. We trylock his journal
+ * file here to test if he's alive. */
+ status = ocfs2_trylock_journal(osb, i);
+ if (!status) {
+ /* Since we're called from mount, we know that
+ * the recovery thread can't race us on
+ * setting / checking the recovery bits. */
+ ocfs_recovery_thread(osb, node_num);
+ } else if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ spin_lock(&si->si_lock);
+ }
+ spin_unlock(&si->si_lock);
+
+ status = 0;
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
int ocfs_recover_orphans(ocfs_super *osb)
{
int status = 0;
int have_disk_lock = 0;
- int tmpstat;
struct inode *inode = NULL;
struct inode *iter;
struct inode *orphan_dir_inode = NULL;
@@ -1381,8 +1328,6 @@
struct ocfs2_dir_entry *de;
struct super_block *sb = osb->sb;
- down(&osb->orphan_recovery_lock);
-
orphan_dir_inode = ocfs_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
-1);
@@ -1393,7 +1338,7 @@
}
down(&orphan_dir_inode->i_sem);
- status = ocfs_acquire_lock_ro(osb, orphan_dir_inode);
+ status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
if (status < 0) {
up(&orphan_dir_inode->i_sem);
LOG_ERROR_STATUS(status);
@@ -1467,12 +1412,8 @@
}
up(&orphan_dir_inode->i_sem);
- status = ocfs_release_lock_ro(osb, orphan_dir_inode);
+ ocfs2_meta_unlock(orphan_dir_inode, 0);
have_disk_lock = 0;
- if (status < 0) {
- LOG_ERROR_STATUS(status);
- goto bail;
- }
iput(orphan_dir_inode);
orphan_dir_inode = NULL;
@@ -1484,14 +1425,9 @@
}
bail:
- up(&osb->orphan_recovery_lock);
+ if (have_disk_lock)
+ ocfs2_meta_unlock(orphan_dir_inode, 0);
- if (have_disk_lock) {
- tmpstat = ocfs_release_lock_ro(osb, orphan_dir_inode);
- if (tmpstat < 0)
- LOG_ERROR_STATUS(tmpstat);
- }
-
if (orphan_dir_inode)
iput(orphan_dir_inode);
@@ -1520,57 +1456,40 @@
goto retry;
}
-/*
- * ocfs_reset_publish()
- *
- *
- * called by: old_ocfs_recover_node()
- *
- * NOTE: This function is unused. I keep it here because it may be
- * useful in the future. --Mark (Sept. 22, 2003)
- */
-static int ocfs_reset_publish (ocfs_super * osb, __u64 node_num)
+static int ocfs2_commit_thread_sleep(ocfs_super *osb)
{
- int status = 0;
- ocfs_publish *publish = NULL;
- struct buffer_head *publish_bh = NULL;
+ int status;
+ signed long timeout = OCFS_CHECKPOINT_INTERVAL;
+ DECLARE_WAITQUEUE(wait, current);
- LOG_ENTRY_ARGS("(0x%p, %llu)\n", osb, node_num);
+ if (atomic_read(&osb->needs_checkpoint))
+ return 0;
- /* take a lock on the publish sector */
- down (&(osb->publish_lock));
+ status = 0;
+ add_wait_queue(&osb->checkpoint_event, &wait);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
- /* Read the publish sector */
- status = ocfs_read_block(osb, (osb->publish_blkno + node_num),
- &publish_bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- publish = (ocfs_publish *) publish_bh->b_data;
+ if (atomic_read(&osb->needs_checkpoint))
+ break;
- publish->dirty = 0;
- publish->mounted = 0;
-
- /* Write the publish sector */
- status = ocfs_write_block(osb, publish_bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
+ if (!signal_pending(current)) {
+ timeout = schedule_timeout(timeout);
+ if (!timeout) {
+ status = -ETIMEDOUT;
+ break;
+ }
+ continue;
+ }
+ status = -EINTR;
+ break;
}
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&osb->checkpoint_event, &wait);
-finally:
- /* unlock it */
- up (&(osb->publish_lock));
+ return status;
+}
- if (publish_bh)
- brelse(publish_bh);
-
- LOG_EXIT_STATUS (status);
- return (status);
-} /* ocfs_reset_publish */
-
int ocfs_commit_thread(void *arg)
{
int status = 0, misses = 0;
@@ -1587,15 +1506,12 @@
misses = 0;
while (1) {
- status = ocfs_wait (osb->flush_event,
- atomic_read (&osb->flush_event_woken),
- OCFS_CHECKPOINT_INTERVAL);
-
- atomic_set (&osb->flush_event_woken, 0);
+ status = ocfs2_commit_thread_sleep(osb);
+ atomic_set (&osb->needs_checkpoint, 0);
switch (status) {
case -ETIMEDOUT:
- LOG_TRACE_STR("FLUSH_EVENT: timed out");
+ LOG_TRACE_STR("timed out");
break;
case -EINTR:
LOG_ERROR_STR("Commit thread got a signal!");
@@ -1607,10 +1523,10 @@
}
break;
case 0:
- LOG_TRACE_STR("FLUSH_EVENT: woken!!!");
+ LOG_TRACE_STR("woken\n");
break;
default:
- LOG_TRACE_STR("FLUSH_EVENT: ??????");
+ LOG_ERROR_STR("invalid status!\n");
break;
}
@@ -1627,15 +1543,12 @@
/* we can trust num_trans here because we're
* in shutdown and nobody other than ourselves
* should be able to start more. */
- if ((atomic_read(&journal->num_trans) == 0)
- && (atomic_read(&journal->num_cmt_locks) == 0))
+ if (atomic_read(&journal->num_trans) == 0)
break;
#ifdef VERBOSE_COMMIT_THREAD
- printk("(%u) commit_thread: %u transactions, %u locks"
- "pending on shutdown\n",
- current->pid,
- atomic_read(&journal->num_trans),
- atomic_read(&journal->num_cmt_locks));
+ printk("(%u) commit_thread: %u transactions pending "
+ "on shutdown\n",
+ current->pid, atomic_read(&journal->num_trans));
#endif
goto skip_sleep;
}
@@ -1644,4 +1557,3 @@
complete (&(commit->c_complete));
return 0;
}
-
Modified: trunk/src/localalloc.c
===================================================================
--- trunk/src/localalloc.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/localalloc.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,7 +38,7 @@
#include "ocfs2.h"
#include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "localalloc.h"
#include "suballoc.h"
#include "sysfile.h"
@@ -54,11 +54,11 @@
static inline int ocfs_local_alloc_window_bits(ocfs_super *osb);
-static __u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc);
+static u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc);
static int ocfs_local_alloc_find_clear_bits(ocfs_super *osb,
ocfs2_dinode *alloc,
- __u32 numbits);
+ u32 numbits);
static void ocfs_clear_local_alloc(ocfs2_dinode *alloc);
@@ -137,7 +137,7 @@
int status = 0;
ocfs2_dinode *alloc = NULL;
struct buffer_head *alloc_bh = NULL;
- __u32 num_used;
+ u32 num_used;
struct inode *inode = NULL;
LOG_ENTRY();
@@ -148,7 +148,7 @@
/* read the alloc off disk */
inode = ocfs_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
- osb->node_num);
+ osb->slot_num);
if (!inode) {
LOG_ERROR_STATUS(status=-EINVAL);
goto bail;
@@ -237,7 +237,7 @@
local_alloc_inode =
ocfs_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
- osb->node_num);
+ osb->slot_num);
if (!local_alloc_inode) {
status = -ENOENT;
LOG_ERROR_STATUS(status);
@@ -272,15 +272,12 @@
}
ocfs_handle_add_inode(handle, main_bm_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
- 0, &main_bm_bh, main_bm_inode);
+ status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- 0, main_bm_inode);
/* WINDOW_MOVE_CREDITS is a bit heavy... */
handle = ocfs_start_trans(osb, handle, OCFS_WINDOW_MOVE_CREDITS);
@@ -352,7 +349,7 @@
* caller to process with ocfs_complete_local_alloc_recovery
*/
int ocfs_begin_local_alloc_recovery(ocfs_super *osb,
- int node_num,
+ int slot_num,
ocfs2_dinode **alloc_copy)
{
int status = 0;
@@ -360,13 +357,13 @@
struct inode *inode = NULL;
ocfs2_dinode *alloc;
- LOG_ENTRY_ARGS("(node_num = %d)\n", node_num);
+ LOG_ENTRY_ARGS("(slot_num = %d)\n", slot_num);
*alloc_copy = NULL;
inode = ocfs_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
- node_num);
+ slot_num);
if (!inode) {
LOG_ERROR_STATUS(status=-EINVAL);
goto bail;
@@ -442,15 +439,12 @@
}
ocfs_handle_add_inode(handle, main_bm_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
- 0, &main_bm_bh, main_bm_inode);
+ status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- 0, main_bm_inode);
handle = ocfs_start_trans(osb, handle, OCFS_WINDOW_MOVE_CREDITS);
if (!handle) {
@@ -508,7 +502,7 @@
local_alloc_inode =
ocfs_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
- osb->node_num);
+ osb->slot_num);
if (!local_alloc_inode) {
status = -ENOENT;
LOG_ERROR_STATUS(status);
@@ -627,18 +621,17 @@
/*
* ocfs_local_alloc_count_bits
*/
-static __u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc)
+static u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc)
{
int i;
- __u8 *buffer;
- __u32 count = 0;
+ u8 *buffer;
+ u32 count = 0;
LOG_ENTRY();
buffer = LOCAL_ALLOC(alloc)->la_bitmap;
- for (i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++) {
+ for (i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++)
count += hweight8(buffer[i]);
- }
LOG_EXIT_ULONG ((unsigned long)count);
return(count);
@@ -649,7 +642,7 @@
*/
static int ocfs_local_alloc_find_clear_bits(ocfs_super *osb,
ocfs2_dinode *alloc,
- __u32 numbits)
+ u32 numbits)
{
int numfound, bitoff, left, startoff, lastzero;
void *bitmap = NULL;
Deleted: trunk/src/lockres.c
===================================================================
--- trunk/src/lockres.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/lockres.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,128 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * lockres.c
- *
- * lock resource handling
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-
-#include "dlm.h"
-#include "lockres.h"
-#include "util.h"
-#include "vote.h"
-
-#include "buffer_head_io.h"
-
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_LOCKRES
-
-
-/*
- * ocfs_update_lockres()
- *
- * @osb: ocfs super block for the volume
- * @fe: corresponding file entry
- *
- * the lockres is refreshed from the disk.
- *
- * Returns 0 if success, < 0 if error.
- */
-int ocfs_update_lockres(ocfs_super *osb, struct buffer_head *bh,
- struct inode *inode, int reread)
-{
- int status = 0;
- ocfs2_dinode *fe;
- int flags;
- int drop_bh = 0;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
- LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, 0x%p)\n", osb,
- OCFS_I(inode)->ip_blkno, lockres, bh);
-
- /* hey, you can't do that! ;) */
- if ((!bh) && !reread)
- BUG();
-
- if (!bh)
- drop_bh = 1;
-
- /* Behavior for process_vote: if you don't pass a buffer, then
- * we'll only read if you're not he master. */
- if ((bh == NULL) && (lockres->master_node_num == osb->node_num))
- goto out;
-
- if (reread) {
- flags = lockres->master_node_num == osb->node_num ?
- OCFS_BH_CACHED : 0;
- status = ocfs_read_block(osb, OCFS_I(inode)->ip_blkno, &bh,
- flags, inode);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto out;
- }
- }
-
- fe = (ocfs2_dinode *) bh->b_data;
- if (!IS_VALID_FILE_ENTRY(fe))
- BUG();
-
- lockres->lock_type = DISK_LOCK(fe)->dl_level;
- lockres->master_node_num = DISK_LOCK(fe)->dl_master;
-
-out:
- if (bh && drop_bh)
- brelse(bh);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_update_lockres */
-
-
-/*
- * ocfs_init_lockres()
- *
- */
-void ocfs_init_lockres (ocfs_super * osb, struct inode *inode)
-{
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
- LOG_ENTRY_ARGS ("(0x%p, 0x%p)\n", osb, lockres);
-
- lockres->master_node_num = OCFS_INVALID_NODE_NUM;
- lockres->lock_holders = 0;
- lockres->readonly_state = 0;
- lockres->uncommitted_holders = 0;
- lockres->lock_type = OCFS_LKM_NLMODE;
- init_rwsem(&lockres->lock);
- ocfs_node_map_init(osb, &lockres->readonly_map);
-
- LOG_EXIT ();
- return;
-} /* ocfs_init_lockres */
Deleted: trunk/src/lockres.h
===================================================================
--- trunk/src/lockres.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/lockres.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,118 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * lockres.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_LOCKRES_H
-#define OCFS2_LOCKRES_H
-
-#include "util.h"
-
-/* lockres.c */
-
-
-/*
- * ocfs_acquire_lockres_write_timeout()
- *
- * @lockres: lockres to acquire
- * @timeout: timeout in ms, 0 == no timeout
- */
-static inline int ocfs_acquire_lockres_write_timeout (struct inode *inode, __u32 timeout)
-{
- unsigned long jif = jiffies + (timeout * HZ / 1000);
- ocfs_lock_res * lockres = GET_INODE_LOCKRES(inode);
-
- while(1) {
- if (down_write_trylock(&lockres->lock))
- return 0;
-
- if (jif < jiffies)
- return -ETIMEDOUT;
-
- ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
- }
-
- return 0;
-}
-
-/*
- * ocfs_acquire_lockres_write()
- */
-static inline int ocfs_acquire_lockres_write (struct inode *inode)
-{
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- down_write(&lockres->lock);
- return 0;
-}
-
-/*
- * ocfs_acquire_lockres_read_timeout()
- *
- * @lockres: lockres to acquire
- * @timeout: timeout in ms, 0 == no timeout
- */
-static inline int ocfs_acquire_lockres_read_timeout (struct inode *inode, __u32 timeout)
-{
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- unsigned long jif = jiffies + (timeout * HZ / 1000);
-
- while(1) {
- if (down_read_trylock(&lockres->lock))
- return 0;
-
- if (jif < jiffies)
- return -ETIMEDOUT;
-
- ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
- }
-
- return 0;
-}
-
-/*
- * ocfs_acquire_lockres_read()
- */
-static inline int ocfs_acquire_lockres_read (struct inode *inode)
-{
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- down_read(&lockres->lock);
- return 0;
-}
-
-static inline void ocfs_release_lockres_write(struct inode *inode)
-{
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- up_write(&lockres->lock);
-}
-static inline void ocfs_release_lockres_read(struct inode *inode)
-{
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- up_read(&lockres->lock);
-}
-
-void ocfs_init_lockres(ocfs_super *osb, struct inode *inode);
-int ocfs_update_lockres(ocfs_super *osb, struct buffer_head *bh,
- struct inode *inode, int reread);
-
-
-#endif /* OCFS2_LOCKRES_H */
Modified: trunk/src/namei.c
===================================================================
--- trunk/src/namei.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/namei.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -50,12 +50,11 @@
#include "alloc.h"
#include "dcache.h"
#include "dir.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "sysfile.h"
#include "inode.h"
-#include "lockres.h"
#include "namei.h"
#include "suballoc.h"
#include "util.h"
@@ -101,10 +100,8 @@
static int ocfs_double_lock(ocfs_super *osb,
ocfs_journal_handle *handle,
- __u32 type1, __u32 flags1,
struct buffer_head **bh1,
struct inode *inode1,
- __u32 type2, __u32 flags2,
struct buffer_head **bh2,
struct inode *inode2);
@@ -292,11 +289,6 @@
/* get our super block */
osb = OCFS_SB(dir->i_sb);
- if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
- LOG_ERROR_STR ("Volume has been shutdown");
- status = -EACCES;
- goto leave;
- }
if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
printk("inode %llu has i_nlink of %u\n",
@@ -323,15 +315,12 @@
goto leave;
}
- /* lock the parent directory */
- status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0,
- &parent_fe_bh, dir);
+ status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto leave;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
dirfe = (ocfs2_dinode *) parent_fe_bh->b_data;
if (!dirfe->i_links_count) {
@@ -392,12 +381,13 @@
BUG();
}
- file_off = fe->i_blkno << dir->i_sb->s_blocksize_bits;
ocfs_inode_set_new(osb, inode);
+ status = ocfs2_create_new_inode_locks(inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
- ocfs_init_lockres(osb, inode);
+ file_off = fe->i_blkno << dir->i_sb->s_blocksize_bits;
- status = ocfs_update_lockres(osb, new_fe_bh, inode, 0);
if (S_ISDIR (mode)) {
status = ocfs_fill_new_dir(osb, handle, dir, inode,
new_fe_bh, data_ac);
@@ -530,7 +520,7 @@
fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
fe->i_blkno = fe_blkno;
fe->i_suballoc_bit = suballoc_bit;
- fe->i_suballoc_node = osb->node_num;
+ fe->i_suballoc_node = osb->slot_num;
fe->i_uid = current->fsuid;
if (dir->i_mode & S_ISGID) {
fe->i_gid = dir->i_gid;
@@ -553,8 +543,6 @@
fe->i_last_eb_blk = 0;
strcpy (fe->i_signature, OCFS2_INODE_SIGNATURE);
fe->i_flags |= OCFS2_VALID_FL;
- DISK_LOCK(fe)->dl_master = osb->node_num;
- DISK_LOCK(fe)->dl_level = OCFS_LKM_EXMODE;
fe->i_atime = fe->i_ctime = fe->i_mtime = OCFS_CURRENT_TIME;
fe->i_dtime = 0;
@@ -662,15 +650,12 @@
goto bail;
}
- /* lock the parent directory */
- err = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0,
- &parent_fe_bh, dir);
+ err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
if (err < 0) {
if (err != -EINTR)
LOG_ERROR_STATUS (err);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
err = ocfs_prepare_dir_for_insert(osb, dir, parent_fe_bh,
dentry->d_name.name,
@@ -680,14 +665,12 @@
goto bail;
}
- err = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, &fe_bh, inode);
+ err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
if (err < 0) {
if (err != -EINTR)
LOG_ERROR_STATUS (err);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_FILE_UPDATE_OIN,
- inode);
fe = (ocfs2_dinode *) fe_bh->b_data;
if (fe->i_links_count >= OCFS2_LINK_MAX) {
@@ -791,13 +774,11 @@
goto leave;
}
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
- &parent_node_bh, dir);
+ status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto leave;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
status = ocfs_find_files_on_disk(osb, dentry->d_name.name,
dentry->d_name.len, &blkno,
@@ -811,16 +792,20 @@
if (blkno != OCFS_I(inode)->ip_blkno)
BUG();
- status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE,
- FLAG_RELEASE_DENTRY, &fe_bh, inode);
+ status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto leave;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_RELEASE_DENTRY,
- inode);
+ status = ocfs2_request_unlink_vote(inode);
+ if (status < 0) {
+ /* This vote should succeed under all normal circumstances. */
+ LOG_ERROR_STATUS(status);
+ goto leave;
+ }
+
if (S_ISDIR (inode->i_mode)) {
if (!ocfs_empty_dir(inode)) {
status = -ENOTEMPTY;
@@ -942,16 +927,14 @@
*/
static int ocfs_double_lock(ocfs_super *osb,
ocfs_journal_handle *handle,
- __u32 type1, __u32 flags1,
struct buffer_head **bh1,
struct inode *inode1,
- __u32 type2, __u32 flags2,
struct buffer_head **bh2,
struct inode *inode2)
{
- int status = 0;
- __u64 tmpid, id1, id2;
- __u32 tmptype, tmpflags;
+ int status;
+ ocfs_inode_private *oip1 = OCFS_I(inode1);
+ ocfs_inode_private *oip2 = OCFS_I(inode2);
struct buffer_head **tmpbh;
struct inode *tmpinode;
@@ -961,31 +944,16 @@
OCFS_ASSERT(handle);
- id1 = OCFS_I(inode1)->ip_blkno;
- id2 = OCFS_I(inode2)->ip_blkno;
-
if (*bh1)
*bh1 = NULL;
if (*bh2)
*bh2 = NULL;
/* we always want to lock the one with the lower lockid first. */
- if (id1 != id2) {
- if (id1 < id2) {
+ if (oip1->ip_blkno != oip2->ip_blkno) {
+ if (oip1->ip_blkno < oip2->ip_blkno) {
/* switch id1 and id2 around */
LOG_TRACE_STR("switching them around...");
- tmpid = id2;
- id2 = id1;
- id1 = tmpid;
-
- tmptype = type2;
- type2 = type1;
- type1 = tmptype;
-
- tmpflags = flags2;
- flags2 = flags1;
- flags1 = tmpflags;
-
tmpbh = bh2;
bh2 = bh1;
bh1 = tmpbh;
@@ -995,21 +963,18 @@
inode1 = tmpinode;
}
/* lock id2 */
- status = ocfs_acquire_lock(osb, type2, flags2, bh2, inode2);
+ status = ocfs2_meta_lock(inode2, handle, bh2, 1);
if (status < 0) {
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, type2, flags2, inode2);
}
/* lock id1 */
- status = ocfs_acquire_lock(osb, type1, flags1,
- bh1, inode1);
+ status = ocfs2_meta_lock(inode1, handle, bh1, 1);
if (status < 0) {
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, type1, flags1, inode1);
bail:
LOG_EXIT_STATUS(status);
return(status);
@@ -1045,7 +1010,6 @@
struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
// this is the 1st dirent bh
nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
- struct buffer_head *lockbh = NULL;
LOG_SET_CONTEXT(RENAME);
@@ -1077,11 +1041,7 @@
/* if old and new are the same, this'll just do one lock. */
status = ocfs_double_lock(osb, handle,
- OCFS_LKM_EXMODE,
- 0,
&old_dir_bh, old_dir,
- OCFS_LKM_EXMODE,
- 0,
&new_dir_bh, new_dir);
if (status < 0) {
LOG_ERROR_STATUS(status);
@@ -1105,20 +1065,17 @@
/* Directories actually require metadata updates to
* the directory info so we can't get away with not
* doing node locking on it. */
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
- FLAG_RELEASE_DENTRY|FLAG_FILE_RENAME,
- &lockbh, old_inode);
- if (lockbh) {
- brelse(lockbh);
- lockbh = NULL;
+ status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
}
+
+ status = ocfs2_request_rename_vote(old_inode);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- FLAG_RELEASE_DENTRY|FLAG_FILE_RENAME,
- old_inode);
status = -EIO;
old_inode_de_bh = ocfs_bread (old_inode, 0, &status, 0);
@@ -1136,7 +1093,7 @@
} else {
/* Ah, the simple case - we're a file so just send a
* message. */
- status = ocfs_notify_on_rename(osb, old_inode);
+ status = ocfs2_request_rename_vote(old_inode);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto bail;
@@ -1187,16 +1144,18 @@
if (newfe_blkno != OCFS_I(new_inode)->ip_blkno)
BUG();
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
- FLAG_RELEASE_DENTRY, &newfe_bh,
- new_inode);
+ status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- FLAG_RELEASE_DENTRY, new_inode);
+ status = ocfs2_request_unlink_vote(new_inode);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
newfe = (ocfs2_dinode *) newfe_bh->b_data;
LOG_TRACE_ARGS("aha rename over existing... new_de=%p "
@@ -1426,7 +1385,7 @@
goto bail;
}
- bhs = ocfs_malloc(sizeof(struct buffer_head *) * blocks);
+ bhs = kmalloc(sizeof(struct buffer_head *) * blocks, GFP_KERNEL);
if (!bhs) {
status = -ENOMEM;
LOG_ERROR_STATUS(status);
@@ -1560,14 +1519,12 @@
}
/* lock the parent directory */
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
- &parent_fe_bh, dir);
+ status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
dirfe = (ocfs2_dinode *) parent_fe_bh->b_data;
if (!dirfe->i_links_count) {
@@ -1622,8 +1579,7 @@
}
ocfs_inode_set_new(osb, inode);
- ocfs_init_lockres(osb, inode);
- status = ocfs_update_lockres(osb, new_fe_bh, inode, 0);
+ status = ocfs2_create_new_inode_locks(inode);
if (status < 0)
LOG_ERROR_STATUS(status);
@@ -2054,14 +2010,11 @@
}
ocfs_handle_add_inode(handle, orphan_dir_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
- &orphan_dir_bh, orphan_dir_inode);
+ status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
if (status < 0) {
LOG_ERROR_STATUS(status);
goto leave;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0,
- orphan_dir_inode);
status = ocfs_prepare_dir_for_insert(osb, orphan_dir_inode,
orphan_dir_bh, name, namelen,
Deleted: trunk/src/nm.c
===================================================================
--- trunk/src/nm.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/nm.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,1150 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * nm.c
- *
- * net and disk process vote, nm thread, etc.
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/bitops.h>
-#include <linux/net.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-#include "ocfs2.h"
-
-#include "alloc.h"
-#include "dlm.h"
-#include "extent_map.h"
-#include "file.h"
-#include "heartbeat.h"
-#include "inode.h"
-#include "lockres.h"
-#include "nm.h"
-#include "util.h"
-#include "vote.h"
-
-#include "ocfs_journal.h"
-#include "buffer_head_io.h"
-
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_NM
-
-/* for lack of a better name... protects inode_private->ip_num_extends. */
-static spinlock_t oin_num_ext_lock = SPIN_LOCK_UNLOCKED;
-struct semaphore recovery_list_sem;
-
-static inline int need_write_lock(ocfs_super *osb, ocfs_lock_res *lockres, __u32 flags);
-static inline int get_process_vote_action(ocfs_super * osb, ocfs_lock_res *lockres, __u32 node_num,
- __u32 flags, int *master_alive, int *write_lock,
- int *change_master, struct inode *inode);
-
-static int ocfs_process_vote_pre_change_master(int vote_type, int flags, int *vote_response, struct inode *inode);
-static int ocfs_process_vote_change_master(ocfs_super *osb, int *vote_response, int *status, struct inode *inode, ocfs_lock_res *lockres, __s16 node_num, __u64 lock_id);
-static int ocfs_process_vote_post_change_master(ocfs_super *osb, int vote_type, int flags, int *vote_response, struct inode *inode, ocfs_lock_res *lockres, int *status, __s16 node_num, int *inc_seq);
-static int ocfs_lock_busy(ocfs_super *osb, struct inode *inode, ocfs_lock_res *lockres);
-
-static int _ocfs_drop_readonly_cache_lock(void *arg);
-
-typedef struct _ocfs_ro_cache_drop_ctxt
-{
- ocfs_super *osb;
- ocfs_lock_res *lockres;
- struct inode *inode;
- int yield;
-} ocfs_ro_cache_drop_ctxt;
-
-static void ocfs_mark_inode_for_extend(ocfs_super *osb, struct inode *inode,
- __u32 node_num);
-static void ocfs_clear_inode_for_extend(ocfs_super *osb, struct inode *inode,
- __u32 node_num, u32 num_rel);
-
-static int ocfs_process_inode_delete(struct inode *inode);
-static void ocfs_commit_inode_delete(struct inode *inode);
-
-static const char *process_vote_strings[] = {
- "INVALID_REQUEST", // reply with a NO vote
- "UPDATE_OIN_INODE", // update both oin and inode
- "DELETE_ACQUIRE",// delete or rename request
- "CHANGE_MASTER", // request to change master to requestor
- "NOT_MASTER", // I am not master, retry
- "REMASTER_THIS", // remaster lock to me
- "REMASTER_REQUESTOR", // remaster lock to requestor
- "DROP_READONLY", // RO cachelock needs to convert to RW
- "READONLY",
- "RELEASE_DENTRY",
- "TRUNCATE_PAGES"
-};
-
-/*
- * ocfs_recv_thread()
- *
- */
-int ocfs_recv_thread (void *unused)
-{
- int status = 0;
- ocfs_recv_ctxt *recv_ctxt = NULL;
-
- LOG_ENTRY ();
-
-#define LISTENER_PROCESS_NAME "ocfs2lsnr"
- ocfs_daemonize (LISTENER_PROCESS_NAME, strlen(LISTENER_PROCESS_NAME),
- 1);
-
- OcfsIpcCtxt.task = current;
-
- while (1) {
- recv_ctxt = ocfs_malloc (sizeof (ocfs_recv_ctxt));
- if (recv_ctxt == NULL) {
- LOG_ERROR_STATUS (-ENOMEM);
- goto bail;
- }
-
- memset (recv_ctxt, 0, sizeof (ocfs_recv_ctxt));
- recv_ctxt->msg_len = OCFS_MAX_DLM_PKT_SIZE;
-
- status = ocfs_recv_udp_msg (recv_ctxt);
- if (status < 0) {
- kfree(recv_ctxt);
- if (status != -EBADF) {
- LOG_ERROR_STATUS (status);
- } else {
- /* Thread is being killed. */
- goto finally;
- }
- }
- }
-
-finally:
- /* Flush all scheduled tasks */
- flush_scheduled_work();
-
- if (OcfsIpcCtxt.send_sock) {
- sock_release (OcfsIpcCtxt.send_sock);
- OcfsIpcCtxt.send_sock = NULL;
- }
-
- if (OcfsIpcCtxt.recv_sock) {
- sock_release (OcfsIpcCtxt.recv_sock);
- OcfsIpcCtxt.recv_sock = NULL;
- }
-
- OcfsIpcCtxt.task = NULL;
-
- /* signal main thread of ipcdlm's exit */
- complete (&(OcfsIpcCtxt.complete));
-
-bail:
- LOG_EXIT ();
- return 0;
-} /* ocfs_recv_thread */
-
-// gets a best guess (based on dirty read of lockres)
-// of whether down_read or down_write should be used on lockres
-// NOTE: always RECHECK after getting the lock and follow what
-// get_process_vote_action says
-static inline int need_write_lock(ocfs_super *osb, ocfs_lock_res *lockres, __u32 flags)
-{
- // always need write access to lockres if not master
- if (lockres->master_node_num != osb->node_num)
- return 1;
- // usually need write access for these so just get it
- if (flags & (FLAG_CHANGE_MASTER|FLAG_DROP_READONLY|FLAG_READONLY))
- return 1;
- // nothing else will need it, assuming it didnt just change under us
- return 0;
-}
-
-static inline int get_process_vote_action(ocfs_super * osb, ocfs_lock_res *lockres, __u32 node_num,
- __u32 flags, int *master_alive, int *write_lock,
- int *change_master, struct inode *inode)
-{
- int vote_type = INVALID_REQUEST;
- int my_node_wins = 0;
- int this_node_master = 0;
- __u64 lockid = 0;
- ocfs_vote_obj_lookup_data data;
-
- LOG_ENTRY_ARGS("(node_num=%d, flags=%08x)\n", node_num, flags);
-
- OCFS_ASSERT(inode);
- OCFS_ASSERT(lockres);
-
- lockid = OCFS_I(inode)->ip_blkno;
-
- *change_master = 0;
- *write_lock = 0;
- this_node_master = (lockres->master_node_num == osb->node_num);
- *master_alive = (lockres->master_node_num != OCFS_INVALID_NODE_NUM) &&
- ocfs_node_is_alive(&osb->publ_map, lockres->master_node_num);
-
- // if an outstanding vote request is found on this lockid
- // and this node number is higher, this node wins
- data.func = ocfs_lookup_obj_by_lockid;
- data.u.s.lock_id = lockid;
- data.ret = NULL;
- if (ocfs_lookup_vote_request_obj(osb, &data) == 0)
- my_node_wins = (node_num < osb->node_num);
-
- /* NOTE: FLAG_CHANGE_MASTER may be combined with
- * other flags and result in a process_vote action
- * other than CHANGE_MASTER. the change_master
- * value returned here is independent of this action */
- if (this_node_master && flags & FLAG_CHANGE_MASTER) {
- *write_lock = 1;
- *change_master = 1;
- }
-
- // if this node is not master, we will need to update the lockres
- if (!this_node_master)
- *write_lock = 1;
-
- if (flags & (FLAG_RELEASE_DENTRY | FLAG_FILE_RENAME)) {
- vote_type = RELEASE_DENTRY;
- goto done;
- }
-
- if (flags & FLAG_DROP_READONLY) {
- vote_type = DROP_READONLY;
- *write_lock = 1;
- goto done;
- } else if (flags & FLAG_READONLY) {
- if (this_node_master && lockres->lock_type == OCFS_LKM_EXMODE) {
- vote_type = READONLY;
- *write_lock = 1;
- } else
- vote_type = INVALID_REQUEST;
- goto done;
- }
-
- if (flags & FLAG_FILE_DELETE) {
- if (flags & FLAG_RELEASE_LOCK)
- vote_type = INVALID_REQUEST;
- else if (flags & FLAG_ACQUIRE_LOCK)
- vote_type = DELETE_ACQUIRE;
- else
- vote_type = INVALID_REQUEST;
- } else if (flags & FLAG_FILE_UPDATE_OIN) {
- if ((flags & FLAG_FILE_TRUNCATE) &&
- (flags & FLAG_ACQUIRE_LOCK))
- vote_type = TRUNCATE_PAGES;
- else
- vote_type = UPDATE_OIN_INODE;
- } else if (flags & FLAG_TRUNCATE_PAGES) {
- vote_type = TRUNCATE_PAGES;
- } else if (this_node_master) {
- if (flags & FLAG_CHANGE_MASTER)
- vote_type = CHANGE_MASTER;
- else {
- LOG_TRACE_STR("(INVALID_REQUEST) am master, but no more types");
- vote_type = INVALID_REQUEST;
- }
- } else {
- if (*master_alive)
- vote_type = NOT_MASTER;
- else if (my_node_wins)
- vote_type = REMASTER_THIS;
- else
- vote_type = REMASTER_REQUESTOR;
- }
-
-done:
- LOG_EXIT_STATUS(vote_type);
- return vote_type;
-}
-
-/* this function requires that callers to it be serialized (isn't
- * really a problem as vote_sem does that for us. */
-static void ocfs_mark_inode_for_extend(ocfs_super *osb, struct inode *inode,
- __u32 node_num)
-{
- spin_lock(&oin_num_ext_lock);
-
- if (OCFS_I(inode)->ip_num_extends < 0)
- BUG();
-
- /* this isn't the 1st extend against the inode, so just inc
- * the counter. */
- if (OCFS_I(inode)->ip_num_extends > 0) {
- OCFS_I(inode)->ip_num_extends++;
-
- printk("ocfs_mark_inode_for_extend: inode %llu, num = %d\n",
- OCFS_I(inode)->ip_blkno, OCFS_I(inode)->ip_num_extends);
-
- spin_unlock(&oin_num_ext_lock);
- return;
- }
-
- /* ok, we're going to have to take the extend sem. We can't do
- * this holding ip_node_extend_sem so we drop it and recheck after
- * we've got it. */
- spin_unlock(&oin_num_ext_lock);
-
- /* take the extend_sem on behalf of
- * this other node. It won't be
- * released until he does his last
- * release broadcast. This has the
- * effect of locking out
- * ocfs2_extent_map lookups
- * inode. */
- down_write(&OCFS_I(inode)->ip_node_extend_sem);
-
- atomic_inc(&inode->i_count);
-
- /* Ok, we've still got it open. Put this guy on the recovery
- * list in case the extending node dies. */
- down(&recovery_list_sem);
- spin_lock(&oin_num_ext_lock);
-
- if (OCFS_I(inode)->ip_num_extends < 0)
- BUG();
-
- OCFS_I(inode)->ip_num_extends++;
- list_add_tail(&OCFS_I(inode)->ip_recovery_list,
- &osb->lock_recovery_lists[node_num]);
-
- LOG_TRACE_PROCESS_VOTE("inode %llu, num = %d\n",
- OCFS_I(inode)->ip_blkno, OCFS_I(inode)->ip_num_extends);
-
- spin_unlock(&oin_num_ext_lock);
- up(&recovery_list_sem);
-
- return;
-}
-
-static void ocfs_clear_inode_for_extend(ocfs_super *osb, struct inode *inode,
- __u32 node_num, u32 num_rel)
-{
- int dec = 0;
-
- down(&recovery_list_sem);
- spin_lock(&oin_num_ext_lock);
-
- if ((OCFS_I(inode)->ip_num_extends - (s32) num_rel) < 0) {
- /* We don't force to zero here in order to cover up a
- * bug, but rather because it's perfectly valid for us
- * to get a release with a count > what we've had if
- * we mount after the acquires have been sent. */
-
- LOG_TRACE_PROCESS_VOTE("inode %llu, num_rel of "
- "%d would result in negative count (ip_num_extends "
- "= %d)\n",
- OCFS_I(inode)->ip_blkno, num_rel,
- OCFS_I(inode)->ip_num_extends);
- OCFS_I(inode)->ip_num_extends = 0;
- } else {
- OCFS_I(inode)->ip_num_extends -= num_rel;
- }
-
- LOG_TRACE_PROCESS_VOTE("inode %llu, num = %d\n",
- OCFS_I(inode)->ip_blkno, OCFS_I(inode)->ip_num_extends);
-
- if (!OCFS_I(inode)->ip_num_extends) {
- list_del(&OCFS_I(inode)->ip_recovery_list);
- INIT_LIST_HEAD(&OCFS_I(inode)->ip_recovery_list);
-
- up_write(&OCFS_I(inode)->ip_node_extend_sem);
-
- dec = 1;
- }
-
- spin_unlock(&oin_num_ext_lock);
- up(&recovery_list_sem);
-
- /* we want iputs to happen outside of as many locks as possible. */
- if (dec)
- iput(inode);
-
- return;
-}
-
-
-static int ocfs_process_inode_delete(struct inode *inode)
-{
- int status;
-
- LOG_TRACE_ARGS("DELETE vote on inode %lu, read "
- "lnk_cnt = %u\n", inode->i_ino,
- inode->i_nlink);
-
- /* force this as ours may be out of date. */
- inode->i_nlink = 0;
-
- spin_lock(&OCFS_I(inode)->ip_lock);
- /* vote no if the file is still open. */
- if (OCFS_I(inode)->ip_open_cnt > 0) {
- LOG_TRACE_PROCESS_VOTE("open count = %u\n",
- OCFS_I(inode)->ip_open_cnt);
- spin_unlock(&OCFS_I(inode)->ip_lock);
- status = 0;
- goto done;
- }
- spin_unlock(&OCFS_I(inode)->ip_lock);
-
- /* vote no if someone's extending it. */
- spin_lock(&oin_num_ext_lock);
- if (OCFS_I(inode)->ip_num_extends) {
- spin_unlock(&oin_num_ext_lock);
- LOG_TRACE_PROCESS_VOTE("extends pending\n");
- status = 0;
- goto done;
- }
- spin_unlock(&oin_num_ext_lock);
-
- /* directories are a bit ugly... What if someone is sitting in
- * it? We want to make sure the inode is removed completely as
- * a result of the iput in process_vote. */
- if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
- LOG_TRACE_PROCESS_VOTE("i_count = %u\n",
- atomic_read(&inode->i_count));
- status = 0;
- goto done;
- }
-
- status = 1;
-done:
- return(status);
-}
-
-static void ocfs_commit_inode_delete(struct inode *inode)
-{
- SET_INODE_DELETED(inode);
- /* if we vote yes, then we set the SKIP_DELETE
- * flag on the inode so we don't try to delete
- * it in delete_inode ourselves. */
- OCFS_SET_FLAG(OCFS_I(inode)->ip_flags,
- OCFS_INODE_SKIP_DELETE);
-
- d_prune_aliases (inode);
- sync_mapping_buffers(inode->i_mapping);
- ocfs_truncate_inode_pages(inode, 0);
- ocfs2_extent_map_drop(inode, 0);
-}
-
-
-/*
- * ocfs_process_vote()
- *
- * @osb:
- * @publish:
- * @node_num: node asking for the vote
- *
- */
-int ocfs_process_vote (ocfs_super * osb, ocfs_dlm_msg *dlm_msg)
-{
- int status = 0;
- int tmpstat = 0;
- ocfs_lock_res *lockres = NULL;
- __u32 flags, num_ident;
- __u16 num_nodes;
- int vote_type = INVALID_REQUEST, vote_response = 0;
- struct inode *inode = NULL;
- int master_alive = 1, change_master = 0, write_lock = 0;
- int inc_inode_seq = 0;
- int change_master_succeeded = 0;
- __s16 node_num = dlm_msg->src_node;
- __u64 lock_id, seq_num;
- ocfs_dlm_req_master *req_master = NULL;
- int lockres_lock_held = NO_LOCK;
-
- LOG_ENTRY_ARGS ("(0x%p, 0x%p)\n", osb, dlm_msg);
-
- down(&osb->vote_sem);
-
- num_nodes = osb->max_nodes;
-
- req_master = (ocfs_dlm_req_master *)dlm_msg->msg_buf;
- flags = req_master->flags;
- lock_id = req_master->lock_id;
- seq_num = req_master->lock_seq_num;
- num_ident = req_master->num_ident;
-
- if (!num_ident) {
- printk("flags = 0x%x, lock_id = %llu, node_num = %u\n",
- flags, lock_id, node_num);
- BUG();
- }
-
- LOG_TRACE_ARGS ("node=%u, id=%llu, seq=%llu\n", node_num,
- lock_id, seq_num);
-
- /* if we timeout on any of the locks, we want to send a retry
- * instead of letting the other guy's network timeout. */
- vote_response = FLAG_VOTE_UPDATE_RETRY;
-
- if (flags & FLAG_TRUNCATE_PAGES) {
- inode = ocfs_ilookup(osb, lock_id);
- if(!inode) {
- vote_type = TRUNCATE_PAGES;
- goto got_vote_type;
- }
- } else {
- inode = ocfs_iget(osb, lock_id);
- }
-
- if (!inode) {
- status = -ENOENT;
- LOG_ERROR_ARGS("Could not find inode: lock_id = %llu, "
- "node=%u, seq=%llu, flags=0x%x\n",
- lock_id, node_num, seq_num, flags);
- LOG_ERROR_STATUS(status);
- goto vote;
- }
-
- /* ahh, so you find yourself asking "what the
- * heck is this?"
- * Please see the note in ocfs_delete_inode. */
- osb->voting_ino = inode->i_ino;
-
- lockres = GET_INODE_LOCKRES(inode);
-
- // take a good guess...
- // at worst, we will take 2 passes through
- write_lock = need_write_lock(osb, lockres, flags);
-
-retake_lock:
- OCFS_ASSERT(lockres_lock_held == NO_LOCK);
- if (write_lock)
- status = ocfs_acquire_lockres_write_timeout (inode, (OCFS_NM_HEARTBEAT_TIME/2));
- else
- status = ocfs_acquire_lockres_read_timeout (inode, (OCFS_NM_HEARTBEAT_TIME/2));
-
- if (status < 0) {
- LOG_TRACE_ARGS("Timedout locking lockres for id: %llu\n",
- OCFS_I(inode)->ip_blkno);
- goto vote;
- } else
- lockres_lock_held = (write_lock ? WRITE_LOCK : READ_LOCK);
-
- // find out everything now that a lock is held
- vote_type = get_process_vote_action(osb, lockres, node_num, flags,
- &master_alive, &write_lock,
- &change_master, inode);
-
- // bummer. we got the wrong lock. get the write lock and start over.
- if (write_lock && lockres_lock_held == READ_LOCK) {
- ocfs_release_lockres_read(inode);
- lockres_lock_held = NO_LOCK;
- goto retake_lock;
- }
-
- if (lockres->master_node_num != osb->node_num) {
- /* since we pass a NULL bh, this'll only do a read if
- * we're not the master. */
- OCFS_ASSERT(lockres_lock_held == WRITE_LOCK);
- status = ocfs_update_lockres (osb, NULL, inode, 1);
-
- if (status < 0) {
- if (status != -ETIMEDOUT)
- LOG_ERROR_STATUS (status);
- goto leave;
- }
- }
-
-got_vote_type:
-
- LOG_TRACE_PROCESS_VOTE("type: %s, lockid: %llu, action: (%u) %s, num_ident: %u, "
- "alive: %d, write: %d, change: %d, held: %d\n",
- flags & FLAG_RELEASE_LOCK ? "RELEASE" :
- (flags & FLAG_ACQUIRE_LOCK ? "ACQUIRE" : "MODIFY"), lock_id,
- vote_type, process_vote_strings[vote_type], num_ident,
- master_alive, write_lock, change_master, lockres_lock_held);
-
- if (vote_type == INVALID_REQUEST)
- printk("Invalid request! flags = 0x%x master=%d, readonly=%s\n",
- flags, lockres->master_node_num,
- test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no");
-
- /* get_process_vote_action will only allow CHANGE_MASTER on a CACHE lock
- * held by this node. the CHANGE_MASTER path needs to check the readonly
- * map to see if any nodes need to be updated. */
-
- vote_response = FLAG_VOTE_NODE;
-
- if (ocfs_process_vote_pre_change_master(vote_type, flags, &vote_response, inode))
- goto vote;
-
- if (change_master) {
- tmpstat = ocfs_process_vote_change_master(osb, &vote_response, &status,
- inode, lockres, node_num, lock_id);
- if (tmpstat < 0)
- goto leave;
- else if (tmpstat == 1)
- goto vote;
- change_master_succeeded = 1;
- inc_inode_seq = 1;
- }
-
- tmpstat = ocfs_process_vote_post_change_master(osb, vote_type, flags, &vote_response, inode,
- lockres, &status, node_num, &inc_inode_seq);
-
- /* if we made it this far, and change_master, then it had better be voting yes */
- if (change_master && vote_response != FLAG_VOTE_NODE)
- BUG();
-
- if (inode && (flags & (FLAG_FILE_EXTEND|FLAG_FILE_TRUNCATE)) &&
- ((flags & FLAG_ACQUIRE_LOCK && vote_response==FLAG_VOTE_NODE) ||
- (flags & FLAG_RELEASE_LOCK))) {
- LOG_TRACE_ARGS("responding YES to %s %s request, inode=%p, node=%u\n", flags & FLAG_FILE_EXTEND ?
- "extend" : "truncate", flags & FLAG_RELEASE_LOCK ?
- "release" : "acquire", inode, node_num);
-
- if (flags & FLAG_ACQUIRE_LOCK)
- ocfs_mark_inode_for_extend(osb, inode, node_num);
- else if (flags & FLAG_RELEASE_LOCK)
- ocfs_clear_inode_for_extend(osb, inode, node_num,
- num_ident);
- else {
- printk("uhoh, bad vote flags! 0x%x\n", flags);
- BUG();
- }
- }
-
-vote:
- status = ocfs_send_vote_reply(osb, dlm_msg, vote_response);
-
- LOG_TRACE_PROCESS_VOTE("vote: lockid=%llu, node=%d, seqnum=%llu, response=%d\n",
- lock_id, node_num, seq_num, vote_response);
-
- if (status < 0)
- LOG_ERROR_STATUS (status);
- else {
- ocfs_compute_dlm_stats (0, vote_response,
- &(OcfsGlobalCtxt.net_reply_stats));
- ocfs_compute_dlm_stats (0, vote_response,
- &(osb->net_reply_stats));
- }
-
-leave:
- if (lockres_lock_held == READ_LOCK)
- ocfs_release_lockres_read (inode);
- else if (lockres_lock_held == WRITE_LOCK)
- ocfs_release_lockres_write (inode);
- lockres_lock_held = NO_LOCK;
-
- if (!inode)
- goto no_inode_leave;
-
- if (inc_inode_seq) {
- ocfs_inc_inode_seq(osb, inode);
- sync_mapping_buffers(inode->i_mapping);
- }
- iput(inode);
-
-no_inode_leave:
- osb->voting_ino = 0;
-
- up(&osb->vote_sem);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_process_vote */
-
-
-/* some lock requests need to be processed before a possible
- * change master. Beware however that the change_master might
- * very well send a no vote, so you can't do things here that
- * cannot be rolled back. */
-
-/* Returns: 1 if process_vote should vote immediately, 0 otherwise */
-
-static int ocfs_process_vote_pre_change_master(int vote_type, int flags, int *vote_response, struct inode *inode)
-{
- if (vote_type == DELETE_ACQUIRE) {
- LOG_TRACE_STR("DELETE_ACQUIRE (part one)");
- if (!ocfs_process_inode_delete(inode)) {
- *vote_response = FLAG_VOTE_OIN_ALREADY_INUSE;
- return 1;
- }
- *vote_response = FLAG_VOTE_NODE;
- return 0;
- }
- if (vote_type == TRUNCATE_PAGES) {
- LOG_TRACE_STR("TRUNCATE_PAGES");
- *vote_response = FLAG_VOTE_NODE;
- if (inode) {
- if (ocfs_sync_inode(inode) < 0) {
- LOG_ERROR_ARGS("sync inode failed for inode %lu!\n", inode->i_ino);
- BUG();
- }
- ocfs_truncate_inode_pages(inode, 0);
- spin_lock(&OCFS_I(inode)->ip_lock);
-
- /* truncate may send this */
- if (flags & FLAG_FILE_UPDATE_OIN)
- atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
- spin_unlock(&OCFS_I(inode)->ip_lock);
-
- /* Do we need this? */
- ocfs2_extent_map_drop(inode, 0);
- }
- return 0;
- }
-
- *vote_response = 0;
- if (vote_type == INVALID_REQUEST) {
- /* we catch INVALID_REQUEST up here now as we
- * don't want to do a change_master on a
- * messed up vote... */
- LOG_TRACE_STR("INVALID_REQUEST");
- return 1;
- }
- return 0;
-}
-
-
-
-
-static int ocfs_lock_busy(ocfs_super *osb, struct inode *inode, ocfs_lock_res *lockres)
-{
- /* requestor will need to retry if anyone is using the lockres */
- if (lockres->lock_holders > 0) {
- LOG_TRACE_PROCESS_VOTE("Lock id (%llu) has %u holders\n",
- OCFS_I(inode)->ip_blkno, lockres->lock_holders);
- // kick the commit thread
- atomic_set(&osb->flush_event_woken, 1);
- wake_up(&osb->flush_event);
-
- return 1;
- }
- return 0;
-}
-
-
-/* Returns: <0 if an I/O error occurred,
- * 1 if process_vote should vote immediately,
- * 0 if change master succeeded */
-
-static int ocfs_process_vote_change_master(ocfs_super *osb, int *vote_response, int *status, struct inode *inode,
- ocfs_lock_res *lockres, __s16 node_num, __u64 lock_id)
-{
- struct buffer_head *fe_bh = NULL;
-
- /* lockres is held with down_write throughout this call */
-
- LOG_TRACE_STR("CHANGE_MASTER");
- LOG_TRACE_PROCESS_VOTE("doing CHANGE_MASTER for this request\n");
-
- if (ocfs_lock_busy(osb, inode, lockres)) {
- *vote_response = FLAG_VOTE_UPDATE_RETRY;
- *status = 0;
- return 1;
- }
-
- /* this is currently a readonly EX lock.
- * need to communicate to all the nodes in the
- * map that lock will be changing to RW before we
- * continue. RETRY this request while we spawn
- * off a thread to collect up the communication */
- if (!ocfs_node_map_is_empty(&lockres->readonly_map)) {
- // assumption: node asking for vote has already dropped readonly
- ocfs_node_map_clear_bit(&lockres->readonly_map, node_num);
- // should not be in there, but...
- ocfs_node_map_clear_bit(&lockres->readonly_map, osb->node_num);
- if (!ocfs_node_map_is_empty(&lockres->readonly_map)) {
- OCFS_ASSERT(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) &&
- lockres->master_node_num == osb->node_num);
- OCFS_ASSERT(inode);
- *status = ocfs_drop_readonly_cache_lock(osb, inode, 1);
- if (*status < 0)
- LOG_ERROR_STATUS(*status);
- LOG_TRACE_PROCESS_VOTE("node map not empty on RO drop request\n");
- *vote_response = FLAG_VOTE_UPDATE_RETRY;
- // did not change master, send response
- return 1;
- }
- // noone left in map, so continue
- clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
- }
-
- sync_mapping_buffers(inode->i_mapping);
-
-#warning do we need a truncate_inode_pages here?
- if (lockres->master_node_num != osb->node_num) {
- printk("are we giving away a lock we don't own!?! "
- "inode %llu\n", OCFS_I(inode)->ip_blkno);
- BUG();
- }
-
- *status = ocfs_read_block(osb, lock_id, &fe_bh, OCFS_BH_CACHED, inode);
- if (*status < 0) {
- LOG_ERROR_STATUS ((*status));
- return *status;
- }
- lockres->master_node_num = node_num;
- lockres->lock_type = OCFS_LKM_NLMODE;
- ocfs_update_disk_lock(osb, fe_bh, inode);
- brelse(fe_bh);
- *vote_response = FLAG_VOTE_NODE;
- *status = 0;
-
- // master successfully changed
- return 0;
-}
-
-
-
-/* Returns: 1 if process_vote should vote immediately,
- * 0 on success */
-
-/* we can't have any of these cases failing if the change master already succeeded */
-static int ocfs_process_vote_post_change_master(ocfs_super *osb, int vote_type, int flags, int *vote_response, struct inode *inode, ocfs_lock_res *lockres, int *status, __s16 node_num, int *inc_seq)
-{
- switch (vote_type) {
- case TRUNCATE_PAGES:
- case CHANGE_MASTER:
- /* we dealt with this all above. */
- break;
-
- case UPDATE_OIN_INODE:
- LOG_TRACE_STR("UPDATE_OIN_INODE");
- atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
- *vote_response = FLAG_VOTE_OIN_UPDATED;
- break;
-
- case RELEASE_DENTRY:
- OCFS_ASSERT(inode);
-
- /* we always vote yes on this one. */
- *vote_response = FLAG_VOTE_NODE;
-
- /* do nothing in the release case... hmm,
- * perhaps we should just do a verify_update
- * or something in case the guy aborted... */
- if (flags & FLAG_RELEASE_LOCK)
- break;
-
- d_prune_aliases (inode);
-
- /* for rename, we don't drop link counts */
- if (!(flags & FLAG_FILE_RENAME)) {
- if (S_ISDIR(inode->i_mode))
- inode->i_nlink = 0;
- else
- inode->i_nlink--;
- }
-
- LOG_TRACE_ARGS("pruned dentries for inode %lu, nlink = %u\n",
- inode->i_ino, inode->i_nlink);
- break;
-
- case DELETE_ACQUIRE:
- LOG_TRACE_STR("DELETE_ACQUIRE (part two)");
- /* If we got this far, then we assume we've
- * done the 1st part of the DELETE_ACQUIRE
- * case and we just have to commit it. */
- if (*vote_response != FLAG_VOTE_NODE)
- BUG();
-
- ocfs_commit_inode_delete(inode);
- break;
-
- case READONLY:
- LOG_TRACE_STR("READONLY");
- // WRITELOCK
- OCFS_ASSERT(!(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state)) ||
- lockres->master_node_num == osb->node_num);
-
- if (ocfs_lock_busy(osb, inode, lockres)) {
- *vote_response = FLAG_VOTE_UPDATE_RETRY;
- *status = 0;
- return 1;
- }
-
- // if the requestor just wants to do readonly, we
- // drop our buffers, so switch to readonly and done
- sync_mapping_buffers(inode->i_mapping);
-
- ocfs_node_map_set_bit(&lockres->readonly_map, node_num);
- set_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
- *vote_response = FLAG_VOTE_NODE;
- *status = 0;
- break;
-
- case DROP_READONLY:
- /* TODO: may need locking in here to lock out
- * the actual IO that a readdir may have in
- * progress, if it's possible to have a corrupt
- * readdir. for now, skip it.
- * NOTE: can't just take io_sem because lock order
- * needs to be io_sem->lockres... would have to
- * drop lockres, take io_sem, take lockres, then
- * recheck all the conditions to see if still
- * appropriate, then do the work and drop both.
- * seems like a lot of work. almost as many lines
- * of code as there are lines of comments right here.
- */
-
- /* this path should always succeed on the vote *
- * even in the error case. do nothing for error. */
-
- // WRITELOCK
- if (lockres->master_node_num != node_num ||
- lockres->lock_type != OCFS_LKM_EXMODE ||
- !ocfs_node_map_is_empty(&lockres->readonly_map))
- LOG_ERROR_ARGS("(drop-ro) master=%d node_num=%d locktype=%d readonly=%s\n",
- lockres->master_node_num, node_num, lockres->lock_type,
- test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no");
- else
- clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-
- *status = 0;
- *vote_response = FLAG_VOTE_NODE;
- *inc_seq = 1;
- break;
-
- case NOT_MASTER:
- LOG_TRACE_STR("NOT_MASTER");
- *vote_response = FLAG_VOTE_UPDATE_RETRY;
- if (flags & (FLAG_FILE_EXTEND|FLAG_FILE_TRUNCATE) &&
- lockres->master_node_num == node_num) {
- LOG_TRACE_STR("owner is requesting extend/truncate");
- *vote_response = FLAG_VOTE_NODE;
- }
- break;
-
- case REMASTER_THIS:
- LOG_TRACE_STR("REMASTER_THIS");
- *vote_response = FLAG_VOTE_UPDATE_RETRY;
- break;
-
- case REMASTER_REQUESTOR:
- LOG_TRACE_STR("REMASTER_REQUESTOR");
- *vote_response = FLAG_VOTE_NODE;
- break;
-
- case INVALID_REQUEST:
- default:
- LOG_TRACE_STR("INVALID_REQUEST");
- *vote_response = 0;
- break;
- }
- return 0;
-}
-
-
-/* inode is definitely non NULL */
-void ocfs_inc_inode_seq(ocfs_super *osb, struct inode *inode)
-{
- atomic_t *seq = GET_INODE_CLEAN_SEQ(inode);
-
- LOG_TRACE_ARGS("incrementing inode seq... current is %d\n",
- atomic_read(seq));
-
- /* wrap to ONE after 13 bits, will need a spinlock */
- spin_lock (&osb->clean_buffer_lock);
- if ((atomic_read(&osb->clean_buffer_seq)+1) % STATE_BIT_MAX == 0)
- atomic_set(&osb->clean_buffer_seq, 1);
- else
- atomic_inc(&osb->clean_buffer_seq);
- spin_unlock (&osb->clean_buffer_lock);
-
- /* doesn't matter if this another process */
- /* has already incremented the global seq */
- atomic_set(seq, atomic_read(&osb->clean_buffer_seq));
-
- LOG_TRACE_ARGS("done incrementing inode seq... new is %d\n",
- atomic_read(seq));
-}
-
-
-void ocfs_recover_oin_locks(ocfs_super *osb, __u32 node_num)
-{
- struct list_head *iter, *temp;
- struct inode *inode;
- ocfs_inode_private *i;
-
- LOG_ENTRY_ARGS("(node_num = %u)\n", node_num);
-
-start:
- down(&recovery_list_sem);
- list_for_each_safe (iter, temp, &osb->lock_recovery_lists[node_num]) {
- i = list_entry (iter, ocfs_inode_private, ip_recovery_list);
-
- inode = i->ip_inode;
- spin_lock(&oin_num_ext_lock);
-
- if (OCFS_I(inode)->ip_num_extends) {
- OCFS_I(inode)->ip_num_extends = 0;
- list_del(&OCFS_I(inode)->ip_recovery_list);
- INIT_LIST_HEAD(&OCFS_I(inode)->ip_recovery_list);
- up_write(&OCFS_I(inode)->ip_node_extend_sem);
-
- spin_unlock(&oin_num_ext_lock);
- up (&recovery_list_sem);
- iput(inode);
- goto start;
- } else
- LOG_ERROR_STR("oin is in recovery list, but has zero extend counter value!");
-
- spin_unlock(&oin_num_ext_lock);
- }
-
- up (&recovery_list_sem);
-
- LOG_EXIT();
-}
-
-static int _ocfs_drop_readonly_cache_lock_thread(void *arg);
-
-/* inode is definitely non NULL */
-int ocfs_drop_readonly_cache_lock(ocfs_super *osb, struct inode *inode, int yield)
-{
- ocfs_ro_cache_drop_ctxt *arg;
- int status = 0;
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
-
- if (test_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state)) {
- // if coming from process_vote, go about our merry way
- if (yield)
- return 0;
- // if coming from acquire_lock, we are holding the
- // lockres and the rodrop thread needs it.
- // return -EAGAIN to drop and try again.
- return -EAGAIN;
- }
-
- arg = kmalloc(sizeof(ocfs_ro_cache_drop_ctxt), GFP_KERNEL);
- if (arg == NULL)
- return -ENOMEM;
-
- atomic_inc(&inode->i_count);
- arg->osb = osb;
- arg->lockres = lockres;
- arg->inode = inode;
- arg->yield = yield;
-
- if (yield)
- kernel_thread(_ocfs_drop_readonly_cache_lock_thread, arg,
- CLONE_VM | CLONE_FS | CLONE_FILES);
- else
- status = _ocfs_drop_readonly_cache_lock(arg);
-
- return status;
-}
-
-static int _ocfs_drop_readonly_cache_lock(void *arg)
-{
- ocfs_ro_cache_drop_ctxt *ctxt = arg;
- ocfs_super *osb = ctxt->osb;
- ocfs_lock_res *lockres = ctxt->lockres;
- struct inode *inode = ctxt->inode;
- int status = 0;
- int yield = ctxt->yield;
-
- /* this will wait until process_vote gets to the release */
- if (yield)
- ocfs_acquire_lockres_write(inode);
- /* check these under the lock */
- if (!(test_bit(LOCK_STATE_READONLY, &lockres->readonly_state)) ||
- lockres->master_node_num != osb->node_num ||
- lockres->lock_type != OCFS_LKM_EXMODE) {
- LOG_ERROR_ARGS("inode %llu: bad RO lockres! this=%d, readonly=%s, master=%d, locktype=%u\n", OCFS_I(inode)->ip_blkno,
- osb->node_num,
- test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) ? "yes" : "no",
- lockres->master_node_num, lockres->lock_type);
- status = -EINVAL;
- goto leave;
- }
-
- if (test_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state)) {
- status = 0;
- goto leave;
- }
-
- set_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state);
- /* remove this node */
- ocfs_node_map_clear_bit(&lockres->readonly_map, osb->node_num);
-
- status = 0;
- while (!ocfs_node_map_is_empty(&lockres->readonly_map)) {
- /* remove all dead nodes */
- ocfs_node_map_and(&lockres->readonly_map, &osb->publ_map);
- status = new_lock_function(osb, OCFS_LKM_EXMODE, FLAG_DROP_READONLY,
- NULL, inode);
- if (status == -EAGAIN) {
- status = 0;
- if (yield) {
- /* from nm thread, give some time to waiters */
- ocfs_release_lockres_write(inode);
- ocfs_sleep(50);
- ocfs_acquire_lockres_write(inode);
- }
- continue;
- }
- if (status < 0)
- LOG_ERROR_STATUS (status);
- break;
- }
-
- if (ocfs_node_map_is_empty(&lockres->readonly_map) &&
- test_bit(LOCK_STATE_READONLY, &lockres->readonly_state) &&
- lockres->master_node_num == osb->node_num)
- clear_bit(LOCK_STATE_READONLY, &lockres->readonly_state);
-
- clear_bit(LOCK_STATE_READONLY_DROPPING, &lockres->readonly_state);
-
-leave:
- if (yield)
- ocfs_release_lockres_write(inode); // ocfs_process_vote ocfs_acquire_lock
-
- if (inode)
- iput(inode);
- kfree(arg);
-
- return status;
-}
-
-static int _ocfs_drop_readonly_cache_lock_thread(void *arg)
-{
- int status = 0;
- siginfo_t info;
-
-#define OCFS_DROP_RO_THREAD_NAME "ocfs2dropro"
-
- ocfs_daemonize (OCFS_DROP_RO_THREAD_NAME,
- strlen(OCFS_DROP_RO_THREAD_NAME),
- 0);
- status = _ocfs_drop_readonly_cache_lock(arg);
-
- /* ignore the actual signal */
- if (signal_pending(current)) {
- dequeue_signal_lock(current, ¤t->blocked, &info);
- }
-
- /* Flush all scheduled tasks */
- flush_scheduled_work();
-
-#warning need a way to wait on all of these threads on dismount
-/*
- * The way to do this is to create a wait queue on the osb. When one of
- * these guys start, you bump a counter. When it ends, it decrements
- * the counter and wake_up()s the wait queue. The counter probably can
- * be protected by a spinlock on the OSB. The dismount handling just
- * waits on that wait queue until readonly_threads == 0.
- */
- return status;
-}
Deleted: trunk/src/nm.h
===================================================================
--- trunk/src/nm.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/nm.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,36 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * nm.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_NM_H
-#define OCFS2_NM_H
-
-int ocfs_drop_readonly_cache_lock(ocfs_super *osb, struct inode *inode,
- int yield);
-void ocfs_inc_inode_seq(ocfs_super *osb, struct inode *inode);
-int ocfs_process_vote (ocfs_super * osb, ocfs_dlm_msg *dlm_msg);
-int ocfs_recv_thread(void *unused);
-void ocfs_recover_oin_locks(ocfs_super *osb, __u32 node_num);
-
-#endif /* OCFS2_NM_H */
Modified: trunk/src/ocfs.h
===================================================================
--- trunk/src/ocfs.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -42,37 +42,12 @@
# include <linux/tqueue.h>
#endif
-enum
-{
- OCFS_VOTE_REQUEST = 1,
- OCFS_VOTE_REPLY,
- OCFS_INFO_DISMOUNT
-};
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
-enum {
- DISK_VOTE,
- COMM_VOTE
-};
-
-enum {
- INVALID_REQUEST, // reply with a NO vote
- UPDATE_OIN_INODE, // update both oin and inode
- DELETE_ACQUIRE,// delete or rename acquire request
- DELETE_RELEASE,// delete or rename release request
- CHANGE_MASTER, // request to change master to requestor
- NOT_MASTER, // I am not master, retry
- REMASTER_THIS, // remaster lock to me
- REMASTER_REQUESTOR, // remaster lock to requestor
- DROP_READONLY, // RO cachelock needs to convert to RW
- READONLY, // a RW or RO cachelock, requesting RO
- RELEASE_DENTRY,
- TRUNCATE_PAGES // truncate page caches of a file
-};
-
-#define OCFS_MAX_DLM_PKT_SIZE 256
-#define OCFS_DLM_MAX_MSG_SIZE 256
-#define OCFS_DLM_MSG_MAGIC 0x79677083
-
/* convenience macro */
#define OCFS_ASSERT(x) do { if (!(x)) BUG(); } while (0)
@@ -88,134 +63,14 @@
#define OCFS_CURRENT_TIME ocfs_get_seconds(CURRENT_TIME)
#define OCFS_SET_INODE_TIME(i, x, y) (ocfs_get_seconds(i->x) = (y))
-
-#define MISS_COUNT_WARNING 20
-#define MISS_COUNT_EMERGENCY 40
-#define MISS_COUNT_NODE_DEAD 60
-
-/*
-** The following flag values reflect the operation to be performed
-** by ocfs_create_modify_file
-*/
-// FILEFLAG MASK
-#define FLAG_ACQUIRE_LOCK 0x00000001
-#define FLAG_RELEASE_LOCK 0x00000002
-#define FLAG_FILE_EXTEND 0x00000004
-#define FLAG_FILE_DELETE 0x00000008
-#define FLAG_FILE_RENAME 0x00000010
-#define FLAG_FILE_RECOVERY 0x00000020
-#define FLAG_FILE_UPDATE_OIN 0x00000040
-#define FLAG_RELEASE_DENTRY 0x00000080
-#define FLAG_CHANGE_MASTER 0x00000100
-#define FLAG_DIR 0x00000200
-#define FLAG_REMASTER 0x00000400
-#define FLAG_FAST_PATH_LOCK 0x00000800
-#define FLAG_TRUNCATE_PAGES 0x00001000
-#define FLAG_FILE_TRUNCATE 0x00002000
-#define FLAG_DROP_READONLY 0x00004000
-#define FLAG_READONLY 0x00008000
-#define FLAG_FILE_UNUSED01 0x00010000
-#define FLAG_FILE_UNUSED02 0x00020000
-#define FLAG_FILE_UNUSED03 0x00040000
-#define FLAG_FILE_UNUSED04 0x00080000
-#define FLAG_FILE_UNUSED05 0x00100000
-#define FLAG_FILE_UNUSED06 0x00200000
-#define FLAG_FILE_UNUSED07 0x00400000
-#define FLAG_FILE_UNUSED08 0x00800000
-#define FLAG_FILE_UNUSED09 0x01000000
-#define FLAG_FILE_UNUSED10 0x02000000
-#define FLAG_FILE_UNUSED11 0x04000000
-#define FLAG_FILE_UNUSED12 0x08000000
-#define FLAG_FILE_UNUSED13 0x10000000
-#define FLAG_FILE_UNUSED14 0x20000000
-#define FLAG_FILE_UNUSED15 0x40000000
-#define FLAG_FILE_UNUSED16 0x80000000
-
#define OCFS_MAX_OSB_ID 65536
-
-#define HEARTBEAT_METHOD_DISK (1)
-#define HEARTBEAT_METHOD_IPC (2)
-
-
-enum
-{
- LEFT_NO_OVERLAP,
- LEFT_ADJACENT,
- LEFT_OVERLAP,
- FULLY_CONTAINED,
- FULLY_CONTAINING,
- RIGHT_OVERLAP,
- RIGHT_ADJACENT,
- RIGHT_NO_OVERLAP
-};
-
-
-/*
-** Extents Defines
-*/
-
-typedef enum _ocfs_ext_flag {
- LOCAL_EXT = 1,
- NONLOCAL_EXT = 2
-} ocfs_ext_flag;
-
-/* The following are standard DLM lock types, of which we currently
- * only use a couple. */
-#define OCFS_LKM_NLMODE (0) /* null lock */
-#define OCFS_LKM_CRMODE (1) /* concurrent read */
-#define OCFS_LKM_CWMODE (2) /* concurrent write */
-#define OCFS_LKM_PRMODE (3) /* protected read */
-#define OCFS_LKM_PWMODE (4) /* protected write */
-#define OCFS_LKM_EXMODE (5) /* exclusive */
-
#define OCFS_INVALID_NODE_NUM -1
-/* lockres->lock_state bits */
-enum {
- LOCK_STATE_READONLY,
- LOCK_STATE_READONLY_DROPPING,
- LOCK_STATE_BLOCK_EXCLUSIVE,
- LOCK_STATE_BLOCK_READONLY
-};
-
-enum {
- NO_LOCK=0,
- READ_LOCK,
- WRITE_LOCK
-};
-
-
-
-/* osb->osb_flags flags */
-#define OCFS_OSB_FLAGS_BEING_DISMOUNTED (0x00000004)
-#define OCFS_OSB_FLAGS_SHUTDOWN (0x00000008)
-#define OCFS_OSB_FLAGS_INITIALIZED (0x00000020)
-
/* OcfsGlobalCtxt.flags flags */
#define OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED (0x00000001)
#define OCFS_FLAG_MEM_LISTS_INITIALIZED (0x00000002)
-#define OCFS_FLAG_SHUTDOWN_VOL_THREAD (0x00000004)
-/*
-** Information on Publish sector of each node
-*/
-#define DISK_HBEAT_COMM_ON 20 /* in the order of 5 secs */
-#define DISK_HBEAT_NO_COMM 4 /* in the order of 1 sec */
-#define DISK_HBEAT_INVALID 0 /* in the order of 100ms */
-
-
-/*
-** Information on Vote sector of each node
-*/
-// VOTEFLAG MASK
-#define FLAG_VOTE_NODE 0x1
-#define FLAG_VOTE_OIN_UPDATED 0x2
-#define FLAG_VOTE_OIN_ALREADY_INUSE 0x4
-#define FLAG_VOTE_UPDATE_RETRY 0x8
-#define FLAG_VOTE_FILE_DEL 0x10
-
-
#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGHUP) | \
sigmask(SIGINT) | sigmask(SIGQUIT))
@@ -223,21 +78,12 @@
#define OCFS_LINUX_MAX_FILE_SIZE 9223372036854775807LL
-#define OCFS_VOLCFG_LOCK_ITERATE (HZ/10) /* in jiffies */
-#define OCFS_VOLCFG_LOCK_TIME 1000 /* in ms */
-#define OCFS_VOLCFG_HDR_SECTORS 2 /* in sectors */
-#define OCFS_VOLCFG_NEWCFG_SECTORS 4 /* in sectors */
-
-#define OCFS_NM_HEARTBEAT_TIME 500 /* in ms */
-#define OCFS_HEARTBEAT_INIT 10 /* number of NM iterations to stabilize the publish map */
-
#ifndef O_DIRECT
#warning this depends on the architecture!
#define O_DIRECT 040000
#endif
-/* sm - ocfs 1.0 fails to set fe->sig for dirs */
#define IS_VALID_FILE_ENTRY(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
#define IS_VALID_EXTENT_BLOCK(ptr) \
@@ -246,27 +92,9 @@
(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
/*
-** Macros
-*/
-#define OCFS_SET_FLAG(flag, value) ((flag) |= (value))
-#define OCFS_CLEAR_FLAG(flag, value) ((flag) &= ~(value))
-
-
-/*
** Structures...
*/
-
-static inline void *ocfs_malloc(size_t size)
-{
- void *p = kmalloc(size, GFP_KERNEL);
- if (p)
- memset(p, 0, size);
- return p;
-}
-
-
-
#define OCFS_NAME "OCFS2"
/* ioctl commands */
@@ -276,28 +104,10 @@
#define OCFS_SB(sb) ((ocfs_super *)OCFS_GENERIC_SB_MEMBER(sb))
#define OCFS2_SB(sb) ((ocfs_super *)OCFS_GENERIC_SB_MEMBER(sb))
-#define OCFS_IPC_DEFAULT_PORT 7001
-
-
-#define OCFS_IPC_DLM_VERSION 0x0201
-
-
-/* =========================================================== */
-
/* This totally sucks that we have to include these here
* FIXME: Make them seperately includable. */
#include "ocfs2_fs.h"
-#include "ocfs2_disk_dlm.h"
-typedef struct _BARF_BARF_BARF
-{
- char node_name[MAX_NODE_NAME_LENGTH];
- ocfs_guid guid;
- ocfs_ipc_config_info ipc_config;
-}
-BARF_BARF_BARF;
-
-
typedef struct _ocfs_super ocfs_super;
typedef struct _ocfs_lock_res ocfs_lock_res;
@@ -310,17 +120,6 @@
unsigned long map[BITS_TO_LONGS(OCFS_NODE_MAP_MAX_NODES)];
} ocfs_node_map;
-struct _ocfs_lock_res
-{
- __s16 master_node_num; /* Master Node */
- __u32 lock_holders;
- __u32 uncommitted_holders;
- __u8 lock_type;
- struct rw_semaphore lock;
- unsigned long readonly_state;
- ocfs_node_map readonly_map;
-};
-
struct _ocfs_journal_handle;
/* I hate our includes */
@@ -329,6 +128,65 @@
struct rb_root em_extents;
};
+enum ocfs2_ast_action {
+ OCFS2_AST_INVALID = 0,
+ OCFS2_AST_ATTACH,
+ OCFS2_AST_CONVERT,
+ OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+ OCFS2_UNLOCK_INVALID = 0,
+ OCFS2_UNLOCK_CANCEL_CONVERT,
+ OCFS2_UNLOCK_DROP_LOCK,
+};
+
+enum ocfs2_lock_type {
+ OCFS_TYPE_META = 0,
+ OCFS_TYPE_DATA,
+ OCFS_TYPE_SUPER,
+ OCFS_NUM_LOCK_TYPES
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
+ * the lvb */
+#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
+ * dlm_lock */
+#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
+ * downconvert*/
+#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING (0x00000020)
+
+struct ocfs2_lock_res_ops;
+
+typedef struct _ocfs2_lock_res {
+ void *l_priv;
+ struct ocfs2_lock_res_ops *l_ops;
+ spinlock_t l_lock;
+
+ struct list_head l_blocked_list;
+
+ enum ocfs2_lock_type l_type;
+ int l_flags;
+ char *l_name;
+ int l_level;
+ unsigned int l_ro_holders;
+ unsigned int l_ex_holders;
+ dlm_lockstatus l_lksb;
+ u32 l_local_seq;
+
+ /* used from AST/BAST funcs. */
+ enum ocfs2_ast_action l_action;
+ enum ocfs2_unlock_action l_unlock_action;
+ int l_requested;
+ int l_blocking;
+
+ wait_queue_head_t l_event;
+} ocfs2_lock_res;
+
/* OCFS2 Inode Private Data */
typedef struct _ocfs_inode_private
{
@@ -338,19 +196,19 @@
u64 ip_blkno;
+ ocfs2_lock_res ip_meta_lockres;
+ ocfs2_lock_res ip_data_lockres;
+
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
- __u32 ip_open_cnt;
+ u32 ip_open_cnt;
u32 ip_clusters;
u64 ip_mmu_private;
- __u32 ip_open_flags;
struct ocfs2_extent_map ip_map;
- atomic_t ip_needs_verification;
-
struct semaphore ip_io_sem;
/* Used by the journalling code to attach an inode to a
@@ -360,40 +218,21 @@
struct list_head ip_handle_list;
struct _ocfs_journal_handle *ip_handle;
- /* ip_node_extend_sem locks out extends on behalf of other nodes. */
- struct rw_semaphore ip_node_extend_sem;
-
- struct list_head ip_recovery_list;/* protected by recovery_list_sem */
- __s32 ip_num_extends; /* protected by oin_num_ext_lock */
-
atomic_t ip_clean_buffer_seq;
- __u32 ip_flags; /* see below */
+ u32 ip_flags; /* see below */
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
- ocfs_lock_res ip_lockres;
- __u32 ip_dir_start_lookup;
+ u32 ip_dir_start_lookup;
- /* ip_pending_locks and ip_j_inode are protected by the
- * journals cmt_lock.
- * ip_pending_locks: disk locks for this inode which have to be
- * released once their transaction checkpoints
- * ip_j_inode: list_head for journal->committing_inodes. */
- struct list_head ip_pending_locks;
- struct list_head ip_j_inode;
-
- /* protected by trans_inc_lock, which transaction were we
- * created on? Zero if none. */
+ /* next two are protected by trans_inc_lock */
+ /* which transaction were we created on? Zero if none. */
unsigned long ip_created_trans;
+ /* last transaction we were a part of. */
+ unsigned long ip_last_trans;
} ocfs_inode_private;
-/* Eventually, the 'flags' and 'open_flags' fields need to be
- * merged. */
-/* open flags */
-#define OCFS_OIN_OPEN_FOR_DIRECTIO (0x00000001)
-#define OCFS_IN_FIRST_OPEN (0x00000002)
-
/* 'flags' flags. */
/* has this inode been deleted, either from this node or from another node. */
#define OCFS_INODE_DELETED 0x00000001
@@ -405,6 +244,9 @@
#define OCFS_INODE_SYSTEM_FILE 0x00000008
/* are we going to let another node deal with deletion of this inode? */
#define OCFS_INODE_SKIP_DELETE 0x00000010
+#define OCFS_INODE_IN_REFRESH 0x00000020
+#define OCFS_INODE_BITMAP 0x00000040
+#define OCFS_INODE_OPEN_DIRECT 0x00000080
#define OCFS_I(i) ((ocfs_inode_private *)(i->u.generic_ip))
@@ -430,14 +272,6 @@
}
ocfs_vol_state;
-typedef struct _ocfs_vol_node_map
-{
- __u64 time;
- __u32 miss_cnt;
- atomic_t dismount;
-}
-ocfs_vol_node_map;
-
typedef struct _ocfs_commit_task
{
struct completion c_complete;
@@ -469,6 +303,7 @@
} ocfs_alloc_stats;
struct _ocfs_journal;
+struct _ocfs2_slot_info;
/*
* ocfs_super
@@ -477,34 +312,27 @@
*/
struct _ocfs_super
{
- struct semaphore osb_res; /* resource to protect the ocfs_super */
struct list_head osb_next; /* list of ocfs_super(s) */
__u32 osb_id; /* id used by the proc interface */
- struct completion dlm_complete;
- struct task_struct *dlm_task;
ocfs_commit_task *commit;
- __u32 osb_flags;
- ocfs_node_map publ_map;
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
struct inode *system_inodes[NUM_SYSTEM_INODES];
-
+
+ struct _ocfs2_slot_info *slot_info;
+
+ spinlock_t node_map_lock;
+ ocfs_node_map mounted_map;
+ ocfs_node_map recovery_map;
+ ocfs_node_map umount_map;
+
/* new */
u32 num_clusters;
u64 root_blkno;
u64 system_dir_blkno;
u64 bitmap_blkno;
u32 bitmap_cpg;
- u64 publish_blkno;
- u32 publish_blocks;
- u64 vote_blkno;
- u32 vote_blocks;
- u64 autoconfig_blkno;
- u32 autoconfig_blocks;
- u64 new_autoconfig_blkno;
- u32 new_autoconfig_blocks;
- u32 total_autoconfig_blocks;
u8 *uuid;
u8 *vol_label;
u64 first_cluster_group_blkno;
@@ -517,73 +345,69 @@
spinlock_t s_next_gen_lock;
u32 s_next_generation;
- ocfs_vol_node_map *vol_node_map;
- struct semaphore cfg_lock;
- BARF_BARF_BARF **node_cfg_info;
- __u64 cfg_seq_num;
- int cfg_initialized;
u16 max_nodes;
- u16 num_cfg_nodes;
u16 num_nodes;
s16 node_num;
+ s16 slot_num;
int reclaim_id; /* reclaim the original node number*/
- __u32 hbt;
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
- int needs_flush;
struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
atomic_t vol_state;
- struct semaphore orphan_recovery_lock;
struct semaphore recovery_lock;
- spinlock_t recovery_map_lock;
- ocfs_node_map recovery_map;
+ int recovery_launched;
int disable_recovery;
- atomic_t num_recovery_threads;
- struct timer_list lock_timer;
- atomic_t lock_stop;
- wait_queue_head_t lock_event;
- atomic_t lock_event_woken;
- struct semaphore comm_lock; /* protects ocfs_comm_process_vote_reply */
- atomic_t nm_init;
- wait_queue_head_t nm_init_event;
- __u32 prealloc_lock;
- struct buffer_head **autoconfig_bhs;
- struct semaphore publish_lock; /* protects r/w to publish sector */
- atomic_t node_req_vote; /* set when node's vote req pending */
- int publish_dirty;
- wait_queue_head_t flush_event;
- atomic_t flush_event_woken;
+ wait_queue_head_t checkpoint_event;
+ atomic_t needs_checkpoint;
struct _ocfs_journal *journal;
atomic_t clean_buffer_seq;
spinlock_t clean_buffer_lock;
- struct list_head *lock_recovery_lists;
- __u64 *last_publ_seq_num;
+
int have_local_alloc;
struct buffer_head *local_alloc_bh;
- __u8 check_mounted; /* tell nm to check mounted flag, protected by publish_lock*/
+
+ /* Next two fields are for local node slot recovery during
+ * mount. */
+ int dirty;
+ ocfs2_dinode *local_alloc_copy;
+
ocfs_dlm_stats net_reqst_stats; /* stats of netdlm vote requests */
ocfs_dlm_stats net_reply_stats; /* stats of netdlm vote reponses */
ocfs_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
- struct semaphore vote_sem; /* protects calls to ocfs_process_vote */
- struct list_head vote_obj_queue;
- spinlock_t vote_obj_queue_lock;
- unsigned long voting_ino; /* only safe from the process_vote pid */
- wait_queue_head_t open_event;
-};
-typedef struct _ocfs_comm_info
-{
- u16 ip_version; /* IP version in NBO */
- u16 ip_port; /* IP port in NBO */
- union {
- __u32 ip_addr4; /* IPv4 address in NBO */
- __u32 ip_addr6[4]; /* IPv6 address in NBO */
- } addr_u;
-} ocfs_comm_info;
+ char *group_name;
+ struct inode *group_inode;
+ dlm_ctxt *dlm;
+ ocfs2_lock_res super_lockres;
+ wait_queue_head_t recovery_event;
+
+ spinlock_t vote_task_lock;
+ struct task_struct *vote_task;
+ wait_queue_head_t vote_event;
+ atomic_t wake_vote_task;
+ int vote_exit;
+
+ struct list_head blocked_lock_list;
+ unsigned long blocked_lock_count;
+
+ struct list_head vote_list;
+ int vote_count;
+
+ struct completion vote_event_complete;
+ struct completion vote_event_init;
+
+ u32 net_key;
+ char *net_vote_buf;
+ char *net_response_buf;
+ spinlock_t net_response_lock;
+ unsigned int net_response_ids;
+ struct list_head net_response_list;
+};
+
typedef struct _ocfs_global_ctxt
{
struct semaphore global_res;
@@ -591,11 +415,8 @@
kmem_cache_t *inode_cache;
kmem_cache_t *lock_cache;
__u32 flags;
- __s16 pref_node_num; /* preferred... osb has the real one */
- ocfs_guid guid; /* uniquely identifies a node */
char *node_name; /* human readable node identification */
char *cluster_name; /* unused */
- ocfs_comm_info comm_info; /* ip address, etc for listener */
int comm_info_read; /* ipc info loaded from config file */
spinlock_t comm_seq_lock; /* protects comm_seq_num */
__u64 comm_seq_num; /* local node seq num used in ipcdlm */
@@ -605,160 +426,11 @@
}
ocfs_global_ctxt;
-typedef struct _ocfs_ipc_ctxt
-{
- __u32 dlm_msg_size;
- __u16 version;
- int init;
- struct socket *send_sock;
- struct socket *recv_sock;
- struct completion complete;
- struct task_struct *task;
-}
-ocfs_ipc_ctxt;
-
-
-extern ocfs_ipc_ctxt OcfsIpcCtxt;
-
-typedef struct _ocfs_ipc_dlm_config
-{
- __u16 version;
- __u32 msg_size;
- __u32 num_recv_threads;
-}
-ocfs_ipc_dlm_config;
-
/*
** Globals ...
*/
extern ocfs_global_ctxt OcfsGlobalCtxt;
-
-/*
- * DLM network stuff
- */
-typedef struct _ocfs_dlm_msg_hdr
-{
- __u64 lock_id;
- __u64 lock_seq_num;
- __u32 flags;
- __u8 odmh_pad[4];
- __u32 num_ident; /* number of identical messages, always >= 1 */
-} ocfs_dlm_msg_hdr;
-
-typedef ocfs_dlm_msg_hdr ocfs_dlm_req_master;
-
-typedef struct _ocfs_dlm_reply_master
-{
- ocfs_dlm_msg_hdr h;
- __u32 status;
-}
-ocfs_dlm_reply_master;
-
-typedef struct _ocfs_dlm_msg
-{
- __u32 magic;
- __u32 msg_len;
- __u8 vol_id[MAX_VOL_ID_LENGTH];
- __s16 src_node;
- __s16 dst_node;
- __u32 msg_type;
- __u32 check_sum;
- __u8 msg_buf[0];
-} ocfs_dlm_msg;
-
-typedef struct _ocfs_vote_obj
-{
- struct list_head list;
- wait_queue_head_t voted_event;
- atomic_t voted_event_woken;
- atomic_t refcount;
- spinlock_t lock;
- __u32 vote_state;
- __u32 req_lock_type;
- int vote_status;
- ocfs_node_map req_vote_map;
- ocfs_node_map got_vote_map;
- //ocfs_node_map tmp_openmap;
- __u64 seq_num;
- pid_t pid;
- ocfs_dlm_msg m;
-} ocfs_vote_obj;
-
-enum {
- VOTE_OBJ_STATE_UNSENT,
- VOTE_OBJ_STATE_SENT,
- VOTE_OBJ_STATE_PARTIAL_REPLY,
- VOTE_OBJ_STATE_FULL_REPLY,
- VOTE_OBJ_STATE_DESTROYING
-};
-
-
-
-typedef struct _ocfs_vote_obj_lookup_data ocfs_vote_obj_lookup_data;
-
-struct _ocfs_vote_obj_lookup_data
-{
- union {
- struct {
- __u64 seq_num;
- __u64 lock_id;
- } s;
- struct {
- char *page;
- int *len;
- int max;
- } proc;
- } u;
- int (*func) (ocfs_vote_obj *obj, struct _ocfs_vote_obj_lookup_data *data);
- ocfs_vote_obj **ret;
-};
-
-
-
-
-typedef struct _ocfs_recv_ctxt
-{
- __s32 msg_len;
- __u8 msg[OCFS_MAX_DLM_PKT_SIZE];
- int status;
- struct work_struct ipc_wq;
-}
-ocfs_recv_ctxt;
-
-typedef struct _ocfs_cfg_task
-{
- struct work_struct cfg_wq;
- ocfs_super *osb;
- __u64 lock_off;
- __u8 *buffer;
- struct buffer_head *bh;
-}
-ocfs_cfg_task;
-
-typedef enum _ocfs_volcfg_op
-{
- OCFS_VOLCFG_ADD,
- OCFS_VOLCFG_UPD
-}
-ocfs_volcfg_op;
-
-typedef struct _ocfs_vote_request_ctxt
-{
- __s16 node_num;
- int status;
- ocfs_dlm_msg *dlm_msg;
-} ocfs_vote_request_ctxt;
-
-typedef struct _ocfs_vote_reply_ctxt
-{
- int reply_method;
- int *status;
- ocfs_node_map *got_vote_map;
- __u32 flags;
- ocfs_dlm_reply_master *reply;
-} ocfs_vote_reply_ctxt;
-
struct ocfs_ioc
{
char name[255]; /* "OCFS" */
@@ -767,114 +439,11 @@
char nodename[255]; /* node name */
};
-/* timeout structure taken from Ben's aio.c */
-typedef struct _ocfs_timeout {
- struct timer_list timer;
- int timed_out;
- wait_queue_head_t wait;
-} ocfs_timeout;
-
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
-#define __ocfs_wait(wq, condition, timeo, ret) \
-do { \
- ocfs_timeout __to; \
- \
- DECLARE_WAITQUEUE(__wait, current); \
- DECLARE_WAITQUEUE(__to_wait, current); \
- \
- ocfs_init_timeout(&__to); \
- \
- if (timeo) { \
- ocfs_set_timeout(&__to, timeo); \
- if (__to.timed_out) { \
- ocfs_clear_timeout(&__to); \
- } \
- } \
- \
- add_wait_queue(&wq, &__wait); \
- add_wait_queue(&__to.wait, &__to_wait); \
- do { \
- ret = 0; \
- set_current_state(TASK_INTERRUPTIBLE); \
- if (condition) \
- break; \
- ret = -ETIMEDOUT; \
- if (__to.timed_out) \
- break; \
- schedule(); \
- if (signal_pending(current)) { \
- ret = -EINTR; \
- break; \
- } \
- } while (1); \
- \
- set_current_state(TASK_RUNNING); \
- remove_wait_queue(&wq, &__wait); \
- remove_wait_queue(&__to.wait, &__to_wait); \
- \
- if (timeo) \
- ocfs_clear_timeout(&__to); \
- \
-} while(0)
-
-#define __ocfs_wait_uninterruptible(wq, condition, timeo, ret) \
-do { \
- ocfs_timeout __to; \
- \
- DECLARE_WAITQUEUE(__wait, current); \
- DECLARE_WAITQUEUE(__to_wait, current); \
- \
- ocfs_init_timeout(&__to); \
- \
- if (timeo) { \
- ocfs_set_timeout(&__to, timeo); \
- if (__to.timed_out) { \
- ocfs_clear_timeout(&__to); \
- } \
- } \
- \
- add_wait_queue(&wq, &__wait); \
- add_wait_queue(&__to.wait, &__to_wait); \
- do { \
- ret = 0; \
- set_current_state(TASK_UNINTERRUPTIBLE); \
- if (condition) \
- break; \
- ret = -ETIMEDOUT; \
- if (__to.timed_out) \
- break; \
- schedule(); \
- } while (1); \
- \
- set_current_state(TASK_RUNNING); \
- remove_wait_queue(&wq, &__wait); \
- remove_wait_queue(&__to.wait, &__to_wait); \
- \
- if (timeo) \
- ocfs_clear_timeout(&__to); \
- \
-} while(0)
-
-#define ocfs_wait(wq, condition, timeout) \
-({ \
- int __ret = 0; \
- if (!(condition)) \
- __ocfs_wait(wq, condition, timeout, __ret); \
- __ret; \
-})
-
-#define ocfs_wait_uninterruptible(wq, condition, timeout) \
-({ \
- int __ret = 0; \
- if (!(condition)) \
- __ocfs_wait_uninterruptible(wq, condition, timeout, __ret); \
- __ret; \
-})
-
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
{
@@ -900,15 +469,6 @@
return -EINVAL;
}
-static inline int ocfs_is_local_cache_lock(ocfs_super *osb, struct inode *inode)
-{
- ocfs_lock_res *lockres = GET_INODE_LOCKRES(inode);
- if (lockres->lock_type == OCFS_LKM_EXMODE &&
- lockres->master_node_num == osb->node_num)
- return 1;
- return 0;
-}
-
typedef struct _ocfs_journal_handle ocfs_journal_handle;
#endif /* !OCFS_H */
Modified: trunk/src/ocfs1_fs_compat.h
===================================================================
--- trunk/src/ocfs1_fs_compat.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs1_fs_compat.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -33,6 +33,7 @@
#define MAX_VOL_ID_LENGTH_V1 16
#define MAX_VOL_LABEL_LEN_V1 64
#define MAX_CLUSTER_NAME_LEN_V1 64
+#define MAX_NODE_NAME_LENGTH 32
#define OCFS1_MAJOR_VERSION (2)
#define OCFS1_MINOR_VERSION (0)
Modified: trunk/src/ocfs2.h
===================================================================
--- trunk/src/ocfs2.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs2.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,6 +26,8 @@
#ifndef OCFS2_H
#define OCFS2_H
+#define OCFS2_MAX_NODE_NAME_LENGTH 65
+
static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
u32 clusters)
{
Deleted: trunk/src/ocfs2_disk_dlm.h
===================================================================
--- trunk/src/ocfs2_disk_dlm.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs2_disk_dlm.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,130 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_disk_dlm.h
- *
- * On-disk structures involved in disk publish/vote for OCFS2.
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef _OCFS2_DISK_DLM_H
-#define _OCFS2_DISK_DLM_H
-
-/*
- * On-disk IPC configuration for an OCFS2 node.
- */
-typedef struct _ocfs_ipc_config_info
-{
-/*00*/ __u16 ip_version; /* IP version in NBO */
- __u16 ip_port; /* IP port in NBO */
- __u32 ip_reserved1;
- __u64 ip_reserved2;
-/*10*/ union {
- __u32 ip_addr4; /* IPv4 address in NBO */
- __u32 ip_addr6[4]; /* IPv6 address in NBO */
- } addr_u;
-/*20*/
-} ocfs_ipc_config_info;
-
-/*
- * On-disk structure representing a Global Unique ID for an OCFS2 node.
- *
- * The GUID has two parts. The host_id is a generally-randomly-unique
- * hex-as-ascii string of 20 characters (10 bytes). The mad_id field
- * is, unsurprisingly, the MAC address of the network card that the
- * IPC mechanism will be using (the address in
- * ocfs_ipc_config_info.addr_u). This should (ha-ha) provide a unique
- * identifier for a node in the OCFS2 cluster. It has the added
- * benefit of detecting when a node has changed network cards
- * (host_id is the same, mac_id has changed) or when an identical
- * mac address is on a different mode (the converse).
- */
-typedef union _ocfs_guid
-{
-/*00*/ struct
- {
- char host_id[OCFS2_GUID_HOSTID_LEN];
- char mac_id[OCFS2_GUID_MACID_LEN];
- } id;
- __u8 guid[OCFS2_GUID_LEN];
-/*20*/
-} ocfs_guid;
-
-/*
- * On-disk configuration information for an OCFS2 node. A node
- * populates its own info for other nodes to read and use.
- */
-typedef struct _ocfs_node_config_info
-{
-/*00*/ ocfs2_disk_lock disk_lock; /* Lock on the info */
-/*30*/ ocfs_guid guid; /* GUID */
-/*50*/ ocfs_ipc_config_info ipc_config; /* IPC info */
-/*70*/ __u8 node_name[MAX_NODE_NAME_LENGTH+1]; /* Name */
-/*91*/ __u8 name_pad[7]; /* Pad to align (UGH) */
-/*98*/
-} ocfs_node_config_info;
-
-/*
- * On-disk ... for OCFS2. FIXME this description.
- */
-typedef struct _ocfs_node_config_hdr
-{
-/*00*/ ocfs2_disk_lock disk_lock;
-/*30*/ __u8 signature[OCFS2_NODE_CONFIG_SIGN_LEN];
- __u32 version;
- __u16 num_nodes;
- __u16 reserved1;
-/*40*/ __u32 last_node;
- __u32 onch_pad;
- __u64 cfg_seq_num;
-/*50*/
-} ocfs_node_config_hdr;
-
-/*
- * On-disk lock / state change request for OCFS2.
- */
-typedef struct _ocfs_publish
-{
-/*00*/ __u64 time; /* Time of publish */
- __s32 vote_UNUSED;
- __u32 dirty; /* Is the node in a clean state */
-/*10*/ __u32 vote_type_UNUSED; /* Type required */
- __u32 mounted; /* Does the publisher have it mounted */
-/*18*/ __u32 vote_map_UNUSED[8]; /* Who needs to vote */
-/*38*/ __u64 reserved1;
-/*50*/ __u64 publ_seq_num_UNUSED; /* Sequence for vote */
- __u64 lock_id_UNUSED; /* Lock vote is requested for */
- /* last seq num used in comm voting */
-/*60*/ __u64 comm_seq_num;
- __u32 num_ident;
-/*72*/
-} ocfs_publish;
-
-typedef struct _ocfs_vote
-{
-/*00*/ __u8 type_UNUSED; /* Vote type */
- __u8 node_UNUSED; /* Node voting */
- __u8 reserved1[30]; /* used to be vote[32] */
-/*20*/ __u64 vote_seq_num_UNUSED; /* Vote sequence */
- __u64 lock_id_UNUSED; /* Lock being voted on */
-/*30*/ __u8 open_handle_UNUSED;/* Does the voter have it open */
- __u8 ov_pad[7];
-/*38*/
-} ocfs_vote;
-
-#endif /* _OCFS2_DISK_DLM_H */
Modified: trunk/src/ocfs2_fs.h
===================================================================
--- trunk/src/ocfs2_fs.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs2_fs.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -96,6 +96,11 @@
#define OCFS2_DLM_FL (0x00000200) /* DLM area */
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
+
/* Limit of space in ocfs2_dir_entry */
#define OCFS2_MAX_FILENAME_LENGTH 255
@@ -115,8 +120,9 @@
enum {
BAD_BLOCK_SYSTEM_INODE = 0,
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+ SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
DLM_SYSTEM_INODE,
-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE DLM_SYSTEM_INODE
GLOBAL_BITMAP_SYSTEM_INODE,
ORPHAN_DIR_SYSTEM_INODE,
#define OCFS2_LAST_GLOBAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
@@ -134,6 +140,7 @@
[GLOBAL_INODE_ALLOC_SYSTEM_INODE] "global_inode_alloc",
/* These are used by the running filesystem */
+ [SLOT_MAP_SYSTEM_INODE] "slot_map",
[DLM_SYSTEM_INODE] "dlm",
[GLOBAL_BITMAP_SYSTEM_INODE] "global_bitmap",
[ORPHAN_DIR_SYSTEM_INODE] "orphan_dir",
@@ -191,7 +198,6 @@
* Convenience casts
*/
#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
-#define DISK_LOCK(dinode) (&((dinode)->i_disk_lock))
#define LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
/* TODO: change these? */
@@ -200,8 +206,6 @@
#define OCFS2_NODE_CONFIG_VER 2
#define OCFS2_NODE_MIN_SUPPORTED_VER 2
-#define MAX_NODE_NAME_LENGTH 32
-
#define OCFS2_GUID_HOSTID_LEN 20
#define OCFS2_GUID_MACID_LEN 12
#define OCFS2_GUID_LEN (OCFS2_GUID_HOSTID_LEN + OCFS2_GUID_MACID_LEN)
@@ -280,17 +284,6 @@
} ocfs2_extent_block;
/*
- * On disk lock structure for OCFS2
- */
-typedef struct _ocfs2_disk_lock
-{
-/*00*/ __s16 dl_master; /* Node number of current master */
- __u8 dl_level; /* Lock level */
- __u8 dl_reserved1;
-/*04*/
-} ocfs2_disk_lock;
-
-/*
* On disk superblock for OCFS2
* Note that it is contained inside an ocfs2_dinode, so all offsets
* are relative to the start of ocfs2_dinode.id2.
@@ -349,7 +342,7 @@
belongs to */
__u16 i_suballoc_bit; /* Bit offset in suballocater
block group */
-/*10*/ ocfs2_disk_lock i_disk_lock; /* Lock structure */
+ __u32 i_reserved0;
/*14*/ __u32 i_clusters; /* Cluster count */
/*18*/ __u32 i_uid; /* Owner UID */
__u32 i_gid; /* Owning GID */
@@ -365,8 +358,8 @@
__u64 i_last_eb_blk; /* Pointer to last extent
block */
/*60*/ __u32 i_fs_generation; /* Generation per fs-instance */
- __u32 i_reserved0; /* Generation per fs-instance */
-/*68*/ __u64 i_reserved1[10];
+ __u32 i_reserved1; /* Generation per fs-instance */
+/*68*/ __u64 i_reserved2[10];
/*B8*/ union {
__u64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -379,6 +372,11 @@
__u32 i_total; /* Total bits (clusters)
available */
} bitmap1;
+ struct { /* Info for journal system
+ inodes */
+ __u32 i_flags; /* Mounted, version, etc. */
+ __u32 i_j_pad;
+ } journal1;
} id1; /* Inode type dependant 1 */
/*C0*/ union {
ocfs2_super_block i_super;
Modified: trunk/src/ocfs_journal.h
===================================================================
--- trunk/src/ocfs_journal.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs_journal.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -30,7 +30,7 @@
#include <linux/jbd.h>
#define OCFS_JOURNAL_CURRENT_VERSION 1
-#define OCFS_CHECKPOINT_INTERVAL 8000
+#define OCFS_CHECKPOINT_INTERVAL (8 * HZ)
enum ocfs_journal_state {
OCFS_JOURNAL_FREE = 0,
@@ -59,7 +59,6 @@
* which we usually run
* from (recovery,
* etc) */
- __u32 node_num; /* Whose journal are we? */
struct buffer_head *lockbh; /* Journal disk lock, used
to access file entry */
atomic_t num_trans; /* Number of transactions
@@ -67,58 +66,51 @@
unsigned long trans_id;
/* locking order: trans_lock -> cmt_lock */
spinlock_t cmt_lock; /* protects the committed list */
- atomic_t num_cmt_locks; /* number of delayed
- * locks */
- atomic_t num_chkpt_locks;
struct rw_semaphore trans_barrier;
-
- struct list_head committing_inodes; /* list of all
- * inodes that
- * have committed
- * and are
- * awaiting a
- * checkpoint. Protected
- * by cmt_lock. */
- struct list_head checkpointing_locks; /* locks
- * pending release
- * after a checkpoint
- * -- this variable
- * is unlocked as
- * commit_thread is
- * the only guy who
- * looks at it! */
};
extern spinlock_t trans_inc_lock;
/* wrap trans_id so we never have it equal to zero. */
-static inline void ocfs_inc_trans_id(ocfs_journal *j)
+static inline unsigned long ocfs_inc_trans_id(ocfs_journal *j)
{
+ unsigned long old_id;
spin_lock(&trans_inc_lock);
- j->trans_id++;
+ old_id = j->trans_id++;
if (!j->trans_id)
j->trans_id = 1;
spin_unlock(&trans_inc_lock);
+ return old_id;
}
-static inline int ocfs_trans_checkpointed(ocfs_journal *j,
- unsigned long trans_id)
+static inline void ocfs_set_inode_lock_trans(ocfs_journal *journal,
+ struct inode *inode)
{
+ spin_lock(&trans_inc_lock);
+ OCFS_I(inode)->ip_last_trans = journal->trans_id;
+ spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * inode. Returns true if all the inodes changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock at put it on another transaction. */
+static inline int ocfs_inode_fully_checkpointed(struct inode *inode)
+{
int ret;
+ ocfs_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+
spin_lock(&trans_inc_lock);
- ret = time_after(trans_id, j->trans_id);
+ ret = time_after(journal->trans_id, OCFS_I(inode)->ip_last_trans);
spin_unlock(&trans_inc_lock);
return ret;
}
-/* convenience function to check if an inode has been checkpointed
- * yet. Replaces ocfs_journal_new_file_search. Will do you a favor and
- * set created_trans = 0 when you've been checkpointed.
- * returns '1' if the inode hasn't been checkpointed yet.
- *
- */
-static inline int ocfs_inode_is_new(ocfs_super *osb,
- struct inode *inode)
+/* convenience function to check if an inode is still new (has never
+ * hit disk) Will do you a favor and set created_trans = 0 when you've
+ * been checkpointed. returns '1' if the inode is still new. */
+static inline int ocfs_inode_is_new(struct inode *inode)
{
int ret;
@@ -126,10 +118,10 @@
* mkfs. This helps us early during mount, before we have the
* journal open and trans_id could be junk. */
if (OCFS_I(inode)->ip_flags & OCFS_INODE_SYSTEM_FILE)
- return(0);
+ return 0;
spin_lock(&trans_inc_lock);
- ret = !(time_after(osb->journal->trans_id,
- OCFS_I(inode)->ip_created_trans));
+ ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->trans_id,
+ OCFS_I(inode)->ip_created_trans));
if (!ret)
OCFS_I(inode)->ip_created_trans = 0;
spin_unlock(&trans_inc_lock);
@@ -146,20 +138,8 @@
typedef struct _ocfs_journal_lock ocfs_journal_lock;
struct _ocfs_journal_lock {
- /* release_lock arguments. */
- __u32 type;
- __u32 flags;
- struct inode *inode;
- unsigned int num_ident;
-
- /* used by commit_cache */
- unsigned int drop_holders;
- /* lock_list: we are either on
- * - handle->locks: if still running
- * - inode->ip_pending_locks: if waiting for checkpoint
- * - journal->checkpointing_locks: awaiting release after checkpoint
- */
- struct list_head lock_list;
+ struct inode *jl_inode;
+ struct list_head jl_lock_list;
};
struct _ocfs_journal_handle {
@@ -198,7 +178,7 @@
/*
* Journal Control:
- * Initialize, Load, Shutdown, Wipe, Create a journal.
+ * Initialize, Load, Shutdown, Wipe a journal.
*
* ocfs_journal_init - Initialize journal structures in the OSB.
* ocfs_journal_load - Load the given journal off disk. Replay it if
@@ -208,12 +188,21 @@
* ocfs_journal_wipe - Wipe transactions from a journal. Optionally
* zero out each block.
* ocfs_recovery_thread - Perform recovery on a node. osb is our own osb.
+ * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ * event on.
+ * ocfs_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
-int ocfs_journal_init(struct _ocfs_super *osb);
+int ocfs_journal_init(struct _ocfs_super *osb, int *dirty);
void ocfs_journal_shutdown(struct _ocfs_super *osb);
int ocfs_journal_wipe(ocfs_journal *journal, int full);
int ocfs_journal_load(ocfs_journal *journal);
void ocfs_recovery_thread(struct _ocfs_super *osb, int node_num);
+int ocfs2_mark_dead_nodes(ocfs_super *osb);
+static inline void ocfs_start_checkpoint(struct _ocfs_super *osb)
+{
+ atomic_set(&osb->needs_checkpoint, 1);
+ wake_up(&osb->checkpoint_event);
+}
/*
* Transaction Handling:
@@ -292,10 +281,8 @@
*/
int ocfs_journal_dirty(ocfs_journal_handle *handle,
struct buffer_head *bh);
-void ocfs_handle_add_lock(ocfs_journal_handle *handle,
- __u32 type,
- __u32 flags,
- struct inode *inode);
+int ocfs_handle_add_lock(ocfs_journal_handle *handle,
+ struct inode *inode);
/*
* Use this to protect from other processes reading buffer state while
* it's in flight.
Modified: trunk/src/ocfs_log.h
===================================================================
--- trunk/src/ocfs_log.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/ocfs_log.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -100,29 +100,29 @@
#define OCFS_DEBUG_CONTEXT_ALLOC 0x00000001 /* alloc.c */
#define OCFS_DEBUG_CONTEXT_DIR 0x00000002 /* dir.c */
#define OCFS_DEBUG_CONTEXT_EXTMAP 0x00000004 /* extmap.c */
-#define OCFS_DEBUG_CONTEXT_HEARTBEAT 0x00000008 /* hearbeat.c */
+#define OCFS_DEBUG_CONTEXT_SLOTMAP 0x00000008 /* slotmap.c */
#define OCFS_DEBUG_CONTEXT_IOCTL 0x00000010 /* ioctl.c */
-#define OCFS_DEBUG_CONTEXT_NM 0x00000020 /* nm.c */
+#define OCFS_DEBUG_CONTEXT_VOTE 0x00000020 /* vote.c */
#define OCFS_DEBUG_CONTEXT_PROC 0x00000040 /* proc.c */
#define OCFS_DEBUG_CONTEXT_SYMLINK 0x00000080 /* symlink.c */
#define OCFS_DEBUG_CONTEXT_BITMAP 0x00000100 /* bitmap.c */
#define OCFS_DEBUG_CONTEXT_FILE 0x00000200 /* file.c */
#define OCFS_DEBUG_CONTEXT_INODE 0x00000400 /* inode.c */
#define OCFS_DEBUG_CONTEXT_JOURNAL 0x00000800 /* journal.c */
-#define OCFS_DEBUG_CONTEXT_CHAINALLOC 0x00001000 /* */
-#define OCFS_DEBUG_CONTEXT_LOCALALLOC 0x00002000 /* */
+#define OCFS_DEBUG_CONTEXT_CHAINALLOC 0x00001000 /* chainalloc */
+#define OCFS_DEBUG_CONTEXT_LOCALALLOC 0x00002000 /* localalloc */
#define OCFS_DEBUG_CONTEXT_SYSFILE 0x00004000 /* sysfile.c */
#define OCFS_DEBUG_CONTEXT_VOLCFG 0x00008000 /* volcfg.c */
#define OCFS_DEBUG_CONTEXT_DCACHE 0x00010000 /* dcache.c */
-#define OCFS_DEBUG_CONTEXT_DLM 0x00020000 /* dlm.c */
+#define OCFS_DEBUG_CONTEXT_DLMGLUE 0x00020000 /* dlmglue.c */
#define OCFS_DEBUG_CONTEXT_HASH 0x00040000 /* hash.c */
#define OCFS_DEBUG_CONTEXT_IO 0x00080000 /* io.c */
#define OCFS_DEBUG_CONTEXT_NAMEI 0x00100000 /* namei.c */
#define OCFS_DEBUG_CONTEXT_OSB 0x00200000 /* osb.c */
#define OCFS_DEBUG_CONTEXT_SUPER 0x00400000 /* super.c */
#define OCFS_DEBUG_CONTEXT_UTIL 0x00800000 /* util.c */
-#define OCFS_DEBUG_CONTEXT_VOTE 0x01000000 /* vote.c */
-#define OCFS_DEBUG_CONTEXT_LOCKRES 0x02000000 /* lockres.c */
+#define OCFS_DEBUG_CONTEXT_UNUSED3 0x01000000 /* */
+#define OCFS_DEBUG_CONTEXT_UNUSED4 0x02000000 /* */
#ifdef OCFS_DBG_TIMING
Modified: trunk/src/proc.c
===================================================================
--- trunk/src/proc.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/proc.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,8 +38,8 @@
#include "ocfs2.h"
#include "proc.h"
-#include "vote.h"
#include "alloc.h"
+#include "heartbeat.h"
#include "ocfs_journal.h"
@@ -48,18 +48,16 @@
#define OCFS2_PROC_BASENAME "fs/ocfs2"
-static int ocfs_proc_globalctxt(char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_dlm_stats(char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_version (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_nodenum (char *page, char **start, off_t off, int count, int *eof, void *data);
+static int ocfs_proc_slotnum (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_nodename (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_mountpoint (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_statistics (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_device (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_nodes (char *page, char **start, off_t off, int count, int *eof, void *data);
-static int ocfs_proc_net_vote_obj (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_alloc_stat(char *page, char **start, off_t off, int count, int *eof, void *data);
-static int ocfs_proc_guid (char *page, char **start, off_t off, int count, int *eof, void *data);
static int ocfs_proc_label (char *page, char **start, off_t off, int count, int *eof, void *data);
typedef struct _ocfs_proc_list
@@ -72,20 +70,18 @@
ocfs_proc_list top_dir[] = {
{ "version", NULL, ocfs_proc_version },
{ "nodename", NULL, ocfs_proc_nodename },
- { "globalctxt", NULL, ocfs_proc_globalctxt },
{ "lockstat", NULL, ocfs_proc_dlm_stats },
{ NULL } };
ocfs_proc_list sub_dir[] = {
{ "nodenum", NULL, ocfs_proc_nodenum },
{ "mountpoint", NULL, ocfs_proc_mountpoint },
+ { "slotnum", NULL, ocfs_proc_slotnum },
{ "statistics", NULL, ocfs_proc_statistics },
{ "lockstat", NULL, ocfs_proc_dlm_stats },
{ "device", NULL, ocfs_proc_device },
{ "nodes", NULL, ocfs_proc_nodes },
- { "sent-votes", NULL, ocfs_proc_net_vote_obj },
{ "allocstat", NULL, ocfs_proc_alloc_stat },
- { "guid", NULL, ocfs_proc_guid },
{ "label", NULL, ocfs_proc_label },
{ NULL } };
@@ -159,35 +155,7 @@
return len;
} /* ocfs_proc_calc_metrics */
-
/*
- * ocfs_proc_globalctxt()
- *
- */
-static int ocfs_proc_globalctxt(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len = 0;
- int ret;
-
- LOG_ENTRY ();
-
- len += sprintf(page + len, "ip addr/port : 0x%08u/%u\n",
- ntohl(OcfsGlobalCtxt.comm_info.addr_u.ip_addr4),
- ntohs(OcfsGlobalCtxt.comm_info.ip_port));
- len += sprintf(page + len, "guid : ");
- strncat(page + len, OcfsGlobalCtxt.guid.guid, OCFS2_GUID_LEN);
- len += OCFS2_GUID_LEN;
- strncat(page + len, "\n", 1);
- len++;
-
- ret = ocfs_proc_calc_metrics(page, start, off, count, eof, len);
-
- LOG_EXIT_INT (ret);
- return ret;
-} /* ocfs_proc_version */
-
-/*
* ocfs_proc_dlm_stats()
*
*/
@@ -331,6 +299,29 @@
} /* ocfs_proc_nodenum */
/*
+ * ocfs_proc_slotnum()
+ *
+ */
+static int ocfs_proc_slotnum (char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len;
+ int ret;
+ ocfs_super *osb;
+
+ LOG_ENTRY ();
+
+ osb = data;
+ sprintf (page, "%d\n", osb->slot_num);
+ len = strlen (page);
+
+ ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
+
+ LOG_EXIT_INT (ret);
+ return ret;
+} /* ocfs_proc_slotnum */
+
+/*
* ocfs_proc_nodename()
*
*/
@@ -431,50 +422,25 @@
int count, int *eof, void *data)
{
int len;
- char *pubmap = NULL;
ocfs_super *osb;
- int ret = 0, i;
- char *ptr;
+ int ret = 0;
LOG_ENTRY ();
osb = data;
- pubmap = ocfs_malloc (100);
- if (!pubmap) {
- LOG_ERROR_STATUS (-ENOMEM);
- goto bail;
- }
-
- ptr = pubmap;
- for (i = 0; i < osb->max_nodes; i++) {
- if (ocfs_node_map_test_bit(&osb->publ_map, i))
- ptr += sprintf (ptr, "%d ", i);
- }
- if (pubmap != ptr)
- *(ptr - 1) = '\0';
-
#define PROC_STATS \
- "Publish map : %s\n" \
"Number of nodes : %u\n" \
"Cluster size : %d\n" \
"Volume size : %llu\n" \
- "Open Transactions: : %u\n" \
- "Delayed Locks : %u\n" \
- "Checkpointing Locks : %u\n"
+ "Open Transactions: : %u\n"
- len = sprintf (page, PROC_STATS, pubmap,
- osb->num_nodes, osb->s_clustersize,
+ len = sprintf (page, PROC_STATS, osb->num_nodes, osb->s_clustersize,
ocfs2_clusters_to_bytes(osb->sb, osb->num_clusters),
- atomic_read(&osb->journal->num_trans),
- atomic_read(&osb->journal->num_cmt_locks),
- atomic_read(&osb->journal->num_chkpt_locks));
+ atomic_read(&osb->journal->num_trans));
ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
-bail:
- if (pubmap)
- kfree(pubmap);
LOG_EXIT_INT (ret);
return ret;
} /* ocfs_proc_statistics */
@@ -512,7 +478,6 @@
int i;
int ret;
ocfs_super *osb;
- BARF_BARF_BARF *node;
char mount;
LOG_ENTRY ();
@@ -520,23 +485,10 @@
osb = data;
if (osb) {
- down (&(osb->cfg_lock));
for (i = 0; i < osb->max_nodes; i++) {
- node = osb->node_cfg_info[i];
- if (!node)
- continue;
- mount = ocfs_node_map_test_bit(&osb->publ_map, i) ? 'M' : ' ';
- len += sprintf (page + len,
- "%2d %c %-32s 0x%08u %-6u ",
- i, mount, node->node_name,
- ntohl(node->ipc_config.addr_u.ip_addr4),
- ntohs(node->ipc_config.ip_port));
- strncat(page + len, node->guid.guid,
- OCFS2_GUID_LEN);
- len += OCFS2_GUID_LEN;
- len += sprintf (page + len, "\n");
+ mount = ocfs_node_map_test_bit(osb, &osb->mounted_map, i) ? 'M' : ' ';
+ len += sprintf(page + len, "%2d %c\n", i, mount);
}
- up (&(osb->cfg_lock));
}
ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
@@ -546,63 +498,6 @@
} /* ocfs_proc_nodes */
/*
- * ocfs_proc_net_votes()
- *
- */
-static int ocfs_proc_net_vote_obj (char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len = 0, ret;
- ocfs_super *osb;
- ocfs_vote_obj_lookup_data d; // 24 bytes
-
- LOG_ENTRY ();
-
- osb = data;
-
- d.func = ocfs_lookup_obj_for_proc;
- d.ret = NULL;
- d.u.proc.page = page;
- d.u.proc.len = &len;
- d.u.proc.max = 4096;
- ret = ocfs_lookup_vote_request_obj (osb, &d);
- ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
-
- LOG_EXIT_INT (ret);
- return ret;
-} /* ocfs_proc_net_vote_obj */
-
-/*
- * ocfs_proc_guid()
- *
- */
-static int ocfs_proc_guid (char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len;
- int ret;
- ocfs_super *osb;
- char *p;
- int i;
-
- LOG_ENTRY ();
-
- osb = (ocfs_super *) data;
-
- for (i = 0, p = page; i < MAX_VOL_ID_LENGTH; i++, p += 2)
- sprintf(p, "%02X", osb->uuid[i]);
- *p = '\n'; ++p; *p = '\0';
-
- len = strlen (page);
-
- ret = ocfs_proc_calc_metrics (page, start, off, count, eof, len);
-
- LOG_EXIT_INT (ret);
- return ret;
-} /* ocfs_proc_guid */
-
-
-/*
* ocfs_proc_label()
*
*/
Added: trunk/src/slot_map.c
===================================================================
--- trunk/src/slot_map.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/slot_map.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,288 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "slot_map.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_SLOTMAP
+
+static s16 __ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+ s16 global);
+static void __ocfs2_fill_slot(ocfs2_slot_info *si,
+ s16 slot_num,
+ s16 node_num);
+
+/* Use the slot information we've collected to create a map of mounted
+ * nodes. Should be holding an EX on super block. assumes slot info is
+ * up to date. Note that we call this *after* we find a slot, so our
+ * own node should be set in the map too... */
+void ocfs2_populate_mounted_map(ocfs_super *osb)
+{
+ int i;
+ ocfs2_slot_info *si = osb->slot_info;
+
+ spin_lock(&si->si_lock);
+
+ for (i = 0; i < si->si_size; i++)
+ if (si->si_global_node_nums[i] != OCFS_INVALID_NODE_NUM)
+ ocfs_node_map_set_bit(osb, &osb->mounted_map,
+ si->si_global_node_nums[i]);
+
+ spin_unlock(&si->si_lock);
+}
+
+/* post the slot information on disk into our slot_info struct. */
+void ocfs2_update_slot_info(ocfs2_slot_info *si)
+{
+ int i;
+ s16 *disk_info;
+
+ /* we don't read the slot block here as ocfs2_super_lock
+ * should've made sure we have the most recent copy. */
+ spin_lock(&si->si_lock);
+ disk_info = (s16 *) si->si_bh->b_data;
+
+ for (i = 0; i < si->si_size; i++)
+ si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+
+ spin_unlock(&si->si_lock);
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+int ocfs2_update_disk_slots(ocfs_super *osb,
+ ocfs2_slot_info *si)
+{
+ int status, i;
+ s16 *disk_info = (s16 *) si->si_bh->b_data;
+
+ spin_lock(&si->si_lock);
+ for (i = 0; i < si->si_size; i++)
+ disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+ spin_unlock(&si->si_lock);
+
+ status = ocfs_write_block(osb, si->si_bh, si->si_inode);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+ return status;
+}
+
+/* try to find global node in the slot info. Returns
+ * OCFS_INVALID_NODE_NUM if nothing is found. */
+static s16 __ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+ s16 global)
+{
+ int i;
+ s16 ret = OCFS_INVALID_NODE_NUM;
+
+ for(i = 0; i < si->si_num_slots; i++) {
+ if (global == si->si_global_node_nums[i]) {
+ ret = (s16) i;
+ break;
+ }
+ }
+ return ret;
+}
+
+s16 ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+ s16 global)
+{
+ s16 ret;
+
+ spin_lock(&si->si_lock);
+ ret = __ocfs2_node_num_to_slot(si, global);
+ spin_unlock(&si->si_lock);
+ return ret;
+}
+
+static void __ocfs2_fill_slot(ocfs2_slot_info *si,
+ s16 slot_num,
+ s16 node_num)
+{
+ OCFS_ASSERT(slot_num != OCFS_INVALID_NODE_NUM);
+ OCFS_ASSERT(slot_num < si->si_num_slots);
+ OCFS_ASSERT((node_num == OCFS_INVALID_NODE_NUM) ||
+ (node_num < OCFS2_MAX_NODES));
+
+ si->si_global_node_nums[slot_num] = node_num;
+}
+
+void ocfs2_clear_slot(ocfs2_slot_info *si,
+ s16 slot_num)
+{
+ spin_lock(&si->si_lock);
+ __ocfs2_fill_slot(si, slot_num, OCFS_INVALID_NODE_NUM);
+ spin_unlock(&si->si_lock);
+}
+
+int ocfs2_init_slot_info(ocfs_super *osb)
+{
+ int status, i;
+ u64 blkno;
+ struct inode *inode = NULL;
+ struct buffer_head *bh = NULL;
+ ocfs2_slot_info *si;
+
+ si = kmalloc(sizeof(ocfs2_slot_info), GFP_KERNEL);
+ if (!si) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ memset(si, 0, sizeof(ocfs2_slot_info));
+ spin_lock_init(&si->si_lock);
+ si->si_num_slots = osb->max_nodes;
+ si->si_size = OCFS2_MAX_NODES;
+
+ for(i = 0; i < si->si_num_slots; i++)
+ si->si_global_node_nums[i] = OCFS_INVALID_NODE_NUM;
+
+ inode = ocfs_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, -1);
+ if (!inode) {
+ LOG_ERROR_STATUS(status = -EINVAL);
+ goto bail;
+ }
+
+ status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = ocfs_read_block(osb, blkno, &bh, 0, inode);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ si->si_inode = inode;
+ si->si_bh = bh;
+ osb->slot_info = si;
+bail:
+ if (status < 0 && si)
+ ocfs2_free_slot_info(si);
+
+ return status;
+}
+
+void ocfs2_free_slot_info(ocfs2_slot_info *si)
+{
+ if (si->si_inode)
+ iput(si->si_inode);
+ if (si->si_bh)
+ brelse(si->si_bh);
+ kfree(si);
+}
+
+int ocfs2_find_slot(ocfs_super *osb)
+{
+ int status;
+ s16 slot;
+ ocfs2_slot_info *si;
+
+ LOG_ENTRY();
+
+ si = osb->slot_info;
+
+ ocfs2_update_slot_info(si);
+
+ spin_lock(&si->si_lock);
+ /* search for ourselves first and take the slot if it already
+ * exists. Perhaps we need to mark this in a variable for our
+ * own journal recovery? Possibly not, though we certainly
+ * need to warn to the user */
+ slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ if (slot == OCFS_INVALID_NODE_NUM) {
+ /* if no slot yet, then just take 1st available
+ * one. */
+ slot = __ocfs2_node_num_to_slot(si, OCFS_INVALID_NODE_NUM);
+ if (slot == OCFS_INVALID_NODE_NUM) {
+ spin_unlock(&si->si_lock);
+ printk("ocfs2: no free slots available!\n");
+ status = -EINVAL;
+ goto bail;
+ }
+ } else
+ printk("ocfs2: slot %d is already allocated to this node!\n",
+ slot);
+
+ __ocfs2_fill_slot(si, slot, osb->node_num);
+ osb->slot_num = slot;
+ spin_unlock(&si->si_lock);
+
+ printk("ocfs2: taking node slot %d\n", osb->slot_num);
+
+ status = ocfs2_update_disk_slots(osb, si);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
+bail:
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+void ocfs2_put_slot(ocfs_super *osb)
+{
+ int status;
+ ocfs2_slot_info *si = osb->slot_info;
+
+ if (!si)
+ return;
+
+ ocfs2_update_slot_info(si);
+
+ spin_lock(&si->si_lock);
+ __ocfs2_fill_slot(si, osb->slot_num, OCFS_INVALID_NODE_NUM);
+ osb->slot_num = OCFS_INVALID_NODE_NUM;
+ spin_unlock(&si->si_lock);
+
+ status = ocfs2_update_disk_slots(osb, si);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+bail:
+ osb->slot_info = NULL;
+ kfree(si);
+}
+
Added: trunk/src/slot_map.h
===================================================================
--- trunk/src/slot_map.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/slot_map.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -0,0 +1,57 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+typedef struct _ocfs2_slot_info {
+ spinlock_t si_lock;
+
+ struct inode *si_inode;
+ struct buffer_head *si_bh;
+ unsigned int si_num_slots;
+ unsigned int si_size;
+ s16 si_global_node_nums[OCFS2_MAX_NODES];
+} ocfs2_slot_info;
+
+int ocfs2_init_slot_info(ocfs_super *osb);
+void ocfs2_free_slot_info(ocfs2_slot_info *si);
+
+int ocfs2_find_slot(ocfs_super *osb);
+void ocfs2_put_slot(ocfs_super *osb);
+
+void ocfs2_update_slot_info(ocfs2_slot_info *si);
+int ocfs2_update_disk_slots(ocfs_super *osb,
+ ocfs2_slot_info *si);
+
+s16 ocfs2_node_num_to_slot(ocfs2_slot_info *si,
+ s16 global);
+void ocfs2_clear_slot(ocfs2_slot_info *si,
+ s16 slot_num);
+
+void ocfs2_populate_mounted_map(ocfs_super *osb);
+
+#endif
Modified: trunk/src/suballoc.c
===================================================================
--- trunk/src/suballoc.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/suballoc.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -36,7 +36,7 @@
#include "ocfs2.h"
#include "alloc.h"
-#include "dlm.h"
+#include "dlmglue.h"
#include "localalloc.h"
#include "util.h"
#include "suballoc.h"
@@ -425,15 +425,12 @@
OCFS_ASSERT(!(handle->flags & OCFS_HANDLE_STARTED));
ocfs_handle_add_inode(handle, alloc_inode);
- status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
- 0, &bh, alloc_inode);
+ status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
if (status < 0) {
if (status != -EINTR)
LOG_ERROR_STATUS (status);
goto bail;
}
- ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
- 0, alloc_inode);
fe = (ocfs2_dinode *) bh->b_data;
OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
@@ -494,7 +491,7 @@
#ifndef OCFS_USE_ALL_METADATA_SUBALLOCATORS
alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
#else
- alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, osb->node_num);
+ alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, osb->slot_num);
#endif
if (!alloc_inode) {
status = -ENOMEM;
@@ -543,7 +540,7 @@
(*ac)->ac_handle = handle;
(*ac)->ac_which = OCFS_AC_USE_INODE;
- alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, osb->node_num);
+ alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, osb->slot_num);
if (!alloc_inode) {
status = -ENOMEM;
LOG_ERROR_STATUS(status);
Modified: trunk/src/super.c
===================================================================
--- trunk/src/super.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/super.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -42,6 +42,10 @@
#include <linux/socket.h>
#include <linux/inet.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/nodemanager.h>
+
#include "ocfs_log.h"
#include "ocfs.h"
#include "ocfs2.h"
@@ -50,18 +54,18 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
-#include "nm.h"
#include "proc.h"
+#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
#include "util.h"
#include "ver.h"
-#include "volcfg.h"
#include "vote.h"
#include "ocfs_journal.h"
@@ -75,19 +79,10 @@
ocfs_global_ctxt OcfsGlobalCtxt;
spinlock_t osb_id_lock;
__u32 osb_id; /* Keeps track of next available OSB Id */
-spinlock_t mount_cnt_lock;
-__u32 mount_cnt; /* Number of volumes currently mounted */
-char *node_name = NULL;
-__s32 node_number = OCFS_INVALID_NODE_NUM;
__u32 debug_context = 0;
__u32 debug_level = 0;
__u32 debug_exclude = 0;
-char *ip_address = NULL;
-__u32 ip_port_v2 = 0;
-char *guid = NULL;
-__u32 cs = 0;
-char *ocfs_hostname;
#ifdef EVIL_TRACE
__u64 debug_mask = 0;
@@ -128,45 +123,26 @@
//MODULE_DESCRIPTION("Oracle Clustered FileSystem");
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-module_param (node_name, charp, 0);
-module_param (node_number, int, 0);
module_param (debug_context, uint, 0);
module_param (debug_level, uint, 0);
module_param (debug_exclude, uint, 0);
-module_param (ip_address, charp, 0);
-module_param (ip_port_v2, uint, 0);
-module_param (guid, charp, 0);
-module_param (cs, uint, 0);
#else /* 2.6.x kernel */
-MODULE_PARM (node_name, "s");
-MODULE_PARM_DESC(node_name, "Name of this machine in the cluster");
-MODULE_PARM (node_number, "i");
-MODULE_PARM_DESC(node_number, "Slot number for this machine within volume");
MODULE_PARM (debug_context, "i");
MODULE_PARM_DESC(debug_context, "Debug context");
MODULE_PARM (debug_level, "i");
MODULE_PARM_DESC(debug_level, "Debug level");
MODULE_PARM (debug_exclude, "i");
MODULE_PARM_DESC(debug_exclude, "Process ID to exclude from tracing");
-MODULE_PARM (ip_address, "s");
-MODULE_PARM_DESC(ip_address, "IP address for the network dlm on this node");
-MODULE_PARM (ip_port_v2, "i");
-MODULE_PARM_DESC(ip_port_v2, "Port number for the network dlm on this node");
-MODULE_PARM (guid, "s");
-MODULE_PARM_DESC(guid, "GUID for this machine");
-MODULE_PARM (cs, "i");
-MODULE_PARM_DESC(cs, "Checksum");
#endif /* Linux 2.4 stuff */
-extern struct semaphore recovery_list_sem;
-static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id);
+static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id, char **group_name);
static int __init ocfs_driver_entry (void);
static void __exit ocfs_driver_exit (void);
static void ocfs_put_super (struct super_block *sb);
-static int ocfs_mount_volume (struct super_block *sb, int reclaim_id, struct inode *root);
-static int ocfs_dismount_volume (struct super_block *sb);
-static int ocfs_read_params(void);
+static int ocfs_mount_volume (struct super_block *sb, int reclaim_id,
+ char **group_name, struct inode *root);
+static void ocfs_dismount_volume(struct super_block *sb);
static int ocfs_initialize_mem_lists (void);
static void ocfs_free_mem_lists (void);
static void ocfs_delete_osb (ocfs_super * osb);
@@ -182,10 +158,8 @@
static int ocfs_init_global_system_inodes(ocfs_super *osb);
static int ocfs_init_local_system_inodes(ocfs_super *osb);
static int ocfs_release_system_inodes(ocfs_super *osb);
-static int ocfs_publish_set_unmounted(ocfs_super *osb, int node_num);
-static int ocfs_publish_set_mounted(ocfs_super *osb, int node_num);
-static int ocfs_publish_toggle_mounted(ocfs_super *osb, int node_num,
- int value);
+static int ocfs2_fill_local_node_info(ocfs_super *osb, char **group_name);
+static int ocfs2_complete_mount_recovery(ocfs_super *osb);
static int ocfs_check_volume(ocfs_super * osb);
static int ocfs_verify_volume(ocfs2_dinode *di, struct buffer_head *bh,
__u32 sectsize);
@@ -269,7 +243,7 @@
for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
- new = ocfs_get_system_file_inode(osb, i, osb->node_num);
+ new = ocfs_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs_release_system_inodes(osb);
LOG_ERROR_STATUS(status = -EINVAL);
@@ -298,7 +272,7 @@
LOG_ENTRY();
for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; i < NUM_SYSTEM_INODES ; i++) {
- new = ocfs_get_system_file_inode(osb, i, osb->node_num);
+ new = ocfs_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs_release_system_inodes(osb);
LOG_ERROR_STATUS(status = -EINVAL);
@@ -349,11 +323,12 @@
__u32 uid = current->fsuid;
__u32 gid = current->fsgid;
int reclaim_id;
+ char *group_name = NULL;
ocfs_super *osb = NULL;
LOG_ENTRY_ARGS ("%p, %p, %i", sb, data, silent);
- if (ocfs_parse_options (data, &uid, &gid, &reclaim_id) != 0) {
+ if (ocfs_parse_options (data, &uid, &gid, &reclaim_id, &group_name) != 0) {
status = -EINVAL;
LOG_ERROR_STR ("ocfs_read_super: bad mount option");
goto read_super_error;
@@ -366,7 +341,7 @@
/* this is needed to support O_LARGE_FILE */
sb->s_maxbytes = OCFS_LINUX_MAX_FILE_SIZE;
- status = ocfs_mount_volume (sb, reclaim_id, NULL);
+ status = ocfs_mount_volume (sb, reclaim_id, &group_name, NULL);
if (status < 0)
goto read_super_error;
@@ -394,11 +369,23 @@
sb->s_root = root;
- printk ("ocfs2: Mounting device (%u,%u) on %s (node %d)\n",
+ printk ("ocfs2: Mounting device (%u,%u) on %s (node %d, slot %d)\n",
MAJOR(sb->s_dev), MINOR(sb->s_dev),
- osb->node_cfg_info[osb->node_num]->node_name, osb->node_num);
+ OcfsGlobalCtxt.node_name, osb->node_num, osb->slot_num);
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
+
+ if (osb->dirty) {
+ /* This must happen *after* setting the volume to
+ * MOUNTED as we may sleep on any recovery threads. */
+ status = ocfs2_complete_mount_recovery(osb);
+ if (status < 0)
+ LOG_EXIT_STATUS(status);
+ }
+
+ if (group_name)
+ kfree(group_name);
+
LOG_EXIT_STATUS(status);
return status;
@@ -411,6 +398,9 @@
if (inode)
iput (inode);
+ if (group_name)
+ kfree(group_name);
+
LOG_EXIT_STATUS(status);
return status;
}
@@ -447,11 +437,12 @@
*
* e.g., gid=9999,uid=9999,[no]cache,reclaimid
*/
-static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id)
+static int ocfs_parse_options (char *options, __u32 * uid, __u32 * gid, int * reclaim_id, char **group_name)
{
char *c;
char *value;
int ret = 1;
+ int size;
LOG_ENTRY ();
@@ -492,6 +483,24 @@
}
} else if (!strcmp (c, "reclaimid")) {
*reclaim_id = 1;
+ } else if (!strcmp(c, "group")) {
+ if (!value || !*value) {
+ LOG_ERROR_STR
+ ("group option requires an argument");
+ goto bail;
+ }
+ LOG_TRACE_ARGS("group name passed = %s\n", value);
+
+ size = strlen(value) + 1;
+ *group_name = kmalloc(size, GFP_KERNEL);
+ if (!(*group_name)) {
+ LOG_ERROR_STATUS(-ENOMEM);
+ goto bail;
+ }
+ memset(*group_name, 0, size);
+ printk("ocfs2: group name passed = %s, size = %d\n",
+ value, size);
+ strcpy(*group_name, value);
} else {
LOG_ERROR_ARGS ("Invalid mount option: %s", c);
goto bail;
@@ -519,16 +528,7 @@
if (init_ocfs2_extent_maps())
return -ENOMEM;
-
- ocfs_hostname = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
- if (ocfs_hostname == NULL) {
- status = -EINVAL;
- goto leave;
- }
- strcpy(ocfs_hostname, system_utsname.nodename);
- printk("ocfs2: hostname is %s\n", ocfs_hostname);
-
ocfs_table_header = register_sysctl_table(ocfs_root_table, 0);
if (!ocfs_table_header) {
LOG_ERROR_STATUS(status = -ENOMEM);
@@ -536,18 +536,28 @@
}
memset (&OcfsGlobalCtxt, 0, sizeof (ocfs_global_ctxt));
- memset (&OcfsIpcCtxt, 0, sizeof (ocfs_ipc_ctxt));
INIT_LIST_HEAD (&(OcfsGlobalCtxt.osb_next));
INIT_LIST_HEAD (&(OcfsGlobalCtxt.osb_next));
- /* Read remaining insmod params */
- if ((status = ocfs_read_params ()) < 0)
- goto leave;
+ /* Ok, just use utsname for now. Eventually we need to
+ * get this from the node config subsystem. */
+ OcfsGlobalCtxt.node_name = kmalloc(OCFS2_MAX_NODE_NAME_LENGTH,
+ GFP_KERNEL);
+ if (!OcfsGlobalCtxt.node_name) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto leave;
+ }
+ memset(OcfsGlobalCtxt.node_name, 0, OCFS2_MAX_NODE_NAME_LENGTH);
+ strncpy(OcfsGlobalCtxt.node_name, system_utsname.nodename,
+ OCFS2_MAX_NODE_NAME_LENGTH - 1);
+ printk("ocfs2: node name is %s\n", OcfsGlobalCtxt.node_name);
+
/* Initialize the global data resource */
init_MUTEX (&(OcfsGlobalCtxt.global_res));
- OCFS_SET_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED);
+ OcfsGlobalCtxt.flags |= OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED;
/* Initialize the memory slabs for oin and file entry */
status = ocfs_initialize_mem_lists ();
@@ -561,17 +571,11 @@
osb_id = 0;
spin_unlock (&osb_id_lock);
- spin_lock_init (&mount_cnt_lock);
- spin_lock (&mount_cnt_lock);
- mount_cnt = 0;
- spin_unlock (&mount_cnt_lock);
-
spin_lock_init (&OcfsGlobalCtxt.comm_seq_lock);
spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
OcfsGlobalCtxt.comm_seq_num = 0;
spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
- init_MUTEX (&recovery_list_sem);
/* Initialize the proc interface */
ocfs_proc_init ();
@@ -583,8 +587,7 @@
/* Delete the global context resource */
if (OcfsGlobalCtxt.flags & OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED)
- OCFS_CLEAR_FLAG (OcfsGlobalCtxt.flags,
- OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED);
+ OcfsGlobalCtxt.flags &= ~OCFS_FLAG_GLBL_CTXT_RESOURCE_INITIALIZED;
if (ocfs_table_header)
unregister_sysctl_table(ocfs_table_header);
@@ -601,79 +604,6 @@
} /* ocfs_driver_entry */
/*
- * ocfs_read_params()
- *
- * Read insmod params
- */
-static int ocfs_read_params(void)
-{
- int status = 0;
- __u32 check_sum = 0;
- int i;
-
- /* Read remaining insmod params */
- if (node_number != OCFS_INVALID_NODE_NUM) {
- // this will be validated later
- OcfsGlobalCtxt.pref_node_num = node_number;
- LOG_TRACE_ARGS("Preferred node number: %d\n", node_number);
- }
-
- if (ip_port_v2 == 0)
- OcfsGlobalCtxt.comm_info.ip_port =
- htons(OCFS_IPC_DEFAULT_PORT);
- else if (ip_port_v2 & 0xFFFF0000) {
- status = -EINVAL;
- LOG_ERROR_STR("'ip_port_v2' is too large'");
- }
- else
- OcfsGlobalCtxt.comm_info.ip_port =
- htons((u16)ip_port_v2);
- LOG_TRACE_ARGS("IP port: %d\n",
- ntohs(OcfsGlobalCtxt.comm_info.ip_port));
-
- if (node_name && strlen(node_name) < MAX_NODE_NAME_LENGTH) {
- OcfsGlobalCtxt.node_name = node_name;
- LOG_TRACE_ARGS ("Node name: %s\n", OcfsGlobalCtxt.node_name);
- } else {
- status = -EINVAL;
- LOG_ERROR_STR ("'node_name' not set or too long");
- }
-
-#define MAX_IPv4_ADDR_STR_LEN 15 /* 4x '255' + 3x '.' */
- if (ip_address && strlen (ip_address) <= MAX_IPv4_ADDR_STR_LEN) {
- OcfsGlobalCtxt.comm_info.addr_u.ip_addr4 =
- in_aton(ip_address);
- LOG_TRACE_ARGS ("IP address: %s\n", ip_address);
- } else {
- status = -EINVAL;
- LOG_ERROR_STR ("'ip_address' not set or too long");
- }
-
- if (guid && strlen (guid) == OCFS2_GUID_LEN) {
- memcpy(&OcfsGlobalCtxt.guid.guid, guid, OCFS2_GUID_LEN);
- LOG_TRACE_ARGS ("Node guid: %s\n", guid);
- } else {
- status = -EINVAL;
- LOG_ERROR_STR ("'guid' not set correctly");
- }
-
- if (status == 0) {
- for (i = 0; i < OCFS2_GUID_LEN; ++i)
- check_sum += (__u32) guid[i];
- if (cs != check_sum) {
- status = -EINVAL;
- LOG_ERROR_STR ("load module using load_ocfs2");
- }
- }
-
- /* hardcoding... not used yet */
- OcfsGlobalCtxt.comm_info.ip_version = htons(4);
-
- return status;
-} /* ocfs_read_params */
-
-
-/*
* ocfs_driver_exit()
*
* Called on rmmod
@@ -688,7 +618,6 @@
/* Signal DLM thread to exit */
down (&(OcfsGlobalCtxt.global_res));
- OCFS_SET_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_SHUTDOWN_VOL_THREAD);
if (OcfsGlobalCtxt.flags & OCFS_FLAG_MEM_LISTS_INITIALIZED)
ocfs_free_mem_lists ();
@@ -702,6 +631,9 @@
exit_ocfs2_extent_maps();
+ if (OcfsGlobalCtxt.node_name)
+ kfree(OcfsGlobalCtxt.node_name);
+
printk("Unloaded OCFS Driver module\n");
LOG_EXIT ();
return;
@@ -812,7 +744,7 @@
sizeof(ocfs_journal_lock), 0, SLAB_NO_REAP | SLAB_HWCACHE_ALIGN,
NULL, NULL);
- OCFS_SET_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_MEM_LISTS_INITIALIZED);
+ OcfsGlobalCtxt.flags |= OCFS_FLAG_MEM_LISTS_INITIALIZED;
return 0;
} /* ocfs_initialize_mem_lists */
@@ -825,7 +757,7 @@
{
kmem_cache_destroy (OcfsGlobalCtxt.inode_cache);
kmem_cache_destroy (OcfsGlobalCtxt.lock_cache);
- OCFS_CLEAR_FLAG (OcfsGlobalCtxt.flags, OCFS_FLAG_MEM_LISTS_INITIALIZED);
+ OcfsGlobalCtxt.flags &= ~OCFS_FLAG_MEM_LISTS_INITIALIZED;
} /* ocfs_free_mem_lists */
static int ocfs2_sb_probe(struct super_block *sb,
@@ -926,18 +858,58 @@
return 0;
}
+static int ocfs2_fill_local_node_info(ocfs_super *osb, char **group_name)
+{
+ int status, i;
+ struct inode *group = NULL;
+ char *p;
+ if (group_name) {
+ osb->group_name = *group_name;
+ *group_name = NULL;
+ } else {
+ osb->group_name = kmalloc(NM_MAX_NAME_LEN + 1, GFP_KERNEL);
+ if (!osb->group_name) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ memset(osb->group_name, 0, NM_MAX_NAME_LEN + 1);
+ for (i = 0, p = osb->uuid; i < MAX_VOL_ID_LENGTH; i++, p += 2)
+ sprintf(p, "%02X", osb->uuid[i]);
+ }
+
+ group = nm_get_group_by_name(osb->group_name);
+ if (!group) {
+ printk("ocfs2: could not join group \"%s\"\n",
+ osb->group_name);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ osb->group_inode = group;
+ osb->node_num = nm_this_node(group);
+
+ printk("ocfs2: I am node %d, a member of group %s\n", osb->node_num,
+ osb->group_name);
+
+ status = 0;
+bail:
+
+ return status;
+}
+
/*
* ocfs_mount_volume()
*
*/
-static int ocfs_mount_volume (struct super_block *sb, int reclaim_id, struct inode *root)
+static int ocfs_mount_volume (struct super_block *sb, int reclaim_id,
+ char **group_name, struct inode *root)
{
- int status;
- ocfs_super *osb;
- int child_pid;
+ int status, sector_size;
+ int unlock_super = 0;
+ ocfs_super *osb = NULL;
struct buffer_head *bh = NULL;
- int sector_size;
LOG_ENTRY ();
@@ -948,7 +920,7 @@
goto leave;
}
- if ((osb = ocfs_malloc (sizeof (ocfs_super))) == NULL) {
+ if ((osb = kmalloc (sizeof(ocfs_super), GFP_KERNEL)) == NULL) {
LOG_ERROR_STATUS (status = -ENOMEM);
goto leave;
}
@@ -972,97 +944,79 @@
goto leave;
}
- down(&(osb->osb_res));
+ status = ocfs2_fill_local_node_info(osb, group_name);
+ if (status < 0) {
+ LOG_ERROR_STATUS (status);
+ goto leave;
+ }
- /* Launch the NM thread for the mounted volume */
- osb->dlm_task = NULL;
- child_pid = kernel_thread (ocfs_heartbeat_thread, osb,
- CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
- if (child_pid < 0) {
- LOG_ERROR_ARGS ("unable to launch ocfs2nm thread, error=%d",
- child_pid);
- up (&(osb->osb_res));
- status = child_pid;
+ status = ocfs2_register_hb_callbacks(osb);
+ if (status < 0) {
+ LOG_ERROR_STATUS (status);
goto leave;
- } else {
- init_completion (&osb->dlm_complete);
}
- up (&(osb->osb_res));
+ status = ocfs2_dlm_init(osb);
+ if (status < 0) {
+ LOG_ERROR_STATUS (status);
+ goto leave;
+ }
- /* Add proc entry for this volume */
- ocfs_proc_add_volume (osb);
+ /* requires vote_thread to be running. */
+ status = ocfs2_register_net_handlers(osb);
+ if (status < 0) {
+ LOG_ERROR_STATUS (status);
+ goto leave;
+ }
- /* GlobalMountCount */
- spin_lock (&mount_cnt_lock);
- mount_cnt++;
- if (mount_cnt == 1) {
- OcfsIpcCtxt.dlm_msg_size = OCFS_DLM_MAX_MSG_SIZE;
- OcfsIpcCtxt.version = OCFS_IPC_DLM_VERSION;
- /* start the listener thread */
- status = ocfs_init_udp_sock(&OcfsIpcCtxt.send_sock,
- &OcfsIpcCtxt.recv_sock);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto leave;
- }
- OcfsIpcCtxt.task = NULL;
- child_pid = kernel_thread (ocfs_recv_thread, NULL,
- CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
- if (child_pid >= 0) {
- init_completion (&(OcfsIpcCtxt.complete));
- } else {
- status = child_pid;
- LOG_ERROR_ARGS ("unable to launch ocfs2lsnr thread, error=%d", child_pid);
- goto leave;
- }
+ status = ocfs2_super_lock(osb, 1);
+ if (status < 0) {
+ LOG_ERROR_STATUS (status);
+ goto leave;
}
- spin_unlock (&mount_cnt_lock);
+ unlock_super = 1;
- /* wait for nm thread to be init */
- ocfs_wait (osb->nm_init_event, (atomic_read (&osb->nm_init) >= OCFS_HEARTBEAT_INIT ), 0);
+ /* This will load up the node map and add ourselves to it. */
+ status = ocfs2_find_slot(osb);
+ if (status < 0) {
+ LOG_ERROR_STATUS (status);
+ goto leave;
+ }
- down(&(osb->osb_res));
- down (&(osb->publish_lock));
- ocfs_nm_heart_beat (osb, HEARTBEAT_METHOD_DISK, 1);
- up (&(osb->publish_lock));
+ ocfs2_populate_mounted_map(osb);
- ocfs_node_map_set_bit(&osb->publ_map, osb->node_num);
- up (&(osb->osb_res));
+ /* load all node-local system inodes */
+ status = ocfs_init_local_system_inodes(osb);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto leave;
+ }
+ /* Add proc entry for this volume */
+ ocfs_proc_add_volume (osb);
+
/* Read the publish sector for this node and cleanup dirent being */
/* modified when we crashed. */
LOG_TRACE_STR ("ocfs_check_volume...");
- down(&(osb->osb_res));
status = ocfs_check_volume (osb);
if (status < 0) {
- up(&(osb->osb_res));
LOG_ERROR_STATUS (status);
goto leave;
}
- /* Launch the commit thread */
- osb->commit = ocfs_malloc(sizeof(ocfs_commit_task));
- if (osb->commit == NULL) {
- LOG_ERROR_STATUS(status = -ENOMEM);
- up (&(osb->osb_res));
- goto leave;
- }
- memset(osb->commit, 0, sizeof(ocfs_commit_task));
- child_pid = kernel_thread (ocfs_commit_thread, osb,
- CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
- if (child_pid < 0) {
- LOG_ERROR_ARGS ("unable to launch ocfs2commit thread, error=%d",
- child_pid);
- up (&(osb->osb_res));
- status = child_pid;
- goto leave;
- } else {
- init_completion (&osb->commit->c_complete);
- }
- up (&(osb->osb_res));
+ /* This should be sent *after* we recovered our journal as it
+ * will cause other nodes to unmark us as needing
+ * recovery. However, we need to send it *before* dropping the
+ * super block lock as otherwise their recovery threads might
+ * try to clean us up while we're live! */
+ status = ocfs2_request_mount_vote(osb);
+ if (status < 0)
+ LOG_ERROR_STATUS (status);
leave:
+ if (unlock_super)
+ ocfs2_super_unlock(osb, 1);
+
if (bh != NULL)
brelse(bh);
LOG_EXIT_STATUS (status);
@@ -1074,232 +1028,67 @@
* ocfs_dismount_volume()
*
*/
-static int ocfs_dismount_volume (struct super_block *sb)
+static void ocfs_dismount_volume (struct super_block *sb)
{
- int status;
- int AcquiredOSB = 0;
+ int tmp;
ocfs_super *osb = NULL;
- int i;
LOG_ENTRY_ARGS ("(0x%p)\n", sb);
- if (sb == NULL) {
- LOG_ERROR_STATUS (status = -EINVAL);
- goto leave;
- }
-
+ OCFS_ASSERT(sb);
osb = OCFS_SB(sb);
+ OCFS_ASSERT(osb);
- if (osb == NULL) {
- LOG_ERROR_STATUS (status = -EINVAL);
- goto leave;
- }
+ ocfs_shutdown_local_alloc(osb);
/* disable any new recovery threads and wait for any currently
* running ones to exit. Do this before setting the vol_state. */
down(&osb->recovery_lock);
osb->disable_recovery = 1;
- up(&osb->recovery_lock);
- while (atomic_read(&osb->num_recovery_threads)) {
+ while (osb->recovery_launched) {
+ up(&osb->recovery_lock);
LOG_TRACE_STR("Waiting on a recovery thread to complete.");
schedule();
+ down(&osb->recovery_lock);
}
+ up(&osb->recovery_lock);
- down(&(osb->osb_res));
- AcquiredOSB = 1;
-
- ocfs_shutdown_local_alloc(osb);
ocfs_journal_shutdown(osb);
- /* unset the mounted flag -- we're done with the journal and
- * the local alloc bitmap */
- status = ocfs_publish_set_unmounted(osb, osb->node_num);
- if (status < 0)
- LOG_ERROR_STR("Could not set mounted flag!");
-
ocfs_sync_blockdev(sb);
/* Remove the proc element for this volume */
ocfs_proc_remove_volume (osb);
- /* Dismount */
- OCFS_SET_FLAG (osb->osb_flags, OCFS_OSB_FLAGS_BEING_DISMOUNTED);
-
- /* Wait for this volume's NM thread to exit */
- if (osb->dlm_task) {
- LOG_TRACE_STR ("Waiting for ocfs2nm to exit....");
- send_sig (SIGINT, osb->dlm_task, 0);
- wait_for_completion (&(osb->dlm_complete));
- osb->dlm_task = NULL;
+ tmp = ocfs2_super_lock(osb, 1);
+ if (tmp < 0) {
+ LOG_ERROR_STATUS(tmp);
+ return;
}
- /* send dismount msg to all */
- status = ocfs_send_dismount_msg (osb);
- if (status < 0)
- LOG_ERROR_STATUS (status);
+ tmp = ocfs2_request_umount_vote(osb);
+ if (tmp < 0)
+ LOG_ERROR_STATUS(tmp);
- /* decrement mount count */
- spin_lock (&mount_cnt_lock);
- mount_cnt--;
- if (mount_cnt == 0) {
- /* Shutdown ocfslsnr */
- if (OcfsIpcCtxt.task) {
- LOG_TRACE_STR ("Waiting for ocfs2lsnr to exit....");
- send_sig (SIGINT, OcfsIpcCtxt.task, 0);
- wait_for_completion (&(OcfsIpcCtxt.complete));
- OcfsIpcCtxt.task = NULL;
- }
- }
- spin_unlock (&mount_cnt_lock);
+ ocfs2_put_slot(osb);
+ ocfs2_dlm_shutdown(osb);
+
+ ocfs2_clear_hb_callbacks(osb);
+
atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
- if (AcquiredOSB) {
- up (&(osb->osb_res));
- AcquiredOSB = 0;
- }
printk ("ocfs2: Unmounting device (%u,%u) on %s (node %d)\n",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev),
- osb->node_cfg_info[osb->node_num]->node_name, osb->node_num);
+ OcfsGlobalCtxt.node_name, osb->node_num);
- /* Free all nodecfgs */
- for (i = 0; i < osb->max_nodes; ++i) {
- BARF_BARF_BARF *p;
-
- p = osb->node_cfg_info[i];
- if (p)
- kfree(p);
- }
-
ocfs_release_system_inodes(osb);
ocfs_delete_osb (osb);
kfree(osb);
sb->s_dev = 0;
-
-leave:
- if (AcquiredOSB) {
- up (&(osb->osb_res));
- AcquiredOSB = 0;
- }
-
- LOG_EXIT_STATUS (status);
- return status;
} /* ocfs_dismount_volume */
-
-/* true if mounted, false otherwise */
-int ocfs_publish_get_mount_state(ocfs_super *osb, int node_num)
-{
- int status;
- ocfs_publish *publish;
- struct buffer_head *publish_bh = NULL;
- int retval = 0;
- int flags = 0;
-
- LOG_ENTRY();
-
- /* read it in */
- /* we may be called during mount in which case our publish
- * sector might be dirty. */
- if (node_num == osb->node_num)
- flags = OCFS_BH_CACHED;
- status = ocfs_read_block(osb, (osb->publish_blkno + node_num),
- &publish_bh, flags, NULL);
- if (status < 0) {
- brelse(publish_bh);
- LOG_ERROR_STR("Could not read publish sector, mounted value"
- " may be incorrect!");
- LOG_ERROR_STATUS (status);
- goto done;
- }
- publish = (ocfs_publish *) publish_bh->b_data;
-
- retval = publish->mounted;
-
- brelse(publish_bh);
-done:
- LOG_EXIT_STATUS(retval);
- return(retval);
-}
-
-static int ocfs_publish_toggle_mounted(ocfs_super *osb, int node_num, int value)
-{
- int status;
- ocfs_publish *publish;
- struct buffer_head * publish_bh = NULL;
-
- LOG_ENTRY_ARGS("(node_num=%d, value=%d)\n", node_num, value);
-
- /* read it in */
- status = ocfs_read_block(osb, (osb->publish_blkno + node_num),
- &publish_bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto done;
- }
- publish = (ocfs_publish *) publish_bh->b_data;
-
- /* change it */
- publish->mounted = value;
-
- /* write it back out */
- status = ocfs_write_block(osb, publish_bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto done;
- }
-
-done:
- if (publish_bh)
- brelse(publish_bh);
-
- LOG_EXIT_STATUS(status);
-
- return(status);
-}
-
-/* set the 'mounted' bit in the publish sector */
-static int ocfs_publish_set_mounted(ocfs_super *osb, int node_num)
-{
- int retval;
-
- LOG_ENTRY_ARGS("(node_num=%d)\n", node_num);
-
- down (&(osb->publish_lock));
-
- retval = ocfs_publish_toggle_mounted(osb, node_num, 1);
- if (osb->node_num == node_num)
- osb->check_mounted = 1;
-
- up (&(osb->publish_lock));
-
- LOG_EXIT_STATUS(retval);
-
- return(retval);
-}
-
-/* unset the 'mounted' bit in the publish sector */
-static int ocfs_publish_set_unmounted(ocfs_super *osb, int node_num)
-{
- int retval;
-
- LOG_ENTRY_ARGS("(node_num=%d)\n", node_num);
-
- down (&(osb->publish_lock));
-
- if (osb->node_num == node_num)
- osb->check_mounted = 0;
-
- retval = ocfs_publish_toggle_mounted(osb, node_num, 0);
-
- up (&(osb->publish_lock));
-
- LOG_EXIT_STATUS(retval);
-
- return(retval);
-}
-
/*
* ocfs_initialize_osb()
*
@@ -1307,14 +1096,10 @@
static int ocfs_initialize_osb(ocfs_super *osb, struct buffer_head *bh)
{
int status = 0;
- ocfs_publish *publish = NULL;
- u64 p_blkno;
- struct buffer_head *publish_bh = NULL; /* our own publish sector */
- struct buffer_head **publish_bhs = NULL; /* all the publish sectors */
- struct buffer_head *bitmap_bh = NULL;
int i;
ocfs2_dinode *di = NULL;
struct inode *inode = NULL;
+ struct buffer_head *bitmap_bh = NULL;
LOG_ENTRY ();
@@ -1323,16 +1108,15 @@
if (!osb->vol_label) {
LOG_ERROR_STR("unable to alloc vol label");
status = -ENOMEM;
- goto done_nojournal;
+ goto bail;
}
osb->uuid = kmalloc(MAX_VOL_ID_LENGTH, GFP_KERNEL);
if (!osb->uuid) {
LOG_ERROR_STR("unable to alloc uuid");
status = -ENOMEM;
- goto done_nojournal;
+ goto bail;
}
-
/* this needs to be done before most other initializations */
di = (ocfs2_dinode *) bh->b_data;
osb->max_nodes = le32_to_cpu(di->id2.i_super.s_max_nodes);
@@ -1340,7 +1124,7 @@
LOG_ERROR_ARGS("Invalid number of nodes (%u)\n",
osb->max_nodes);
status = -EINVAL;
- goto done_nojournal;
+ goto bail;
}
printk("max_nodes for this device: %u\n", osb->max_nodes);
@@ -1354,16 +1138,23 @@
if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
LOG_ERROR_ARGS("couldn't mount because of unsupported "
"optional features (%x).\n", i);
- goto done_nojournal;
+ goto bail;
}
if (!(osb->sb->s_flags & MS_RDONLY) &&
(i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
LOG_ERROR_ARGS("couldn't mount RDWR because of "
"unsupported optional features (%x).\n",
i);
- goto done_nojournal;
+ goto bail;
}
+ init_waitqueue_head(&osb->recovery_event);
+ atomic_set(&osb->wake_vote_task, 0);
+ spin_lock_init(&osb->vote_task_lock);
+ init_waitqueue_head(&osb->vote_event);
+ INIT_LIST_HEAD(&osb->blocked_lock_list);
+ osb->blocked_lock_count = 0;
+ INIT_LIST_HEAD(&osb->vote_list);
spin_lock_init(&osb->s_next_gen_lock);
get_random_bytes(&osb->s_next_generation, sizeof(u32));
@@ -1380,93 +1171,33 @@
if (!osb->journal) {
LOG_ERROR_STR("unable to alloc journal");
status = -ENOMEM;
- goto done_nojournal;
+ goto bail;
}
memset(osb->journal, 0, sizeof(ocfs_journal));
- publish_bhs = kmalloc(sizeof(struct buffer_head *) * osb->max_nodes, GFP_KERNEL);
- if (publish_bhs == NULL) {
- LOG_ERROR_STATUS(status = -ENOMEM);
- goto finally;
- }
- memset(publish_bhs, 0, sizeof(struct buffer_head *) * osb->max_nodes);
+ ocfs2_init_node_maps(osb);
- osb->vol_node_map = kmalloc(sizeof(ocfs_vol_node_map) * osb->max_nodes, GFP_KERNEL);
- if (!osb->vol_node_map) {
- LOG_ERROR_STATUS(status = -ENOMEM);
- goto bail;
- }
- memset(osb->vol_node_map, 0, sizeof(ocfs_vol_node_map) * osb->max_nodes);
-
- osb->lock_recovery_lists = kmalloc(sizeof(struct list_head) * osb->max_nodes, GFP_KERNEL);
- if (!osb->lock_recovery_lists) {
- LOG_ERROR_STATUS(status = -ENOMEM);
- goto bail;
- }
- memset(osb->lock_recovery_lists, 0, sizeof(struct list_head) * osb->max_nodes);
-
- osb->last_publ_seq_num = kmalloc(sizeof(__u64) * osb->max_nodes, GFP_KERNEL);
- if (!osb->last_publ_seq_num) {
- LOG_ERROR_STATUS(status = -ENOMEM);
- goto bail;
- }
- memset(osb->last_publ_seq_num, 0, sizeof(__u64) * osb->max_nodes);
-
- osb->node_cfg_info = kmalloc(sizeof(BARF_BARF_BARF *) * osb->max_nodes, GFP_KERNEL);
- if (!osb->node_cfg_info) {
- LOG_ERROR_STATUS(status = -ENOMEM);
- goto bail;
- }
- memset(osb->node_cfg_info, 0, sizeof(BARF_BARF_BARF *) * osb->max_nodes);
-
- ocfs_node_map_init(osb, &osb->publ_map);
-
-
- OCFS_CLEAR_FLAG (osb->osb_flags, OCFS_OSB_FLAGS_SHUTDOWN);
-
INIT_LIST_HEAD (&(osb->osb_next));
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
- init_MUTEX (&(osb->osb_res));
init_MUTEX (&(osb->recovery_lock));
- init_MUTEX (&(osb->orphan_recovery_lock));
- init_MUTEX (&(osb->comm_lock));
- init_MUTEX (&(osb->cfg_lock));
- init_MUTEX (&(osb->vote_sem));
- spin_lock_init(&osb->recovery_map_lock);
- ocfs_node_map_init(osb, &osb->recovery_map);
-
- osb->needs_flush = 0;
osb->disable_recovery = 0;
+ osb->recovery_launched = 0;
- init_MUTEX (&(osb->publish_lock));
- atomic_set (&osb->node_req_vote, 0);
-
- atomic_set (&osb->num_recovery_threads, 0);
-
- init_waitqueue_head (&osb->nm_init_event);
- atomic_set (&osb->nm_init, 0);
-
- osb->publish_dirty = 0;
- init_waitqueue_head (&osb->flush_event);
- atomic_set (&osb->flush_event_woken, 0);
+ init_waitqueue_head (&osb->checkpoint_event);
+ atomic_set (&osb->needs_checkpoint, 0);
atomic_set (&osb->clean_buffer_seq, 1);
spin_lock_init (&osb->clean_buffer_lock);
- spin_lock_init (&osb->vote_obj_queue_lock);
- INIT_LIST_HEAD (&(osb->vote_obj_queue));
- for (i=0; i<osb->max_nodes; i++) {
- INIT_LIST_HEAD(&(osb->lock_recovery_lists[i]));
- }
osb->node_num = OCFS_INVALID_NODE_NUM;
+ osb->slot_num = OCFS_INVALID_NODE_NUM;
osb->have_local_alloc = 0;
osb->local_alloc_bh = NULL;
- init_waitqueue_head (&osb->open_event);
/* not using any of these sb fields yet */
#if 0
di->i_ctime = cpu_to_le64(format_time); // use this as s_wtime (write time)
@@ -1533,63 +1264,16 @@
goto bail;
}
- status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &p_blkno,
- NULL);
- if (status < 0) {
- LOG_ERROR_STATUS(status);
- goto bail;
- }
-
- // i_size must be at least
- // (2 + osb->max_nodes + 4) + osb->max_nodes + osb->max_nodes
- if (inode->i_size >> osb->sb->s_blocksize_bits <
- (OCFS_VOLCFG_HDR_SECTORS + osb->max_nodes) + // autoconfig
- OCFS_VOLCFG_NEWCFG_SECTORS + // new autoconfig
- osb->max_nodes + // publish
- osb->max_nodes ) { // vote
+ if (inode->i_size >> osb->sb->s_blocksize_bits < OCFS2_MAX_NODES) {
LOG_ERROR_ARGS("dlm area size incorrect: "
"found=%llu, need=%u\n",
inode->i_size,
- (OCFS_VOLCFG_HDR_SECTORS +
- OCFS_VOLCFG_NEWCFG_SECTORS +
- (osb->max_nodes*3)) <<
- osb->sb->s_blocksize_bits);
+ OCFS2_MAX_NODES << osb->sb->s_blocksize_bits);
status = -EINVAL;
goto bail;
}
- osb->autoconfig_blkno = p_blkno;
- osb->autoconfig_blocks = OCFS_VOLCFG_HDR_SECTORS + osb->max_nodes;
-
- osb->new_autoconfig_blkno = osb->autoconfig_blkno + osb->autoconfig_blocks;
- osb->new_autoconfig_blocks = OCFS_VOLCFG_NEWCFG_SECTORS;
- osb->total_autoconfig_blocks = OCFS_VOLCFG_NEWCFG_SECTORS + osb->max_nodes;
-
- osb->publish_blkno = osb->new_autoconfig_blkno + osb->new_autoconfig_blocks;
- osb->publish_blocks = osb->max_nodes;
-
- osb->vote_blkno = osb->publish_blkno + osb->publish_blocks;
- osb->vote_blocks = osb->max_nodes;
-
- printk("autoconfig: blkno=%llu, blocks=%u newblkno=%llu newblocks=%u\n",
- osb->autoconfig_blkno, osb->autoconfig_blocks,
- osb->new_autoconfig_blkno, osb->new_autoconfig_blocks);
- printk("publish: blkno=%llu, blocks=%u\n", osb->publish_blkno,
- osb->publish_blocks);
- printk("vote: blkno=%llu, blocks=%u\n", osb->vote_blkno, osb->vote_blocks);
-
- osb->autoconfig_bhs = ocfs_malloc (osb->total_autoconfig_blocks
- * sizeof(struct buffer_head *));
- if (!osb->autoconfig_bhs) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto bail;
- }
- memset(osb->autoconfig_bhs, 0,
- osb->total_autoconfig_blocks * sizeof(struct buffer_head *));
-
iput(inode);
-
-
/*
* global bitmap
@@ -1599,7 +1283,7 @@
LOG_ERROR_STATUS(status = -EINVAL);
goto bail;
}
-
+
osb->bitmap_blkno = OCFS_I(inode)->ip_blkno;
status = ocfs_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, inode);
@@ -1608,6 +1292,7 @@
LOG_ERROR_STATUS(status);
goto bail;
}
+
di = (ocfs2_dinode *) bitmap_bh->b_data;
osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
@@ -1615,75 +1300,18 @@
printk("cluster bitmap inode: %llu, clusters per group: %u\n",
osb->bitmap_blkno, osb->bitmap_cpg);
- osb->prealloc_lock = 0;
-
-
- status = ocfs_get_config (osb);
+ status = ocfs2_init_slot_info(osb);
if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- /* Read the Publish Sector of local Node */
- status = ocfs_read_block(osb, (osb->publish_blkno + osb->node_num),
- &publish_bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
- publish = (ocfs_publish *) publish_bh->b_data;
-
- /*
- * FIXME: This really ought to be something exported by the
- * identical code in heartbeat.c
- */
- publish->time = jiffies;
- /* Disallow 0 */
- if (!publish->time)
- publish->time = 1;
-
- publish = NULL;
-
- status = ocfs_write_block(osb, publish_bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
- brelse(publish_bh);
-
- /* Read disk for all Publish Sectors */
- status = ocfs_read_blocks(osb, osb->publish_blkno, osb->max_nodes,
- publish_bhs, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- ocfs_update_publish_map(osb, publish_bhs, 1);
-
- for(i = 0; i < osb->max_nodes; i++)
- osb->last_publ_seq_num[i] = (__u64) (-1);
-
-
- /* load all node-local system inodes */
- status = ocfs_init_local_system_inodes(osb);
- if (status < 0) {
LOG_ERROR_STATUS(status);
goto bail;
}
- /* We might need to add a variable in Global List of osb to */
- /* delay any creation, if any other node is already creating a file */
-
/* Link this osb onto the global linked list of all osb structures. */
/* The Global Link List is mainted for the whole driver . */
down (&(OcfsGlobalCtxt.global_res));
list_add_tail (&(osb->osb_next), &(OcfsGlobalCtxt.osb_next));
up (&(OcfsGlobalCtxt.global_res));
- /* Mark the fact that this osb structure is initialized. */
- OCFS_SET_FLAG (osb->osb_flags, OCFS_OSB_FLAGS_INITIALIZED);
-
spin_lock (&osb_id_lock);
osb->osb_id = osb_id;
if (osb_id < OCFS_MAX_OSB_ID)
@@ -1696,35 +1324,7 @@
}
spin_unlock (&osb_id_lock);
-
- /* skip the frees which happen on error only */
- goto finally;
-
bail:
- if (osb->autoconfig_bhs)
- kfree(osb->autoconfig_bhs);
- if (osb->vol_node_map)
- kfree(osb->vol_node_map);
- if (osb->lock_recovery_lists)
- kfree(osb->lock_recovery_lists);
- if (osb->last_publ_seq_num)
- kfree(osb->last_publ_seq_num);
- if (osb->node_cfg_info)
- kfree(osb->node_cfg_info);
-finally:
- if (publish) {
- if (publish_bh) {
- brelse(publish_bh);
- }
- }
- if (publish_bhs[0]) {
- int i;
- for(i = 0; i < osb->max_nodes; i++)
- if (publish_bhs[i])
- brelse(publish_bhs[i]);
- }
-
-done_nojournal:
LOG_EXIT_STATUS (status);
return status;
} /* ocfs_initialize_osb */
@@ -1754,7 +1354,6 @@
OCFS2_MAJOR_REV_LEVEL ||
le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
OCFS2_MINOR_REV_LEVEL) {
-#warning dont know what is appropriate on minor rev difference
LOG_ERROR_ARGS("found superblock with bad version: "
"found %u.%u, should be %u.%u\n",
le16_to_cpu(di->id2.i_super.s_major_rev_level),
@@ -1789,6 +1388,35 @@
return status;
} /* ocfs_verify_volume */
+/* This part of local node recovery needs to happen after we've
+ * discovered all other nodes that need recovery and we've recovered
+ * them. */
+static int ocfs2_complete_mount_recovery(ocfs_super *osb)
+{
+ int status = 0;
+ ocfs2_dinode *local_alloc = osb->local_alloc_copy;
+
+ osb->local_alloc_copy = NULL;
+
+ if (osb->dirty) {
+ status = ocfs_complete_local_alloc_recovery(osb, local_alloc);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto finally;
+ }
+
+ status = ocfs_recover_orphans(osb);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+ }
+ osb->dirty = 0;
+
+finally:
+ if (local_alloc)
+ kfree(local_alloc);
+ return status;
+}
+
/*
* ocfs_check_volume()
*
@@ -1796,32 +1424,15 @@
static int ocfs_check_volume (ocfs_super * osb)
{
int status = 0;
- ocfs_publish *publish = NULL;
- int node_num = osb->node_num;
- struct buffer_head * publish_bh = NULL;
- int mounted;
+ int dirty;
ocfs2_dinode *local_alloc = NULL; /* only used if we
* recover
* ourselves. */
LOG_ENTRY ();
- /* Read the node's publish sector */
- status = ocfs_read_block(osb, (osb->publish_blkno + osb->node_num),
- &publish_bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- publish = (ocfs_publish *) publish_bh->b_data;
- /* we copy this out of the publish sector and then unlock
- * the bh as other functions will need to modify it. */
- mounted = publish->mounted;
- publish = NULL;
-
/* Init our journal object. */
- status = ocfs_journal_init(osb);
+ status = ocfs_journal_init(osb, &dirty);
if (status < 0) {
LOG_ERROR_STR("Could not initialize journal!");
goto finally;
@@ -1830,7 +1441,7 @@
/* If the journal was unmounted cleanly then we don't want to
* recover anything. Otherwise, journal_load will do that
* dirty work for us :) */
- if (!mounted) {
+ if (!dirty) {
status = ocfs_journal_wipe(osb->journal, 0);
if (status < 0) {
LOG_ERROR_STATUS(status);
@@ -1844,10 +1455,10 @@
/* will play back anything left in the journal. */
ocfs_journal_load(osb->journal);
- if (mounted) {
+ if (dirty) {
/* recover my local alloc if we didn't unmount cleanly. */
status = ocfs_begin_local_alloc_recovery(osb,
- node_num,
+ osb->slot_num,
&local_alloc);
if (status < 0) {
LOG_ERROR_STATUS(status);
@@ -1857,36 +1468,33 @@
* ourselves as mounted. */
}
- /* 'mounted' flag in publish sector should not be set until
- * after we successfully load the journal. */
- status = ocfs_publish_set_mounted(osb, osb->node_num);
- if (status < 0)
- LOG_ERROR_STR("Could not set mounted flag!");
LOG_TRACE_STR("Journal loaded.");
status = ocfs_load_local_alloc(osb);
- if (status < 0)
+ if (status < 0) {
LOG_ERROR_STATUS(status);
+ goto finally;
+ }
- if (mounted) {
- status = ocfs_complete_local_alloc_recovery(osb, local_alloc);
- if (status < 0) {
- LOG_ERROR_STATUS(status);
- goto finally;
- }
-
- status = ocfs_recover_orphans(osb);
- if (status < 0)
- LOG_ERROR_STATUS(status);
+ if (dirty) {
+ /* Recovery will be completed after we've mounted the
+ * rest of the volume. */
+ osb->dirty = 1;
+ osb->local_alloc_copy = local_alloc;
+ local_alloc = NULL;
}
+ /* go through each journal, trylock it and if you get the
+ * lock, and it's marked as dirty, set the bit in the recover
+ * map and launch a recovery thread for it. */
+ status = ocfs2_mark_dead_nodes(osb);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+
finally:
if (local_alloc)
kfree(local_alloc);
- if (publish_bh)
- brelse(publish_bh);
-
LOG_EXIT_STATUS (status);
return status;
} /* ocfs_check_volume */
@@ -1902,7 +1510,6 @@
*/
static void ocfs_delete_osb (ocfs_super * osb)
{
- int i;
LOG_ENTRY ();
/* This function assumes that the caller has the main osb resource */
@@ -1914,31 +1521,22 @@
list_del (&(osb->osb_next));
up (&(OcfsGlobalCtxt.global_res));
- for (i=0; i<osb->max_nodes; i++)
- ocfs_recover_oin_locks(osb, i);
+ if (osb->slot_info)
+ ocfs2_free_slot_info(osb->slot_info);
- for(i = 0; i < osb->total_autoconfig_blocks; i++)
- if (osb->autoconfig_bhs[i])
- brelse(osb->autoconfig_bhs[i]);
+ if (osb->group_inode)
+ iput(osb->group_inode);
- if (osb->autoconfig_bhs)
- kfree(osb->autoconfig_bhs);
- if (osb->vol_node_map)
- kfree(osb->vol_node_map);
- if (osb->lock_recovery_lists)
- kfree(osb->lock_recovery_lists);
- if (osb->last_publ_seq_num)
- kfree(osb->last_publ_seq_num);
- if (osb->node_cfg_info)
- kfree(osb->node_cfg_info);
-
/* FIXME
* This belongs in journal shutdown, but because we have to
* allocate osb->journal at the start of ocfs_initalize_osb(),
* we free it here.
*/
kfree(osb->journal);
-
+ if (osb->group_name)
+ kfree(osb->group_name);
+ if (osb->local_alloc_copy)
+ kfree(osb->local_alloc_copy);
memset (osb, 0, sizeof (ocfs_super));
LOG_EXIT ();
Modified: trunk/src/sysfile.c
===================================================================
--- trunk/src/sysfile.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/sysfile.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -46,10 +46,10 @@
/* Tracing */
#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_SYSFILE
-static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 node);
+static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 slot);
static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 node);
+static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 slot);
static inline int is_global_system_inode(int type)
{
@@ -57,19 +57,19 @@
type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE);
}
-static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 node)
+static inline int is_in_system_inode_array(ocfs_super *osb, int type, __u32 slot)
{
- return (node == osb->node_num || is_global_system_inode(type));
+ return (slot == osb->slot_num || is_global_system_inode(type));
}
struct inode *ocfs_get_system_file_inode(ocfs_super *osb, int type,
- __u32 node)
+ __u32 slot)
{
struct inode *inode = NULL;
struct inode **arr = NULL;
/* avoid the lookup if cached in local system file array */
- if (is_in_system_inode_array(osb, type, node))
+ if (is_in_system_inode_array(osb, type, slot))
arr = &(osb->system_inodes[type]);
if (arr && ((inode = *arr) != NULL)) {
@@ -82,7 +82,7 @@
}
/* this gets one ref thru iget */
- inode = _ocfs_get_system_file_inode(osb, type, node);
+ inode = _ocfs_get_system_file_inode(osb, type, slot);
/* add one more if putting into array for first time */
if (arr && inode) {
@@ -93,7 +93,7 @@
return inode;
}
-static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 node)
+static struct inode * _ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 slot)
{
char namebuf[40];
struct inode *inode = NULL;
@@ -104,7 +104,7 @@
ocfs2_sprintf_system_inode_name(namebuf,
sizeof(namebuf),
- type, node);
+ type, slot);
status = ocfs_find_files_on_disk(osb, namebuf, strlen(namebuf),
&blkno, osb->sys_root_inode,
Modified: trunk/src/sysfile.h
===================================================================
--- trunk/src/sysfile.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/sysfile.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,6 +26,6 @@
#ifndef OCFS2_SYSFILE_H
#define OCFS2_SYSFILE_H
-struct inode * ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 node);
+struct inode * ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 slot);
#endif /* OCFS2_SYSFILE_H */
Modified: trunk/src/util.c
===================================================================
--- trunk/src/util.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/util.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -38,8 +38,6 @@
/* Tracing */
#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_UTIL
-static void ocfs_timeout_func(unsigned long data);
-
/* block all but 'mask' sigs, optionally saving off our previous
* signal state. */
void ocfs_block_sigs(sigset_t *oldsigs, unsigned long mask)
@@ -129,31 +127,6 @@
} /* ocfs_daemonize */
#endif
-/*
- * ocfs_sleep()
- *
- * The interval time is in milliseconds
- *
- * This function needs to be removed.
- * Instead call schedule_timeout() directly and handle signals.
- */
-int ocfs_sleep (__u32 ms)
-{
- __u32 numJiffies;
-
- LOG_ENTRY ();
-
- /* 10ms = 1 jiffy, minimum resolution is one jiffy */
- numJiffies = ms * HZ / 1000;
- numJiffies = (numJiffies < 1) ? 1 : numJiffies;
-
- set_current_state (TASK_INTERRUPTIBLE);
- numJiffies = schedule_timeout (numJiffies);
-
- LOG_EXIT ();
- return 0;
-} /* ocfs_sleep */
-
/* prefetch has been declared to allow to build in debug mode */
#ifdef DEBUG
#ifndef ARCH_HAS_PREFETCH
@@ -163,46 +136,6 @@
#endif
#endif
-
-static void ocfs_timeout_func(unsigned long data)
-{
- ocfs_timeout *to = (ocfs_timeout *)data;
-
- to->timed_out = 1;
- wake_up(&to->wait);
-}
-
-void ocfs_init_timeout(ocfs_timeout *to)
-{
- init_timer(&to->timer);
- to->timer.data = (unsigned long)to;
- to->timer.function = ocfs_timeout_func;
- to->timed_out = 0;
- init_waitqueue_head(&to->wait);
-}
-
-void ocfs_set_timeout(ocfs_timeout *to, __u32 timeout)
-{
- __u32 how_long;
-
- if (!timeout) {
- to->timed_out = 1;
- return ;
- }
-
- how_long = (timeout * HZ / 1000);
- if (how_long < 1)
- how_long = 1;
-
- to->timer.expires = jiffies + how_long;
- add_timer(&to->timer);
-}
-
-void ocfs_clear_timeout(ocfs_timeout *to)
-{
- del_timer_sync(&to->timer);
-}
-
void ocfs_truncate_inode_pages(struct inode *inode, loff_t off)
{
LOG_TRACE_ARGS("truncating pages for inode %llu (%p) from offset %llu\n",
@@ -275,46 +208,5 @@
}
-int __ocfs_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms)
-{
- int ret;
- ocfs_timeout timeout;
- DECLARE_WAITQUEUE(wait, current);
- DECLARE_WAITQUEUE(to_wait, current);
- ocfs_init_timeout(&timeout);
- if (ms) {
- ocfs_set_timeout(&timeout, ms);
- if (timeout.timed_out) {
- ocfs_clear_timeout(&timeout);
- }
- }
- add_wait_queue(wq, &wait);
- add_wait_queue(&timeout.wait, &to_wait);
- do {
- ret = 0;
- set_current_state(TASK_INTERRUPTIBLE);
- if (atomic_read(var)==val)
- break;
- ret = -ETIMEDOUT;
- if (timeout.timed_out)
- break;
- schedule();
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
- } while (1);
-
- set_current_state(TASK_RUNNING);
- remove_wait_queue(wq, &wait);
- remove_wait_queue(&timeout.wait, &to_wait);
-
- if (ms)
- ocfs_clear_timeout(&timeout);
-
- return ret;
-}
-
-
Modified: trunk/src/util.h
===================================================================
--- trunk/src/util.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/util.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -26,26 +26,11 @@
#ifndef OCFS2_UTIL_H
#define OCFS2_UTIL_H
-void ocfs_clear_timeout(ocfs_timeout *to);
void ocfs_daemonize(char *name, int len, int shutdown_sigs);
-void ocfs_init_timeout(ocfs_timeout *to);
-void ocfs_set_timeout(ocfs_timeout *to, __u32 timeout);
void ocfs_show_stack(unsigned long *esp);
void ocfs_show_trace(unsigned long *stack);
-int ocfs_sleep(__u32 ms);
void ocfs_truncate_inode_pages(struct inode *inode, loff_t off);
-int __ocfs_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int ms);
void ocfs_block_sigs(sigset_t *oldsigs, unsigned long mask);
void ocfs_unblock_sigs(sigset_t newsig);
-/* exits when var == val, or on timeout */
-static inline int ocfs_wait_atomic_eq(wait_queue_head_t *wq, atomic_t *var, int val, int timeout)
-{
- int ret = 0;
- if (atomic_read(var) != val)
- ret = __ocfs_wait_atomic_eq(wq, var, val, timeout);
- return ret;
-}
-
-
#endif /* OCFS2_UTIL_H */
Deleted: trunk/src/volcfg.c
===================================================================
--- trunk/src/volcfg.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/volcfg.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,970 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * volcfg.c
- *
- * Auto configuration, namely, node number.
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "ocfs_compat.h"
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-#include "ocfs_log.h"
-#include "ocfs.h"
-
-#include "util.h"
-#include "volcfg.h"
-
-#include "buffer_head_io.h"
-
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_VOLCFG
-
-static void ocfs_worker (void *arg);
-static void ocfs_assert_lock_owned (unsigned long arg);
-static int ocfs_add_to_disk_config (ocfs_super * osb, __s16 pref_node_num, ocfs_node_config_info * new_disk_node);
-static int ocfs_write_volcfg_header (ocfs_super * osb, ocfs_volcfg_op op);
-static int ocfs_update_disk_config (ocfs_super * osb, __u32 node_num, ocfs_node_config_info * disk);
-static int ocfs_release_disk_lock (ocfs_super * osb, __u64 lock_off);
-static int ocfs_add_node_to_config (ocfs_super * osb);
-static int ocfs_has_node_config_changed (ocfs_super * osb);
-static int ocfs_refresh_node_config (ocfs_super * osb);
-static void ocfs_show_all_node_cfgs (ocfs_super * osb);
-static int ocfs_disknode_to_node (BARF_BARF_BARF ** node, ocfs_node_config_info * disk);
-static void ocfs_volcfg_gblctxt_to_node(BARF_BARF_BARF *node);
-static void ocfs_volcfg_gblctxt_to_disknode(ocfs_node_config_info *disk);
-
-/*
- * ocfs_worker()
- *
- * This function reiterates the lock on the disk from this node once
- * it has obtained it.
- */
-static void ocfs_worker (void *arg)
-{
- __u32 length;
- int status;
- ocfs_super *osb;
- __u64 offset;
- ocfs_cfg_task *cfg_task;
- struct buffer_head *bh;
-
- LOG_ENTRY ();
-
- cfg_task = arg;
-
- /* Obtain the volume for which we need to reiterate the lock */
- osb = cfg_task->osb;
- bh = cfg_task->bh;
- length = osb->sb->s_blocksize;
- offset = cfg_task->lock_off;
-
- /* Write the sector back */
- status = ocfs_write_block(osb, bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- /* deliberate no exit jump here */
- }
-
- if (atomic_read (&osb->lock_stop)) {
- LOG_TRACE_ARGS ("Last Lock written : %lu\n", jiffies);
- atomic_set (&osb->lock_event_woken, 1);
- brelse(bh);
- wake_up (&osb->lock_event);
- } else {
- LOG_TRACE_ARGS ("Lock written : %lu\n", jiffies);
- mod_timer (&osb->lock_timer, jiffies + OCFS_VOLCFG_LOCK_ITERATE);
- }
-
- LOG_EXIT ();
- return;
-} /* ocfs_worker */
-
-/*
- * ocfs_assert_lock_owned()
- *
- * Routine called by a timer to reiterate the disk lock.
- */
-static void ocfs_assert_lock_owned (unsigned long arg)
-{
- ocfs_cfg_task *cfg_task;
-
- LOG_ENTRY ();
-
- cfg_task = (ocfs_cfg_task *) arg;
-
- /* initialize the task and submit it */
- INIT_WORK(&cfg_task->cfg_wq, ocfs_worker, cfg_task);
- schedule_work(&cfg_task->cfg_wq);
-
- LOG_EXIT ();
- return ;
-} /* ocfs_assert_lock_owned */
-
-/*
- * ocfs_add_to_disk_config()
- *
- */
-static int ocfs_add_to_disk_config (ocfs_super * osb, __s16 pref_node_num, ocfs_node_config_info * new_disk_node)
-{
- int status = 0;
- int i;
- ocfs_node_config_info *disk_node = NULL;
- __s16 node_num;
- struct buffer_head **cfg_bhs = NULL;
-
- LOG_ENTRY ();
-
- cfg_bhs = kmalloc(sizeof(struct buffer_head *) * osb->max_nodes, GFP_KERNEL);
- if (cfg_bhs == NULL) {
- LOG_ERROR_STATUS(status = -ENOMEM);
- goto finally;
- }
- memset(cfg_bhs, 0, osb->max_nodes * sizeof(struct buffer_head *));
-
- /* Read the nodecfg info for all nodes from disk */
- status = ocfs_read_blocks(osb,
- (osb->autoconfig_blkno + OCFS_VOLCFG_HDR_SECTORS),
- (osb->autoconfig_blocks - OCFS_VOLCFG_HDR_SECTORS),
- cfg_bhs, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- /* Check if preferred node num is available */
- node_num = OCFS_INVALID_NODE_NUM;
- if (pref_node_num < osb->max_nodes) {
- disk_node = (ocfs_node_config_info *) cfg_bhs[pref_node_num]->b_data;
- if (disk_node->node_name[0] == '\0')
- node_num = pref_node_num;
- }
-
- /* if not, find the first available empty slot */
- if (node_num == OCFS_INVALID_NODE_NUM) {
- for (node_num = 0; node_num < osb->max_nodes; node_num++) {
- disk_node = (ocfs_node_config_info *) cfg_bhs[node_num]->b_data;
- if (disk_node->node_name[0] == '\0')
- break;
- }
- }
-
- /* If no free slots, error out */
- if (node_num >= osb->max_nodes) {
- LOG_ERROR_STR ("Unable to allocate node number as no slots " \
- "are available");
- status = -ENOSPC;
- goto finally;
- }
-
- /* Copy the new nodecfg into the memory buffer */
- memcpy (cfg_bhs[node_num]->b_data, new_disk_node, osb->sb->s_blocksize);
-
- /* Write the new node details on disk */
- status = ocfs_write_block(osb, cfg_bhs[node_num], NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- /* Update the nodecfg hdr on disk */
- status = ocfs_write_volcfg_header (osb, OCFS_VOLCFG_ADD);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
-finally:
- for (i = 0; i < osb->max_nodes; i++)
- if (cfg_bhs[i])
- brelse(cfg_bhs[i]);
- if (cfg_bhs)
- kfree(cfg_bhs);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_add_to_disk_config */
-
-/*
- * ocfs_write_volcfg_header()
- *
- */
-static int ocfs_write_volcfg_header (ocfs_super * osb, ocfs_volcfg_op op)
-{
- int status = 0;
- ocfs_node_config_hdr *hdr, *hdr_copy;
- struct buffer_head *node_cfg_bhs[2];
-
- LOG_ENTRY ();
-
- node_cfg_bhs[0] = node_cfg_bhs[1] = NULL;
- /* Read the nodecfg header */
- status = ocfs_read_block(osb, osb->autoconfig_blkno, &node_cfg_bhs[0],
- 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
-
- status = ocfs_read_block(osb, (osb->new_autoconfig_blkno + 1),
- &node_cfg_bhs[1], 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
-
- hdr = (ocfs_node_config_hdr *) node_cfg_bhs[0]->b_data;
- hdr_copy = (ocfs_node_config_hdr *) node_cfg_bhs[1]->b_data;
-
- if (op == OCFS_VOLCFG_ADD)
- hdr->num_nodes++;
-
- /* Increment the seq# to trigger other nodes to re-read node cfg */
- hdr->cfg_seq_num++;
-
- memcpy(hdr_copy, hdr, osb->sb->s_blocksize);
- /* Write the nodecfg header */
- /* Write the nodecfg hdr into the second sector of newcfg also. */
- /* We do so so that we can read the nodecfg hdr easily when we */
- /* read the publish sector, for e.g. in ocfs_nm_thread() */
- status = ocfs_write_blocks(osb, node_cfg_bhs, 2, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
- brelse(node_cfg_bhs[0]);
- brelse(node_cfg_bhs[1]);
- node_cfg_bhs[0] = node_cfg_bhs[1] = NULL;
-
-bail:
- if (node_cfg_bhs[0])
- brelse(node_cfg_bhs[0]);
- if (node_cfg_bhs[1])
- brelse(node_cfg_bhs[1]);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_write_volcfg_header */
-
-/*
- * ocfs_config_with_disk_lock()
- *
- * This function tries to obtain the lock on the disk for the volume
- * specified. The logic for obtaining a disk lock is as follows :
- *
- * Read the volcfg lock sector. If it is not locked, lock it by stamping
- * ones node number. Read the same sector after OCFS_VOLCFG_LOCK_TIME.
- * If the contents have not been modified, the lock is ours. Retain the
- * lock by reiterating the lock write operation every OCFS_VOLCFG_ITERATE_TIME.
- *
- * If the volcfg lock sector is owned by someone else, wait for
- * OCFS_VOLCFG_LOCK_TIME and read the lock sector again. If the lock sector
- * is owned by the same node as before attempt to break the lock as the
- * node may have died. If however, the lock sector is now owned by someone
- * else, wait for OCFS_VOLCFG_LOCK_TIME before repeating the entire exercise
- * again.
- *
- * Returns 0 if success, < 0 if error.
- */
-static int ocfs_config_with_disk_lock (ocfs_super * osb, __u64 lock_off, __u8 * cfg_buf, __s16 node_num, ocfs_volcfg_op op)
-{
- int status = 0;
- char *lock_buf;
- int tried_acq = 0;
- int break_lock = 0;
- ocfs2_disk_lock *disk_lock;
- ocfs_cfg_task *cfg_task;
- __s16 lock_node_num = OCFS_INVALID_NODE_NUM;
- struct buffer_head *bh = NULL;
- int i;
-
- LOG_ENTRY ();
-
- cfg_task = ocfs_malloc (sizeof (ocfs_cfg_task));
- if (cfg_task == NULL)
- {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto finito;
- }
-
- /* initialize cfg_task with info reqd to reiterate the volcfg lock */
- cfg_task->osb = osb;
- cfg_task->lock_off = lock_off;
-
- /* Initialize the kernel timer */
- init_timer(&osb->lock_timer);
- osb->lock_timer.function = ocfs_assert_lock_owned;
- osb->lock_timer.expires = 0;
- osb->lock_timer.data = (unsigned long) cfg_task;
-
- init_waitqueue_head (&osb->lock_event);
- atomic_set (&osb->lock_event_woken, 0);
- atomic_set (&osb->lock_stop, 0);
-
- status = ocfs_read_block(osb, lock_off >> osb->sb->s_blocksize_bits,
- &bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finito;
- }
- cfg_task->bh = bh;
-
- for (i = 0; i < 50; i++) {
- /* Read the volcfg lock sector */
- status = ocfs_read_block(osb,
- lock_off >> osb->sb->s_blocksize_bits,
- &bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finito;
- }
-
- disk_lock = (ocfs2_disk_lock *) bh->b_data;
- lock_node_num = disk_lock->dl_master;
-
- if (disk_lock->dl_level == 0 || break_lock) {
- if (disk_lock->dl_level != 0)
- LOG_TRACE_STR ("Try to break node config lock");
- else
- LOG_TRACE_STR ("Lock node config");
-
- /* Attempt to lock volcfg */
- memcpy(disk_lock, cfg_buf, osb->sb->s_blocksize);
-
- disk_lock->dl_master = osb->node_num;
- disk_lock->dl_level = 1;
- memcpy(cfg_buf, disk_lock, osb->sb->s_blocksize);
-
- /* Write into volcfg lock sector... */
- status = ocfs_write_block(osb, bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finito;
- }
- tried_acq = 1;
- }
-
- ocfs_sleep (OCFS_VOLCFG_LOCK_TIME);
-
- /* Read the volcfg lock sector again... */
- status = ocfs_read_block(osb,
- lock_off >> osb->sb->s_blocksize_bits,
- &bh, 0, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finito;
- }
-
- lock_buf = bh->b_data;
-
- /* If we tried to acquire and we still own it we take it... */
- if ((tried_acq) && (memcmp (lock_buf, cfg_buf, osb->sb->s_blocksize) == 0)) {
- memcpy (lock_buf, cfg_buf, osb->sb->s_blocksize);
-
- /* Set timer to reiterate lock every few jiffies */
- LOG_TRACE_ARGS ("Start Timer: %lu\n", jiffies);
- osb->lock_timer.expires = jiffies +
- OCFS_VOLCFG_LOCK_ITERATE;
- /* we get_bh here because we brelse later in
- * this function, and so does the timer routine. */
- get_bh(bh);
- add_timer(&osb->lock_timer);
-
- /* Write the config info into the disk */
- disk_lock = (ocfs2_disk_lock *)cfg_buf;
- disk_lock->dl_master = OCFS_INVALID_NODE_NUM;
- disk_lock->dl_level = 0;
-
- if (op == OCFS_VOLCFG_ADD)
- status = ocfs_add_to_disk_config (osb, node_num,
- (ocfs_node_config_info *) cfg_buf);
- else if (op == OCFS_VOLCFG_UPD)
- status = ocfs_update_disk_config(osb, node_num,
- (ocfs_node_config_info *) cfg_buf);
- else
- status = -EINVAL;
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finito;
- }
- break;
- } else {
- disk_lock = (ocfs2_disk_lock *)lock_buf;
- if (disk_lock->dl_master == lock_node_num)
- break_lock = 1;
- else {
- LOG_TRACE_ARGS ("Node config locked by node: %d\n",
- disk_lock->dl_master);
- ocfs_sleep (OCFS_VOLCFG_LOCK_TIME);
- }
- }
-
- }
- if (i >= 50)
- status = -EINVAL;
-
-finito:
- ocfs_release_disk_lock (osb, lock_off);
-
- if (cfg_task)
- kfree(cfg_task);
- if (bh)
- brelse(bh);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_config_with_disk_lock */
-
-/*
- * ocfs_release_disk_lock()
- *
- * This function Cancels the timer to reiterate we own the disk lock and
- * then frees it by writing the sector for the disk lock.
- *
- * Returns 0 if success, < 0 if error.
- */
-static int ocfs_release_disk_lock (ocfs_super * osb, __u64 lock_off)
-{
- int status = 0;
- struct buffer_head *bh;
- struct super_block *sb;
- __u64 blocknum;
-
- LOG_ENTRY ();
-
- sb = osb->sb;
-
- blocknum = lock_off >> sb->s_blocksize_bits;
- bh = sb_getblk(sb, blocknum);
- if (bh == NULL) {
- LOG_ERROR_STATUS (status = -EIO);
- goto finally;
- }
-
- /* reset lock... */
- memset (bh->b_data, 0, osb->sb->s_blocksize);
-
- /* Release the lock */
- status = ocfs_write_block(osb, bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- /* Cancel the timer so that we don't reiterate the lock anymore */
- LOG_TRACE_STR ("Waiting for osb->lock_event");
- atomic_set (&osb->lock_stop, 1);
- ocfs_wait (osb->lock_event, atomic_read (&osb->lock_event_woken), 0);
- atomic_set (&osb->lock_event_woken, 0);
- del_timer_sync(&osb->lock_timer);
-
- /* reset lock... */
- memset (bh->b_data, 0, osb->sb->s_blocksize);
-
- /* Release the lock */
- status = ocfs_write_block(osb, bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
-finally:
- if (bh)
- brelse(bh);
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_release_disk_lock */
-
-/*
- * ocfs_add_node_to_config()
- *
- */
-static int ocfs_add_node_to_config (ocfs_super * osb)
-{
- int status;
- ocfs_node_config_info *disk;
- void *buffer;
- __u64 offset;
-
- LOG_ENTRY ();
-
- buffer = ocfs_malloc (osb->sb->s_blocksize);
- if (buffer == NULL) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto bail;
- }
- memset (buffer, 0, osb->sb->s_blocksize);
-
- disk = buffer;
-
- /* populate the disknodecfg info from global context */
- ocfs_volcfg_gblctxt_to_disknode (disk);
-
- /* Write this nodes config onto disk */
- offset = (osb->new_autoconfig_blkno << osb->sb->s_blocksize_bits);
- status = ocfs_config_with_disk_lock (osb, offset, (__u8 *) disk,
- OcfsGlobalCtxt.pref_node_num,
- OCFS_VOLCFG_ADD);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
-
- status = ocfs_chk_update_config (osb);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
-
-bail:
- if (buffer)
- kfree(buffer);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_add_node_to_config */
-
-/*
- * ocfs_disknode_to_node()
- *
- */
-static int ocfs_disknode_to_node (BARF_BARF_BARF ** node, ocfs_node_config_info * disk)
-{
- int status = 0;
-
- LOG_ENTRY ();
-
- if (*node == NULL) {
- if ((*node = (BARF_BARF_BARF *)
- ocfs_malloc (sizeof (BARF_BARF_BARF))) == NULL) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto bail;
- }
- memset (*node, 0, sizeof (BARF_BARF_BARF));
- }
-
- strncpy ((*node)->node_name, disk->node_name, MAX_NODE_NAME_LENGTH);
-
- memcpy((*node)->guid.guid, disk->guid.guid, OCFS2_GUID_LEN);
-
- (*node)->ipc_config.ip_port = disk->ipc_config.ip_port;
- (*node)->ipc_config.addr_u.ip_addr4 =
- disk->ipc_config.addr_u.ip_addr4;
- (*node)->ipc_config.ip_version = disk->ipc_config.ip_version;
-
-bail:
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_disknode_to_node */
-
-/*
- * ocfs_update_disk_config()
- *
- */
-static int ocfs_update_disk_config (ocfs_super * osb, __u32 node_num, ocfs_node_config_info * disk)
-{
- int status = 0;
- __u32 blocknum;
- struct buffer_head *bh = NULL;
- struct super_block *sb = NULL;
-
- LOG_ENTRY ();
-
- sb = osb->sb;
- /* Write the node details */
- blocknum = osb->autoconfig_blkno + OCFS_VOLCFG_HDR_SECTORS + node_num;
-
- bh = sb_getblk(sb, blocknum);
- if (bh == NULL) {
- status = -EIO;
- LOG_ERROR_STATUS(status);
- goto finally;
- }
-
- memcpy(bh->b_data, disk, osb->sb->s_blocksize);
-
- status = ocfs_write_block(osb, bh, NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- status = ocfs_write_volcfg_header (osb, OCFS_VOLCFG_UPD);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
-finally:
- if (bh)
- brelse(bh);
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_update_disk_config */
-
-/*
- * ocfs_volcfg_gblctxt_to_disknode()
- *
- */
-static void ocfs_volcfg_gblctxt_to_disknode(ocfs_node_config_info *disk)
-{
- ocfs_ipc_config_info *ipc;
- ocfs_comm_info *g_ipc;
-
- LOG_ENTRY ();
-
- ipc = &(disk->ipc_config);
- g_ipc = &(OcfsGlobalCtxt.comm_info);
-
- if (OcfsGlobalCtxt.node_name)
- strncpy (disk->node_name, OcfsGlobalCtxt.node_name,
- MAX_NODE_NAME_LENGTH);
-
- memcpy(disk->guid.guid, OcfsGlobalCtxt.guid.guid,
- OCFS2_GUID_LEN);
-
- ipc->ip_port = g_ipc->ip_port;
- ipc->ip_version = g_ipc->ip_version;
- ipc->addr_u.ip_addr4 = g_ipc->addr_u.ip_addr4;
-
- LOG_EXIT ();
- return ;
-} /* ocfs_volcfg_gblctxt_to_disknode */
-
-/*
- * ocfs_volcfg_gblctxt_to_node()
- *
- */
-static void ocfs_volcfg_gblctxt_to_node(BARF_BARF_BARF *node)
-{
- ocfs_ipc_config_info *ipc;
- ocfs_comm_info *g_ipc;
-
- LOG_ENTRY ();
-
- ipc = &(node->ipc_config);
- g_ipc = &(OcfsGlobalCtxt.comm_info);
-
- if (OcfsGlobalCtxt.node_name)
- strncpy (node->node_name, OcfsGlobalCtxt.node_name,
- MAX_NODE_NAME_LENGTH);
-
- memcpy(node->guid.guid, OcfsGlobalCtxt.guid.guid,
- OCFS2_GUID_LEN);
-
- ipc->ip_port = g_ipc->ip_port;
- ipc->ip_version = g_ipc->ip_version;
- ipc->addr_u.ip_addr4 = g_ipc->addr_u.ip_addr4;
-
- LOG_EXIT ();
- return ;
-} /* ocfs_volcfg_gblctxt_to_node */
-
-/*
- * ocfs_chk_update_config()
- *
- */
-int ocfs_chk_update_config (ocfs_super * osb)
-{
- int status = 0;
- ocfs_node_config_hdr *hdr = NULL;
- ocfs_node_config_info *disk = NULL;
- __s32 i;
- struct buffer_head **cfg_bhs = NULL;
-
- LOG_ENTRY ();
-
- /* Read in the config on the disk */
- cfg_bhs = ocfs_malloc(osb->autoconfig_blocks *
- sizeof(*cfg_bhs));
- if (cfg_bhs == NULL) {
- status = -ENOMEM;
- LOG_ERROR_STATUS(status);
- goto finally;
- }
- memset(cfg_bhs, 0, osb->autoconfig_blocks * sizeof(*cfg_bhs));
-
- status = ocfs_read_blocks(osb, osb->autoconfig_blkno,
- (osb->autoconfig_blocks), cfg_bhs, 0,
- NULL);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- /* 1st block in buffer is the NodeCfgHdr */
- hdr = (ocfs_node_config_hdr *) cfg_bhs[0]->b_data;
-
- if (strncmp (hdr->signature, OCFS2_NODE_CONFIG_HDR_SIGN,
- OCFS2_NODE_CONFIG_SIGN_LEN)) {
- LOG_ERROR_STR ("Invalid node config signature");
- status = -EINVAL;
- goto finally;
- }
-
- if (hdr->version < OCFS2_NODE_MIN_SUPPORTED_VER ||
- hdr->version > OCFS2_NODE_CONFIG_VER) {
- LOG_ERROR_ARGS ("Node config version mismatch, (%d) < minimum" \
- " (%d) or > current (%d)", hdr->version,
- OCFS2_NODE_MIN_SUPPORTED_VER, OCFS2_NODE_CONFIG_VER);
- status = -EINVAL;
- goto finally;
- }
-
- /* Exit if nodecfg on disk has remained unchanged... */
- if ((osb->cfg_initialized) && (osb->cfg_seq_num == hdr->cfg_seq_num) &&
- (osb->num_cfg_nodes == hdr->num_nodes))
- goto finally;
-
- /* ... else refresh nodecfg in memory */
-
- /* Read the nodecfg for all possible nodes as there may be holes */
- /* i.e., node numbers need not be dolled out in sequence */
- for (i = 0; i < osb->max_nodes; i++) {
- int which;
- which = i + OCFS_VOLCFG_HDR_SECTORS;
- disk = (ocfs_node_config_info *) cfg_bhs[which]->b_data;
-
- if (disk->node_name[0] == '\0')
- continue;
-
- status = ocfs_disknode_to_node (&osb->node_cfg_info[i], disk);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto finally;
- }
-
- /* If nodenum is set, goto next node */
- if (osb->node_num != OCFS_INVALID_NODE_NUM)
- continue;
-
- /*
- * If node num is not set, set it if guid matches.
- * If guid does not match and the hostid also does not
- * match, goto next slot.
- * However if the guid does not natch but the hostid
- * matches, it means that the user re-ran ocfs_uid_gen
- * with the -r option to reclaim its node number. In
- * this case, allow the reclaim only if the user mounts
- * the volume with the reclaimid option. Else, error.
- */
- if (!memcmp(&OcfsGlobalCtxt.guid.guid, disk->guid.guid,
- OCFS2_GUID_LEN)) {
- osb->node_num = i;
- continue;
- }
-
- /* If the hostid does not match, goto next... */
- if (memcmp(&OcfsGlobalCtxt.guid.id.host_id,
- disk->guid.id.host_id,
- OCFS2_GUID_HOSTID_LEN))
- continue;
-
- /* ...else allow node to reclaim the number if reclaimid set */
- if (osb->reclaim_id) {
- osb->node_num = i;
- /* Write this node's cfg with the new guid on disk */
- status = ocfs_refresh_node_config (osb);
- if (status < 0) {
- LOG_ERROR_STATUS(status);
- goto finally;
- }
- }
- else {
- LOG_ERROR_STR("Re-mount volume with the reclaimid " \
- "option to reclaim the node number");
- status = -EBUSY;
- goto finally;
- }
- }
-
- osb->cfg_initialized = 1;
- osb->cfg_seq_num = hdr->cfg_seq_num;
- osb->num_cfg_nodes = hdr->num_nodes;
- LOG_TRACE_ARGS ("Num of configured nodes (%u)\n", osb->num_cfg_nodes);
- IF_TRACE(ocfs_show_all_node_cfgs (osb));
-
-finally:
- if (cfg_bhs) {
- for (i = 0; i < osb->autoconfig_blocks; i++)
- if (cfg_bhs[i])
- brelse(cfg_bhs[i]);
- kfree(cfg_bhs);
- }
-
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_chk_update_config */
-
-/*
- * ocfs_get_config()
- *
- */
-int ocfs_get_config (ocfs_super * osb)
-{
- int status = 0;
-
- LOG_ENTRY ();
-
- /* Update our config info for this volume from the disk */
- status = ocfs_chk_update_config (osb);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
-
- if (osb->node_num == OCFS_INVALID_NODE_NUM) {
- if (osb->reclaim_id) {
- LOG_ERROR_STR ("unable to reclaim id");
- status = -EINVAL;
- goto bail;
- }
- status = ocfs_add_node_to_config (osb);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
- } else {
- if (ocfs_has_node_config_changed (osb)) {
- status = ocfs_refresh_node_config (osb);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
- }
- }
-
- LOG_TRACE_ARGS ("Node Num: %d\n", osb->node_num);
-
-bail:
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_get_config */
-
-/*
- * ocfs_has_node_config_changed()
- *
- */
-static int ocfs_has_node_config_changed (ocfs_super * osb)
-{
- BARF_BARF_BARF *node;
- ocfs_ipc_config_info *ipc;
- ocfs_comm_info *g_ipc;
- int chg = 0;
-
- LOG_ENTRY ();
-
- node = osb->node_cfg_info[osb->node_num];
- ipc = &(node->ipc_config);
- g_ipc = &(OcfsGlobalCtxt.comm_info);
-
- if (OcfsGlobalCtxt.node_name &&
- strncmp (node->node_name, OcfsGlobalCtxt.node_name,
- MAX_NODE_NAME_LENGTH))
- chg = 1;
-
- if (!chg && (ipc->ip_version != g_ipc->ip_version))
- chg = 1;
-
- if (!chg && (ipc->ip_port != g_ipc->ip_port))
- chg = 1;
-
- if (!chg && (ipc->addr_u.ip_addr4 != g_ipc->addr_u.ip_addr4))
- chg = 1;
-
- LOG_EXIT_INT (chg);
- return chg;
-} /* ocfs_has_node_config_changed */
-
-/*
- * ocfs_refresh_node_config()
- *
- */
-static int ocfs_refresh_node_config (ocfs_super * osb)
-{
- BARF_BARF_BARF *node;
- ocfs_node_config_info *disk;
- __u64 offset;
- __u8 *buffer;
- int status;
-
- LOG_ENTRY ();
-
- buffer = ocfs_malloc (osb->sb->s_blocksize);
- if (buffer == NULL) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto bail;
- }
-
- memset (buffer, 0, osb->sb->s_blocksize);
- disk = (ocfs_node_config_info *) buffer;
-
- /* populate the nodecfg info in disk from global context */
- ocfs_volcfg_gblctxt_to_disknode (disk);
-
- /* populate the nodecfg info in mem from global context */
- node = osb->node_cfg_info[osb->node_num];
- ocfs_volcfg_gblctxt_to_node (node);
-
- /* Update the nodecfg on disk with the new info */
- offset = (osb->new_autoconfig_blkno << osb->sb->s_blocksize_bits);
- status = ocfs_config_with_disk_lock (osb, offset, (__u8 *) disk,
- osb->node_num, OCFS_VOLCFG_UPD);
- if (status < 0) {
- LOG_ERROR_STATUS (status);
- goto bail;
- }
-
-bail:
- if (buffer)
- kfree(buffer);
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_refresh_node_config */
-
-/*
- * ocfs_show_all_node_cfgs()
- *
- */
-static void ocfs_show_all_node_cfgs (ocfs_super * osb)
-{
- BARF_BARF_BARF *node;
- __u32 i;
-
- for (i = 0; i < osb->max_nodes; i++) {
- node = osb->node_cfg_info[i];
-
- if (!node || node->node_name[0] == '\0')
- continue;
-
- LOG_TRACE_ARGS ("Node (%u) is (%s)\n", i,
- node->node_name);
- LOG_TRACE_ARGS ("ip=0x%08u, port=%d\n",
- ntohl(node->ipc_config.addr_u.ip_addr4),
- ntohs(node->ipc_config.ip_port));
- }
-
- return;
-} /* ocfs_show_all_node_cfgs */
Deleted: trunk/src/volcfg.h
===================================================================
--- trunk/src/volcfg.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/volcfg.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -1,32 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * volcfg.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VOLCFG_H
-#define OCFS2_VOLCFG_H
-
-int ocfs_chk_update_config(ocfs_super *osb);
-int ocfs_get_config(ocfs_super *osb);
-
-#endif /* OCFS2_VOLCFG_H */
Modified: trunk/src/vote.c
===================================================================
--- trunk/src/vote.c 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/vote.c 2004-12-06 21:45:32 UTC (rev 1693)
@@ -3,9 +3,9 @@
*
* vote.c
*
- * netdlm listener, receive, verify and send messages
+ * description here
*
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
+ * Copyright (C) 2003, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -25,1125 +25,831 @@
#include "ocfs_compat.h"
-#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
-#include <linux/inet.h>
-#include <linux/net.h>
-#include <linux/in.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include <cluster/util.h>
+#include <cluster/dlmcommon.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+#include <cluster/dlmmod.h>
#include "ocfs_log.h"
#include "ocfs.h"
+#include "ocfs2.h"
-#include "dlm.h"
-#include "nm.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
#include "util.h"
#include "vote.h"
-/* Tracing */
-#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_VOTE
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_VOTE
+#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
+#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
+typedef struct _ocfs2_msg_hdr
+{
+ u32 h_response_id; /* used to lookup message handle on sending
+ * node. */
+ u32 h_request;
+ u64 h_blkno;
+ u32 h_generation;
+ u32 h_node_num; /* node sending this particular message. */
+} ocfs2_msg_hdr;
-ocfs_ipc_ctxt OcfsIpcCtxt;
+typedef struct _ocfs2_vote_msg
+{
+ ocfs2_msg_hdr v_hdr;
+ /* may put stuff in here... */
+} ocfs2_vote_msg;
-static const char vote_state_str[] = { 'U', 'S', 'P', 'F', 'D' };
+typedef struct _ocfs2_response_msg
+{
+ ocfs2_msg_hdr r_hdr;
+ s32 r_response; /* this maps to '0' or a -value in errno.h */
+} ocfs2_response_msg;
-static ocfs_vote_obj * ocfs_alloc_vote_obj (ocfs_super *osb, int bytes, __u32 reqlock, ocfs_node_map * votemap);
-static void ocfs_dlm_recv_msg (void *val);
-static int ocfs_check_ipc_msg (__u8 * msg, __u32 msg_len);
-static int ocfs_comm_process_vote_reply (ocfs_super * osb, ocfs_dlm_msg * dlm_msg);
-static int ocfs_comm_process_msg (__u8 * msg);
-static void ocfs_init_dlm_msg (ocfs_super * osb, ocfs_dlm_msg * dlm_msg, __u32 msg_len, __u32 type);
+typedef struct _ocfs2_vote_work {
+ struct list_head w_list;
+ ocfs2_vote_msg w_msg;
+} ocfs2_vote_work;
-static int ocfs_send_bcast (ocfs_super * osb, ocfs_node_map *votemap, ocfs_dlm_msg * dlm_msg);
-static int ocfs_node_map_stringify(ocfs_node_map *map, char **str);
+enum ocfs2_vote_request {
+ OCFS2_VOTE_REQ_INVALID = 0,
+ OCFS2_VOTE_REQ_DELETE,
+ OCFS2_VOTE_REQ_UNLINK,
+ OCFS2_VOTE_REQ_RENAME,
+ OCFS2_VOTE_REQ_MOUNT,
+ OCFS2_VOTE_REQ_UMOUNT
+};
-static void ocfs_put_vote_obj(ocfs_vote_obj *obj)
+typedef struct _ocfs2_net_wait_ctxt {
+ struct list_head n_list;
+ u32 n_response_id;
+ wait_queue_head_t n_event;
+ ocfs_node_map n_node_map;
+ int n_response; /* an agreggate response. 0 if
+ * all nodes are go, < 0 on any
+ * negative response from any
+ * node or network error. */
+} ocfs2_net_wait_ctxt;
+
+static void ocfs2_vote_thread_do_work(ocfs_super *osb);
+static void ocfs2_process_vote(ocfs_super *osb,
+ ocfs2_vote_msg *msg);
+static int ocfs2_do_request_vote(ocfs_super *osb,
+ u64 blkno,
+ unsigned int generation,
+ enum ocfs2_vote_request type);
+
+static void ocfs2_process_mount_request(ocfs_super *osb,
+ unsigned int node_num)
{
- if (atomic_dec_and_test(&obj->refcount))
- kfree(obj);
+ printk("MOUNT vote from node %u\n", node_num);
+ /* The other node only sends us this message when he has an EX
+ * on the superblock, so our recovery threads (if having been
+ * launched) are waiting on it.*/
+ ocfs_recovery_map_clear(osb, node_num);
+ ocfs_node_map_set_bit(osb, &osb->mounted_map, node_num);
}
-static void ocfs_get_vote_obj(ocfs_vote_obj *obj)
+static void ocfs2_process_umount_request(ocfs_super *osb,
+ unsigned int node_num)
{
- atomic_inc(&obj->refcount);
+ printk("UMOUNT vote from node %u\n", node_num);
+ ocfs_node_map_clear_bit(osb, &osb->mounted_map, node_num);
+ ocfs_node_map_set_bit(osb, &osb->umount_map, node_num);
}
-
-/*
- * ocfs_recv_udp_msg()
- *
- */
-int ocfs_recv_udp_msg (ocfs_recv_ctxt * recv_ctxt)
+static int ocfs2_process_delete_request(struct inode *inode)
{
- int status = -ENETUNREACH, error;
- mm_segment_t oldfs;
- struct sockaddr_in sin;
- struct iovec iov = {
- .iov_len = recv_ctxt->msg_len,
- .iov_base = recv_ctxt->msg
- };
- struct msghdr msg = {
- .msg_control = NULL,
- .msg_controllen = 0,
- .msg_iovlen = 1,
- .msg_iov = &iov,
- .msg_name = (struct sockaddr *) &sin,
- .msg_namelen = sizeof (sin),
- .msg_flags = 0
- };
+ int response = -EBUSY;
- LOG_ENTRY ();
+ LOG_TRACE_ARGS("DELETE vote on inode %lu, read "
+ "lnk_cnt = %u\n", inode->i_ino,
+ inode->i_nlink);
- /* Initialize the workitem with our worker routine and Q it. */
- INIT_WORK (&recv_ctxt->ipc_wq, ocfs_dlm_recv_msg, recv_ctxt);
+ /* force this as ours may be out of date. */
+ inode->i_nlink = 0;
- memset (&sin, 0, sizeof (sin));
- oldfs = get_fs ();
- set_fs (get_ds ());
- error = sock_recvmsg (OcfsIpcCtxt.recv_sock, &msg, recv_ctxt->msg_len, msg.msg_flags);
- set_fs (oldfs);
+ spin_lock(&OCFS_I(inode)->ip_lock);
+ /* vote no if the file is still open. */
+ if (OCFS_I(inode)->ip_open_cnt > 0) {
+ LOG_TRACE_PROCESS_VOTE("open count = %u\n",
+ OCFS_I(inode)->ip_open_cnt);
+ spin_unlock(&OCFS_I(inode)->ip_lock);
+ goto done;
+ }
+ spin_unlock(&OCFS_I(inode)->ip_lock);
- if (error < 0) {
- if (error == -ERESTARTSYS) {
- status = -EBADF;
- LOG_TRACE_STR ("Shutting down ocfs2lsnr");
- } else {
- status = -ENETUNREACH;
- LOG_ERROR_ARGS ("unable to recvmsg, error=%d", error);
- LOG_ERROR_STATUS (status);
- }
- goto bail;
- } else if (msg.msg_namelen) {
- recv_ctxt->msg_len = iov.iov_len;
- status = 0;
+ /* directories are a bit ugly... What if someone is sitting in
+ * it? We want to make sure the inode is removed completely as
+ * a result of the iput in process_vote. */
+ if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
+ LOG_TRACE_PROCESS_VOTE("i_count = %u\n",
+ atomic_read(&inode->i_count));
+ goto done;
}
- LOG_TRACE_ARGS ("Received packet from: %d.%d.%d.%d\n",
- NIPQUAD (sin.sin_addr.s_addr));
+ /* If we get here, then we're voting 'yes', so commit the
+ * delete on our side. */
+ response = 0;
- if (status == 0)
- schedule_work(&recv_ctxt->ipc_wq);
+ spin_lock(&OCFS_I(inode)->ip_lock);
+ SET_INODE_DELETED(inode);
+ /* We set the SKIP_DELETE flag on the inode so we don't try to
+ * delete it in delete_inode ourselves. */
+ OCFS_I(inode)->ip_flags |= OCFS_INODE_SKIP_DELETE;
+ spin_unlock(&OCFS_I(inode)->ip_lock);
-bail:
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_recv_udp_msg */
+ d_prune_aliases (inode);
+ /* TODO: How much of this is really necessary? */
+ sync_mapping_buffers(inode->i_mapping);
+ ocfs_truncate_inode_pages(inode, 0);
+ ocfs2_extent_map_trunc(inode, 0);
-static inline int ocfs2_comm_ip_version_to_family(u16 ip_version)
-{
- switch (ntohs(ip_version)) {
- case 4:
- return PF_INET;
- case 6:
- return PF_INET6;
- default:
- BUG();
- }
-
- return 4;
+done:
+ return response;
}
-/*
- * ocfs_send_bcast()
- *
- */
-static int ocfs_send_bcast (ocfs_super * osb, ocfs_node_map *votemap, ocfs_dlm_msg * dlm_msg)
+static void ocfs2_process_dentry_request(struct inode *inode,
+ int rename)
{
- int status = 0, error;
- __s16 num;
- BARF_BARF_BARF *node;
- struct sockaddr_in sin;
- mm_segment_t oldfs;
+ d_prune_aliases (inode);
- LOG_ENTRY ();
-
- oldfs = get_fs ();
- for (num=0; num<osb->max_nodes; num++) {
- if (num == osb->node_num)
- continue;
-
- if (!ocfs_node_map_test_bit(votemap, num))
- continue;
-
- node = osb->node_cfg_info[num];
- if (!node)
- continue;
-
- LOG_TRACE_ARGS("Sending msg to node=%u, name=%s\n",
- num, node->node_name);
- memset (&sin, 0, sizeof (sin));
- sin.sin_family = ocfs2_comm_ip_version_to_family(node->ipc_config.ip_version);
- sin.sin_addr.s_addr = node->ipc_config.addr_u.ip_addr4;
- sin.sin_port = node->ipc_config.ip_port;
-
- LOG_TRACE_ARGS("about to send to 0x%08u:%u\n",
- ntohl(node->ipc_config.addr_u.ip_addr4),
- ntohs(node->ipc_config.ip_port));
-
- status = -ENETUNREACH;
- if (OcfsIpcCtxt.send_sock) {
- struct iovec iov = {
- .iov_base = dlm_msg,
- .iov_len = dlm_msg->msg_len
- };
- struct msghdr msg = {
- .msg_iov = &iov,
- .msg_iovlen = 1,
- .msg_control = NULL,
- .msg_controllen = 0,
- .msg_name = (struct sockaddr *) &sin,
- .msg_namelen = sizeof (sin),
- .msg_flags = 0
- };
-
- status = 0;
- set_fs (get_ds ());
- error = sock_sendmsg (OcfsIpcCtxt.send_sock, &msg, dlm_msg->msg_len);
- set_fs (oldfs);
-
- if (error < 0) {
- LOG_ERROR_ARGS ("unable to sendmsg, error=%d", error);
- status = -ENETUNREACH;
- }
- }
- if (status < 0)
- LOG_ERROR_STATUS (status);
+ /* for rename, we don't drop link counts */
+ if (!rename) {
+ if (S_ISDIR(inode->i_mode))
+ inode->i_nlink = 0;
+ else
+ inode->i_nlink--;
}
+}
- status = 0;
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_send_bcast */
-
-/*
- * ocfs_init_udp_sock()
- *
- */
-int ocfs_init_udp_sock(struct socket **send_sock,
- struct socket **recv_sock)
+static void ocfs2_process_vote(ocfs_super *osb,
+ ocfs2_vote_msg *msg)
{
- struct sockaddr_in sin;
- int status;
- ocfs_comm_info *comm;
+ int net_status, vote_response;
+ int rename = 0;
+ unsigned int node_num, generation;
+ u64 blkno;
+ enum ocfs2_vote_request request;
+ struct inode *inode = NULL;
+ struct inode *remote_node;
+ ocfs2_msg_hdr *hdr = &msg->v_hdr;
+ ocfs2_response_msg response;
- LOG_ENTRY ();
+ /* decode the network mumbo jumbo into local variables. */
+ request = ntohl(hdr->h_request);
+ blkno = be64_to_cpu(hdr->h_blkno);
+ generation = ntohl(hdr->h_generation);
+ node_num = ntohl(hdr->h_node_num);
- comm = &(OcfsGlobalCtxt.comm_info);
+ printk("ocfs2: processing vote: request = %u, blkno = %llu, "
+ "generation = %u, node_num = %u\n", request, blkno, generation,
+ node_num);
- /* Create Send Socket */
- status = sock_create(ocfs2_comm_ip_version_to_family(comm->ip_version),
- SOCK_DGRAM, IPPROTO_UDP,
- send_sock);
- if (status < 0) {
- LOG_ERROR_ARGS ("unable to create socket, error=%d", status);
- goto bail;
+ vote_response = 0;
+ switch (request) {
+ case OCFS2_VOTE_REQ_UMOUNT:
+ ocfs2_process_umount_request(osb, node_num);
+ goto respond;
+ case OCFS2_VOTE_REQ_MOUNT:
+ ocfs2_process_mount_request(osb, node_num);
+ goto respond;
+ default:
+ /* avoids a gcc warning */
+ break;
}
- /* Bind Send Socket */
- memset(&sin, 0, sizeof (sin));
- sin.sin_family = ocfs2_comm_ip_version_to_family(comm->ip_version);
- sin.sin_addr.s_addr = htonl (INADDR_ANY);
- sin.sin_port = htons(0);
+ /* We cannot process the remaining message types before we're
+ * fully mounted. It's perfectly safe however to send a 'yes'
+ * response as we can't possibly have any of the state they're
+ * asking us to modify yet. */
+ if (atomic_read(&osb->vol_state) == VOLUME_INIT)
+ goto respond;
- status = (*send_sock)->ops->bind(*send_sock,
- (struct sockaddr *)&sin,
- sizeof(sin));
- if (status < 0) {
- LOG_ERROR_ARGS ("unable to bind socket, error=%d", status);
- goto bail;
- }
+ vote_response = -EINVAL;
+ /* If we get here, then the request is against an inode. */
+ inode = ocfs_ilookup(osb, blkno);
+ if (!inode)
+ goto respond;
- /* Create Receive Socket */
- status = sock_create(ocfs2_comm_ip_version_to_family(comm->ip_version),
- SOCK_DGRAM, IPPROTO_UDP,
- recv_sock);
- if (status < 0) {
- LOG_ERROR_ARGS ("unable to create socket, error=%d", status);
- goto bail;
+ OCFS_ASSERT(inode->i_generation == generation);
+
+ switch (request) {
+ case OCFS2_VOTE_REQ_DELETE:
+ vote_response = ocfs2_process_delete_request(inode);
+ break;
+ case OCFS2_VOTE_REQ_RENAME:
+ rename = 1;
+ /* fall through */
+ case OCFS2_VOTE_REQ_UNLINK:
+ ocfs2_process_dentry_request(inode, rename);
+ break;
+ default:
+ printk("ocfs2_process_vote: node %u, invalid request: %u\n",
+ node_num, request);
}
+respond:
+ /* Response struture is small so we just put it on the stack
+ * and stuff it inline. */
+ memset(&response, 0, sizeof(ocfs2_response_msg));
+ response.r_hdr.h_response_id = hdr->h_response_id;
+ response.r_hdr.h_blkno = hdr->h_blkno;
+ response.r_hdr.h_generation = hdr->h_generation;
+ response.r_hdr.h_node_num = htonl(osb->node_num);
+ response.r_response = htonl(vote_response);
- /* Bind Receive Socket */
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = ocfs2_comm_ip_version_to_family(comm->ip_version);
- sin.sin_addr.s_addr = htonl(INADDR_ANY);
- sin.sin_port = comm->ip_port;
-
- status = (*recv_sock)->ops->bind(*recv_sock,
- (struct sockaddr *)&sin,
- sizeof(sin));
- if (status < 0) {
- LOG_ERROR_ARGS ("unable to bind socket, error=%d", status);
- goto bail;
+ remote_node = nm_get_node_by_num(node_num);
+ if (!remote_node) {
+ LOG_ERROR_ARGS("Couldn't get inode for node %u!\n", node_num);
+ } else {
+ net_status = net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
+ osb->net_key,
+ &response,
+ sizeof(ocfs2_response_msg),
+ remote_node,
+ NULL);
+ if (net_status < 0)
+ LOG_ERROR_ARGS("message to node %u fails with error "
+ "%d!\n", node_num, net_status);
+ iput(remote_node);
}
-bail:
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_init_udp_sock */
+ if (inode)
+ iput(inode);
+}
-
-/*
- * ocfs_send_vote_reply()
- *
- */
-int ocfs_send_vote_reply (ocfs_super * osb, ocfs_dlm_msg * dlm_msg, __u32 vote_status)
+static void ocfs2_vote_thread_do_work(ocfs_super *osb)
{
- ocfs_dlm_req_master *req_master;
- ocfs_dlm_reply_master *reply_master;
- ocfs_dlm_msg *send_dlm_msg;
- ocfs_vote_obj *obj;
- int status = 0;
- __u8 *buf;
- __u32 msg_len, obj_len;
- ocfs_node_map vote_map;
+ unsigned long processed;
+ ocfs2_lock_res *lockres;
+ ocfs2_vote_work *work;
- LOG_ENTRY ();
+ spin_lock(&osb->vote_task_lock);
+ processed = osb->blocked_lock_count;
+ while (processed) {
+ OCFS_ASSERT(!list_empty(&osb->blocked_lock_list));
- ocfs_node_map_init(osb, &vote_map);
+ lockres = list_entry(osb->blocked_lock_list.next,
+ ocfs2_lock_res, l_blocked_list);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock(&osb->vote_task_lock);
- req_master = (ocfs_dlm_req_master *) dlm_msg->msg_buf;
+ OCFS_ASSERT(processed);
+ processed--;
- msg_len = sizeof (ocfs_dlm_msg) + sizeof (ocfs_dlm_reply_master);
- obj_len = sizeof (ocfs_vote_obj) + sizeof (ocfs_dlm_reply_master);
+ ocfs2_process_blocked_lock(osb, lockres);
- obj = ocfs_alloc_vote_obj (osb, obj_len, 0, NULL);
- if (obj == NULL) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto finally;
+ spin_lock(&osb->vote_task_lock);
}
- buf = (__u8 *)&(obj->m);
- send_dlm_msg = (ocfs_dlm_msg *)buf;
- reply_master = (ocfs_dlm_reply_master *) send_dlm_msg->msg_buf;
-
- ocfs_init_dlm_msg (osb, send_dlm_msg, msg_len, OCFS_VOTE_REPLY);
- reply_master->h.lock_id = req_master->lock_id;
- reply_master->status = vote_status;
- reply_master->h.lock_seq_num = req_master->lock_seq_num;
- reply_master->h.flags = req_master->flags;
+ while (osb->vote_count) {
+ OCFS_ASSERT(!list_empty(&osb->vote_list));
+ work = list_entry(osb->vote_list.next,
+ ocfs2_vote_work, w_list);
+ list_del(&work->w_list);
+ osb->vote_count--;
+ spin_unlock(&osb->vote_task_lock);
- ocfs_node_map_set_bit(&vote_map, dlm_msg->src_node);
- ocfs_node_map_set(&obj->req_vote_map, &vote_map);
+ ocfs2_process_vote(osb, &work->w_msg);
+ kfree(work);
- spin_lock(&osb->vote_obj_queue_lock);
- list_add_tail(&obj->list, &osb->vote_obj_queue);
- spin_unlock(&osb->vote_obj_queue_lock);
+ spin_lock(&osb->vote_task_lock);
+ }
+ spin_unlock(&osb->vote_task_lock);
+}
- ocfs_send_bcast (osb, &vote_map, send_dlm_msg);
- spin_lock (&obj->lock);
- obj->vote_state = VOTE_OBJ_STATE_SENT;
- spin_unlock (&obj->lock);
+static inline int ocfs2_vote_thread_has_work(ocfs_super *osb)
+{
+ if (list_empty(&osb->blocked_lock_list) &&
+ list_empty(&osb->vote_list))
+ return 0;
- // silly ;-)
- spin_lock (&obj->lock);
- obj->vote_state = VOTE_OBJ_STATE_DESTROYING;
- spin_unlock (&obj->lock);
+ return 1;
+}
- spin_lock(&osb->vote_obj_queue_lock);
- list_del(&obj->list);
- spin_unlock(&osb->vote_obj_queue_lock);
+int ocfs2_vote_thread(void *arg)
+{
+ int status = 0;
+ ocfs_super *osb = arg;
+ char proc[16];
-finally:
- ocfs_put_vote_obj (obj);
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_send_vote_reply */
+ sprintf (proc, "ocfs2vote-%d", osb->osb_id);
+ ocfs_daemonize (proc, strlen(proc), 0);
+ spin_lock(&osb->vote_task_lock);
+ osb->vote_task = current;
+ init_completion (&osb->vote_event_complete);
-/*
- * ocfs_check_ipc_msg()
- *
- */
-int ocfs_check_ipc_msg (__u8 * msg, __u32 msg_len)
-{
- int ret = 0;
- ocfs_dlm_msg *dlm_msg;
+ complete(&osb->vote_event_init);
- LOG_ENTRY ();
+ while (1) {
+ if (osb->vote_exit) {
+ if (!ocfs2_vote_thread_has_work(osb))
+ break;
+ /* don't want to sleep if we're supposed to quit. */
+ atomic_set(&osb->wake_vote_task, 1);
+ }
+ spin_unlock(&osb->vote_task_lock);
- dlm_msg = (ocfs_dlm_msg *) msg;
+ wait_event_interruptible(osb->vote_event,
+ atomic_read(&osb->wake_vote_task));
- if (dlm_msg == NULL) {
- LOG_TRACE_STR("Null netdlm message");
- goto bail;
+ atomic_set(&osb->wake_vote_task, 0);
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
+ printk("(%u) vote_thread: awoken\n", current->pid);
+#endif
+ ocfs2_vote_thread_do_work(osb);
+ spin_lock(&osb->vote_task_lock);
}
- if (msg_len < sizeof(ocfs_dlm_msg)) {
- LOG_TRACE_STR("Netdlm message too short");
- goto bail;
- }
+ osb->vote_task = NULL;
+ spin_unlock(&osb->vote_task_lock);
- /* Compute and Compare the checksum */
- if (dlm_msg->magic != OCFS_DLM_MSG_MAGIC) {
- LOG_TRACE_ARGS ("Magic number mismatch in netdlm message: "
- "0x%08x != 0x%08x\n",
- dlm_msg->magic, OCFS_DLM_MSG_MAGIC);
- goto bail;
- }
+ complete(&osb->vote_event_complete);
+ return status;
+}
- ret = 1;
-
-bail:
- LOG_EXIT_INT (ret);
- return ret;
-} /* ocfs_check_ipc_msg */
-
-
-int ocfs_lookup_obj_for_proc (ocfs_vote_obj *obj, ocfs_vote_obj_lookup_data *data)
+static ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(ocfs_super *osb,
+ unsigned int response_id)
{
- int status = -ENOENT;
- ocfs_dlm_msg *dlm_msg = NULL;
- ocfs_dlm_msg_hdr *request = NULL;
- ocfs_dlm_reply_master *reply = NULL;
- int *len = data->u.proc.len;
- int max = data->u.proc.max - *len;
- char *p = data->u.proc.page + *len;
- int ret = 0;
- char *reqstr=NULL, *gotstr=NULL;
+ ocfs2_net_wait_ctxt *w;
- /* just run thru everything to populate /proc */
- /* return -ENOENT to keep going */
- dlm_msg = &(obj->m);
-
- switch (dlm_msg->msg_type) {
- case OCFS_VOTE_REQUEST:
- request = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
-
- if (ocfs_node_map_stringify(&obj->req_vote_map, &reqstr) < 0)
- break;
- if (ocfs_node_map_stringify(&obj->got_vote_map, &gotstr) < 0)
- break;
- ret = snprintf(p, max, "REQST: %d %c %3d %s %21llu %21llu %08x | %s\n",
- obj->pid,
- vote_state_str[obj->vote_state],
- obj->vote_status,
- reqstr,
- request->lock_id,
- request->lock_seq_num,
- request->flags, gotstr);
- break;
- case OCFS_VOTE_REPLY:
- reply = (ocfs_dlm_reply_master *) dlm_msg->msg_buf;
- if (ocfs_node_map_stringify(&obj->req_vote_map, &reqstr) < 0)
- break;
- ret = snprintf(p, max, "REPLY: %d %c %3d %s %21llu %21llu %08x | %3d\n",
- obj->pid,
- vote_state_str[obj->vote_state],
- obj->vote_status,
- reqstr,
- reply->h.lock_id,
- reply->h.lock_seq_num,
- reply->h.flags,
- reply->status);
-
- break;
- case OCFS_INFO_DISMOUNT:
- ret = snprintf(p, max, "UNMNT: %d\n", obj->pid);
- break;
- default:
- ret = snprintf(p, max, "BAD!!: %d\n", obj->pid);
- break;
+ w = kmalloc(sizeof(*w), GFP_KERNEL);
+ if (!w) {
+ LOG_ERROR_STATUS(-ENOMEM);
+ goto bail;
}
- (*len) += ret;
- p[max-1] = '\0';
+ memset(w, 0, sizeof(*w));
- if (reqstr)
- kfree(reqstr);
- if (gotstr)
- kfree(gotstr);
- return status;
+ INIT_LIST_HEAD(&w->n_list);
+ init_waitqueue_head(&w->n_event);
+ ocfs_node_map_init(osb, &w->n_node_map);
+ w->n_response_id = response_id;
+bail:
+ return w;
}
-
-int ocfs_lookup_obj_by_lockid (ocfs_vote_obj *obj, ocfs_vote_obj_lookup_data *data)
+static unsigned int ocfs2_new_response_id(ocfs_super *osb)
{
- int status = 0;
- ocfs_dlm_msg *dlm_msg = NULL;
- ocfs_dlm_msg_hdr *req = NULL;
+ unsigned int ret;
- dlm_msg = &(obj->m);
- req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
- if (dlm_msg->msg_type != OCFS_VOTE_REQUEST ||
- obj->vote_state == VOTE_OBJ_STATE_DESTROYING ||
- req->lock_id != data->u.s.lock_id) {
- status = -ENOENT;
- }
- return status;
+ spin_lock(&osb->net_response_lock);
+ ret = ++osb->net_response_ids;
+ spin_unlock(&osb->net_response_lock);
+
+ return ret;
}
-static int ocfs_lookup_obj_by_seq (ocfs_vote_obj *obj, ocfs_vote_obj_lookup_data *data)
+static void ocfs2_dequeue_net_wait_ctxt(ocfs_super *osb,
+ ocfs2_net_wait_ctxt *w)
{
- int status = -ENOENT;
- ocfs_dlm_msg *dlm_msg = NULL;
- ocfs_dlm_msg_hdr *req = NULL;
+ spin_lock(&osb->net_response_lock);
+ list_del(&w->n_list);
+ spin_unlock(&osb->net_response_lock);
+}
- if (obj->seq_num == data->u.s.seq_num) {
- status = 0;
- dlm_msg = &(obj->m);
- req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
- // error if there is a non-request with a matching seqnum, or
- // a vote object that is in too early or too late a state, or
- // a vote object with the right seqnum but wrong lockid
- if (dlm_msg->msg_type != OCFS_VOTE_REQUEST ||
- obj->vote_state == VOTE_OBJ_STATE_DESTROYING ||
- obj->vote_state == VOTE_OBJ_STATE_UNSENT ||
- req->lock_id != data->u.s.lock_id) {
- LOG_ERROR_ARGS("bad message: vote_state=%d type=%d "
- "lockid=%llu expected=%llu\n",
- obj->vote_state, dlm_msg->msg_type,
- req->lock_id, data->u.s.lock_id);
- status = -EINVAL;
- }
- }
- return status;
+static void ocfs2_queue_net_wait_ctxt(ocfs_super *osb,
+ ocfs2_net_wait_ctxt *w)
+{
+ spin_lock(&osb->net_response_lock);
+ list_add_tail(&w->n_list,
+ &osb->net_response_list);
+ spin_unlock(&osb->net_response_lock);
}
-/*
- * returns an ocfs_vote_obj with a ref on it or NULL
- */
-int ocfs_lookup_vote_request_obj (ocfs_super *osb, ocfs_vote_obj_lookup_data *data)
+#define OCFS2_RESPONSE_WAIT_JIFFIES (30 * HZ)
+static int ocfs2_wait_on_vote_responses(ocfs_super *osb,
+ ocfs2_net_wait_ctxt *w)
{
- int status = -ENOENT;
- struct list_head *iter;
- ocfs_vote_obj *obj = NULL;
+ int status = 0;
+ signed long timeout = OCFS2_RESPONSE_WAIT_JIFFIES;
+ DECLARE_WAITQUEUE(wait, current);
- spin_lock(&osb->vote_obj_queue_lock);
+ add_wait_queue(&w->n_event, &wait);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
- list_for_each (iter, &osb->vote_obj_queue) {
- obj = list_entry (iter, ocfs_vote_obj, list);
- ocfs_get_vote_obj (obj);
- spin_lock(&obj->lock);
- status = data->func(obj, data);
- spin_unlock(&obj->lock);
- if (status < 0) {
- ocfs_put_vote_obj (obj);
- obj = NULL;
- }
- if (status != -ENOENT)
+ if (ocfs_node_map_is_empty(osb, &w->n_node_map))
break;
- obj = NULL;
+
+ if (!signal_pending(current)) {
+ timeout = schedule_timeout(timeout);
+ if (!timeout) {
+ status = -ETIMEDOUT;
+ break;
+ }
+ continue;
+ }
+ status = -ERESTARTSYS;
+ break;
}
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&w->n_event, &wait);
- spin_unlock(&osb->vote_obj_queue_lock);
-
- // return the obj, or drop the ref
- if (data->ret)
- *(data->ret) = obj;
- else if (obj)
- ocfs_put_vote_obj (obj);
return status;
}
-
-/*
- * ocfs_comm_process_vote_reply()
- *
- */
-int ocfs_comm_process_vote_reply (ocfs_super * osb, ocfs_dlm_msg * dlm_msg)
+static int ocfs2_broadcast_vote(ocfs_super *osb,
+ ocfs2_vote_msg *request,
+ unsigned int response_id)
{
- int status = 0;
- ocfs_dlm_reply_master *reply;
- ocfs_dlm_msg_hdr *reply_msg;
- ocfs_vote_reply_ctxt ctxt;
- ocfs_vote_obj *obj = NULL;
- ocfs_vote_obj_lookup_data data;
+ int status, i, remote_err;
+ ocfs2_net_wait_ctxt *w = NULL;
+ struct inode *remote_node;
- LOG_ENTRY ();
-
- down (&(osb->comm_lock));
-
- reply = (ocfs_dlm_reply_master *) dlm_msg->msg_buf;
- reply_msg = &(reply->h);
-
- /* find the original request object for this reply */
- data.u.s.seq_num = reply_msg->lock_seq_num;
- data.u.s.lock_id = reply_msg->lock_id;
- data.func = ocfs_lookup_obj_by_seq;
- data.ret = &obj;
- status = ocfs_lookup_vote_request_obj (osb, &data);
- if (status < 0 || obj==NULL) {
- LOG_ERROR_STATUS (status);
+ w = ocfs2_new_net_wait_ctxt(osb, response_id);
+ if (!w) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
goto bail;
}
- spin_lock(&obj->lock);
- if (obj->vote_state != VOTE_OBJ_STATE_SENT &&
- obj->vote_state != VOTE_OBJ_STATE_PARTIAL_REPLY) {
- LOG_ERROR_ARGS("bad vote reply state=%d, node=%u, lockid=%llu, seq=%llu, vote=%d\n",
- obj->vote_state, dlm_msg->src_node,
- reply_msg->lock_id,
- reply_msg->lock_seq_num, reply->status);
- status = -EINVAL;
- goto unlock;
- }
+ /* we're pretty much ready to go at this point, and this fills
+ * in n_response which we need anyway... */
+ ocfs2_queue_net_wait_ctxt(osb, w);
- LOG_TRACE_ARGS("node=%u, lockid=%llu, seq=%llu, vote=%d\n",
- dlm_msg->src_node, reply_msg->lock_id,
- reply_msg->lock_seq_num, reply->status);
-
- ctxt.got_vote_map = &(obj->got_vote_map);
- ctxt.status = &(obj->vote_status);
- ctxt.flags = reply_msg->flags;
- ctxt.reply = reply;
+ i = ocfs_node_map_iterate(osb, &osb->mounted_map, 0);
+ while (i != OCFS_INVALID_NODE_NUM) {
+ if (i != osb->node_num) {
+ ocfs_node_map_set_bit(osb, &w->n_node_map, i);
- ocfs_process_one_vote_reply(osb, &ctxt, dlm_msg->src_node);
+ remote_node = nm_get_node_by_num(i);
+ if (!remote_node) {
+ status = -EINVAL;
+ goto bail;
+ }
- if (ocfs_node_map_is_equal(&obj->got_vote_map, &obj->req_vote_map))
- obj->vote_state = VOTE_OBJ_STATE_FULL_REPLY;
- else
- obj->vote_state = VOTE_OBJ_STATE_PARTIAL_REPLY;
-
-unlock:
- // wake if complete or error
- if (obj->vote_status < 0 || status < 0 ||
- obj->vote_state == VOTE_OBJ_STATE_FULL_REPLY) {
- atomic_set (&obj->voted_event_woken, 1);
- wake_up (&obj->voted_event);
+ remote_err = 0;
+ status = net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
+ osb->net_key,
+ request,
+ sizeof(*request),
+ remote_node,
+ &remote_err);
+ iput(remote_node);
+ if (status == -ETIMEDOUT) {
+ printk("ocfs2: remote node %d timed out!\n",
+ i);
+ status = -EAGAIN;
+ goto bail;
+ }
+ if (remote_err < 0) {
+ status = remote_err;
+ printk("ocfs2: remote error %d on node %d!\n",
+ remote_err, i);
+ goto bail;
+ }
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ }
+ i = ocfs_node_map_iterate(osb, &osb->mounted_map, i);
}
- spin_unlock(&obj->lock);
- ocfs_put_vote_obj (obj);
-bail:
- up (&(osb->comm_lock));
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_comm_process_vote_reply */
+ status = ocfs2_wait_on_vote_responses(osb, w);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
-/*
- * ocfs_dlm_recv_msg()
- *
- */
-void ocfs_dlm_recv_msg (void *val)
-{
- ocfs_recv_ctxt *recv_ctxt;
- __u8 *dlm_packet;
-
- LOG_ENTRY ();
-
- recv_ctxt = (ocfs_recv_ctxt *) val;
- dlm_packet = (__u8 *) recv_ctxt->msg;
-
- if (recv_ctxt->status >= 0) {
- if (ocfs_check_ipc_msg (dlm_packet, recv_ctxt->msg_len))
- ocfs_comm_process_msg (dlm_packet);
+ ocfs2_dequeue_net_wait_ctxt(osb, w);
+ status = w->n_response;
+bail:
+ if (w) {
+ ocfs2_dequeue_net_wait_ctxt(osb, w);
+ kfree(w);
}
- kfree(recv_ctxt);
+ return status;
+}
- LOG_EXIT ();
- return;
-} /* ocfs_dlm_recv_msg */
-
-/*
- * ocfs_comm_process_msg()
- *
- */
-int ocfs_comm_process_msg (__u8 * msg)
+static int ocfs2_do_request_vote(ocfs_super *osb,
+ u64 blkno,
+ unsigned int generation,
+ enum ocfs2_vote_request type)
{
- int status = 0;
- ocfs_super *osb = NULL;
- ocfs_dlm_msg *dlm_msg;
- ocfs_dlm_req_master *req_master;
- struct list_head *iter_osb, *temp_iter;
- __s16 src_node;
+ int status;
+ unsigned int response_id;
+ ocfs2_vote_msg *request = NULL;
+ ocfs2_msg_hdr *hdr;
- LOG_ENTRY ();
+ OCFS_ASSERT(type == OCFS2_VOTE_REQ_DELETE ||
+ type == OCFS2_VOTE_REQ_UNLINK ||
+ type == OCFS2_VOTE_REQ_RENAME);
- dlm_msg = (ocfs_dlm_msg *) msg;
-
- down (&(OcfsGlobalCtxt.global_res));
- list_for_each_safe (iter_osb, temp_iter, &(OcfsGlobalCtxt.osb_next)) {
- osb = list_entry (iter_osb, ocfs_super, osb_next);
- if (!memcmp (osb->uuid, dlm_msg->vol_id,
- MAX_VOL_ID_LENGTH))
- break;
- osb = NULL;
- }
- up (&(OcfsGlobalCtxt.global_res));
-
- if (osb == NULL) {
- LOG_ERROR_STR("Ignoring netdlm message with invalid volume id");
+ request = kmalloc(sizeof(*request), GFP_KERNEL);
+ if (!request) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
goto bail;
}
+ memset(request, 0, sizeof(*request));
+ hdr = &request->v_hdr;
- if (dlm_msg->src_node >= osb->max_nodes) {
- LOG_ERROR_ARGS ("Invalid source node in netdlm message: %d\n",
- dlm_msg->src_node);
- goto bail;
- }
+ response_id = ocfs2_new_response_id(osb);
- if (!ocfs_node_map_test_bit(&osb->publ_map, dlm_msg->src_node)) {
- LOG_TRACE_STR("Ignoring netdlm message from dead node");
+ hdr->h_response_id = htonl(response_id);
+ hdr->h_request = htonl(type);
+ hdr->h_blkno = cpu_to_be64(blkno);
+ hdr->h_generation = htonl(generation);
+ hdr->h_node_num = htonl((unsigned int) osb->node_num);
+
+ status = ocfs2_broadcast_vote(osb, request, response_id);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
goto bail;
}
- switch (dlm_msg->msg_type) {
- case OCFS_VOTE_REQUEST:
- status = ocfs_process_vote (osb, dlm_msg);
- break;
-
- case OCFS_VOTE_REPLY:
- ocfs_comm_process_vote_reply (osb, dlm_msg);
- break;
-
- case OCFS_INFO_DISMOUNT:
- src_node = dlm_msg->src_node;
- req_master = (ocfs_dlm_req_master *) dlm_msg->msg_buf;
- printk ("ocfs2: Received dismount message for device (%u,%u) "
- "from %s (node %d)\n", MAJOR(osb->sb->s_dev),
- MINOR(osb->sb->s_dev), osb->node_cfg_info[src_node]->node_name,
- src_node);
- atomic_set (&(osb->vol_node_map[src_node].dismount), 1);
- break;
-
- default:
- break;
- }
-
bail:
- LOG_EXIT_STATUS (status);
+ if (request)
+ kfree(request);
+
return status;
-} /* ocfs_comm_process_msg */
+}
-
-
-/*
- * ocfs_send_dismount_msg()
- *
- */
-int ocfs_send_dismount_msg (ocfs_super * osb)
+static int ocfs2_request_vote(struct inode *inode,
+ enum ocfs2_vote_request type)
{
- int status = 0;
- ocfs_dlm_msg *dlm_msg = NULL;
- ocfs_dlm_msg_hdr *req;
- ocfs_vote_obj *obj;
- __u32 msg_len, obj_len;
- ocfs_node_map map;
+ int status;
+ ocfs_super *osb = OCFS2_SB(inode->i_sb);
- LOG_ENTRY_ARGS ("(osb=0x%p)\n", osb);
+ if (ocfs_inode_is_new(inode))
+ return 0;
- ocfs_node_map_dup(osb, &map, &osb->publ_map);
- ocfs_node_map_clear_bit(&map, osb->node_num);
+ status = -EAGAIN;
+ while (status == -EAGAIN) {
+ if (signal_pending(current))
+ return -EINTR;
- msg_len = sizeof (ocfs_dlm_msg) + sizeof (ocfs_dlm_req_master);
- obj_len = sizeof (ocfs_vote_obj) + sizeof (ocfs_dlm_req_master);
+ status = ocfs2_super_lock(osb, 0);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS(status);
+ break;
+ }
- obj = ocfs_alloc_vote_obj (osb, obj_len, 0, NULL);
- if (obj == NULL) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto finally;
- }
- dlm_msg = &(obj->m);
- req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
- ocfs_init_dlm_msg (osb, dlm_msg, msg_len, OCFS_INFO_DISMOUNT);
- req->lock_id = 0;
- req->flags = 0;
- req->lock_seq_num = 0;
+ status = 0;
+ if (!ocfs_node_map_is_only(osb, &osb->mounted_map,
+ osb->node_num))
+ status = ocfs2_do_request_vote(osb,
+ OCFS_I(inode)->ip_blkno,
+ inode->i_generation,
+ type);
- spin_lock(&osb->vote_obj_queue_lock);
- list_add_tail(&obj->list, &osb->vote_obj_queue);
- spin_unlock(&osb->vote_obj_queue_lock);
-
- ocfs_send_bcast (osb, &map, dlm_msg);
- spin_lock (&obj->lock);
- obj->vote_state = VOTE_OBJ_STATE_SENT;
- spin_unlock (&obj->lock);
-
- // silly ;-)
- spin_lock (&obj->lock);
- obj->vote_state = VOTE_OBJ_STATE_DESTROYING;
- spin_unlock (&obj->lock);
-
- spin_lock(&osb->vote_obj_queue_lock);
- list_del(&obj->list);
- spin_unlock(&osb->vote_obj_queue_lock);
-
-finally:
- ocfs_put_vote_obj (obj);
- LOG_EXIT_STATUS (status);
+ ocfs2_super_unlock(osb, 0);
+ }
return status;
-} /* ocfs_send_dismount_msg */
-
-/*
- * ocfs_init_dlm_msg()
- *
- */
-static void ocfs_init_dlm_msg (ocfs_super * osb, ocfs_dlm_msg * dlm_msg, __u32 msg_len, __u32 type)
+}
+
+int ocfs2_request_delete_vote(struct inode *inode)
{
- LOG_ENTRY ();
+ return ocfs2_request_vote(inode, OCFS2_VOTE_REQ_DELETE);
+}
- dlm_msg->magic = OCFS_DLM_MSG_MAGIC;
- dlm_msg->msg_len = msg_len;
- dlm_msg->src_node = osb->node_num;
- dlm_msg->msg_type = type;
- memcpy (dlm_msg->vol_id, osb->uuid, MAX_VOL_ID_LENGTH);
-
- LOG_EXIT ();
- return;
-} /* ocfs_init_dlm_msg */
-
-
-static ocfs_vote_obj * ocfs_alloc_vote_obj (ocfs_super *osb, int bytes, __u32 reqlock, ocfs_node_map *votemap)
+int ocfs2_request_unlink_vote(struct inode *inode)
{
- ocfs_vote_obj *obj = NULL;
+ return ocfs2_request_vote(inode, OCFS2_VOTE_REQ_UNLINK);
+}
- obj = ocfs_malloc (bytes);
- if (obj == NULL)
- return NULL;
-
- memset(obj, 0, bytes);
- obj->vote_state = VOTE_OBJ_STATE_UNSENT;
- spin_lock_init (&obj->lock);
- atomic_set(&obj->refcount, 1);
- atomic_set(&obj->voted_event_woken, 0);
- init_waitqueue_head (&obj->voted_event);
- INIT_LIST_HEAD (&obj->list);
-
- if (votemap)
- ocfs_node_map_dup(osb, &obj->req_vote_map, votemap);
- else
- ocfs_node_map_init(osb, &obj->req_vote_map);
- ocfs_node_map_init(osb, &obj->got_vote_map);
-
- obj->seq_num = 0ULL;
- obj->req_lock_type = reqlock;
- obj->vote_status = 0;
- obj->pid = current->pid;
-
- return obj;
+int ocfs2_request_rename_vote(struct inode *inode)
+{
+ return ocfs2_request_vote(inode, OCFS2_VOTE_REQ_RENAME);
}
-#define OCFS_DLM_NET_TIMEOUT (30000) // 30 seconds
-
-/*
- * ocfs_send_dlm_request_msg()
- * inode is definitely non NULL
- */
-int ocfs_send_dlm_request_msg (ocfs_super * osb, __u64 lock_id, __u32 lock_type, __u32 flags, ocfs_node_map *votemap, struct inode *inode, __u32 num_ident, int *vote_status)
+int ocfs2_request_mount_vote(ocfs_super *osb)
{
- int status = 0;
- ocfs_dlm_msg *dlm_msg = NULL;
- ocfs_dlm_msg_hdr *req;
- ocfs_vote_obj *obj;
- __u32 msg_len, obj_len;
+ int status;
- LOG_ENTRY_ARGS ("(osb=0x%p, id=%llu, ty=%u, fl=%u)\n",
- osb, lock_id, lock_type, flags);
+ status = -EAGAIN;
+ while (status == -EAGAIN) {
+ if (signal_pending(current))
+ return -EINTR;
- msg_len = sizeof (ocfs_dlm_msg) + sizeof (ocfs_dlm_req_master);
- obj_len = sizeof (ocfs_vote_obj) + sizeof (ocfs_dlm_req_master);
+ if (ocfs_node_map_is_only(osb, &osb->mounted_map,
+ osb->node_num))
+ return 0;
- obj = ocfs_alloc_vote_obj (osb, obj_len, lock_type, votemap);
- if (obj == NULL) {
- LOG_ERROR_STATUS (status = -ENOMEM);
- goto finally;
+ status = ocfs2_do_request_vote(osb, 0ULL, 0,
+ OCFS2_VOTE_REQ_MOUNT);
}
- dlm_msg = &(obj->m);
- req = (ocfs_dlm_msg_hdr *) dlm_msg->msg_buf;
- ocfs_init_dlm_msg (osb, dlm_msg, msg_len, OCFS_VOTE_REQUEST);
-
- spin_lock (&OcfsGlobalCtxt.comm_seq_lock);
- req->lock_seq_num = ++OcfsGlobalCtxt.comm_seq_num;
- obj->seq_num = req->lock_seq_num;
- spin_unlock (&OcfsGlobalCtxt.comm_seq_lock);
-
- req->lock_id = lock_id;
- req->flags = flags;
- req->num_ident = num_ident;
-
-#ifdef VERBOSE_LOCKING_TRACE
- printk("ocfs_send_dlm_request_msg: inode=%p, lockid = %llu\n",
- inode, lock_id);
-#endif
-
- spin_lock(&osb->vote_obj_queue_lock);
- list_add_tail(&obj->list, &osb->vote_obj_queue);
- spin_unlock(&osb->vote_obj_queue_lock);
-
- ocfs_send_bcast (osb, votemap, dlm_msg);
- spin_lock (&obj->lock);
- obj->vote_state = VOTE_OBJ_STATE_SENT;
- spin_unlock (&obj->lock);
- status = ocfs_wait_uninterruptible(obj->voted_event,
- atomic_read (&obj->voted_event_woken),
- OCFS_DLM_NET_TIMEOUT);
-
- spin_lock (&obj->lock);
- if (obj->vote_status >= 0 && obj->vote_state == VOTE_OBJ_STATE_FULL_REPLY) {
- LOG_TRACE_ARGS ("OK vote, lockid=%llu\n", lock_id);
- } else if ((obj->vote_status != -EAGAIN && obj->vote_status != -EBUSY) || obj->vote_state != VOTE_OBJ_STATE_FULL_REPLY) {
-#warning "should we even be erroring here at all!"
- LOG_ERROR_ARGS("inode %llu, vote_status=%d, vote_state=%d, "
- "lockid=%llu, flags = 0x%x, asked type = %u "
- "master = %d, state = 0x%lx, type = %u\n",
- OCFS_I(inode)->ip_blkno, obj->vote_status,
- obj->vote_state, lock_id, flags, lock_type,
- GET_INODE_LOCKRES(inode)->master_node_num,
- GET_INODE_LOCKRES(inode)->readonly_state,
- GET_INODE_LOCKRES(inode)->lock_type);
- }
- *vote_status = obj->vote_status;
- obj->vote_state = VOTE_OBJ_STATE_DESTROYING;
- ocfs_node_map_clear_bits(votemap, &obj->got_vote_map);
- spin_unlock (&obj->lock);
-
- spin_lock(&osb->vote_obj_queue_lock);
- list_del(&obj->list);
- spin_unlock(&osb->vote_obj_queue_lock);
-
-
- ocfs_compute_dlm_stats (status, *vote_status,
- &(OcfsGlobalCtxt.net_reqst_stats));
-
- ocfs_compute_dlm_stats (status, *vote_status,
- &(osb->net_reqst_stats));
-finally:
- ocfs_put_vote_obj (obj);
- LOG_EXIT_STATUS (status);
return status;
-} /* ocfs_send_dlm_request_msg */
+}
-
-void ocfs_process_one_vote_reply(ocfs_super *osb, ocfs_vote_reply_ctxt *ctxt, __u32 node_num)
+int ocfs2_request_umount_vote(ocfs_super *osb)
{
int status;
- int reply_status;
- reply_status = ctxt->reply->status;
+ status = -EAGAIN;
+ while (status == -EAGAIN) {
+ if (signal_pending(current))
+ return -EINTR;
- status = 0;
+ if (ocfs_node_map_is_only(osb, &osb->mounted_map,
+ osb->node_num))
+ return 0;
- switch (reply_status) {
- case FLAG_VOTE_NODE:
- ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
- break;
- case FLAG_VOTE_OIN_ALREADY_INUSE:
- ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
- status = -EINVAL;
- if (ctxt->flags & FLAG_FILE_DELETE)
- status = -EBUSY;
- break;
- case FLAG_VOTE_OIN_UPDATED:
- status = 0;
- ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
- break;
- case FLAG_VOTE_UPDATE_RETRY:
- ocfs_node_map_set_bit(ctxt->got_vote_map, node_num);
- status = -EAGAIN;
- break;
- case FLAG_VOTE_FILE_DEL:
-#warning "don't we need to set the node map bit here?"
- status = -ENOENT;
- break;
+ status = ocfs2_do_request_vote(osb, 0ULL, 0,
+ OCFS2_VOTE_REQ_UMOUNT);
}
- *(ctxt->status) = status;
+ return status;
}
-/* special case -1 for now
- * TODO: should *really* make sure the calling func never passes -1!! */
-void ocfs_node_map_init(ocfs_super *osb, ocfs_node_map *map)
+/* TODO: This should eventually be a hash table! */
+static ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(ocfs_super *osb,
+ u32 response_id)
{
- map->num_nodes = osb->max_nodes;
- memset(map->map, 0, BITS_TO_LONGS(OCFS_NODE_MAP_MAX_NODES) *
- sizeof(unsigned long));
-}
+ struct list_head *p;
+ ocfs2_net_wait_ctxt *w = NULL;
-void ocfs_node_map_set_bit(ocfs_node_map *map, int bit)
-{
- if (bit==-1)
- return;
- OCFS_ASSERT(bit < map->num_nodes);
- set_bit(bit, map->map);
-}
-
-void ocfs_node_map_clear_bit(ocfs_node_map *map, int bit)
-{
- if (bit==-1)
- return;
- OCFS_ASSERT(bit < map->num_nodes);
- clear_bit(bit, map->map);
-}
-
-// clear all the bits in "target" which are set in "mask"
-void ocfs_node_map_clear_bits(ocfs_node_map *target, ocfs_node_map *mask)
-{
- int bit, prev=0;
- while (1) {
- bit = find_next_bit (mask->map, mask->num_nodes, prev);
- if (bit >= mask->num_nodes)
+ list_for_each(p, &osb->net_response_list) {
+ w = list_entry(p, ocfs2_net_wait_ctxt, n_list);
+ if (response_id == w->n_response_id)
break;
- ocfs_node_map_clear_bit(target, bit);
- prev = bit+1;
+ w = NULL;
}
-}
-// set all the bits in "target" which are set in "mask"
-void ocfs_node_map_set_bits(ocfs_node_map *target, ocfs_node_map *mask)
-{
- int bit, prev=0;
- while (1) {
- bit = find_next_bit (mask->map, mask->num_nodes, prev);
- if (bit >= mask->num_nodes)
- break;
- ocfs_node_map_set_bit(target, bit);
- prev = bit+1;
- }
+ return w;
}
-int ocfs_node_map_test_bit(ocfs_node_map *map, int bit)
+static int ocfs2_handle_response_message(net_msg *msg,
+ u32 len,
+ void *data)
{
- if (bit >= map->num_nodes) {
- LOG_ERROR_ARGS("bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
- BUG();
- }
- return test_bit(bit, map->map);
-}
+ unsigned int response_id, node_num;
+ int response_status;
+ ocfs_super *osb = data;
+ ocfs2_response_msg *resp;
+ ocfs2_net_wait_ctxt * w;
-static int ocfs_node_map_stringify(ocfs_node_map *map, char **str)
-{
- int i, n;
- char *s;
+ resp = (ocfs2_response_msg *) msg->buf;
- OCFS_ASSERT(map->num_nodes > 0);
+ response_id = ntohl(resp->r_hdr.h_response_id);
+ node_num = ntohl(resp->r_hdr.h_node_num);
+ response_status = ntohl(resp->r_response);
- *str = kmalloc( strlen("123 ") * map->num_nodes, GFP_KERNEL);
- if (!(*str))
- return -ENOMEM;
+ printk("recieved response message:\n");
+ printk("h_response_id = %u\n", ntohl(response_id));
+ printk("h_request = %u\n", ntohl(resp->r_hdr.h_request));
+ printk("h_blkno = %llu\n", be64_to_cpu(resp->r_hdr.h_blkno));
+ printk("h_generation = %u\n", ntohl(resp->r_hdr.h_generation));
+ printk("h_node_num = %u\n", node_num);
+ printk("r_response = %d\n", response_status);
- memset(*str, 0, strlen("123 ") * map->num_nodes);
+ spin_lock(&osb->net_response_lock);
+ w = __ocfs2_find_net_wait_ctxt(osb, response_id);
+ if (!w) {
+ printk("request not found!\n");
+ goto bail;
+ }
- s = *str;
- for (i=0; i<map->num_nodes; i++) {
- if (ocfs_node_map_test_bit(map, i)) {
- n = sprintf(s, "%3d ", i);
- if (n != strlen("123 ")) {
- kfree(*str);
- return -ENOMEM;
- }
- s += n;
- }
+ if (response_status && (!w->n_response)) {
+ /* we only really need one negative response so don't
+ * set it twice. */
+ w->n_response = response_status;
}
+
+ ocfs_node_map_clear_bit(osb, &w->n_node_map, node_num);
+ if (ocfs_node_map_is_empty(osb, &w->n_node_map))
+ wake_up(&w->n_event);
+bail:
+ spin_unlock(&osb->net_response_lock);
+
return 0;
}
-int ocfs_node_map_is_empty(ocfs_node_map *map)
+static int ocfs2_handle_vote_message(net_msg *msg,
+ u32 len,
+ void *data)
{
- int bit;
- OCFS_ASSERT(map->num_nodes > 0);
- bit = find_next_bit(map->map, map->num_nodes, 0);
- if (bit < map->num_nodes)
- return 0;
- return 1;
-}
+ int status;
+ ocfs_super *osb = data;
+ ocfs2_vote_work *work;
-int ocfs_node_map_is_equal(ocfs_node_map *map1, ocfs_node_map *map2)
-{
- int num_longs, i;
-
- OCFS_ASSERT(map1->num_nodes == map2->num_nodes);
- OCFS_ASSERT(map1->num_nodes > 0);
-
- num_longs = BITS_TO_LONGS(map1->num_nodes);
- for (i=0; i<num_longs; i++) {
- if (map1->map[i] != map2->map[i])
- return 0;
+ work = kmalloc(sizeof(ocfs2_vote_work), GFP_KERNEL);
+ if (!work) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
}
- return 1;
-}
+ INIT_LIST_HEAD(&work->w_list);
+ memcpy(&work->w_msg, msg->buf, sizeof(ocfs2_vote_msg));
-void ocfs_node_map_and(ocfs_node_map *target, ocfs_node_map *mask)
-{
- int num_longs, i;
+ printk("scheduling vote request:\n");
+ printk("h_response_id = %u\n", work->w_msg.v_hdr.h_response_id);
+ printk("h_request = %u\n", work->w_msg.v_hdr.h_request);
+ printk("h_blkno = %llu\n", work->w_msg.v_hdr.h_blkno);
+ printk("h_generation = %u\n", work->w_msg.v_hdr.h_generation);
+ printk("h_node_num = %u\n", work->w_msg.v_hdr.h_node_num);
- OCFS_ASSERT(target->num_nodes == mask->num_nodes);
- OCFS_ASSERT(target->num_nodes > 0);
-
- num_longs = BITS_TO_LONGS(target->num_nodes);
- for (i=0; i<num_longs; i++)
- target->map[i] &= mask->map[i];
-}
+ spin_lock(&osb->vote_task_lock);
+ list_add_tail(&work->w_list, &osb->vote_list);
+ osb->vote_count++;
+ spin_unlock(&osb->vote_task_lock);
-void ocfs_node_map_set(ocfs_node_map *target, ocfs_node_map *from)
-{
- int num_longs, i;
+ ocfs2_kick_vote_thread(osb);
- OCFS_ASSERT(target->num_nodes == from->num_nodes);
- OCFS_ASSERT(target->num_nodes > 0);
-
- num_longs = BITS_TO_LONGS(target->num_nodes);
- for (i=0; i<num_longs; i++)
- target->map[i] = from->map[i];
+ status = 0;
+bail:
+ return status;
}
-
-void ocfs_node_map_dup(ocfs_super *osb, ocfs_node_map *target, ocfs_node_map *from)
+int ocfs2_register_net_handlers(ocfs_super *osb)
{
- OCFS_ASSERT(from->num_nodes > 0);
- ocfs_node_map_init(osb, target);
- ocfs_node_map_set(target, from);
-}
+ int status;
+ int i = MAX_VOL_ID_LENGTH - sizeof(osb->net_key);
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs_node_map_is_only(ocfs_super *osb, ocfs_node_map *target, int bit)
-{
- ocfs_node_map temp;
- int ret;
+ memcpy(&osb->net_key, &osb->uuid[i], sizeof(osb->net_key));
+ osb->net_response_buf = osb->net_vote_buf = NULL;
+ osb->net_response_ids = 0;
+ spin_lock_init(&osb->net_response_lock);
+ INIT_LIST_HEAD(&osb->net_response_list);
- ocfs_node_map_dup(osb, &temp, target);
- ocfs_node_map_clear_bit(&temp, bit);
- ret = ocfs_node_map_is_empty(&temp);
- return ret;
-}
+ osb->net_response_buf = kmalloc(sizeof(ocfs2_response_msg),
+ GFP_KERNEL);
+ if (!osb->net_response_buf) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
-/*
-** All structures have a type, and a size associated with it.
-** The type serves to identify the structure. The size is used for
-** consistency checking ...
-*/
-void ocfs_publish_map_set(ocfs_node_map *pubmap, int num)
-{
- ocfs_node_map_set_bit(pubmap, num);
-}
+ osb->net_vote_buf = kmalloc(sizeof(ocfs2_vote_msg),
+ GFP_KERNEL);
+ if (!osb->net_vote_buf) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
-void ocfs_publish_map_clear(ocfs_node_map *pubmap, int num)
-{
- ocfs_node_map_clear_bit(pubmap, num);
-}
+ status = net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
+ osb->net_key,
+ 0,
+ sizeof(ocfs2_response_msg),
+ ocfs2_handle_response_message,
+ osb,
+ osb->net_response_buf);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
-/* update the recovery map here */
-void ocfs_recovery_map_set(ocfs_super *osb, int num)
-{
- spin_lock(&osb->recovery_map_lock);
- ocfs_node_map_set_bit(&osb->recovery_map, num);
- spin_unlock(&osb->recovery_map_lock);
+ status = net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
+ osb->net_key,
+ 0,
+ sizeof(ocfs2_vote_msg),
+ ocfs2_handle_vote_message,
+ osb,
+ osb->net_vote_buf);
+ if (status < 0) {
+ /* TODO: net_unregister here! */
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+bail:
+ if (status < 0) {
+ if (osb->net_response_buf)
+ kfree(osb->net_response_buf);
+ if (osb->net_vote_buf)
+ kfree(osb->net_vote_buf);
+ osb->net_response_buf = osb->net_vote_buf = NULL;
+ /* 0 indicates we never registered anything */
+ osb->net_key = 0;
+ }
+ return status;
}
-void ocfs_recovery_map_clear(ocfs_super *osb, int num)
+void ocfs2_unregister_net_handlers(ocfs_super *osb)
{
- spin_lock(&osb->recovery_map_lock);
- ocfs_node_map_clear_bit(&osb->recovery_map, num);
- spin_unlock(&osb->recovery_map_lock);
-}
+ if (!osb->net_key)
+ return;
-int ocfs_node_is_recovering(ocfs_super *osb, int num)
-{
- if (num == -1)
- return 0;
- return ocfs_node_map_test_bit(&osb->recovery_map, num);
-}
+ /* TODO: net_unregister here! */
+ /* TODO: net_unregister here! */
-int ocfs_node_is_alive(ocfs_node_map *pubmap, int index)
-{
- if (index == -1)
- return 0;
- return ocfs_node_map_test_bit(pubmap, index);
-}
+ if (!list_empty(&osb->net_response_list))
+ printk("ocfs2: net response list not empty!\n");
+ kfree(osb->net_response_buf);
+ kfree(osb->net_vote_buf);
+}
Modified: trunk/src/vote.h
===================================================================
--- trunk/src/vote.h 2004-12-04 02:54:01 UTC (rev 1692)
+++ trunk/src/vote.h 2004-12-06 21:45:32 UTC (rev 1693)
@@ -3,7 +3,7 @@
*
* vote.h
*
- * Function prototypes
+ * description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
@@ -23,53 +23,23 @@
* Boston, MA 021110-1307, USA.
*/
-#ifndef OCFS2_VOTE_H
-#define OCFS2_VOTE_H
-int ocfs_init_udp_sock(struct socket **send_sock,
- struct socket **recv_sock);
-int ocfs_lookup_obj_for_proc(ocfs_vote_obj *obj,
- ocfs_vote_obj_lookup_data *data);
-int ocfs_lookup_obj_by_lockid(ocfs_vote_obj *obj,
- ocfs_vote_obj_lookup_data *data);
-int ocfs_lookup_vote_request_obj(ocfs_super *osb,
- ocfs_vote_obj_lookup_data *data);
-void ocfs_process_one_vote_reply(ocfs_super *osb,
- ocfs_vote_reply_ctxt *ctxt,
- __u32 node_num);
-int ocfs_recv_udp_msg(ocfs_recv_ctxt *recv_ctxt);
-int ocfs_send_dismount_msg(ocfs_super *osb);
-int ocfs_send_dlm_request_msg (ocfs_super * osb, __u64 lock_id,
- __u32 lock_type, __u32 flags,
- ocfs_node_map *votemap,
- struct inode *inode, __u32 num_ident,
- int *vote_status);
-int ocfs_send_vote_reply(ocfs_super *osb, ocfs_dlm_msg *dlm_msg,
- __u32 vote_status);
-int ocfs_lookup_vote_request_obj (ocfs_super *osb,
- ocfs_vote_obj_lookup_data *data);
+#ifndef VOTE_H
+#define VOTE_H
-void ocfs_node_map_init(ocfs_super *osb, ocfs_node_map *map);
-void ocfs_node_map_set_bit(ocfs_node_map *map, int bit);
-void ocfs_node_map_clear_bit(ocfs_node_map *map, int bit);
-// clear all the bits in "target" which are set in "mask"
-void ocfs_node_map_clear_bits(ocfs_node_map *target, ocfs_node_map *mask);
-// set all the bits in "target" which are set in "mask"
-void ocfs_node_map_set_bits(ocfs_node_map *target, ocfs_node_map *mask);
-int ocfs_node_map_test_bit(ocfs_node_map *map, int bit);
-int ocfs_node_map_is_empty(ocfs_node_map *map);
-int ocfs_node_map_is_equal(ocfs_node_map *map1, ocfs_node_map *map2);
-void ocfs_node_map_and(ocfs_node_map *target, ocfs_node_map *mask);
-void ocfs_node_map_set(ocfs_node_map *target, ocfs_node_map *from);
-void ocfs_node_map_dup(ocfs_super *osb, ocfs_node_map *target, ocfs_node_map *from);
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs_node_map_is_only(ocfs_super *osb, ocfs_node_map *target, int bit);
+int ocfs2_vote_thread(void *arg);
+static inline void ocfs2_kick_vote_thread(ocfs_super *osb)
+{
+ atomic_set(&osb->wake_vote_task, 1);
+ wake_up(&osb->vote_event);
+}
-int ocfs_node_is_recovering(ocfs_super *osb, int num);
-int ocfs_node_is_alive(ocfs_node_map *pubmap, int index);
-void ocfs_publish_map_set(ocfs_node_map *pubmap, int num);
-void ocfs_publish_map_clear(ocfs_node_map *pubmap, int num);
-void ocfs_recovery_map_set(ocfs_super *osb, int num);
-void ocfs_recovery_map_clear(ocfs_super *osb, int num);
+int ocfs2_request_delete_vote(struct inode *inode);
+int ocfs2_request_unlink_vote(struct inode *inode);
+int ocfs2_request_rename_vote(struct inode *inode);
+int ocfs2_request_mount_vote(ocfs_super *osb);
+int ocfs2_request_umount_vote(ocfs_super *osb);
+int ocfs2_register_net_handlers(ocfs_super *osb);
+void ocfs2_unregister_net_handlers(ocfs_super *osb);
-#endif /* OCFS2_VOTE_H */
+#endif
More information about the Ocfs2-commits
mailing list