[Ocfs2-commits] rev 6 - in trunk: . inc
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Thu Dec 4 17:52:50 CST 2003
Author: manish
Date: 2003-12-04 17:52:47 -0600 (Thu, 04 Dec 2003)
New Revision: 6
Modified:
trunk/Config.make
trunk/Makefile
trunk/TODO
trunk/alloc.c
trunk/dir.c
trunk/dlm.c
trunk/file.c
trunk/heartbeat.c
trunk/inc/journal.h
trunk/inc/ocfs.h
trunk/inc/ocfsio.h
trunk/inc/proto.h
trunk/inode.c
trunk/journal.c
trunk/nm.c
trunk/osb.c
trunk/super.c
Log:
sync
Modified: trunk/Config.make
===================================================================
--- trunk/Config.make 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/Config.make 2003-12-04 23:52:47 UTC (rev 6)
@@ -63,3 +63,6 @@
OCFS_MEMDEBUG =
OCFS_TRACE = yes
OCFS_PROCESSOR = i686
+#This should be defined for all kernels <= 2.4.21 except
+#for rhel3 and latest rhas update.
+#USE_JOURNAL_CREATE_REPLACEMENT = yes
Modified: trunk/Makefile
===================================================================
--- trunk/Makefile 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/Makefile 2003-12-04 23:52:47 UTC (rev 6)
@@ -147,6 +147,10 @@
DEFINES += -DHAVE_NPTL
endif
+ifdef USE_JOURNAL_CREATE_REPLACEMENT
+DEFINES += -DUSE_JOURNAL_CREATE_REPLACEMENT
+endif
+
DEFINES += -DDEBUG_LOCK_BUFFER
ifeq ($(KVER),vmware)
Modified: trunk/TODO
===================================================================
--- trunk/TODO 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/TODO 2003-12-04 23:52:47 UTC (rev 6)
@@ -1,9 +1,5 @@
-* We need to recover previously dead nodes during our startup.
+* fix dlm issue with stale locks
-* Fix all places in locking where we blindly take a lock on timeout without
- checking whether a node is in recovery. Hearbeat thread needs to monitor
- recovery processes.
-
* fsck must be able to replay the journal
* merge and turn on the data alloc file
Modified: trunk/alloc.c
===================================================================
--- trunk/alloc.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/alloc.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -235,19 +235,9 @@
}
}
- /* Get all locks */
+ /* Get all the locks we need. do global bitmap last to
+ * preserve lock ordering with extend/create */
- if (free_vol_bits != NULL) {
- status = ocfs_acquire_lock (osb, OCFS_BITMAP_LOCK_OFFSET,
- OCFS_DLM_EXCLUSIVE_LOCK,
- FLAG_FILE_CREATE, &vol_lockres, &globalbh, NULL);
- if (status < 0) {
- if (status != -EINTR)
- LOG_ERROR_STATUS (status);
- goto finally;
- }
- }
-
lock_id = (OCFS_FILE_DIR_ALLOC_BITMAP * osb->sect_size) +
osb->vol_layout.root_int_off;
for (i = 0; i < OCFS_MAXIMUM_NODES; i++, lock_id += osb->sect_size) {
@@ -280,6 +270,18 @@
}
}
+ if (free_vol_bits != NULL) {
+ status = ocfs_acquire_lock (osb, OCFS_BITMAP_LOCK_OFFSET,
+ OCFS_DLM_EXCLUSIVE_LOCK,
+ FLAG_FILE_CREATE, &vol_lockres, &globalbh, NULL);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS (status);
+ goto finally;
+ }
+ }
+
+
/* free vol block */
if (free_vol_bits != NULL)
ocfs_free_vol_block (osb, free_vol_bits, -1, DISK_ALLOC_VOLUME);
Modified: trunk/dir.c
===================================================================
--- trunk/dir.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/dir.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -388,11 +388,21 @@
ocfs_file_entry *fe;
int status;
bool bRet = false;
- ocfs_dir_node * DirNode = NULL;
+ ocfs_dir_node * DirNode = NULL, *tmp;
LOG_ENTRY ();
- DirNode = (ocfs_dir_node *)OCFS_BH_GET_DATA(bhs[0]);
+ DirNode = (ocfs_dir_node *) ocfs_malloc(512);
+ if (DirNode == NULL) {
+ LOG_ERROR_STR("Out of memory");
+ bRet = false;
+ goto bail;
+ }
+
+ tmp = (ocfs_dir_node *)OCFS_BH_GET_DATA(bhs[0]);
+ memcpy(DirNode, tmp, 512);
+ OCFS_BH_PUT_DATA(bhs[0]);
+
if (OFile != NULL)
start = OFile->curr_byte_off;
else
@@ -410,9 +420,7 @@
bool found = true;
if (SearchName != NULL) {
- OCFS_BH_PUT_DATA(bhs[0]);
found = ocfs_find_index (osb, bhs, SearchName, (int *) &i);
- DirNode = (ocfs_dir_node *)OCFS_BH_GET_DATA(bhs[0]);
}
if (found) {
@@ -446,8 +454,6 @@
__u64 next = DirNode->next_node_ptr;
/* dump what we've got */
- OCFS_BH_PUT_DATA(bhs[0]);
-
for (i=0; i<256; i++) {
if (bhs[i]) {
brelse(bhs[i]);
@@ -460,7 +466,11 @@
bRet = false;
goto bail;
}
- DirNode = (ocfs_dir_node *)OCFS_BH_GET_DATA(bhs[0]);
+
+ tmp = (ocfs_dir_node *)OCFS_BH_GET_DATA(bhs[0]);
+ memcpy(DirNode, tmp, 512);
+ OCFS_BH_PUT_DATA(bhs[0]);
+
if (!IS_VALID_DIR_NODE (DirNode)) {
bRet = false;
goto bail;
@@ -480,7 +490,7 @@
bail:
if (DirNode)
- OCFS_BH_PUT_DATA(bhs[0]);
+ ocfs_safefree(DirNode);
LOG_EXIT_ULONG (bRet);
return bRet;
@@ -737,6 +747,7 @@
/* Should status be updated here? */
if (DirNode->num_ent_used >= osb->max_dir_node_ent) {
+ OCFS_BH_PUT_DATA(bhs[0]);
status = -ENOSPC;
LOG_TRACE_STR("DirNode->num_ent_used >= osb->max_dir_node_ent");
goto bail;
@@ -1020,6 +1031,7 @@
bool dirtyall = false;
void *buf;
__u64 locknode_off, dir_next_node, new_head_del = 0;
+ __u64 locknode_head_del;
__u8 dir_num_ent_used;
LOG_ENTRY ();
@@ -1058,13 +1070,20 @@
/* route the new file entry to the proper dir_off */
LockNode = (ocfs_dir_node *)OCFS_BH_GET_DATA(lock_bh);
locknode_off = LockNode->node_disk_off;
-
- if (LockNode->head_del_ent_node != INVALID_NODE_POINTER)
- dir_off = LockNode->head_del_ent_node;
+ locknode_head_del = LockNode->head_del_ent_node;
+
+ LOG_TRACE_ARGS("ocfs_insert_file: head_del=%u.%u, free_node=%u.%u, locknode=%u.%u\n",
+ HILO(LockNode->head_del_ent_node), HILO(LockNode->free_node_ptr),
+ HILO(locknode_off));
+
+ if (locknode_head_del != INVALID_NODE_POINTER)
+ dir_off = locknode_head_del;
else if (LockNode->free_node_ptr != INVALID_NODE_POINTER)
dir_off = LockNode->free_node_ptr;
else
dir_off = locknode_off;
+
+ LOG_TRACE_ARGS("ocfs_insert_file: dir_off selected was %u.%u\n", HILO(dir_off));
parent_is_lock_node = (dir_off == locknode_off);
OCFS_BH_PUT_DATA(lock_bh);
@@ -1087,8 +1106,11 @@
OCFS_BH_PUT_DATA(bhs[0]);
DirNode = NULL;
- if (dir_off == locknode_off || dir_num_ent_used < osb->max_dir_node_ent)
+ if (dir_off != locknode_head_del || dir_num_ent_used < osb->max_dir_node_ent) {
+ LOG_TRACE_ARGS("ocfs_insert_file: going to got_dirnode, numentused=%d\n",
+ dir_num_ent_used);
goto got_dirnode;
+ }
/* we read the head_del_ent_node or the free_node_ptr, but */
/* hit the old BUG. there are no free slots at dir_off. */
@@ -1103,6 +1125,7 @@
/* start from locknode, travel along next_node_ptr */
dir_off = locknode_off;
while (1) {
+ LOG_TRACE_ARGS("ocfs_insert_file: now checking %u.%u\n", dir_off);
status = ocfs_read_bhs (osb, dir_off, osb->vol_layout.dir_node_size,
bhs, OCFS_BH_CACHED, dir_inode);
if (status < 0) {
@@ -1117,9 +1140,12 @@
DirNode = NULL;
if (dir_num_ent_used < osb->max_dir_node_ent) {
+ LOG_TRACE_ARGS("ocfs_insert_file: num_ent_used for %u.%u is good (%d)\n", dir_off,
+ dir_num_ent_used);
new_head_del = dir_off;
break;
}
+ LOG_TRACE_ARGS("ocfs_insert_file: next_node pointer for %u.%u is %u.%u\n", dir_off, dir_next_node);
dir_off = dir_next_node;
if (dir_off == INVALID_NODE_POINTER) {
new_head_del = INVALID_NODE_POINTER;
@@ -1164,6 +1190,7 @@
/* and insert in that. */
/* We should not find this entry already inserted */
if (dir_num_ent_used < osb->max_dir_node_ent) {
+ LOG_TRACE_ARGS("ocfs_insert_file: CASE 1\n");
status = ocfs_insert_dir_node (osb, bhs, InsertEntry, lock_bh,
&indexOffset, insert_bh, handle, dir_inode, file_inode);
if (status < 0) {
@@ -1178,6 +1205,7 @@
struct buffer_head **newbhs = NULL;
__u64 new_disk_off;
+ LOG_TRACE_ARGS("ocfs_insert_file: CASE 2\n");
newbhs = (struct buffer_head **)ocfs_malloc(length);
if (newbhs == NULL) {
LOG_ERROR_STATUS (status = -ENOMEM);
@@ -1187,6 +1215,7 @@
if (dir_next_node != INVALID_NODE_POINTER) {
/* already allocated a new block */
+ LOG_TRACE_ARGS("ocfs_insert_file: CASE 2A\n");
status = ocfs_read_bhs(osb, dir_next_node,
osb->vol_layout.dir_node_size,
newbhs, OCFS_BH_COND_CACHED,
@@ -1208,6 +1237,7 @@
} else {
/* Allocate a new dir node */
__u64 fileOffset = 0;
+ LOG_TRACE_ARGS("ocfs_insert_file: CASE 2B\n");
status = ocfs_alloc_node_block(osb,
osb->vol_layout.dir_node_size,
Modified: trunk/dlm.c
===================================================================
--- trunk/dlm.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/dlm.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -33,6 +33,7 @@
/* Tracing */
#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_DLM
+static void ocfs_stupid_loop(ocfs_super *osb, bool do_other_stupid_things);
static int ocfs_insert_cache_link (ocfs_super * osb, ocfs_lock_res * lockres);
static int ocfs_update_lock_state (ocfs_super * osb, ocfs_lock_res * lockres, __u32 flags, bool *disk_vote, struct inode *inode);
static int ocfs_send_dlm_request_msg (ocfs_super * osb, __u64 lock_id, __u32 lock_type, __u32 flags, ocfs_lock_res * lockres, __u64 vote_map);
@@ -145,6 +146,50 @@
return status;
} /* ocfs_update_lock_state */
+static void ocfs_stupid_loop(ocfs_super *osb, bool do_other_stupid_things)
+{
+ int i = 0;
+
+ LOG_ENTRY_ARGS("do_other_stupid_things = %s\n", do_other_stupid_things ? "true" : "false");
+
+ if((osb->trans_in_progress) && (osb->needs_flush))
+ {
+ osb->trans_in_progress = false;
+ atomic_set (&osb->flush_event_woken, 1);
+ wake_up (&osb->flush_event);
+ while ((osb->needs_flush) && (i < 30))
+ {
+ if (do_other_stupid_things) {
+ ocfs_down_sem(&(osb->voting_lock), true);
+
+ if (osb->in_voting == SKIPPED_HEARTBEAT)
+ {
+ osb->in_voting = DOING_HEARTBEAT;
+ ocfs_up_sem(&(osb->voting_lock));
+
+ down (&(osb->publish_lock));
+ ocfs_nm_heart_beat(osb, HEARTBEAT_METHOD_DISK, false);
+ up (&(osb->publish_lock));
+ }
+ else
+ {
+ ocfs_up_sem(&(osb->voting_lock));
+ }
+ }
+ ocfs_sleep(1000);
+ i++;
+ }
+
+ osb->trans_in_progress = true;
+ if (osb->needs_flush)
+ LOG_ERROR_STR("CHANGE TO TRACE >>> Trans and needs flush both are set");
+ }
+
+ LOG_EXIT();
+ return;
+}
+
+
/*
* ocfs_disk_request_vote()
*
@@ -203,7 +248,7 @@
largestseqno = pubsect->publ_seq_num;
if (pubsect->dirty) {
OCFS_BH_PUT_DATA(bhs[i]);
- if (!IS_NODE_ALIVE (pubmap, i, numnodes) &&
+ if (!IS_NODE_ALIVE (pubmap, i, numnodes) ||
TEST_NODE_IN_RECOVERY(osb, i)) {
LOG_TRACE_STR("Node is in recovery, trying"
" again.");
@@ -215,41 +260,8 @@
wait += OCFS_NM_HEARTBEAT_TIME;
LOG_TRACE_ARGS ("wait: %d\n", wait);
ocfs_sleep (wait);
- if((osb->trans_in_progress) && (osb->needs_flush))
- {
- int i = 0;
-
- osb->trans_in_progress = false;
- atomic_set (&osb->flush_event_woken, 1);
- wake_up (&osb->flush_event);
- while ((osb->needs_flush) && (i < 30))
- {
- ocfs_down_sem(&(osb->voting_lock), true);
-
- if (osb->in_voting == SKIPPED_HEARTBEAT)
- {
- osb->in_voting = DOING_HEARTBEAT;
- ocfs_up_sem(&(osb->voting_lock));
- down (&(osb->publish_lock));
- ocfs_nm_heart_beat(osb, HEARTBEAT_METHOD_DISK, false);
- up (&(osb->publish_lock));
- }
- else
- {
- ocfs_up_sem(&(osb->voting_lock));
- }
- ocfs_sleep(1000);
- i++;
- }
-
- osb->trans_in_progress = false;
- if (osb->needs_flush)
- {
- printk("Trans and needs flush both are set");
- }
- }
-
+ ocfs_stupid_loop(osb, true);
}
status = -EAGAIN;
goto finally;
@@ -717,22 +729,9 @@
status = 0;
goto bail;
}
+
+ ocfs_stupid_loop(osb, false);
- if (osb->trans_in_progress && osb->needs_flush) {
- int i=0;
- osb->trans_in_progress = false;
- atomic_set (&osb->flush_event_woken, 1);
- wake_up (&osb->flush_event);
- while (osb->needs_flush && i<30) {
- ocfs_sleep(1000);
- i++;
- }
- osb->trans_in_progress = true;
-
- if (osb->needs_flush)
- printk("hey! it still needs flush!\n");
- }
-
timewaited += WAIT_FOR_VOTE_INCREMENT;
}
@@ -1212,6 +1211,8 @@
lockres->writer_node_num = OCFS_INVALID_NODE_NUM;
lockres->reader_node_num = OCFS_INVALID_NODE_NUM;
+ lockres->lock_holders = 0;
+
LOG_EXIT ();
return;
} /* ocfs_init_lockres */
@@ -1478,19 +1479,23 @@
fast_path = false;
truncate_extend = (flags & (FLAG_FILE_EXTEND | FLAG_FILE_TRUNCATE));
local_lock = (lockres->master_node_num == osb->node_num);
- wait_on_recovery = !IS_NODE_ALIVE(osb->publ_map, lockres->master_node_num, OCFS_MAXIMUM_NODES) && TEST_NODE_IN_RECOVERY(osb, lockres->master_node_num);
- become_master = (wait_on_recovery || lockres->master_node_num == OCFS_INVALID_NODE_NUM);
+ wait_on_recovery = TEST_NODE_IN_RECOVERY(osb, lockres->master_node_num);
+ become_master = (osb->node_num != lockres->master_node_num)
+ && (wait_on_recovery ||
+ lockres->master_node_num == OCFS_INVALID_NODE_NUM ||
+ !IS_NODE_ALIVE(osb->publ_map, lockres->master_node_num,
+ OCFS_MAXIMUM_NODES));
wait_for_release = (lockres->lock_type > OCFS_DLM_SHARED_LOCK);
wait_for_release = false;
- get_x = (flags & (FLAG_FILE_DELETE | FLAG_FILE_RENAME) && !become_master &&
- (local_lock || !wait_for_release));
- if (local_lock && !(get_x | truncate_extend))
+ get_x = (flags & (FLAG_FILE_DELETE | FLAG_FILE_RENAME)
+ && !become_master && (local_lock || !wait_for_release));
+ if (local_lock && !(get_x || truncate_extend))
fast_path = true;
if (local_lock && truncate_extend) {
printk("local_lock but an extend or truncate request! will do a master_request.\n");
become_master = true;
}
- if (!(fast_path | become_master | get_x | wait_for_release))
+ if (!(fast_path || become_master || get_x || wait_for_release))
master_request = true;
/* possible locking paths: */
@@ -1542,17 +1547,8 @@
goto finally;
}
- if ((osb->trans_in_progress) && (osb->needs_flush)) {
- int i = 0;
- osb->trans_in_progress = false;
- atomic_set (&osb->flush_event_woken, 1);
- wake_up (&osb->flush_event);
- while ((osb->needs_flush) && (i < 30)) {
- ocfs_sleep (1000);
- i++;
- }
- osb->trans_in_progress = true;
- }
+ ocfs_stupid_loop(osb, false);
+
updated = false;
goto again;
}
@@ -1563,10 +1559,8 @@
/* so why, if get_x and the make lock master do the same thing,
* does the make lock master path need to rewrite the stuff to disk
* but the get_x path doesn't ???? */
- if (get_x) {
- ocfs_release_lockres (lockres);
- goto finally;
- }
+ if (get_x)
+ goto skip_lock_write;
keep_exclusive = false;
goto got_lock;
}
@@ -1655,6 +1649,9 @@
OCFS_BH_PUT_DATA(*b);
status = 0;
}
+
+skip_lock_write:
+ lockres->lock_holders++;
ocfs_release_lockres (lockres);
finally:
@@ -1693,8 +1690,6 @@
LOG_ENTRY_ARGS ("(0x%08x, %u.%u, %u, %u, 0x%08x)\n", osb, HI (lock_id),
LO (lock_id), lock_type, flags, lockres);
- ocfs_acquire_lockres (lockres);
-
if (bh != NULL)
b = &bh;
else
@@ -1861,7 +1856,6 @@
if (tmpbh)
brelse(tmpbh);
- ocfs_release_lockres (lockres);
LOG_EXIT_STATUS (status);
return status;
} /* ocfs_disk_release_lock */
@@ -1886,7 +1880,6 @@
if (lockres->lock_type == OCFS_DLM_SHARED_LOCK)
lockres->lock_type = OCFS_DLM_NO_LOCK;
}
- ocfs_release_lockres (lockres);
status = 0;
goto finally;
}
@@ -1902,7 +1895,6 @@
(lockres->master_node_num == osb->node_num) &&
!(flags & FLAG_FILE_DELETE)) {
status = 0;
- ocfs_release_lockres (lockres);
goto finally;
}
@@ -1910,8 +1902,6 @@
if (flags & (FLAG_FILE_DELETE | FLAG_FILE_RELEASE_MASTER))
lockres->master_node_num = OCFS_INVALID_NODE_NUM;
- ocfs_release_lockres (lockres);
-
status = ocfs_disk_release_lock (osb, lock_id, lock_type, flags,
lockres, bh, inode);
if (status < 0) {
@@ -1920,6 +1910,8 @@
}
finally:
+ lockres->lock_holders--;
+ ocfs_release_lockres (lockres);
LOG_EXIT_STATUS (status);
return (status);
} /* ocfs_release_lock */
@@ -2074,23 +2066,7 @@
status = -EAGAIN;
while (status == -EAGAIN) {
if (retry)
- {
- if ((osb->trans_in_progress) && (osb->needs_flush))
- {
- int i = 0;
-
- osb->trans_in_progress = false;
- atomic_set (&osb->flush_event_woken, 1);
- wake_up (&osb->flush_event);
- while ((osb->needs_flush) && (i < 30))
- {
- ocfs_sleep(1000);
- i++;
- }
-
- osb->trans_in_progress = true;
- }
- }
+ ocfs_stupid_loop(osb, false);
if (!IS_NODE_ALIVE (osb->publ_map, lockres->master_node_num,
OCFS_MAXIMUM_NODES)) {
Modified: trunk/file.c
===================================================================
--- trunk/file.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/file.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -683,7 +683,7 @@
}
}
- bail:
+bail:
if (saAcquired) {
OCFS_CLEAR_FLAG(oin->oin_flags, OCFS_OIN_OPEN_FOR_WRITE);
}
@@ -839,6 +839,20 @@
}
fe = (ocfs_file_entry *)OCFS_BH_GET_DATA(bh);
+
+ /* the file entry might have changed underneath us (while
+ * waiting on the lock). make sure the size is still a valid
+ * one. This really ought to check for other things too, like
+ * a valid bit, etc. */
+ if (file_size > fe->file_size) {
+ LOG_TRACE_ARGS("asked to truncate file with size (%u.%u) "
+ "to size (%u.%u)!\n", HILO(fe->file_size),
+ HILO(file_size));
+ OCFS_BH_PUT_DATA(bh);
+ status = -EINVAL;
+ LOG_ERROR_STATUS(status);
+ goto leave;
+ }
fe->file_size = file_size;
fe->alloc_size = new_alloc_size;
Modified: trunk/heartbeat.c
===================================================================
--- trunk/heartbeat.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/heartbeat.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -145,6 +145,18 @@
node_map->time[i] = publish->time;
node_map->scan_rate[i] = publish->hbm[i];
node_map->scan_time[i] = curr_time;
+ if (publish->mounted && i != osb->node_num) {
+ printk("ocfs: Adding %s (node %d) to "
+ "clustered device (%u,%u)\n",
+ osb->node_cfg_info[i]->node_name, i,
+ MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
+ node_map->miss_cnt[i] = 0;
+ node_map->time[i] = publish->time;
+ UPDATE_PUBLISH_MAP(osb->publ_map, i,
+ OCFS_PUBLISH_SET,
+ num_nodes);
+ }
OCFS_BH_PUT_DATA(bhs[i]);
}
goto bail; /* exit */
@@ -166,6 +178,9 @@
if (atomic_read (&(node_map->dismount[i]))) {
node_map->miss_cnt[i] = MISS_COUNT_VALUE;
atomic_set (&(node_map->dismount[i]), 0);
+ UPDATE_PUBLISH_MAP (osb->publ_map, i,
+ OCFS_PUBLISH_CLEAR,
+ num_nodes);
} else
(node_map->miss_cnt[i])++;
if (node_map->miss_cnt[i] > MISS_COUNT_VALUE) {
Modified: trunk/inc/journal.h
===================================================================
--- trunk/inc/journal.h 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/inc/journal.h 2003-12-04 23:52:47 UTC (rev 6)
@@ -45,7 +45,8 @@
struct _ocfs_journal {
enum {
OCFS_JOURNAL_FREE = 0,
- OCFS_JOURNAL_LOADED
+ OCFS_JOURNAL_LOADED,
+ OCFS_JOURNAL_CREATE /* only used during journal_create */
} state; /* Journals current state */
journal_t *k_journal; /* The kernels journal type */
@@ -78,6 +79,11 @@
struct list_head commited; /* doubly linked list of all
* commited handles awaiting
* checkpointing. */
+#define OCFS_JOURNAL_CREATE_MAX_BMAPS 1000
+ __u32 bmaps; /* only used during
+ * journal_create. see
+ * ocfs_journal_create
+ * for an explanation */
};
typedef struct _ocfs_journal_lock ocfs_journal_lock;
@@ -230,7 +236,7 @@
#define ocfs_take_trans_lock(osb) \
do { \
down(&osb->trans_lock); \
- osb->trans_in_progress = false; \
+ osb->trans_in_progress = true; \
} while (0)
#define ocfs_release_trans_lock(osb) \
Modified: trunk/inc/ocfs.h
===================================================================
--- trunk/inc/ocfs.h 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/inc/ocfs.h 2003-12-04 23:52:47 UTC (rev 6)
@@ -1729,6 +1729,7 @@
__u64 last_read_time;
__u32 writer_node_num;
__u32 reader_node_num;
+ __u32 lock_holders;
};
struct _ocfs_inode
@@ -2488,7 +2489,7 @@
if (timeo) \
ocfs_clear_timeout(&__to); \
\
-} while(0) \
+} while(0)
#define ocfs_wait(wq, condition, timeout) \
({ \
Modified: trunk/inc/ocfsio.h
===================================================================
--- trunk/inc/ocfsio.h 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/inc/ocfsio.h 2003-12-04 23:52:47 UTC (rev 6)
@@ -106,7 +106,7 @@
struct super_block *sb;
ocfs_blockdev dev;
struct buffer_head *bh;
-
+
if (osb == NULL || osb->sb == NULL || bhs == NULL) {
printk("ocfs: osb == NULL || osb->sb == NULL || bhs == NULL\n");
status = -EINVAL;
@@ -159,9 +159,11 @@
#endif
mark_buffer_dirty(bh);
if (flags & OCFS_BH_CACHED && inode && !TEST_BH_SEQNUM(inode,bh)) {
+#ifdef VERBOSE_BH_SEQNUM_TRACE
printk("(write) bh (%u) seqnum (%u) does not match inode (%u)\n",
bh->b_blocknr, (bh->b_state & STATE_BIT_MASK) >> 19,
atomic_read(GET_INODE_CLEAN_SEQ(inode)));
+#endif
flags &= ~OCFS_BH_CACHED;
}
unlock_buffer(bh);
@@ -171,7 +173,7 @@
if (!(flags & OCFS_BH_CACHED))
ll_rw_block (WRITE, nr, bhs);
- for (i = 0 ; i < nr ; i++) {
+ for (i = (nr-1) ; i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS_BH_CACHED))
wait_on_buffer(bh);
@@ -225,19 +227,21 @@
printk("ocfs: Asking me to read blocknum = %u even though "
"bh->blocknr == %u\n", blocknum, (*bh)->b_blocknr);
- if (!(flags & OCFS_BH_CACHED) && buffer_jbd(*bh)) {
- printk("ocfs: trying to sync read a jbd managed bh "
- "(blocknr = %u)\n", (*bh)->b_blocknr);
- return(status);
- }
-
if (flags & OCFS_BH_CACHED && inode && !TEST_BH_SEQNUM(inode, *bh)) {
+#ifdef VERBOSE_BH_SEQNUM_TRACE
printk("(read) bh (%u) seqnum (%u) does not match inode (%u)\n",
(*bh)->b_blocknr, ((*bh)->b_state & STATE_BIT_MASK) >> 19,
atomic_read(GET_INODE_CLEAN_SEQ(inode)));
+#endif
flags &= ~OCFS_BH_CACHED;
}
+ if (!(flags & OCFS_BH_CACHED) && buffer_jbd(*bh)) {
+ printk("ocfs: trying to sync read a jbd managed bh "
+ "(blocknr = %u)\n", (*bh)->b_blocknr);
+ return(status);
+ }
+
if (!(flags & OCFS_BH_CACHED)) {
LOCK_BUFFER_STR(*bh);
VERBOSE_LOCK_BUFFER_STR(*bh);
@@ -273,7 +277,7 @@
{
int status = 0;
struct super_block *sb;
- int nr, i;
+ int nr, i, ignore_cache;
__u64 blocknum;
ocfs_blockdev dev;
struct buffer_head *bh;
@@ -322,26 +326,32 @@
}
}
bh = bhs[i];
+
+ ignore_cache = 0;
- if (!(flags & OCFS_BH_CACHED) && buffer_jbd(bh)) {
- printk("ocfs: trying to sync read a jbd managed bh "
- "(blocknr = %u)\n", bh->b_blocknr);
- if (!buffer_uptodate(bh)) {
- printk("ocfs: jbd buffer is not uptodate!\n");
- status = -EINVAL;
- goto done;
- }
- continue;
- }
-
if (flags & OCFS_BH_CACHED && inode && !TEST_BH_SEQNUM(inode,bh)) {
+#ifdef VERBOSE_BH_SEQNUM_TRACE
printk("(read) bh (%u) seqnum (%u) does not match inode (%u)\n",
bh->b_blocknr, (bh->b_state & STATE_BIT_MASK) >> 19,
atomic_read(GET_INODE_CLEAN_SEQ(inode)));
- flags &= ~OCFS_BH_CACHED;
+#endif
+ ignore_cache = 1;
}
- if (!(flags & OCFS_BH_CACHED)) {
+ if (!(flags & OCFS_BH_CACHED) || ignore_cache) {
+ if (buffer_jbd(bh)) {
+ printk("ocfs: trying to sync read a jbd "
+ "managed bh (blocknr = %u)\n",
+ bh->b_blocknr);
+ if (!buffer_uptodate(bh)) {
+ printk("ocfs: jbd buffer is not "
+ "uptodate!\n");
+ status = -EINVAL;
+ goto done;
+ }
+ continue;
+ }
+
LOCK_BUFFER_STR(bh);
VERBOSE_LOCK_BUFFER_STR(bh);
lock_buffer(bh);
@@ -354,19 +364,19 @@
unlock_buffer(bh);
VERBOSE_UNLOCK_BUFFER_STR(bh);
}
- }
+ }
status = 0;
ll_rw_block(READ, nr, bhs);
- for (i = 0; i < nr ; i++) {
+ for (i = (nr-1); i >= 0; i--) {
bh = bhs[i];
wait_on_buffer(bh);
if (inode)
SET_BH_SEQNUM(inode, bh);
//buffer_insert_inode_clean_queue(bh, inode);
}
-
+
done:
return status;
}
Modified: trunk/inc/proto.h
===================================================================
--- trunk/inc/proto.h 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/inc/proto.h 2003-12-04 23:52:47 UTC (rev 6)
@@ -193,7 +193,6 @@
void ocfs_recover_oin_locks(ocfs_super *osb, __u32 node_num);
int ocfs_process_vote (ocfs_super * osb, ocfs_vote_request_ctxt *ctxt);
-int ocfs_flush_data (ocfs_inode * oin);
int ocfs_find_update_res (ocfs_super * osb, __u64 lock_id, ocfs_lock_res ** lockres, struct buffer_head **bh, __u32 * updated, __u32 timeout, struct inode *inode);
Modified: trunk/inode.c
===================================================================
--- trunk/inode.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/inode.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -978,6 +978,20 @@
goto bail;
}
+ /* To understand this block, see ocfs_journal_create */
+ if (osb->journal.state == OCFS_JOURNAL_CREATE) {
+ if (osb->journal.bmaps >= OCFS_JOURNAL_CREATE_MAX_BMAPS) {
+ LOG_TRACE_ARGS("%d iterations in journal_create, "
+ "yielding\n",
+ OCFS_JOURNAL_CREATE_MAX_BMAPS);
+ osb->journal.bmaps = 0;
+
+ yield();
+ } else {
+ osb->journal.bmaps++;
+ }
+ }
+
vbo = (__s64) block << inode->i_sb->s_blocksize_bits;
len = 1;
err = ocfs_lookup_file_allocation(osb, oin, vbo, &lbo, len,
Modified: trunk/journal.c
===================================================================
--- trunk/journal.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/journal.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -38,8 +38,10 @@
ocfs_inode *oin);
static int ocfs_recover_vol(struct _ocfs_super *osb, int node_num);
static int __ocfs_recovery_thread(void *arg);
+#ifdef USE_JOURNAL_CREATE_REPLACEMENT
+static int ocfs_replacement_journal_create(journal_t *journal);
+#endif
-
/* DO NOT EVER CALL THIS FUNCTION WITH A LOCKED BUFFER HEAD! */
ocfs_journal_handle * ocfs_start_trans(ocfs_super *osb, int max_buffs)
{
@@ -767,12 +769,13 @@
journal = &osb->journal;
inode = journal->k_inode;
- /* need to inc inode use count as journal_destroy will iput. */
- atomic_inc(&inode->i_count);
if (journal->state != OCFS_JOURNAL_LOADED)
goto done;
+ /* need to inc inode use count as journal_destroy will iput. */
+ atomic_inc(&inode->i_count);
+
num_running_trans = atomic_read(&(osb->journal.num_trans));
if (num_running_trans > 0)
LOG_ERROR_ARGS("Shutting down journal but there are %d " \
@@ -822,9 +825,9 @@
}
/*
- journal is formatted and loaded - so no need to call journal_load.
-*/
-
+ * journal is formatted and loaded - so no need to call journal_load.
+ * obviously, you MUST call this *after* ocfs_journal_init
+ */
int ocfs_journal_create(ocfs_journal *journal)
{
int status = 0;
@@ -839,16 +842,30 @@
osb = journal->osb;
sb = osb->sb;
+ /* The call to journal create will NOT result in our process
+ * going to sleep after the 1st bmap. On *really* slow, single
+ * CPU machines this can monopolize the CPU and prevent the
+ * heartbeat thread from timestamping our publish
+ * sector. Other nodes might think we're dead, and then try to
+ * recover us which could cause Very Bad Things. The
+ * (hackiferous) solution is to set our state, and in
+ * ocfs_bmap, call "yield" every 1000 times to give the
+ * heartbeat a chance. */
+ journal->state = OCFS_JOURNAL_CREATE;
+ journal->bmaps = 0;
+
+#ifdef USE_JOURNAL_CREATE_REPLACEMENT
+ status = ocfs_replacement_journal_create(journal->k_journal);
+#else
status = journal_create(journal->k_journal);
+#endif
if (status < 0) {
LOG_ERROR_STR("Failed to create new journal!");
goto done;
}
- /* set the mounted flag -- we've loaded the journal */
- status = ocfs_journal_set_mounted(osb, journal->node_num);
- if (status < 0)
- LOG_ERROR_STR("Could not set mounted flag!");
+ journal->state = OCFS_JOURNAL_LOADED;
+ journal->bmaps = 0;
done:
LOG_EXIT_STATUS(status);
@@ -881,11 +898,6 @@
journal_clear_err(journal->k_journal);
}
- /* set the mounted flag -- we've loaded the journal */
- status = ocfs_journal_set_mounted(osb, journal->node_num);
- if (status < 0)
- LOG_ERROR_STR("Could not set mounted flag!");
-
done:
LOG_EXIT_STATUS(status);
return(status);
@@ -1243,7 +1255,7 @@
status = 0;
goto clear_node;
}
- printk("ocfs: Recovering node %d from (%u,%u)\n", node_num,
+ printk("ocfs: Recovering node %d from device (%u,%u)\n", node_num,
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
/* Absolutely need to make sure that the node is using the
@@ -1502,3 +1514,120 @@
LOG_EXIT_STATUS (status);
return status;
} /* ocfs_commit_cache */
+
+#ifdef USE_JOURNAL_CREATE_REPLACEMENT
+/*
+ * This code is here because a bug in journal_create exists which will
+ * cause us to hang at mount time. This bug exists in all 2.4 kernels
+ * before 2.4.22. RHEL3 and RHAS 2.1 (update 3 and later) do not
+ * exhibit this bug either.
+ *
+ * The fix is a one liner, and including this file is a hack which
+ * should be removed at the first convenient moment.
+ *
+ * Unfortunately, we *must* be able to call journal_create at mount
+ * time, so we have to keep this here for those unpatched kernels.
+ *
+ * Taken from the JBD implimentation in 2.4.23
+ */
+
+#warning "using replacement code for journal_create"
+
+static void journal_fail_superblock (journal_t *journal)
+{
+ struct buffer_head *bh = journal->j_sb_buffer;
+ brelse(bh);
+ journal->j_sb_buffer = NULL;
+}
+
+static int ocfs_replacement_journal_create(journal_t *journal)
+{
+ unsigned long blocknr;
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+ int i, err = 0;
+ unsigned int first, last;
+
+ LOG_ENTRY();
+
+ if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
+ printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+ journal->j_maxlen);
+ journal_fail_superblock(journal);
+ err = -EINVAL;
+ goto bail;
+ }
+
+ if (journal->j_inode == NULL) {
+ /*
+ * We don't know what block to start at!
+ */
+ printk(KERN_EMERG "%s: creation of journal on external "
+ "device!\n", __FUNCTION__);
+ BUG();
+ }
+
+ /* Zero out the entire journal on disk. We cannot afford to
+ have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
+ jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+ for (i = 0; i < journal->j_maxlen; i++) {
+ err = journal_bmap(journal, i, &blocknr);
+ if (err)
+ goto bail;
+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ wait_on_buffer(bh);
+ memset (bh->b_data, 0, journal->j_blocksize);
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "marking uptodate");
+ mark_buffer_uptodate(bh, 1);
+ brelse(bh);
+ }
+
+ fsync_no_super(journal->j_dev);
+ jbd_debug(1, "JBD: journal cleared.\n");
+
+ /* OK, fill in the initial static fields in the new superblock */
+ sb = journal->j_superblock;
+
+ sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER);
+ sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2);
+
+ sb->s_blocksize = htonl(journal->j_blocksize);
+ sb->s_maxlen = htonl(journal->j_maxlen);
+ sb->s_first = htonl(1);
+
+ journal->j_transaction_sequence = 1;
+
+ journal->j_flags &= ~JFS_ABORT;
+ journal->j_format_version = 2;
+
+ /* copied out of journal_reset */
+ first = ntohl(sb->s_first);
+ last = ntohl(sb->s_maxlen);
+
+ journal->j_first = first;
+ journal->j_last = last;
+
+ journal->j_head = first;
+ journal->j_tail = first;
+ journal->j_free = last - first;
+
+ journal->j_tail_sequence = journal->j_transaction_sequence;
+ journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+ journal->j_commit_request = journal->j_commit_sequence;
+
+ journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+ /* Add the dynamic fields and write it to disk. */
+ journal_update_superblock(journal, 1);
+
+bail:
+ if (!err) {
+ LOG_TRACE_STR("Calling journal_load...");
+ err = journal_load(journal);
+ }
+ LOG_EXIT_STATUS(err);
+ return(err);
+}
+#endif
Modified: trunk/nm.c
===================================================================
--- trunk/nm.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/nm.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -184,22 +184,15 @@
goto finally;
if (osb->vol_state == VOLUME_MOUNTED) {
- if (osb->needs_flush) {
- i = 0;
- while ((osb->trans_in_progress) && (i < 10)) {
- ocfs_sleep (100);
- i++;
- }
-
- if ((down_trylock(&osb->trans_lock) == 0)
- && (osb->trans_in_progress == false)) {
+ if (osb->needs_flush && down_trylock(&osb->trans_lock) == 0) {
+ if (osb->trans_in_progress == false) {
osb->num_nm_thread_iter = 0;
status = ocfs_commit_cache(osb, false);
if (status < 0)
LOG_ERROR_STATUS (status);
osb->needs_flush = false;
- up(&osb->trans_lock);
}
+ up(&osb->trans_lock);
}
}
@@ -356,41 +349,6 @@
} /* ocfs_volume_thread */
/*
- * ocfs_flush_data()
- *
- */
-int ocfs_flush_data (ocfs_inode * oin)
-{
- int status = 0;
-
- LOG_ENTRY ();
-/* Removed to disable caching */
-#if 0
- if (oin->oin_flags & OCFS_OIN_DIRECTORY)
- goto bail;
-
- ocfs_down_sem (&(oin->main_res), true);
-
- oin->cache_enabled = false;
- ocfs_sleep(100);
-
- /* Grab and release PagingIo to serialize ourselves with the lazy writer. */
- /* This will work to ensure that all IO has completed on the cached */
- /* data and we will succesfully tear away the cache section. */
- ocfs_down_sem (&(oin->paging_io_res), true);
- ocfs_up_sem (&(oin->paging_io_res));
-
- fsync_inode_buffers (oin->inode);
-
- ocfs_up_sem (&(oin->main_res));
-
- bail:
-#endif
- LOG_EXIT_STATUS (status);
- return status;
-} /* ocfs_flush_data */
-
-/*
* ocfs_disk_update_resource()
*
* @osb: ocfs super block for the volume
@@ -909,8 +867,13 @@
else
osb->commit_cache_exec = false;
+ if (inode) {
+ fsync_inode_buffers(inode);
+#if LINUX_VERSION_CODE >= LinuxVersionCode(2,4,18)
+ fsync_inode_data_buffers(inode);
+#endif
+ }
if (oin != NULL) {
- ocfs_flush_data (oin);
lockres->lock_type = lockres->lock_state = OCFS_DLM_NO_LOCK;
lockres->cache_lock_held = false;
}
Modified: trunk/osb.c
===================================================================
--- trunk/osb.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/osb.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -132,7 +132,6 @@
/* read the whole cluster bitmap off disk, even though we only
* need the beginning of it. */
status = ocfs_read_bhs(osb, vol_layout->bitmap_off, ONE_MEGA_BYTE, osb->cluster_bitmap.chunk, 0, NULL);
-// status = ocfs_read_bhs(osb, vol_layout->bitmap_off, (OCFS_ALIGN(osb->cluster_bitmap.validbits, OCFS_BITS_IN_CHUNK) / 8), osb->cluster_bitmap.chunk, 0);
if (status < 0) {
LOG_ERROR_STATUS (status);
goto bail;
@@ -227,7 +226,7 @@
/* skip the frees which happen on error only */
goto finally;
- bail:
+bail:
if (osb->root_sect_node.buckets)
ocfs_hash_destroy (&(osb->root_sect_node), NULL);
ocfs_safefree (osb->data_prealloc);
@@ -235,7 +234,7 @@
ocfs_safefree (osb->log_prealloc);
ocfs_safefree (osb->cfg_bhs);
- finally:
+finally:
if (publish) {
if (publish_bh) {
OCFS_BH_PUT_DATA(publish_bh);
@@ -363,10 +362,9 @@
printk("OCFS: Old journal type found, converting to new" \
"style. You will no longer be able to mount " \
"with old versions of ocfs.\n");
-
+
/* do our "journal_create" */
status = ocfs_journal_create(&osb->journal);
-
if (status < 0) {
LOG_ERROR_STR("Could not create journal!");
goto finally;
@@ -379,11 +377,6 @@
LOG_ERROR_STR("Could not update node config!");
goto finally;
}
- status = ocfs_journal_wipe(&osb->journal, 1);
- if (status < 0) {
- LOG_ERROR_STR("Could not clear journal blocks!");
- goto finally;
- }
goto skip_load;
}
@@ -398,7 +391,7 @@
/* will play back anything left in the journal. */
ocfs_journal_load(&osb->journal);
-
+
skip_load:
/* 'mounted' flag in publish sector should not be set until
* after we successfully load the journal. */
Modified: trunk/super.c
===================================================================
--- trunk/super.c 2003-11-21 23:31:21 UTC (rev 5)
+++ trunk/super.c 2003-12-04 23:52:47 UTC (rev 6)
@@ -1050,9 +1050,6 @@
}
#endif
- LOG_TRACE_ARGS ("osb=0x%08x rootoin=0x%08x offset=%u.%u\n", osb,
- rootoin, rootoin->file_disk_off);
-
ocfs_journal_shutdown(osb);
ocfs_sync_blockdev(sb);
ocfs_release_oin (rootoin, true);
More information about the Ocfs2-commits
mailing list