[Ocfs2-commits] mfasheh commits r2425 - in trunk/fs/ocfs2: . cluster
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Tue Jun 28 16:07:40 CDT 2005
Author: mfasheh
Signed-off-by: jlbec
Date: 2005-06-23 21:30:36 -0500 (Thu, 23 Jun 2005)
New Revision: 2425
Added:
trunk/fs/ocfs2/uptodate.c
trunk/fs/ocfs2/uptodate.h
Removed:
trunk/fs/ocfs2/seqnum.c
trunk/fs/ocfs2/seqnum.h
Modified:
trunk/fs/ocfs2/Makefile
trunk/fs/ocfs2/alloc.c
trunk/fs/ocfs2/buffer_head_io.c
trunk/fs/ocfs2/cluster/masklog.c
trunk/fs/ocfs2/cluster/masklog.h
trunk/fs/ocfs2/dir.c
trunk/fs/ocfs2/dlmglue.c
trunk/fs/ocfs2/inode.c
trunk/fs/ocfs2/inode.h
trunk/fs/ocfs2/namei.c
trunk/fs/ocfs2/ocfs2.h
trunk/fs/ocfs2/suballoc.c
trunk/fs/ocfs2/super.c
Log:
* New metadata caching scheme. We were running out of bits on b_state with
which to hold any useful information so this will take it's place.
Signed-off-by: jlbec
Modified: trunk/fs/ocfs2/Makefile
===================================================================
--- trunk/fs/ocfs2/Makefile 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/Makefile 2005-06-24 02:30:36 UTC (rev 2425)
@@ -53,12 +53,12 @@
mmap.c \
namei.c \
proc.c \
- seqnum.c \
slot_map.c \
suballoc.c \
super.c \
symlink.c \
sysfile.c \
+ uptodate.c \
ver.c \
vote.c
@@ -82,12 +82,12 @@
mmap.h \
namei.h \
proc.h \
- seqnum.h \
slot_map.h \
suballoc.h \
super.h \
symlink.h \
sysfile.h \
+ uptodate.h \
ver.h \
vote.h
Modified: trunk/fs/ocfs2/alloc.c
===================================================================
--- trunk/fs/ocfs2/alloc.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/alloc.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -39,11 +39,11 @@
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
-#include "seqnum.h"
#include "suballoc.h"
#include "sysfile.h"
#include "file.h"
#include "super.h"
+#include "uptodate.h"
#include "buffer_head_io.h"
@@ -181,8 +181,7 @@
mlog_errno(status);
goto bail;
}
- set_buffer_uptodate(bhs[i]);
- ocfs2_set_bh_sequence(inode, bhs[i]);
+ ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
status = ocfs2_journal_access(handle, inode, bhs[i],
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -1612,6 +1611,9 @@
if (!el->l_next_free_rec) {
mlog(0, "deleting this extent block.\n");
+
+ ocfs2_remove_from_cache(inode, eb_bh);
+
OCFS2_ASSERT(!eb->h_suballoc_slot);
OCFS2_ASSERT(!el->l_recs[0].e_clusters);
OCFS2_ASSERT(!el->l_recs[0].e_cpos);
Modified: trunk/fs/ocfs2/buffer_head_io.c
===================================================================
--- trunk/fs/ocfs2/buffer_head_io.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/buffer_head_io.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -35,7 +35,7 @@
#include "alloc.h"
#include "inode.h"
#include "journal.h"
-#include "seqnum.h"
+#include "uptodate.h"
#include "buffer_head_io.h"
@@ -112,12 +112,10 @@
for (i = (nr - 1) ; i >= 0; i--) {
bh = bhs[i];
- wait_on_buffer(bh);
-
if (inode)
- ocfs2_set_bh_sequence(inode, bh);
- else
- ocfs2_clear_bh_sequence(bh);
+ ocfs2_set_buffer_uptodate(inode, bh);
+
+ wait_on_buffer(bh);
}
if (inode)
up(&OCFS2_I(inode)->ip_io_sem);
@@ -161,6 +159,9 @@
sb = osb->sb;
+ if (flags & OCFS2_BH_CACHED && !inode)
+ flags &= ~OCFS2_BH_CACHED;
+
if (inode)
down(&OCFS2_I(inode)->ip_io_sem);
for (i = 0 ; i < nr ; i++) {
@@ -177,18 +178,17 @@
bh = bhs[i];
ignore_cache = 0;
- if (flags & OCFS2_BH_CACHED && inode &&
- !ocfs2_test_bh_sequence(inode, bh)) {
- mlog(ML_SEQNUM, "bh (%llu) seqnum does not match "
- "inode %"MLFu64"\n",
+ if (flags & OCFS2_BH_CACHED &&
+ !ocfs2_buffer_uptodate(inode, bh)) {
+ mlog(ML_UPTODATE,
+ "bh (%llu), inode %"MLFu64" not uptodate\n",
(unsigned long long)bh->b_blocknr,
OCFS2_I(inode)->ip_blkno);
ignore_cache = 1;
}
- if ((flags & OCFS2_BH_CACHED) && (!buffer_uptodate(bh)))
- ignore_cache = 1;
-
+ /* XXX: Can we ever get this and *not* have the cached
+ * flag set? */
if (buffer_jbd(bh)) {
if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
mlog(ML_BH_IO, "trying to sync read a jbd "
@@ -234,6 +234,9 @@
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
+ if (inode)
+ ocfs2_set_buffer_uptodate(inode, bh);
+
/* We know this can't have changed as we hold the
* inode sem. Avoid doing any work on the bh as the
* journal has it now. */
@@ -241,11 +244,6 @@
continue;
wait_on_buffer(bh);
-
- if (inode)
- ocfs2_set_bh_sequence(inode, bh);
- else
- ocfs2_clear_bh_sequence(bh);
}
if (inode)
up(&OCFS2_I(inode)->ip_io_sem);
Modified: trunk/fs/ocfs2/cluster/masklog.c
===================================================================
--- trunk/fs/ocfs2/cluster/masklog.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/cluster/masklog.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -204,7 +204,7 @@
set_a_string(EXTENT_MAP);
set_a_string(DLM_GLUE);
set_a_string(BH_IO);
- set_a_string(SEQNUM);
+ set_a_string(UPTODATE);
set_a_string(NAMEI);
set_a_string(INODE);
set_a_string(VOTE);
Modified: trunk/fs/ocfs2/cluster/masklog.h
===================================================================
--- trunk/fs/ocfs2/cluster/masklog.h 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/cluster/masklog.h 2005-06-24 02:30:36 UTC (rev 2425)
@@ -101,7 +101,7 @@
#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
-#define ML_SEQNUM 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
+#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
Modified: trunk/fs/ocfs2/dir.c
===================================================================
--- trunk/fs/ocfs2/dir.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/dir.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -54,8 +54,8 @@
#include "inode.h"
#include "journal.h"
#include "namei.h"
-#include "seqnum.h"
#include "suballoc.h"
+#include "uptodate.h"
#include "buffer_head_io.h"
@@ -464,8 +464,8 @@
goto bail;
}
- set_buffer_uptodate(new_bh);
- ocfs2_set_bh_sequence(dir, new_bh);
+ ocfs2_set_new_buffer_uptodate(dir, new_bh);
+
status = ocfs2_journal_access(handle, dir, new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
Modified: trunk/fs/ocfs2/dlmglue.c
===================================================================
--- trunk/fs/ocfs2/dlmglue.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/dlmglue.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -49,8 +49,8 @@
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
-#include "seqnum.h"
#include "slot_map.h"
+#include "uptodate.h"
#include "vote.h"
#include "buffer_head_io.h"
@@ -171,8 +171,6 @@
static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
-static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
- struct ocfs2_lock_res *lockres);
static void ocfs2_schedule_blocked_lock(ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static void ocfs2_schedule_blocked_inode_lock(struct inode *inode,
@@ -519,19 +517,7 @@
lockres->l_level = lockres->l_requested;
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
- mlog_exit_void();
-}
-static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
- struct ocfs2_lock_res *lockres)
-{
- mlog_entry_void();
-
- /* generic_handle_convert_action will set the refresh flag for us. */
- if (lockres->l_level == LKM_NLMODE)
- ocfs2_inc_inode_sequence(inode);
- ocfs2_generic_handle_convert_action(lockres);
-
mlog_exit_void();
}
@@ -584,19 +570,11 @@
switch(lockres->l_action) {
case OCFS2_AST_ATTACH:
- if (lockres->l_type == OCFS2_LOCK_TYPE_META &&
- lockres->l_requested > LKM_NLMODE &&
- !(lockres->l_flags & OCFS2_LOCK_LOCAL))
- ocfs2_inc_inode_sequence(inode);
-
ocfs2_generic_handle_attach_action(lockres);
lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
break;
case OCFS2_AST_CONVERT:
- if (lockres->l_type == OCFS2_LOCK_TYPE_META)
- ocfs2_handle_meta_convert_action(inode, lockres);
- else
- ocfs2_generic_handle_convert_action(lockres);
+ ocfs2_generic_handle_convert_action(lockres);
break;
case OCFS2_AST_DOWNCONVERT:
ocfs2_generic_handle_downconvert_action(lockres);
@@ -1480,6 +1458,10 @@
if (!ocfs2_should_refresh_lock_res(lockres))
goto bail;
+ /* This will discard any caching information we might have had
+ * for the inode metadata. */
+ ocfs2_metadata_cache_purge(inode);
+
if (ocfs2_lvb_is_trustable(lockres)) {
/* yay, fastpath! */
ocfs2_meta_lvb_get_trunc_clusters(lockres,
Modified: trunk/fs/ocfs2/inode.c
===================================================================
--- trunk/fs/ocfs2/inode.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/inode.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -44,11 +44,11 @@
#include "inode.h"
#include "journal.h"
#include "namei.h"
-#include "seqnum.h"
#include "suballoc.h"
#include "super.h"
#include "symlink.h"
#include "sysfile.h"
+#include "uptodate.h"
#include "vote.h"
#include "buffer_head_io.h"
@@ -232,7 +232,7 @@
inode->i_mode = fe->i_mode;
inode->i_uid = fe->i_uid;
inode->i_gid = fe->i_gid;
- inode->i_blksize = (u32)osb->s_clustersize; // sb->s_blocksize;
+ inode->i_blksize = (u32)osb->s_clustersize;
/* Fast symlinks will have i_size but no allocated clusters. */
if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
@@ -664,6 +664,8 @@
goto bail_unblock;
}
+ ocfs2_remove_from_cache(inode, fe_bh);
+
status = ocfs2_free_dinode(handle, inode_alloc_inode,
inode_alloc_bh, fe);
if (status < 0) {
@@ -737,6 +739,16 @@
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
+ ocfs2_metadata_cache_purge(inode);
+
+ mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+ "Clear inode of %"MLFu64", inode has %u cache items\n",
+ oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+
+ mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+ "Clear inode of %"MLFu64", inode has a bad flag\n",
+ oi->ip_blkno);
+
mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
"Clear inode of %"MLFu64", inode is locked\n",
oi->ip_blkno);
@@ -765,8 +777,8 @@
"Clear inode of %"MLFu64" has non empty handle pointer\n",
oi->ip_blkno);
- oi->ip_clean_buffer_seq = OCFS2_CLEAN_SEQ_CLEAR;
- oi->ip_flags = 0;
+ /* Clear all other flags. */
+ oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
oi->ip_created_trans = 0;
oi->ip_last_trans = 0;
oi->ip_dir_start_lookup = 0;
Modified: trunk/fs/ocfs2/inode.h
===================================================================
--- trunk/fs/ocfs2/inode.h 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/inode.h 2005-06-24 02:30:36 UTC (rev 2425)
@@ -55,7 +55,6 @@
struct list_head ip_handle_list;
ocfs2_journal_handle *ip_handle;
- unsigned long ip_clean_buffer_seq;
u32 ip_flags; /* see below */
/* protected by recovery_lock. */
@@ -69,6 +68,8 @@
/* last transaction we were a part of. */
unsigned long ip_last_trans;
+ struct ocfs2_caching_info ip_metadata_cache;
+
struct inode vfs_inode;
};
@@ -83,8 +84,7 @@
#define OCFS2_INODE_DELETED 0x00000008
/* Another node is deleting, so our delete is a nop */
#define OCFS2_INODE_SKIP_DELETE 0x00000010
-/*
- * Has the inode been orphaned on another node?
+/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
* continuing.
@@ -101,6 +101,8 @@
#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
/* Does someone have the file open O_DIRECT */
#define OCFS2_INODE_OPEN_DIRECT 0x00000040
+/* Indicates that the metadata cache should be used as an array. */
+#define OCFS2_INODE_CACHE_INLINE 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
Modified: trunk/fs/ocfs2/namei.c
===================================================================
--- trunk/fs/ocfs2/namei.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/namei.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -55,10 +55,10 @@
#include "inode.h"
#include "journal.h"
#include "namei.h"
-#include "seqnum.h"
#include "suballoc.h"
#include "symlink.h"
#include "sysfile.h"
+#include "uptodate.h"
#include "vote.h"
#include "buffer_head_io.h"
@@ -280,8 +280,8 @@
goto bail;
}
- set_buffer_uptodate(new_bh);
- ocfs2_set_bh_sequence(inode, new_bh);
+ ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
status = ocfs2_journal_access(handle, inode, new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
@@ -552,11 +552,8 @@
mlog_errno(status);
goto leave;
}
- set_buffer_uptodate(*new_fe_bh);
+ ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
- ocfs2_new_inode_sequence(inode);
- ocfs2_set_bh_sequence(inode, *new_fe_bh);
-
status = ocfs2_journal_access(handle, inode, *new_fe_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
@@ -1526,9 +1523,8 @@
mlog_errno(status);
goto bail;
}
+ ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
- set_buffer_uptodate(bhs[virtual]);
- ocfs2_set_bh_sequence(inode, bhs[virtual]);
status = ocfs2_journal_access(handle, inode, bhs[virtual],
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
Modified: trunk/fs/ocfs2/ocfs2.h
===================================================================
--- trunk/fs/ocfs2/ocfs2.h 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/ocfs2.h 2005-06-24 02:30:36 UTC (rev 2425)
@@ -47,6 +47,21 @@
struct rb_root em_extents;
};
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+
+struct ocfs2_caching_info {
+ unsigned int ci_num_cached;
+ union {
+ sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+ struct rb_root ci_tree;
+ } ci_cache;
+};
+
/* this limits us to 256 nodes
* if we need more, we can do a kmalloc for the map */
#define OCFS2_NODE_MAP_MAX_NODES 256
@@ -211,7 +226,6 @@
wait_queue_head_t checkpoint_event;
atomic_t needs_checkpoint;
struct _ocfs2_journal *journal;
- unsigned long osb_clean_buffer_seq;
enum ocfs2_local_alloc_state local_alloc_state;
struct buffer_head *local_alloc_bh;
Deleted: trunk/fs/ocfs2/seqnum.c
===================================================================
--- trunk/fs/ocfs2/seqnum.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/seqnum.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -1,186 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * seqnum.c
- *
- * Caching sequence number handling
- *
- * Standard buffer head caching flags (uptodate, etc) are insufficient
- * in a clustered environment - a buffer may be marked up to date on
- * our local node but could have been modified by another cluster
- * member. As a result an additional (and performant) caching scheme
- * is required. OCFS2 uses sequence numbers, stored on the inode and
- * buffer heads to test whether a buffer needs to be read from disk -
- * when a new cluster lock is aquired on an inode, it's sequence
- * number is incremented. Additionally, a buffer undergoing
- * modification is automatically marked by jbd and we can make liberal
- * use of buffer_jbd to shorcut many checks -- if it's in the journal,
- * then it *must* be up to date as we do not allow a metadata block to
- * be modified by multiple nodes at a time.
- *
- * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/buffer_head.h>
-
-#define MLOG_MASK_PREFIX ML_SEQNUM
-
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-
-#include "inode.h"
-#include "seqnum.h"
-
-/* Sequence numbers are stored in 3 places:
- *
- * bh->b_state - Set from inode sequence. If it differs from the inode
- * sequence, then the buffer is considered out of date.
- *
- * ip_clean_buffer_seq - The "master" number, used to compare against
- * buffer heads.
- *
- * osb_clean_buffer_seq - A globally incrementing value, we set inode
- * sequence numbers to it.
- */
-
-/* Number of BH bits used up through JBD. The number of unused bits
- * determine the maximum size of our sequence numbers. */
-#define OCFS2_USED_BH_BITS 22
-#define OCFS2_MAX_BH_BITS (8 * sizeof(((struct buffer_head *) 0)->b_state))
-#define OCFS2_STATE_BH_BITS (OCFS2_MAX_BH_BITS - OCFS2_USED_BH_BITS)
-
-#define OCFS2_MAX_SEQUENCE (1UL << OCFS2_STATE_BH_BITS)
-#define OCFS2_MAX_SEQUENCE_MASK ((1UL << OCFS2_STATE_BH_BITS) - 1)
-#define OCFS2_SEQUENCE_MASK ((~0UL) << OCFS2_USED_BH_BITS)
-
-static spinlock_t ocfs2_clean_buffer_lock = SPIN_LOCK_UNLOCKED;
-
-static void __ocfs2_inc_inode_sequence(struct inode *inode,
- int inc_global)
-{
- ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
- assert_spin_locked(&ocfs2_clean_buffer_lock);
-
- /* Though it's not currently likely, we want to handle the
- * case where the first inode to come through here is a newly
- * created one - hence the check for OCFS2_CLEAN_SEQ_CLEAR */
- if (inc_global ||
- osb->osb_clean_buffer_seq == OCFS2_CLEAN_SEQ_CLEAR) {
- osb->osb_clean_buffer_seq++;
-
- /* We are careful not to wrap the osb sequence */
- if (osb->osb_clean_buffer_seq >= OCFS2_MAX_SEQUENCE)
- osb->osb_clean_buffer_seq = OCFS2_CLEAN_SEQ_START;
- }
-
- /* Set this from the current osb sequence number. This helps
- * prevent inode sequence numbers from repeating themselves
- * too soon. */
- oi->ip_clean_buffer_seq = osb->osb_clean_buffer_seq;
-
- mlog(0, "(%u) Inode %"MLFu64", seq: %lu\n", current->pid,
- OCFS2_I(inode)->ip_blkno, oi->ip_clean_buffer_seq);
-}
-
-void ocfs2_inc_inode_sequence(struct inode *inode)
-{
- spin_lock(&ocfs2_clean_buffer_lock);
- __ocfs2_inc_inode_sequence(inode, 1);
- spin_unlock(&ocfs2_clean_buffer_lock);
-}
-
-/* Called only on newly created inodes. */
-void ocfs2_new_inode_sequence(struct inode *inode)
-{
- BUG_ON(OCFS2_I(inode)->ip_clean_buffer_seq);
-
- spin_lock(&ocfs2_clean_buffer_lock);
- /* We don't increment the global sequence number on brand new
- * inodes -- they won't have any existing metadata buffers
- * which might be old. */
- __ocfs2_inc_inode_sequence(inode, 0);
- spin_unlock(&ocfs2_clean_buffer_lock);
-}
-
-/* Completely point in time, which is fine for buffer_head comparsions
- * as they should be done under cluster lock in which case the
- * sequence won't be incrememnted. */
-static inline unsigned int ocfs2_get_inode_sequence(struct inode *inode)
-{
- unsigned int ret;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
- spin_lock(&ocfs2_clean_buffer_lock);
- if (oi->ip_clean_buffer_seq >= OCFS2_MAX_SEQUENCE)
- mlog(ML_ERROR,
- "Inode %"MLFu64" has bad sequence: %lu\n",
- oi->ip_blkno, oi->ip_clean_buffer_seq);
-
- ret = oi->ip_clean_buffer_seq & OCFS2_MAX_SEQUENCE_MASK;
- spin_unlock(&ocfs2_clean_buffer_lock);
-
- return ret;
-}
-
-/* The sequence numbers on buffer_head are protected by inodes
- * ip_io_sem - this also guards access to that buffer by the journal
- * (via ocfs2_journal_access), so we don't change the buffer state
- * bits at the same time as JBD, which will cause many hard to find
- * problems. Once the buffer has been passed to the journal, we can
- * test that condition via buffer_jbd and avoid changing state
- * bits.
- */
-
-void ocfs2_clear_bh_sequence(struct buffer_head *bh)
-{
- unsigned int prev = bh->b_state & OCFS2_SEQUENCE_MASK;
- bh->b_state &= ~prev;
-}
-
-/* For a newly created inode (e.g., one that's in read_locked_inode)
- * it's sequence number will be zero (uninitialized), until a cluster
- * lock is later acquired. That's ok though because we never want to
- * trust buffers for unlocked inodes.
- *
- * This function is also called against newly allocated inode
- * metadata */
-void ocfs2_set_bh_sequence(struct inode *inode,
- struct buffer_head *bh)
-{
- unsigned int seq =
- ocfs2_get_inode_sequence(inode) << OCFS2_USED_BH_BITS;
-
- ocfs2_clear_bh_sequence(bh);
- bh->b_state |= seq;
-}
-
-int ocfs2_test_bh_sequence(struct inode *inode,
- struct buffer_head *bh)
-{
- unsigned int seq =
- (bh->b_state & OCFS2_SEQUENCE_MASK) >> OCFS2_USED_BH_BITS;
-
- return seq == ocfs2_get_inode_sequence(inode);
-}
Deleted: trunk/fs/ocfs2/seqnum.h
===================================================================
--- trunk/fs/ocfs2/seqnum.h 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/seqnum.h 2005-06-24 02:30:36 UTC (rev 2425)
@@ -1,41 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * seqnum.h
- *
- * Caching sequence number handling
- *
- * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_SEQNUM_H
-#define OCFS2_SEQNUM_H
-
-#define OCFS2_CLEAN_SEQ_CLEAR 0
-#define OCFS2_CLEAN_SEQ_START 1
-
-void ocfs2_inc_inode_sequence(struct inode *inode);
-void ocfs2_new_inode_sequence(struct inode *inode);
-
-void ocfs2_clear_bh_sequence(struct buffer_head *bh);
-void ocfs2_set_bh_sequence(struct inode *inode,
- struct buffer_head *bh);
-int ocfs2_test_bh_sequence(struct inode *inode,
- struct buffer_head *bh);
-
-#endif /* OCFS2_SEQNUM_H */
Modified: trunk/fs/ocfs2/suballoc.c
===================================================================
--- trunk/fs/ocfs2/suballoc.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/suballoc.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -39,9 +39,9 @@
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
-#include "seqnum.h"
#include "suballoc.h"
#include "sysfile.h"
+#include "uptodate.h"
#include "buffer_head_io.h"
@@ -274,8 +274,7 @@
mlog_errno(status);
goto bail;
}
- set_buffer_uptodate(bg_bh);
- ocfs2_set_bh_sequence(alloc_inode, bg_bh);
+ ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
status = ocfs2_block_group_fill(handle,
alloc_inode,
Modified: trunk/fs/ocfs2/super.c
===================================================================
--- trunk/fs/ocfs2/super.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/super.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -57,10 +57,10 @@
#include "journal.h"
#include "localalloc.h"
#include "proc.h"
-#include "seqnum.h"
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "uptodate.h"
#include "ver.h"
#include "vote.h"
@@ -509,7 +509,12 @@
if (init_ocfs2_extent_maps())
return -ENOMEM;
- /* Initialize the memory slabs for oin and file entry */
+ status = init_ocfs2_uptodate_cache();
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
status = ocfs2_initialize_mem_caches();
if (status < 0) {
mlog_errno(status);
@@ -532,6 +537,7 @@
leave:
if (status < 0) {
ocfs2_free_mem_caches();
+ exit_ocfs2_uptodate_cache();
exit_ocfs2_extent_maps();
}
@@ -561,6 +567,8 @@
exit_ocfs2_extent_maps();
+ exit_ocfs2_uptodate_cache();
+
mlog_exit_void();
}
@@ -640,7 +648,6 @@
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
oi->ip_flags = 0;
- oi->ip_clean_buffer_seq = OCFS2_CLEAN_SEQ_CLEAR;
oi->ip_open_count = 0;
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
@@ -661,6 +668,8 @@
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_metadata_cache_init(&oi->vfs_inode);
+
inode_init_once(&oi->vfs_inode);
}
}
@@ -1108,7 +1117,6 @@
init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
- osb->osb_clean_buffer_seq = OCFS2_CLEAN_SEQ_CLEAR;
osb->node_num = O2NM_INVALID_NODE_NUM;
osb->slot_num = OCFS2_INVALID_SLOT;
Added: trunk/fs/ocfs2/uptodate.c
===================================================================
--- trunk/fs/ocfs2/uptodate.c 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/uptodate.c 2005-06-24 02:30:36 UTC (rev 2425)
@@ -0,0 +1,543 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.c
+ *
+ * Tracking the up-to-date-ness of a local buffer_head with respect to
+ * the cluster.
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Standard buffer head caching flags (uptodate, etc) are insufficient
+ * in a clustered environment - a buffer may be marked up to date on
+ * our local node but could have been modified by another cluster
+ * member. As a result an additional (and performant) caching scheme
+ * is required. A further requirement is that we consume as little
+ * memory as possible - we never pin buffer_head structures in order
+ * to cache them.
+ *
+ * We track the existence of up to date buffers on the inodes which
+ * are associated with them. Because we don't want to pin
+ * buffer_heads, this is only a (strong) hint and several other checks
+ * are made in the I/O path to ensure that we don't use a stale or
+ * invalid buffer without going to disk:
+ * - buffer_jbd is used liberally - if a bh is in the journal on
+ * this node then it *must* be up to date.
+ * - the standard buffer_uptodate() macro is used to detect buffers
+ * which may be invalid (even if we have an up to date tracking
+ * item for them)
+ *
+ * For a full understanding of how this code works together, one
+ * should read the callers in dlmglue.c, the I/O functions in
+ * buffer_head_io.c and ocfs2_journal_access in journal.c
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_UPTODATE
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "inode.h"
+#include "uptodate.h"
+
+struct ocfs2_meta_cache_item {
+ struct rb_node c_node;
+ sector_t c_block;
+};
+
+static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
+
+void ocfs2_metadata_cache_init(struct inode *inode)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+ oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+ ci->ci_num_cached = 0;
+}
+
+/* No lock taken here as 'root' is not expected to be visible to other
+ * processes. */
+static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
+{
+ unsigned int purged = 0;
+ struct rb_node *node;
+ struct ocfs2_meta_cache_item *item;
+
+ while ((node = rb_last(root)) != NULL) {
+ item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
+
+ mlog(0, "Purge item %llu\n",
+ (unsigned long long) item->c_block);
+
+ rb_erase(&item->c_node, root);
+ kmem_cache_free(ocfs2_uptodate_cachep, item);
+
+ purged++;
+ }
+ return purged;
+}
+
+/* Called from locking and called from ocfs2_clear_inode. Dump the
+ * cache for a given inode.
+ *
+ * This function is a few more lines longer than necessary due to some
+ * accounting done here, but I think it's worth tracking down those
+ * bugs sooner -- Mark */
+void ocfs2_metadata_cache_purge(struct inode *inode)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ unsigned int tree, to_purge, purged;
+ struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+ struct rb_root root = RB_ROOT;
+
+ spin_lock(&oi->ip_lock);
+ tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+ to_purge = ci->ci_num_cached;
+
+ mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
+ tree ? "array" : "tree", oi->ip_blkno);
+
+ /* If we're a tree, save off the root so that we can safely
+ * initialize the cache. We do the work to free tree members
+ * without the spinlock. */
+ if (tree)
+ root = ci->ci_cache.ci_tree;
+
+ ocfs2_metadata_cache_init(inode);
+ spin_unlock(&oi->ip_lock);
+
+ purged = ocfs2_purge_copied_metadata_tree(&root);
+ /* If possible, track the number wiped so that we can more
+ * easily detect counting errors. Unfortunately, this is only
+ * meaningful for trees. */
+ if (tree && purged != to_purge)
+ mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
+ oi->ip_blkno, to_purge, purged);
+}
+
+/* Returns the index in the cache array, -1 if not found.
+ * Requires ip_lock. */
+static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
+ sector_t item)
+{
+ int i;
+
+ for (i = 0; i < ci->ci_num_cached; i++) {
+ if (item == ci->ci_cache.ci_array[i])
+ return i;
+ }
+
+ return -1;
+}
+
+/* Returns the cache item if found, otherwise NULL.
+ * Requires ip_lock. */
+static struct ocfs2_meta_cache_item *
+ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
+ sector_t block)
+{
+ struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
+ struct ocfs2_meta_cache_item *item = NULL;
+
+ while (n) {
+ item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
+
+ if (block < item->c_block)
+ n = n->rb_left;
+ else if (block > item->c_block)
+ n = n->rb_right;
+ else
+ return item;
+ }
+
+ return NULL;
+}
+
+static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+ struct buffer_head *bh)
+{
+ int index = -1;
+ struct ocfs2_meta_cache_item *item = NULL;
+
+ spin_lock(&oi->ip_lock);
+
+ mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
+ oi->ip_blkno, (unsigned long long) bh->b_blocknr,
+ !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+
+ if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
+ index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
+ bh->b_blocknr);
+ else
+ item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
+ bh->b_blocknr);
+
+ spin_unlock(&oi->ip_lock);
+
+ mlog(0, "index = %d, item = %p\n", index, item);
+
+ return (index != -1) || (item != NULL);
+}
+
+/* Warning: even if it returns true, this does *not* guarantee that
+ * the block is stored in our inode metadata cache. */
+int ocfs2_buffer_uptodate(struct inode *inode,
+ struct buffer_head *bh)
+{
+ /* Doesn't matter if the bh is in our cache or not -- if it's
+ * not marked uptodate then we know it can't have correct
+ * data. */
+ if (!buffer_uptodate(bh))
+ return 0;
+
+ /* OCFS2 does not allow multiple nodes to be changing the same
+ * block at the same time. */
+ if (buffer_jbd(bh))
+ return 1;
+
+ /* Ok, locally the buffer is marked as up to date, now search
+ * our cache to see if we can trust that. */
+ return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
+
+/* Requires ip_lock */
+static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
+ sector_t block)
+{
+ BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+
+ mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
+ ci->ci_num_cached);
+
+ ci->ci_cache.ci_array[ci->ci_num_cached] = block;
+ ci->ci_num_cached++;
+}
+
+/* By now the caller should have checked that the item does *not*
+ * exist in the tree.
+ * Requires ip_lock. */
+static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
+ struct ocfs2_meta_cache_item *new)
+{
+ sector_t block = new->c_block;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
+ struct ocfs2_meta_cache_item *tmp;
+
+ mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
+ ci->ci_num_cached);
+
+ while(*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
+
+ if (block < tmp->c_block)
+ p = &(*p)->rb_left;
+ else if (block > tmp->c_block)
+ p = &(*p)->rb_right;
+ else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate block %llu cached!\n",
+ (unsigned long long) block);
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->c_node, parent, p);
+ rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
+ ci->ci_num_cached++;
+}
+
+static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
+ struct ocfs2_caching_info *ci)
+{
+ assert_spin_locked(&oi->ip_lock);
+
+ return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
+ (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+}
+
+/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+ * pointers in tree after we use them - this allows caller to detect
+ * when to free in case of error. */
+static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+ struct ocfs2_meta_cache_item **tree)
+{
+ int i;
+ struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+ mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
+ "Inode %"MLFu64", num cached = %u, should be %u\n",
+ oi->ip_blkno, ci->ci_num_cached,
+ OCFS2_INODE_MAX_CACHE_ARRAY);
+ mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+ "Inode %"MLFu64" not marked as inline anymore!\n",
+ oi->ip_blkno);
+ assert_spin_locked(&oi->ip_lock);
+
+ /* Be careful to initialize the tree members *first* because
+ * once the ci_tree is used, the array is junk... */
+ for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ tree[i]->c_block = ci->ci_cache.ci_array[i];
+
+ oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+ ci->ci_cache.ci_tree = RB_ROOT;
+ /* this will be set again by __ocfs2_insert_cache_tree */
+ ci->ci_num_cached = 0;
+
+ for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ __ocfs2_insert_cache_tree(ci, tree[i]);
+ tree[i] = NULL;
+ }
+
+ mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
+ oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+}
+
+/* Slow path function - memory allocation is necessary. See the
+ * comment above ocfs2_set_buffer_uptodate for more information. */
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+ sector_t block,
+ int expand_tree)
+{
+ int i;
+ struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+ struct ocfs2_meta_cache_item *new = NULL;
+ struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+ { NULL, };
+
+ mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
+ oi->ip_blkno, (unsigned long long) block, expand_tree);
+
+ new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
+ if (!new) {
+ mlog_errno(-ENOMEM);
+ return;
+ }
+ new->c_block = block;
+
+ if (expand_tree) {
+ /* Do *not* allocate an array here - the removal code
+ * has no way of tracking that. */
+ for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
+ GFP_KERNEL);
+ if (!tree[i]) {
+ mlog_errno(-ENOMEM);
+ goto out_free;
+ }
+
+ /* These are initialized in ocfs2_expand_cache! */
+ }
+ }
+
+ spin_lock(&oi->ip_lock);
+ if (ocfs2_insert_can_use_array(oi, ci)) {
+ mlog(0, "Someone cleared the tree underneath us\n");
+ /* Ok, items were removed from the cache in between
+ * locks. Detect this and revert back to the fast path */
+ ocfs2_append_cache_array(ci, block);
+ spin_unlock(&oi->ip_lock);
+ goto out_free;
+ }
+
+ if (expand_tree)
+ ocfs2_expand_cache(oi, tree);
+
+ __ocfs2_insert_cache_tree(ci, new);
+ spin_unlock(&oi->ip_lock);
+
+ new = NULL;
+out_free:
+ if (new)
+ kmem_cache_free(ocfs2_uptodate_cachep, new);
+
+ /* If these were used, then ocfs2_expand_cache re-set them to
+ * NULL for us. */
+ if (tree[0]) {
+ for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ if (tree[i])
+ kmem_cache_free(ocfs2_uptodate_cachep,
+ tree[i]);
+ }
+}
+
+/* Item insertion is guarded by ip_io_sem, so the insertion path takes
+ * advantage of this by not rechecking for a duplicate insert during
+ * the slow case. Additionally, if the cache needs to be bumped up to
+ * a tree, the code will not recheck after acquiring the lock --
+ * multiple paths cannot be expanding to a tree at the same time.
+ *
+ * The slow path takes into account that items can be removed
+ * (including the whole tree wiped and reset) when this process it out
+ * allocating memory. In those cases, it reverts back to the fast
+ * path.
+ *
+ * Note that this function may actually fail to insert the block if
+ * memory cannot be allocated. This is not fatal however (but may
+ * result in a performance penalty) */
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+ struct buffer_head *bh)
+{
+ int expand;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+ /* The block may very well exist in our cache already, so avoid
+ * doing any more work in that case. */
+ if (ocfs2_buffer_cached(oi, bh))
+ return;
+
+ mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
+ (unsigned long long) bh->b_blocknr);
+
+ /* No need to recheck under spinlock - insertion is guarded by
+ * ip_io_sem */
+ spin_lock(&oi->ip_lock);
+ if (ocfs2_insert_can_use_array(oi, ci)) {
+ /* Fast case - it's an array and there's a free
+ * spot. */
+ ocfs2_append_cache_array(ci, bh->b_blocknr);
+ spin_unlock(&oi->ip_lock);
+ return;
+ }
+
+ expand = 0;
+ if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ /* We need to bump things up to a tree. */
+ expand = 1;
+ }
+ spin_unlock(&oi->ip_lock);
+
+ __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+}
+
+/* Called against a newly allocated buffer. Most likely nobody should
+ * be able to read this sort of metadata while it's still being
+ * allocated, but this is careful to take ip_io_sem anyway. */
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ /* This should definitely *not* exist in our cache */
+ BUG_ON(ocfs2_buffer_cached(oi, bh));
+
+ set_buffer_uptodate(bh);
+
+ down(&oi->ip_io_sem);
+ ocfs2_set_buffer_uptodate(inode, bh);
+ up(&oi->ip_io_sem);
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
+ int index)
+{
+ sector_t *array = ci->ci_cache.ci_array;
+ int bytes;
+
+ BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+ BUG_ON(index >= ci->ci_num_cached);
+ BUG_ON(!ci->ci_num_cached);
+
+ mlog(0, "remove index %d (num_cached = %u\n", index,
+ ci->ci_num_cached);
+
+ ci->ci_num_cached--;
+
+ /* don't need to copy if the array is now empty, or if we
+ * removed at the tail */
+ if (ci->ci_num_cached && index < ci->ci_num_cached) {
+ bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
+ memmove(&array[index], &array[index + 1], bytes);
+ }
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
+ struct ocfs2_meta_cache_item *item)
+{
+ mlog(0, "remove block %llu from tree\n",
+ (unsigned long long) item->c_block);
+
+ rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
+ ci->ci_num_cached--;
+}
+
+/* Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit. */
+void ocfs2_remove_from_cache(struct inode *inode,
+ struct buffer_head *bh)
+{
+ int index;
+ sector_t block = bh->b_blocknr;
+ struct ocfs2_meta_cache_item *item = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+ spin_lock(&oi->ip_lock);
+ mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
+ oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
+ oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+
+ if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ index = ocfs2_search_cache_array(ci, block);
+ if (index != -1)
+ ocfs2_remove_metadata_array(ci, index);
+ } else {
+ item = ocfs2_search_cache_tree(ci, block);
+ if (item)
+ ocfs2_remove_metadata_tree(ci, item);
+ }
+ spin_unlock(&oi->ip_lock);
+
+ if (item)
+ kmem_cache_free(ocfs2_uptodate_cachep, item);
+}
+
+int __init init_ocfs2_uptodate_cache(void)
+{
+ ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
+ sizeof(struct ocfs2_meta_cache_item),
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!ocfs2_uptodate_cachep)
+ return -ENOMEM;
+
+ mlog(0, "%u inlined cache items per inode.\n",
+ OCFS2_INODE_MAX_CACHE_ARRAY);
+
+ return 0;
+}
+
+void __exit exit_ocfs2_uptodate_cache(void)
+{
+ if (ocfs2_uptodate_cachep)
+ kmem_cache_destroy(ocfs2_uptodate_cachep);
+}
Added: trunk/fs/ocfs2/uptodate.h
===================================================================
--- trunk/fs/ocfs2/uptodate.h 2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/uptodate.h 2005-06-24 02:30:36 UTC (rev 2425)
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.h
+ *
+ * Cluster uptodate tracking
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_UPTODATE_H
+#define OCFS2_UPTODATE_H
+
+int __init init_ocfs2_uptodate_cache(void);
+void __exit exit_ocfs2_uptodate_cache(void);
+
+void ocfs2_metadata_cache_init(struct inode *inode);
+void ocfs2_metadata_cache_purge(struct inode *inode);
+
+int ocfs2_buffer_uptodate(struct inode *inode,
+ struct buffer_head *bh);
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+ struct buffer_head *bh);
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+ struct buffer_head *bh);
+void ocfs2_remove_from_cache(struct inode *inode,
+ struct buffer_head *bh);
+
+#endif /* OCFS2_UPTODATE_H */
More information about the Ocfs2-commits
mailing list