[Ocfs2-commits] mfasheh commits r1479 - branches/dlm-changes/src

svn-commits at oss.oracle.com svn-commits at oss.oracle.com
Fri Sep 24 17:43:57 CDT 2004


Author: mfasheh
Date: 2004-09-24 17:43:55 -0500 (Fri, 24 Sep 2004)
New Revision: 1479

Added:
   branches/dlm-changes/src/suballoc.c
   branches/dlm-changes/src/suballoc.h
Modified:
   branches/dlm-changes/src/ocfs2_fs.h
Log:
* Commit our new metadata suballocation scheme. This bears close
  resemblance to a certain Linux FS...
  Things are still under testing, delete hasn't been written yet, and
  two features (the self optimizing code and the zeroing code) haven't
  been turned on yet.



Modified: branches/dlm-changes/src/ocfs2_fs.h
===================================================================
--- branches/dlm-changes/src/ocfs2_fs.h	2004-09-20 21:51:09 UTC (rev 1478)
+++ branches/dlm-changes/src/ocfs2_fs.h	2004-09-24 22:43:55 UTC (rev 1479)
@@ -56,6 +56,7 @@
 #define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
 #define OCFS2_INODE_SIGNATURE		"INODE01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -96,7 +97,7 @@
 #define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
 #define OCFS2_JOURNAL_FL	(0x00000100)	/* Node journal */
 #define OCFS2_DLM_FL		(0x00000200)	/* DLM area */
-	
+#define OCFS2_SUBALLOC_FL	(0x00000400)	/* Suballocator File */
 
 /* Limit of space in ocfs2_dir_entry */
 #define OCFS2_MAX_FILENAME_LENGTH       255
@@ -225,6 +226,12 @@
 /*10*/
 } ocfs2_extent_rec;	
 
+typedef struct _ocfs2_chain_rec {
+	__u32 c_free;       /* number of free bits in this chain. */
+	__u32 c_total;
+	__u64 c_blkno;      /* Physical disk offset (blocks) of 1st group */
+} ocfs2_chain_rec;
+
 /*
  * On disk extent list for OCFS2 (node in the tree).  Note that this
  * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
@@ -244,17 +251,26 @@
 /*10*/	ocfs2_extent_rec l_recs[0];	/* Extent records */
 } ocfs2_extent_list;
 
+typedef struct _ocfs2_chain_list {
+	__u16 cl_cpg;		/* Clusters per Block Group */
+	__u16 cl_bpc;		/* Bits per cluster */
+	__u16 cl_count;
+	__u16 cl_next_free_rec;
+	__u64 cl_reserved1;
+	ocfs2_chain_rec cl_recs[0];
+} ocfs2_chain_list;
+
 /*
  * On disk extent block (indirect block) for OCFS2
  */
 typedef struct _ocfs2_extent_block
 {
 /*00*/	__u8 h_signature[8];		/* Signature for verification */
-	__u64 h_suballoc_blkno;		/* Node suballocator offset,
-					   in blocks */
+	__u64 h_reserved1;
 /*10*/	__s16 h_suballoc_node;		/* Node suballocator this
 					   extent_header belongs to */
-	__u16 h_reserved1;
+	__u16 h_suballoc_bit;		/* Bit offset in suballocater
+					   block group */
 	__u32 h_reserved2;
 	__u64 h_blkno;			/* Offset on disk, in blocks */
 /*20*/	__u64 h_parent_blk;		/* Offset on disk, in blocks,
@@ -338,11 +354,11 @@
 typedef struct _ocfs2_dinode {
 /*00*/	__u8 i_signature[8];		/* Signature for validation */
 	__u32 i_generation;		/* Generation number */
-	__u16 i_reserved1;
 	__s16 i_suballoc_node;		/* Node suballocater this inode
 					   belongs to */
-/*10*/	__u64 i_suballoc_blkno;		/* Node suballocator offset,
-       					   in blocks */
+	__u16 i_suballoc_bit;		/* Bit offset in suballocater
+					   block group */
+/*10*/	__u64 i_reserved1;
 /*18*/	ocfs2_disk_lock i_disk_lock;	/* Lock structure */
 /*48*/	__u32 i_uid;			/* Owner UID */
 	__u32 i_gid;			/* Owning GID */
@@ -376,7 +392,8 @@
 	} id1;				/* Inode type dependant 1 */
 /*C0*/	union {
 		ocfs2_super_block i_super;
-                ocfs2_local_alloc i_lab;
+		ocfs2_local_alloc i_lab;
+		ocfs2_chain_list  i_chain;
 		ocfs2_extent_list i_list;
 	} id2;
 /* Actual on-disk size is one block */
@@ -394,8 +411,24 @@
 /* Actual on-disk length specified by rec_len */
 };
 
+/*
+ * On disk allocator group structure for OCFS2
+ */
+typedef struct _ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__u16   bg_size;                /* Size of included bitmap in bytes. */
+	__u16   bg_bits;                /* Bits represented by this group. */
+	__u16	bg_free_bits_count;     /* Free bits count */
+	__u16   bg_chain;               /* What chain I am in. */
+	__u32   bg_generation;
+	__u64   bg_next_group;          /* Next group in my list, in blocks */
+	__u64   bg_parent_dinode;       /* dinode which owns me, in blocks */
+	__u64   bg_blkno;               /* Offset on disk, in blocks */
+	__u64   bg_reserved2[2];
+	__u8    bg_bitmap[0];
+} ocfs2_group_desc;
 
-
 #ifdef __KERNEL__
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
@@ -407,6 +440,16 @@
 	return size / sizeof(struct _ocfs2_extent_rec);
 }
 
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct _ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct _ocfs2_chain_rec);
+}
+
 static inline int ocfs2_extent_recs_per_eb(struct super_block *sb)
 {
 	int size;
@@ -426,6 +469,16 @@
 
 	return size;
 }
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct _ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
 #else
 static inline int ocfs2_extent_recs_per_inode(int blocksize)
 {
@@ -437,6 +490,16 @@
 	return size / sizeof(struct _ocfs2_extent_rec);
 }
 
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct _ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct _ocfs2_chain_rec);
+}
+
 static inline int ocfs2_extent_recs_per_eb(int blocksize)
 {
 	int size;
@@ -456,6 +519,16 @@
 
 	return size;
 }
+
+static inline int ocfs2_group_bitmap_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct _ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
 #endif  /* __KERNEL__ */
 
 

Added: branches/dlm-changes/src/suballoc.c
===================================================================
--- branches/dlm-changes/src/suballoc.c	2004-09-20 21:51:09 UTC (rev 1478)
+++ branches/dlm-changes/src/suballoc.c	2004-09-24 22:43:55 UTC (rev 1479)
@@ -0,0 +1,1008 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Mark Fasheh, Kurt Hackel, Joel Becker, Sunil Mushran, 
+ *          Wim Coekaerts, Manish Singh
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "dlm.h"
+#include "util.h"
+#include "suballoc.h"
+#include "sysfile.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_CHAINALLOC
+
+static inline void debug_bg(ocfs2_group_desc *bg);
+static inline void debug_suballoc_inode(ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(ocfs_journal_handle *handle, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  ocfs2_chain_list *cl);
+static int ocfs_block_group_alloc(ocfs_super *osb, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bh);
+static int ocfs_reserve_suballoc_bits(ocfs_super *osb, 
+				      ocfs_journal_handle *handle,
+				      ocfs2_alloc_context *ac);
+static int ocfs_claim_suballoc_bits(ocfs_super *osb,
+				    ocfs_journal_handle *handle,
+				    ocfs2_alloc_context *ac,
+				    u32 bits_wanted,
+				    u16 *bit_off,
+				    unsigned int *num_bits,
+				    u64 *bg_blkno);
+static int ocfs_block_group_find_clear_bits(ocfs_super *osb, 
+					    ocfs2_group_desc *bg,
+					    unsigned int bits_wanted, 
+					    u16 *bit_off,
+					    u16 *bits_found);
+static inline int ocfs_block_group_set_bits(ocfs_journal_handle *handle,
+					    ocfs2_group_desc *bg, 
+					    struct buffer_head *group_bh,
+					    unsigned int bit_off, 
+					    unsigned int num_bits);
+
+static int ocfs2_block_group_fill(ocfs_journal_handle *handle, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  ocfs2_chain_list *cl)
+{
+	int status = 0;
+	ocfs2_group_desc *bg = (ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(((unsigned long long) bg_bh->b_blocknr) == group_blkno);
+
+	set_buffer_uptodate(bg_bh);
+	SET_BH_SEQNUM(alloc_inode, bg_bh);
+	status = ocfs_journal_access(handle, 
+				     bg_bh, 
+				     OCFS_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy (bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(alloc_inode->i_generation);
+	bg->bg_size = ocfs2_group_bitmap_size(sb);
+	bg->bg_bits = (u32) cl->cl_cpg * (u32) cl->cl_bpc;
+	bg->bg_chain = my_chain;
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = OCFS_I(alloc_inode)->ip_blkno;
+	bg->bg_blkno = group_blkno;
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	set_bit(0, bg->bg_bitmap);
+	bg->bg_free_bits_count = bg->bg_bits - 1;
+
+	status = ocfs_journal_dirty(handle, bg_bh);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+#warning "we need to zero out the other blocks in the group! (only inode alloc?)"
+	printk("filled new block group:\n");
+	debug_bg(bg);
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+static inline u16 ocfs2_find_smallest_chain(ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < cl->cl_count) {
+		if (cl->cl_recs[best].c_total > cl->cl_recs[curr].c_total)
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+#ifdef OCFS_BG_ZERO
+static struct buffer_head **ocfs_block_group_zero_start(ocfs_super *osb,
+						       u32 bit_off,
+						       u16 clusters)
+{
+	struct buffer_head **bhs = NULL;
+	unsigned int blocks;
+	u64 blkno;
+	int i;
+
+	LOG_ENTRY();
+
+	blocks = ocfs_clusters_to_blocks(osb->sb, (u32) clusters) - (u64) 1;
+	bhs = kmalloc(blocks * sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		LOG_ERROR_STATUS(-ENOMEM);
+		goto bail;
+	}
+	memset(bhs, 0, blocks * sizeof(struct buffer_head *));
+
+	blkno = ocfs_clusters_to_blocks(osb->sb, bit_off) + (u64) 1;
+	for(i = 0; i < blocks; i++) {
+		bhs[i] = sb_getblk(osb->sb, ((u64) i + blkno));
+		if (!bhs[i]) {
+			kfree(bhs);
+			bhs = NULL;
+			LOG_ERROR_STATUS(-EIO);
+			goto bail;
+		}
+		lock_buffer(bhs[i]);
+		OCFS_ASSERT(!buffer_jbd(bhs[i]));
+
+		memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+		set_buffer_uptodate(bhs[i]);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)		
+		clear_buffer_dirty(bhs[i]);
+#else
+		mark_buffer_clean(bhs[i]);
+#endif
+
+		bhs[i]->b_end_io = ocfs_end_buffer_io_sync;
+		submit_bh(WRITE, bhs[i]);
+	}
+bail:
+	LOG_EXIT();
+	return(bhs);
+}
+
+static void ocfs_block_group_zero_wait(ocfs_super *osb, 
+				       struct buffer_head **bhs,
+				       u16 clusters)
+{
+	unsigned int blocks = 
+		ocfs_clusters_to_blocks(osb->sb, (u32) clusters) - (u64) 1;
+
+	do {
+		blocks--;
+		wait_on_buffer(bhs[i]);
+		brelse(bhs[i]);
+	} while (blocks);
+
+	kfree(bhs);
+	return;
+}
+
+
+#endif
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs_block_group_alloc(ocfs_super *osb, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bh)
+{
+	int status, credits;
+	ocfs2_dinode *fe = (ocfs2_dinode *) bh->b_data;
+	ocfs2_chain_list *cl;
+	ocfs2_alloc_context *ac = NULL;
+	ocfs_journal_handle *handle = NULL;
+	u32 bit_off, num_bits;
+	u16 alloc_rec;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh = NULL;
+	ocfs2_group_desc *bg;
+#ifdef OCFS_BG_ZERO
+	struct buffer_head **zero_bhs = NULL;
+#endif
+
+	LOG_ENTRY();
+
+	handle = ocfs_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	cl = &fe->id2.i_chain;
+	status = ocfs_reserve_bits(osb, 
+				   handle, 
+				   cl->cl_cpg, 
+				   &ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	debug_suballoc_inode(fe);
+
+	credits = ocfs_calc_group_alloc_credits(osb->sb, cl->cl_cpg);
+	printk("allocate new block group, requires %d credits\n", credits);
+	handle = ocfs_start_trans(osb, handle, credits);
+	if (!handle) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	ocfs_handle_set_always_commits(handle, 1);
+
+	status = ocfs_claim_bits(osb, 
+				 handle, 
+				 ac, 
+				 cl->cl_cpg, 
+				 &bit_off, 
+				 &num_bits);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+#ifdef OCFS_BG_ZERO
+	status = ocfs_block_group_zero_start(osb, bit_off, cl->cpg, &zero_bhs);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+#endif
+	alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	/* setup the group */
+	bg_blkno = ocfs_clusters_to_blocks(osb->sb, bit_off);
+	printk("new descriptor, record %u, at block %llu\n", 
+	       alloc_rec, bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_fill(handle, 
+					alloc_inode, 
+					bg_bh, 
+					bg_blkno,
+					alloc_rec, 
+					cl);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg = (ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	cl->cl_recs[alloc_rec].c_free  += bg->bg_free_bits_count;
+	cl->cl_recs[alloc_rec].c_total += bg->bg_bits;
+	cl->cl_recs[alloc_rec].c_blkno  = bg_blkno;
+	if (cl->cl_next_free_rec < cl->cl_count)
+		cl->cl_next_free_rec++;
+
+	fe->id1.bitmap1.i_used  += (bg->bg_bits - bg->bg_free_bits_count);
+	fe->id1.bitmap1.i_total += bg->bg_bits;
+	fe->i_clusters += cl->cl_cpg;
+
+	status = ocfs_journal_dirty(handle, bh);
+		if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	down (&(OCFS_I(alloc_inode)->ip_sem));
+	OCFS_I(alloc_inode)->ip_alloc_size = 
+		(u64)fe->i_clusters << osb->s_clustersize_bits;
+	fe->i_size = OCFS_I(alloc_inode)->ip_alloc_size;
+	up (&(OCFS_I(alloc_inode)->ip_sem));
+	alloc_inode->i_size = fe->i_size;
+	alloc_inode->i_blocks = (alloc_inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
+
+	printk("allocation succes, new block group allocator:\n");
+	debug_suballoc_inode(fe);
+	status = 0;
+bail:
+#ifdef OCFS_BG_ZERO
+	if (zero_bhs)
+		ocfs_block_group_zero_wait(osb, zero_bhs, cl->cl_cpg);
+#endif
+	if (handle)
+		ocfs_commit_trans(handle);
+
+	if (ac)
+		ocfs_free_alloc_context(ac);
+
+	if (bg_bh)
+		brelse(bg_bh);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static int ocfs_reserve_suballoc_bits(ocfs_super *osb, 
+				      ocfs_journal_handle *handle,
+				      ocfs2_alloc_context *ac)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *bh = NULL;
+	ocfs2_dinode *fe;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(!(handle->flags & OCFS_HANDLE_STARTED));
+
+	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
+				   0, &bh, alloc_inode);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
+			     0, alloc_inode);
+	ocfs_handle_add_inode(handle, alloc_inode);
+
+	fe = (ocfs2_dinode *) bh->b_data;
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT(fe->i_flags & OCFS2_SUBALLOC_FL);
+
+	if (bits_wanted > (le32_to_cpu(fe->id1.bitmap1.i_total) - 
+			   le32_to_cpu(fe->id1.bitmap1.i_used))) {
+		status = ocfs_block_group_alloc(osb, alloc_inode, bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		/* You should never ask for this much metadata */
+		OCFS_ASSERT(bits_wanted <= 
+			    (le32_to_cpu(fe->id1.bitmap1.i_total) 
+			     - le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+int ocfs_reserve_new_metadata(ocfs_super *osb, 
+			      ocfs_journal_handle *handle,
+			      struct inode *inode,
+			      ocfs2_dinode *fe,
+			      ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(*ac, 0, sizeof(ocfs2_alloc_context));
+	/* Our file data alloc path is such a mess that I really feel
+	 * comfortable just always over-reserving here. */
+	(*ac)->ac_bits_wanted = 2 * ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS_AC_USE_META;
+
+#ifndef OCFS_USE_ALL_METADATA_SUBALLOCATORS
+	alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_BITMAP_SYSTEM_INODE, 0);
+#else
+	alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_BITMAP_SYSTEM_INODE, osb->node_num);
+#endif
+	if (alloc_inode) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+
+	status = ocfs_reserve_suballoc_bits(osb, handle, (*ac));
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (inode)
+		iput(inode);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+int ocfs_reserve_new_inode(ocfs_super *osb, 
+			   ocfs_journal_handle *handle,
+			   ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(*ac, 0, sizeof(ocfs2_alloc_context));
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS_AC_USE_INODE;
+
+	alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_BITMAP_SYSTEM_INODE, osb->node_num);
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+
+	status = ocfs_reserve_suballoc_bits(osb, handle, *ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+static int ocfs_block_group_find_clear_bits(ocfs_super *osb, 
+					    ocfs2_group_desc *bg,
+					    unsigned int bits_wanted, 
+					    u16 *bit_off,
+					    u16 *bits_found)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+
+	OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = find_next_zero_bit(bitmap, 
+					   bg->bg_bits, 
+					   start)) != -1) {
+		if (offset == bg->bg_bits)
+			break;
+
+		if (offset == start) {
+			/* we found a zero */
+			found++;
+			start++;
+			if (found > best_size) {
+				best_size = found;
+				best_offset = start - found;
+			}
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* LOG_TRACE_STR("Found it all!"); */
+			break;
+		}
+	}
+
+	if (found == bits_wanted) {
+		*bit_off = start - found;
+		*bits_found = found;
+	} else if (best_offset) {
+		*bit_off = best_offset;
+		*bits_found = best_size;
+	} else {
+		status = -ENOSPC;
+		LOG_ERROR_STATUS(status);
+	}
+
+	return(status);
+}
+
+static inline int ocfs_block_group_set_bits(ocfs_journal_handle *handle,
+					    ocfs2_group_desc *bg, 
+					    struct buffer_head *group_bh,
+					    unsigned int bit_off, 
+					    unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+	OCFS_ASSERT(bg->bg_free_bits_count >= num_bits);
+
+	printk("block_group_set_bits: off = %u, num = %u\n", bit_off, 
+	       num_bits);
+	debug_bg(bg);
+
+	status = ocfs_journal_access(handle, 
+				     group_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg->bg_free_bits_count -= num_bits;
+
+	while(num_bits--)
+		set_bit(bit_off++, bitmap);
+
+	status = ocfs_journal_dirty(handle, 
+				    group_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	OCFS_ASSERT(cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < cl->cl_next_free_rec) {
+		if (cl->cl_recs[curr].c_free > cl->cl_recs[best].c_free)
+			best = curr;
+		curr++;
+	}
+
+	OCFS_ASSERT(best < cl->cl_next_free_rec);
+	return best;
+}
+
+#ifdef OCFS2_OPTIMIZE_SUBALLOCATORS
+static int ocfs_relink_block_group(ocfs_journal_handle *handle,
+				   struct buffer_head *fe_bh,
+				   struct buffer_head *bg_bh,
+				   struct buffer_head *prev_bg_bh,
+				   u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	ocfs2_dinode *fe = (ocfs2_dinode *) fe_bh->b_data;
+	ocfs2_group_desc *bg = (ocfs2_group_desc *) bg_bh->b_data;
+	ocfs2_group_desc *prev_bg = (ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+	OCFS_ASSERT(IS_VALID_GROUP_DESC(prev_bg));
+
+	fe_ptr = fe->id2.i_chain.cl_recs[chain].c_blkno;
+	bg_ptr = bg->bg_next_group;
+	prev_bg_ptr = prev_bg->bg_next_group;
+
+	status = ocfs_journal_access(handle, prev_bg_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+
+	status = ocfs_journal_dirty(handle, prev_bg_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_journal_access(handle, bg_bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+
+	status = ocfs_journal_dirty(handle, bg_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 
+
+	status = ocfs_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		fe->id2.i_chain.cl_recs[chain].c_blkno = fe_ptr;
+		bg->bg_next_group = bg_ptr;
+		prev_bg->bg_next_group = prev_bg_ptr;
+	}
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+static inline int ocfs_block_group_reasonably_empty(ocfs2_group_desc *bg)
+{
+	return(bg->bg_free_bits_count >= (bg->bg_bits / 2));
+}
+#endif
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs_claim_suballoc_bits(ocfs_super *osb,
+				    ocfs_journal_handle *handle,
+				    ocfs2_alloc_context *ac,
+				    u32 bits_wanted,
+				    u16 *bit_off,
+				    unsigned int *num_bits,
+				    u64 *bg_blkno)
+{
+	int status, groups_read;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	ocfs2_chain_list *cl;
+	ocfs2_dinode *fe;
+	ocfs2_group_desc *bg;
+	u16 chain, tmp_bits;
+	u64 next_group;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(ac->ac_bits_given < ac->ac_bits_wanted);
+	OCFS_ASSERT(ac->ac_handle == handle);
+	OCFS_ASSERT(bits_wanted <= (ac->ac_bits_wanted - ac->ac_bits_given));
+	OCFS_ASSERT(ac->ac_bh);
+
+	fe = (ocfs2_dinode *) ac->ac_bh->b_data;
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT(fe->id1.bitmap1.i_used < fe->id1.bitmap1.i_total);
+
+	cl = (ocfs2_chain_list *) &fe->id2.i_chain;
+
+	chain = ocfs2_find_victim_chain(cl);
+
+	printk("trying to alloc %u bits from chain %u, inode %llu\n",
+	       bits_wanted, chain, OCFS_I(alloc_inode)->ip_blkno);
+
+	status = ocfs_read_bh(osb, 
+			      cl->cl_recs[chain].c_blkno << osb->sb->s_blocksize_bits, 
+			      &group_bh, 
+			      OCFS_BH_CACHED, 
+			      alloc_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg = (ocfs2_group_desc *) group_bh->b_data;
+	OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	groups_read = 1;
+	while (!bg->bg_free_bits_count) {
+		/* TODO: Self optimizing block groups. Here's how:
+		 * Keep track of previous block descriptor read. When
+		 * we find a target, if we have read more than X
+		 * number of descriptors, and the target is reasonably
+		 * empty, relink him to top of his chain.
+		 *
+		 * prev_bg->bg_next_group = bg->bg_next_group;
+		 * bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+		 * fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 
+		 *
+		 * We've read 0 extra blocks and only send one more to
+		 * the transaction, yet the next guy to search has a
+		 * much easier time.
+		 */
+		OCFS_ASSERT(bg->bg_next_group);
+
+		if (prev_group_bh) {
+			brelse(prev_group_bh);
+			prev_group_bh = NULL;
+		}
+		next_group = bg->bg_next_group;
+//		brelse(group_bh);
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs_read_bh(osb, 
+				      next_group << osb->sb->s_blocksize_bits, 
+				      &group_bh, 
+				      OCFS_BH_CACHED, 
+				      alloc_inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		groups_read++;
+		bg = (ocfs2_group_desc *) group_bh->b_data;
+		OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+	}
+
+#ifdef OCFS2_OPTIMIZE_SUBALLOCATORS
+#define OCFS2_BG_RELINK_TRIGGER 3
+	if ((groups_read > OCFS2_BG_RELINK_TRIGGER) && ()) {
+		status = ocfs_relink_block_group(handle, ac->ac_bh, bg_bh, 
+						 prev_bg_bh, chain);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
+#endif
+
+	status = ocfs_block_group_find_clear_bits(osb, 
+						  bg, 
+						  bits_wanted, 
+						  bit_off,
+						  &tmp_bits);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	*num_bits = tmp_bits;
+
+	OCFS_ASSERT(*num_bits);
+
+	/* we found some. set the info on dinode, chainlist and then
+	 * the group */
+	status = ocfs_journal_access(handle, 
+				     ac->ac_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	fe->id1.bitmap1.i_used += *num_bits;
+	cl->cl_recs[chain].c_free -= *num_bits;
+
+	status = ocfs_journal_dirty(handle, 
+				    ac->ac_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_block_group_set_bits(handle, 
+					   bg, 
+					   group_bh, 
+					   *bit_off, 
+					   *num_bits);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	printk("Allocated %u bits from suballocator %llu\n", *num_bits, 
+	       fe->i_blkno);
+	debug_suballoc_inode(fe);
+
+	*bg_blkno = bg->bg_blkno;
+bail:
+	if (group_bh)
+		brelse(group_bh);
+	if (prev_group_bh)
+		brelse(prev_group_bh);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+int ocfs_claim_metadata(ocfs_super *osb,
+			ocfs_journal_handle *handle,
+			ocfs2_alloc_context *ac,
+			u32 bits_wanted,
+			u16 *suballoc_bit_start,
+			unsigned int *num_bits,
+			u64 *blkno_start)
+{
+	int status;
+	u64 bg_blkno;
+
+	OCFS_ASSERT(ac);
+	OCFS_ASSERT(ac->ac_bits_wanted >= (ac->ac_bits_given + bits_wanted));
+	OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_META);
+	OCFS_ASSERT(ac->ac_handle == handle);
+
+	status = ocfs_claim_suballoc_bits(osb,
+					  handle,
+					  ac,
+					  bits_wanted,
+					  suballoc_bit_start,
+					  num_bits,
+					  &bg_blkno);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+	ac->ac_bits_given += (*num_bits);
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+int ocfs_claim_new_inode(ocfs_super *osb, 
+			 ocfs_journal_handle *handle,
+			 ocfs2_alloc_context *ac,
+			 u16 *suballoc_bit,
+			 u64 *fe_blkno)
+{
+	int status;
+	unsigned int num_bits;
+	u64 bg_blkno;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(ac);
+	OCFS_ASSERT(ac->ac_bits_given == 0);
+	OCFS_ASSERT(ac->ac_bits_wanted == 1);
+	OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_INODE);
+	OCFS_ASSERT(ac->ac_handle == handle);
+
+	status = ocfs_claim_suballoc_bits(osb, 
+					  handle, 
+					  ac, 
+					  1, 
+					  suballoc_bit, 
+					  &num_bits,
+					  &bg_blkno);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	OCFS_ASSERT(num_bits == 1);
+
+#warning "is this cast right?"
+	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+	ac->ac_bits_given++;
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+#if 0
+int ocfs_free_suballoc_bits(ocfs_super *osb, 
+			    ocfs_journal_handle *handle, 
+			    struct inode *alloc_inode,
+			    struct buffer_head *alloc_bh) 
+{
+	int status = 0;
+	ocfs2_dinode *fe = (ocfs2_dinode *) alloc_bh->b_data;
+
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+
+	//need to figure out the rest of the api here.
+	//don't forget to update bit counts on fe, chain record and
+	//block group.
+
+	return status;
+}
+#endif
+static inline void debug_bg(ocfs2_group_desc *bg) 
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", bg->bg_generation);
+	printk("bg_next_group:      %llu\n", bg->bg_next_group);
+	printk("bg_parent_dinode:   %llu\n", bg->bg_parent_dinode);
+	printk("bg_blkno:           %llu\n", bg->bg_blkno);
+	return;
+}
+
+static inline void debug_suballoc_inode(ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %llu:\n", fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %llu\n", fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n", fe->i_generation);
+	printk("id1.bitmap1.i_used:           %u\n", fe->id1.bitmap1.i_used);
+	printk("id1.bitmap1.i_total:          %u\n", fe->id1.bitmap1.i_total);
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n", 
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i, fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+	return;
+}

Added: branches/dlm-changes/src/suballoc.h
===================================================================
--- branches/dlm-changes/src/suballoc.h	2004-09-20 21:51:09 UTC (rev 1478)
+++ branches/dlm-changes/src/suballoc.h	2004-09-24 22:43:55 UTC (rev 1479)
@@ -0,0 +1,53 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Mark Fasheh, Kurt Hackel, Joel Becker, Sunil Mushran, 
+ *	    Manish Singh, Wim Coekaerts
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+int ocfs_reserve_new_metadata(ocfs_super *osb, 
+			      ocfs_journal_handle *handle,
+			      struct inode *inode,
+			      ocfs2_dinode *fe,
+			      ocfs2_alloc_context **ac);
+int ocfs_reserve_new_inode(ocfs_super *osb, 
+			   ocfs_journal_handle *handle,
+			   ocfs2_alloc_context **ac);
+int ocfs_claim_new_inode(ocfs_super *osb, 
+			 ocfs_journal_handle *handle,
+			 ocfs2_alloc_context *ac,
+			 u16 *suballoc_bit,
+			 u64 *fe_blkno);
+int ocfs_claim_metadata(ocfs_super *osb,
+			ocfs_journal_handle *handle,
+			ocfs2_alloc_context *ac,
+			u32 bits_wanted,
+			u16 *suballoc_bit_start,
+			u32 *num_bits,
+			u64 *blkno_start);
+
+#endif /* _CHAINALLOC_H_ */



More information about the Ocfs2-commits mailing list