[Ocfs2-commits] mfasheh commits r1479 - branches/dlm-changes/src
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Fri Sep 24 17:43:57 CDT 2004
Author: mfasheh
Date: 2004-09-24 17:43:55 -0500 (Fri, 24 Sep 2004)
New Revision: 1479
Added:
branches/dlm-changes/src/suballoc.c
branches/dlm-changes/src/suballoc.h
Modified:
branches/dlm-changes/src/ocfs2_fs.h
Log:
* Commit our new metadata suballocation scheme. This bears close
resemblance to a certain Linux FS...
Things are still under testing, delete hasn't been written yet, and
two features (the self optimizing code and the zeroing code) haven't
been turned on yet.
Modified: branches/dlm-changes/src/ocfs2_fs.h
===================================================================
--- branches/dlm-changes/src/ocfs2_fs.h 2004-09-20 21:51:09 UTC (rev 1478)
+++ branches/dlm-changes/src/ocfs2_fs.h 2004-09-24 22:43:55 UTC (rev 1479)
@@ -56,6 +56,7 @@
#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
#define OCFS2_INODE_SIGNATURE "INODE01"
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -96,7 +97,7 @@
#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */
#define OCFS2_JOURNAL_FL (0x00000100) /* Node journal */
#define OCFS2_DLM_FL (0x00000200) /* DLM area */
-
+#define OCFS2_SUBALLOC_FL (0x00000400) /* Suballocator File */
/* Limit of space in ocfs2_dir_entry */
#define OCFS2_MAX_FILENAME_LENGTH 255
@@ -225,6 +226,12 @@
/*10*/
} ocfs2_extent_rec;
+typedef struct _ocfs2_chain_rec {
+ __u32 c_free; /* number of free bits in this chain. */
+ __u32 c_total;
+ __u64 c_blkno; /* Physical disk offset (blocks) of 1st group */
+} ocfs2_chain_rec;
+
/*
* On disk extent list for OCFS2 (node in the tree). Note that this
* is contained inside ocfs2_dinode or ocfs2_extent_block, so the
@@ -244,17 +251,26 @@
/*10*/ ocfs2_extent_rec l_recs[0]; /* Extent records */
} ocfs2_extent_list;
+typedef struct _ocfs2_chain_list {
+ __u16 cl_cpg; /* Clusters per Block Group */
+ __u16 cl_bpc; /* Bits per cluster */
+ __u16 cl_count;
+ __u16 cl_next_free_rec;
+ __u64 cl_reserved1;
+ ocfs2_chain_rec cl_recs[0];
+} ocfs2_chain_list;
+
/*
* On disk extent block (indirect block) for OCFS2
*/
typedef struct _ocfs2_extent_block
{
/*00*/ __u8 h_signature[8]; /* Signature for verification */
- __u64 h_suballoc_blkno; /* Node suballocator offset,
- in blocks */
+ __u64 h_reserved1;
/*10*/ __s16 h_suballoc_node; /* Node suballocator this
extent_header belongs to */
- __u16 h_reserved1;
+ __u16 h_suballoc_bit; /* Bit offset in suballocater
+ block group */
__u32 h_reserved2;
__u64 h_blkno; /* Offset on disk, in blocks */
/*20*/ __u64 h_parent_blk; /* Offset on disk, in blocks,
@@ -338,11 +354,11 @@
typedef struct _ocfs2_dinode {
/*00*/ __u8 i_signature[8]; /* Signature for validation */
__u32 i_generation; /* Generation number */
- __u16 i_reserved1;
__s16 i_suballoc_node; /* Node suballocater this inode
belongs to */
-/*10*/ __u64 i_suballoc_blkno; /* Node suballocator offset,
- in blocks */
+ __u16 i_suballoc_bit; /* Bit offset in suballocater
+ block group */
+/*10*/ __u64 i_reserved1;
/*18*/ ocfs2_disk_lock i_disk_lock; /* Lock structure */
/*48*/ __u32 i_uid; /* Owner UID */
__u32 i_gid; /* Owning GID */
@@ -376,7 +392,8 @@
} id1; /* Inode type dependant 1 */
/*C0*/ union {
ocfs2_super_block i_super;
- ocfs2_local_alloc i_lab;
+ ocfs2_local_alloc i_lab;
+ ocfs2_chain_list i_chain;
ocfs2_extent_list i_list;
} id2;
/* Actual on-disk size is one block */
@@ -394,8 +411,24 @@
/* Actual on-disk length specified by rec_len */
};
+/*
+ * On disk allocator group structure for OCFS2
+ */
+typedef struct _ocfs2_group_desc
+{
+/*00*/ __u8 bg_signature[8]; /* Signature for validation */
+ __u16 bg_size; /* Size of included bitmap in bytes. */
+ __u16 bg_bits; /* Bits represented by this group. */
+ __u16 bg_free_bits_count; /* Free bits count */
+ __u16 bg_chain; /* What chain I am in. */
+ __u32 bg_generation;
+ __u64 bg_next_group; /* Next group in my list, in blocks */
+ __u64 bg_parent_dinode; /* dinode which owns me, in blocks */
+ __u64 bg_blkno; /* Offset on disk, in blocks */
+ __u64 bg_reserved2[2];
+ __u8 bg_bitmap[0];
+} ocfs2_group_desc;
-
#ifdef __KERNEL__
static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
{
@@ -407,6 +440,16 @@
return size / sizeof(struct _ocfs2_extent_rec);
}
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct _ocfs2_dinode, id2.i_chain.cl_recs);
+
+ return size / sizeof(struct _ocfs2_chain_rec);
+}
+
static inline int ocfs2_extent_recs_per_eb(struct super_block *sb)
{
int size;
@@ -426,6 +469,16 @@
return size;
}
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct _ocfs2_group_desc, bg_bitmap);
+
+ return size;
+}
#else
static inline int ocfs2_extent_recs_per_inode(int blocksize)
{
@@ -437,6 +490,16 @@
return size / sizeof(struct _ocfs2_extent_rec);
}
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+ int size;
+
+ size = blocksize -
+ offsetof(struct _ocfs2_dinode, id2.i_chain.cl_recs);
+
+ return size / sizeof(struct _ocfs2_chain_rec);
+}
+
static inline int ocfs2_extent_recs_per_eb(int blocksize)
{
int size;
@@ -456,6 +519,16 @@
return size;
}
+
+static inline int ocfs2_group_bitmap_size(int blocksize)
+{
+ int size;
+
+ size = blocksize -
+ offsetof(struct _ocfs2_group_desc, bg_bitmap);
+
+ return size;
+}
#endif /* __KERNEL__ */
Added: branches/dlm-changes/src/suballoc.c
===================================================================
--- branches/dlm-changes/src/suballoc.c 2004-09-20 21:51:09 UTC (rev 1478)
+++ branches/dlm-changes/src/suballoc.c 2004-09-24 22:43:55 UTC (rev 1479)
@@ -0,0 +1,1008 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ *
+ * Copyright (C) 2002, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Mark Fasheh, Kurt Hackel, Joel Becker, Sunil Mushran,
+ * Wim Coekaerts, Manish Singh
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "dlm.h"
+#include "util.h"
+#include "suballoc.h"
+#include "sysfile.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_CHAINALLOC
+
+static inline void debug_bg(ocfs2_group_desc *bg);
+static inline void debug_suballoc_inode(ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(ocfs_journal_handle *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh,
+ u64 group_blkno,
+ u16 my_chain,
+ ocfs2_chain_list *cl);
+static int ocfs_block_group_alloc(ocfs_super *osb,
+ struct inode *alloc_inode,
+ struct buffer_head *bh);
+static int ocfs_reserve_suballoc_bits(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac);
+static int ocfs_claim_suballoc_bits(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac,
+ u32 bits_wanted,
+ u16 *bit_off,
+ unsigned int *num_bits,
+ u64 *bg_blkno);
+static int ocfs_block_group_find_clear_bits(ocfs_super *osb,
+ ocfs2_group_desc *bg,
+ unsigned int bits_wanted,
+ u16 *bit_off,
+ u16 *bits_found);
+static inline int ocfs_block_group_set_bits(ocfs_journal_handle *handle,
+ ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits);
+
+static int ocfs2_block_group_fill(ocfs_journal_handle *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh,
+ u64 group_blkno,
+ u16 my_chain,
+ ocfs2_chain_list *cl)
+{
+ int status = 0;
+ ocfs2_group_desc *bg = (ocfs2_group_desc *) bg_bh->b_data;
+ struct super_block * sb = alloc_inode->i_sb;
+
+ LOG_ENTRY();
+
+ OCFS_ASSERT(((unsigned long long) bg_bh->b_blocknr) == group_blkno);
+
+ set_buffer_uptodate(bg_bh);
+ SET_BH_SEQNUM(alloc_inode, bg_bh);
+ status = ocfs_journal_access(handle,
+ bg_bh,
+ OCFS_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ memset(bg, 0, sb->s_blocksize);
+ strcpy (bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+ bg->bg_generation = cpu_to_le32(alloc_inode->i_generation);
+ bg->bg_size = ocfs2_group_bitmap_size(sb);
+ bg->bg_bits = (u32) cl->cl_cpg * (u32) cl->cl_bpc;
+ bg->bg_chain = my_chain;
+ bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+ bg->bg_parent_dinode = OCFS_I(alloc_inode)->ip_blkno;
+ bg->bg_blkno = group_blkno;
+ /* set the 1st bit in the bitmap to account for the descriptor block */
+ set_bit(0, bg->bg_bitmap);
+ bg->bg_free_bits_count = bg->bg_bits - 1;
+
+ status = ocfs_journal_dirty(handle, bg_bh);
+ if (status < 0)
+ LOG_ERROR_STATUS(status);
+#warning "we need to zero out the other blocks in the group! (only inode alloc?)"
+ printk("filled new block group:\n");
+ debug_bg(bg);
+
+bail:
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+static inline u16 ocfs2_find_smallest_chain(ocfs2_chain_list *cl)
+{
+ u16 curr, best;
+
+ best = curr = 0;
+ while (curr < cl->cl_count) {
+ if (cl->cl_recs[best].c_total > cl->cl_recs[curr].c_total)
+ best = curr;
+ curr++;
+ }
+ return best;
+}
+#ifdef OCFS_BG_ZERO
+static struct buffer_head **ocfs_block_group_zero_start(ocfs_super *osb,
+ u32 bit_off,
+ u16 clusters)
+{
+ struct buffer_head **bhs = NULL;
+ unsigned int blocks;
+ u64 blkno;
+ int i;
+
+ LOG_ENTRY();
+
+ blocks = ocfs_clusters_to_blocks(osb->sb, (u32) clusters) - (u64) 1;
+ bhs = kmalloc(blocks * sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!bhs) {
+ LOG_ERROR_STATUS(-ENOMEM);
+ goto bail;
+ }
+ memset(bhs, 0, blocks * sizeof(struct buffer_head *));
+
+ blkno = ocfs_clusters_to_blocks(osb->sb, bit_off) + (u64) 1;
+ for(i = 0; i < blocks; i++) {
+ bhs[i] = sb_getblk(osb->sb, ((u64) i + blkno));
+ if (!bhs[i]) {
+ kfree(bhs);
+ bhs = NULL;
+ LOG_ERROR_STATUS(-EIO);
+ goto bail;
+ }
+ lock_buffer(bhs[i]);
+ OCFS_ASSERT(!buffer_jbd(bhs[i]));
+
+ memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+ set_buffer_uptodate(bhs[i]);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+ clear_buffer_dirty(bhs[i]);
+#else
+ mark_buffer_clean(bhs[i]);
+#endif
+
+ bhs[i]->b_end_io = ocfs_end_buffer_io_sync;
+ submit_bh(WRITE, bhs[i]);
+ }
+bail:
+ LOG_EXIT();
+ return(bhs);
+}
+
+static void ocfs_block_group_zero_wait(ocfs_super *osb,
+ struct buffer_head **bhs,
+ u16 clusters)
+{
+ unsigned int blocks =
+ ocfs_clusters_to_blocks(osb->sb, (u32) clusters) - (u64) 1;
+
+ do {
+ blocks--;
+ wait_on_buffer(bhs[i]);
+ brelse(bhs[i]);
+ } while (blocks);
+
+ kfree(bhs);
+ return;
+}
+
+
+#endif
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs_block_group_alloc(ocfs_super *osb,
+ struct inode *alloc_inode,
+ struct buffer_head *bh)
+{
+ int status, credits;
+ ocfs2_dinode *fe = (ocfs2_dinode *) bh->b_data;
+ ocfs2_chain_list *cl;
+ ocfs2_alloc_context *ac = NULL;
+ ocfs_journal_handle *handle = NULL;
+ u32 bit_off, num_bits;
+ u16 alloc_rec;
+ u64 bg_blkno;
+ struct buffer_head *bg_bh = NULL;
+ ocfs2_group_desc *bg;
+#ifdef OCFS_BG_ZERO
+ struct buffer_head **zero_bhs = NULL;
+#endif
+
+ LOG_ENTRY();
+
+ handle = ocfs_alloc_handle(osb);
+ if (!handle) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ cl = &fe->id2.i_chain;
+ status = ocfs_reserve_bits(osb,
+ handle,
+ cl->cl_cpg,
+ &ac);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ debug_suballoc_inode(fe);
+
+ credits = ocfs_calc_group_alloc_credits(osb->sb, cl->cl_cpg);
+ printk("allocate new block group, requires %d credits\n", credits);
+ handle = ocfs_start_trans(osb, handle, credits);
+ if (!handle) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ ocfs_handle_set_always_commits(handle, 1);
+
+ status = ocfs_claim_bits(osb,
+ handle,
+ ac,
+ cl->cl_cpg,
+ &bit_off,
+ &num_bits);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+#ifdef OCFS_BG_ZERO
+ status = ocfs_block_group_zero_start(osb, bit_off, cl->cpg, &zero_bhs);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+#endif
+ alloc_rec = ocfs2_find_smallest_chain(cl);
+
+ /* setup the group */
+ bg_blkno = ocfs_clusters_to_blocks(osb->sb, bit_off);
+ printk("new descriptor, record %u, at block %llu\n",
+ alloc_rec, bg_blkno);
+
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
+ status = -EIO;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = ocfs2_block_group_fill(handle,
+ alloc_inode,
+ bg_bh,
+ bg_blkno,
+ alloc_rec,
+ cl);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ bg = (ocfs2_group_desc *) bg_bh->b_data;
+
+ status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ cl->cl_recs[alloc_rec].c_free += bg->bg_free_bits_count;
+ cl->cl_recs[alloc_rec].c_total += bg->bg_bits;
+ cl->cl_recs[alloc_rec].c_blkno = bg_blkno;
+ if (cl->cl_next_free_rec < cl->cl_count)
+ cl->cl_next_free_rec++;
+
+ fe->id1.bitmap1.i_used += (bg->bg_bits - bg->bg_free_bits_count);
+ fe->id1.bitmap1.i_total += bg->bg_bits;
+ fe->i_clusters += cl->cl_cpg;
+
+ status = ocfs_journal_dirty(handle, bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ down (&(OCFS_I(alloc_inode)->ip_sem));
+ OCFS_I(alloc_inode)->ip_alloc_size =
+ (u64)fe->i_clusters << osb->s_clustersize_bits;
+ fe->i_size = OCFS_I(alloc_inode)->ip_alloc_size;
+ up (&(OCFS_I(alloc_inode)->ip_sem));
+ alloc_inode->i_size = fe->i_size;
+ alloc_inode->i_blocks = (alloc_inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
+
+ printk("allocation succes, new block group allocator:\n");
+ debug_suballoc_inode(fe);
+ status = 0;
+bail:
+#ifdef OCFS_BG_ZERO
+ if (zero_bhs)
+ ocfs_block_group_zero_wait(osb, zero_bhs, cl->cl_cpg);
+#endif
+ if (handle)
+ ocfs_commit_trans(handle);
+
+ if (ac)
+ ocfs_free_alloc_context(ac);
+
+ if (bg_bh)
+ brelse(bg_bh);
+
+ LOG_EXIT_STATUS(status);
+ return status;
+}
+
+static int ocfs_reserve_suballoc_bits(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac)
+{
+ int status;
+ u32 bits_wanted = ac->ac_bits_wanted;
+ struct inode *alloc_inode = ac->ac_inode;
+ struct buffer_head *bh = NULL;
+ ocfs2_dinode *fe;
+
+ LOG_ENTRY();
+
+ OCFS_ASSERT(!(handle->flags & OCFS_HANDLE_STARTED));
+
+ status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
+ 0, &bh, alloc_inode);
+ if (status < 0) {
+ if (status != -EINTR)
+ LOG_ERROR_STATUS (status);
+ goto bail;
+ }
+ ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE,
+ 0, alloc_inode);
+ ocfs_handle_add_inode(handle, alloc_inode);
+
+ fe = (ocfs2_dinode *) bh->b_data;
+ OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+ OCFS_ASSERT(fe->i_flags & OCFS2_SUBALLOC_FL);
+
+ if (bits_wanted > (le32_to_cpu(fe->id1.bitmap1.i_total) -
+ le32_to_cpu(fe->id1.bitmap1.i_used))) {
+ status = ocfs_block_group_alloc(osb, alloc_inode, bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ /* You should never ask for this much metadata */
+ OCFS_ASSERT(bits_wanted <=
+ (le32_to_cpu(fe->id1.bitmap1.i_total)
+ - le32_to_cpu(fe->id1.bitmap1.i_used)));
+ }
+
+ get_bh(bh);
+ ac->ac_bh = bh;
+ status = 0;
+bail:
+ if (bh)
+ brelse(bh);
+
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+int ocfs_reserve_new_metadata(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ struct inode *inode,
+ ocfs2_dinode *fe,
+ ocfs2_alloc_context **ac)
+{
+ int status;
+ struct inode *alloc_inode = NULL;
+
+ *ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+ if (!(*ac)) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ memset(*ac, 0, sizeof(ocfs2_alloc_context));
+ /* Our file data alloc path is such a mess that I really feel
+ * comfortable just always over-reserving here. */
+ (*ac)->ac_bits_wanted = 2 * ocfs2_extend_meta_needed(fe);
+ (*ac)->ac_handle = handle;
+ (*ac)->ac_which = OCFS_AC_USE_META;
+
+#ifndef OCFS_USE_ALL_METADATA_SUBALLOCATORS
+ alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_BITMAP_SYSTEM_INODE, 0);
+#else
+ alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_BITMAP_SYSTEM_INODE, osb->node_num);
+#endif
+ if (alloc_inode) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ (*ac)->ac_inode = igrab(alloc_inode);
+
+ status = ocfs_reserve_suballoc_bits(osb, handle, (*ac));
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = 0;
+bail:
+ if ((status < 0) && *ac) {
+ ocfs_free_alloc_context(*ac);
+ *ac = NULL;
+ }
+
+ if (inode)
+ iput(inode);
+
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+int ocfs_reserve_new_inode(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context **ac)
+{
+ int status;
+ struct inode *alloc_inode = NULL;
+
+ *ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+ if (!(*ac)) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ memset(*ac, 0, sizeof(ocfs2_alloc_context));
+ (*ac)->ac_bits_wanted = 1;
+ (*ac)->ac_handle = handle;
+ (*ac)->ac_which = OCFS_AC_USE_INODE;
+
+ alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_BITMAP_SYSTEM_INODE, osb->node_num);
+ if (!alloc_inode) {
+ status = -ENOMEM;
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ (*ac)->ac_inode = igrab(alloc_inode);
+
+ status = ocfs_reserve_suballoc_bits(osb, handle, *ac);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = 0;
+bail:
+ if ((status < 0) && *ac) {
+ ocfs_free_alloc_context(*ac);
+ *ac = NULL;
+ }
+
+ if (alloc_inode)
+ iput(alloc_inode);
+
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+static int ocfs_block_group_find_clear_bits(ocfs_super *osb,
+ ocfs2_group_desc *bg,
+ unsigned int bits_wanted,
+ u16 *bit_off,
+ u16 *bits_found)
+{
+ void *bitmap;
+ u16 best_offset, best_size;
+ int offset, start, found, status = 0;
+
+ OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+
+ found = start = best_offset = best_size = 0;
+ bitmap = bg->bg_bitmap;
+
+ while((offset = find_next_zero_bit(bitmap,
+ bg->bg_bits,
+ start)) != -1) {
+ if (offset == bg->bg_bits)
+ break;
+
+ if (offset == start) {
+ /* we found a zero */
+ found++;
+ start++;
+ if (found > best_size) {
+ best_size = found;
+ best_offset = start - found;
+ }
+ } else {
+ /* got a zero after some ones */
+ found = 1;
+ start = offset + 1;
+ }
+ /* we got everything we needed */
+ if (found == bits_wanted) {
+ /* LOG_TRACE_STR("Found it all!"); */
+ break;
+ }
+ }
+
+ if (found == bits_wanted) {
+ *bit_off = start - found;
+ *bits_found = found;
+ } else if (best_offset) {
+ *bit_off = best_offset;
+ *bits_found = best_size;
+ } else {
+ status = -ENOSPC;
+ LOG_ERROR_STATUS(status);
+ }
+
+ return(status);
+}
+
+static inline int ocfs_block_group_set_bits(ocfs_journal_handle *handle,
+ ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits)
+{
+ int status;
+ void *bitmap = bg->bg_bitmap;
+
+ LOG_ENTRY();
+
+ OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+ OCFS_ASSERT(bg->bg_free_bits_count >= num_bits);
+
+ printk("block_group_set_bits: off = %u, num = %u\n", bit_off,
+ num_bits);
+ debug_bg(bg);
+
+ status = ocfs_journal_access(handle,
+ group_bh,
+ OCFS_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ bg->bg_free_bits_count -= num_bits;
+
+ while(num_bits--)
+ set_bit(bit_off++, bitmap);
+
+ status = ocfs_journal_dirty(handle,
+ group_bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+bail:
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(ocfs2_chain_list *cl)
+{
+ u16 curr, best;
+
+ OCFS_ASSERT(cl->cl_next_free_rec);
+
+ best = curr = 0;
+ while (curr < cl->cl_next_free_rec) {
+ if (cl->cl_recs[curr].c_free > cl->cl_recs[best].c_free)
+ best = curr;
+ curr++;
+ }
+
+ OCFS_ASSERT(best < cl->cl_next_free_rec);
+ return best;
+}
+
+#ifdef OCFS2_OPTIMIZE_SUBALLOCATORS
+static int ocfs_relink_block_group(ocfs_journal_handle *handle,
+ struct buffer_head *fe_bh,
+ struct buffer_head *bg_bh,
+ struct buffer_head *prev_bg_bh,
+ u16 chain)
+{
+ int status;
+ /* there is a really tiny chance the journal calls could fail,
+ * but we wouldn't want inconsistent blocks in *any* case. */
+ u64 fe_ptr, bg_ptr, prev_bg_ptr;
+ ocfs2_dinode *fe = (ocfs2_dinode *) fe_bh->b_data;
+ ocfs2_group_desc *bg = (ocfs2_group_desc *) bg_bh->b_data;
+ ocfs2_group_desc *prev_bg = (ocfs2_group_desc *) prev_bg_bh->b_data;
+
+ OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+ OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+ OCFS_ASSERT(IS_VALID_GROUP_DESC(prev_bg));
+
+ fe_ptr = fe->id2.i_chain.cl_recs[chain].c_blkno;
+ bg_ptr = bg->bg_next_group;
+ prev_bg_ptr = prev_bg->bg_next_group;
+
+ status = ocfs_journal_access(handle, prev_bg_bh,
+ OCFS_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ prev_bg->bg_next_group = bg->bg_next_group;
+
+ status = ocfs_journal_dirty(handle, prev_bg_bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = ocfs_journal_access(handle, bg_bh, OCFS_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+
+ status = ocfs_journal_dirty(handle, bg_bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+
+ status = ocfs_journal_dirty(handle, fe_bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = 0;
+bail:
+ if (status < 0) {
+ fe->id2.i_chain.cl_recs[chain].c_blkno = fe_ptr;
+ bg->bg_next_group = bg_ptr;
+ prev_bg->bg_next_group = prev_bg_ptr;
+ }
+
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+static inline int ocfs_block_group_reasonably_empty(ocfs2_group_desc *bg)
+{
+ return(bg->bg_free_bits_count >= (bg->bg_bits / 2));
+}
+#endif
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs_claim_suballoc_bits(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac,
+ u32 bits_wanted,
+ u16 *bit_off,
+ unsigned int *num_bits,
+ u64 *bg_blkno)
+{
+ int status, groups_read;
+ struct inode *alloc_inode = ac->ac_inode;
+ struct buffer_head *group_bh = NULL;
+ struct buffer_head *prev_group_bh = NULL;
+ ocfs2_chain_list *cl;
+ ocfs2_dinode *fe;
+ ocfs2_group_desc *bg;
+ u16 chain, tmp_bits;
+ u64 next_group;
+
+ LOG_ENTRY();
+
+ OCFS_ASSERT(ac->ac_bits_given < ac->ac_bits_wanted);
+ OCFS_ASSERT(ac->ac_handle == handle);
+ OCFS_ASSERT(bits_wanted <= (ac->ac_bits_wanted - ac->ac_bits_given));
+ OCFS_ASSERT(ac->ac_bh);
+
+ fe = (ocfs2_dinode *) ac->ac_bh->b_data;
+ OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+ OCFS_ASSERT(fe->id1.bitmap1.i_used < fe->id1.bitmap1.i_total);
+
+ cl = (ocfs2_chain_list *) &fe->id2.i_chain;
+
+ chain = ocfs2_find_victim_chain(cl);
+
+ printk("trying to alloc %u bits from chain %u, inode %llu\n",
+ bits_wanted, chain, OCFS_I(alloc_inode)->ip_blkno);
+
+ status = ocfs_read_bh(osb,
+ cl->cl_recs[chain].c_blkno << osb->sb->s_blocksize_bits,
+ &group_bh,
+ OCFS_BH_CACHED,
+ alloc_inode);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ bg = (ocfs2_group_desc *) group_bh->b_data;
+ OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+
+ /* for now, the chain search is a bit simplistic. We just use
+ * the 1st group with any empty bits. */
+ groups_read = 1;
+ while (!bg->bg_free_bits_count) {
+ /* TODO: Self optimizing block groups. Here's how:
+ * Keep track of previous block descriptor read. When
+ * we find a target, if we have read more than X
+ * number of descriptors, and the target is reasonably
+ * empty, relink him to top of his chain.
+ *
+ * prev_bg->bg_next_group = bg->bg_next_group;
+ * bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+ * fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+ *
+ * We've read 0 extra blocks and only send one more to
+ * the transaction, yet the next guy to search has a
+ * much easier time.
+ */
+ OCFS_ASSERT(bg->bg_next_group);
+
+ if (prev_group_bh) {
+ brelse(prev_group_bh);
+ prev_group_bh = NULL;
+ }
+ next_group = bg->bg_next_group;
+// brelse(group_bh);
+ prev_group_bh = group_bh;
+ group_bh = NULL;
+ status = ocfs_read_bh(osb,
+ next_group << osb->sb->s_blocksize_bits,
+ &group_bh,
+ OCFS_BH_CACHED,
+ alloc_inode);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ groups_read++;
+ bg = (ocfs2_group_desc *) group_bh->b_data;
+ OCFS_ASSERT(IS_VALID_GROUP_DESC(bg));
+ }
+
+#ifdef OCFS2_OPTIMIZE_SUBALLOCATORS
+#define OCFS2_BG_RELINK_TRIGGER 3
+ if ((groups_read > OCFS2_BG_RELINK_TRIGGER) && ()) {
+ status = ocfs_relink_block_group(handle, ac->ac_bh, bg_bh,
+ prev_bg_bh, chain);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ }
+#endif
+
+ status = ocfs_block_group_find_clear_bits(osb,
+ bg,
+ bits_wanted,
+ bit_off,
+ &tmp_bits);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+ *num_bits = tmp_bits;
+
+ OCFS_ASSERT(*num_bits);
+
+ /* we found some. set the info on dinode, chainlist and then
+ * the group */
+ status = ocfs_journal_access(handle,
+ ac->ac_bh,
+ OCFS_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ fe->id1.bitmap1.i_used += *num_bits;
+ cl->cl_recs[chain].c_free -= *num_bits;
+
+ status = ocfs_journal_dirty(handle,
+ ac->ac_bh);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ status = ocfs_block_group_set_bits(handle,
+ bg,
+ group_bh,
+ *bit_off,
+ *num_bits);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ printk("Allocated %u bits from suballocator %llu\n", *num_bits,
+ fe->i_blkno);
+ debug_suballoc_inode(fe);
+
+ *bg_blkno = bg->bg_blkno;
+bail:
+ if (group_bh)
+ brelse(group_bh);
+ if (prev_group_bh)
+ brelse(prev_group_bh);
+
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+int ocfs_claim_metadata(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac,
+ u32 bits_wanted,
+ u16 *suballoc_bit_start,
+ unsigned int *num_bits,
+ u64 *blkno_start)
+{
+ int status;
+ u64 bg_blkno;
+
+ OCFS_ASSERT(ac);
+ OCFS_ASSERT(ac->ac_bits_wanted >= (ac->ac_bits_given + bits_wanted));
+ OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_META);
+ OCFS_ASSERT(ac->ac_handle == handle);
+
+ status = ocfs_claim_suballoc_bits(osb,
+ handle,
+ ac,
+ bits_wanted,
+ suballoc_bit_start,
+ num_bits,
+ &bg_blkno);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+ ac->ac_bits_given += (*num_bits);
+ status = 0;
+bail:
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+int ocfs_claim_new_inode(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac,
+ u16 *suballoc_bit,
+ u64 *fe_blkno)
+{
+ int status;
+ unsigned int num_bits;
+ u64 bg_blkno;
+
+ LOG_ENTRY();
+
+ OCFS_ASSERT(ac);
+ OCFS_ASSERT(ac->ac_bits_given == 0);
+ OCFS_ASSERT(ac->ac_bits_wanted == 1);
+ OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_INODE);
+ OCFS_ASSERT(ac->ac_handle == handle);
+
+ status = ocfs_claim_suballoc_bits(osb,
+ handle,
+ ac,
+ 1,
+ suballoc_bit,
+ &num_bits,
+ &bg_blkno);
+ if (status < 0) {
+ LOG_ERROR_STATUS(status);
+ goto bail;
+ }
+
+ OCFS_ASSERT(num_bits == 1);
+
+#warning "is this cast right?"
+ *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+ ac->ac_bits_given++;
+ status = 0;
+bail:
+ LOG_EXIT_STATUS(status);
+ return(status);
+}
+
+#if 0
+int ocfs_free_suballoc_bits(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh)
+{
+ int status = 0;
+ ocfs2_dinode *fe = (ocfs2_dinode *) alloc_bh->b_data;
+
+ OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+
+ //need to figure out the rest of the api here.
+ //don't forget to update bit counts on fe, chain record and
+ //block group.
+
+ return status;
+}
+#endif
+static inline void debug_bg(ocfs2_group_desc *bg)
+{
+ printk("Block Group:\n");
+ printk("bg_signature: %s\n", bg->bg_signature);
+ printk("bg_size: %u\n", bg->bg_size);
+ printk("bg_bits: %u\n", bg->bg_bits);
+ printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+ printk("bg_chain: %u\n", bg->bg_chain);
+ printk("bg_generation: %u\n", bg->bg_generation);
+ printk("bg_next_group: %llu\n", bg->bg_next_group);
+ printk("bg_parent_dinode: %llu\n", bg->bg_parent_dinode);
+ printk("bg_blkno: %llu\n", bg->bg_blkno);
+ return;
+}
+
+static inline void debug_suballoc_inode(ocfs2_dinode *fe)
+{
+ int i;
+
+ printk("Suballoc Inode %llu:\n", fe->i_blkno);
+ printk("i_signature: %s\n", fe->i_signature);
+ printk("i_size: %llu\n", fe->i_size);
+ printk("i_clusters: %u\n", fe->i_clusters);
+ printk("i_generation: %u\n", fe->i_generation);
+ printk("id1.bitmap1.i_used: %u\n", fe->id1.bitmap1.i_used);
+ printk("id1.bitmap1.i_total: %u\n", fe->id1.bitmap1.i_total);
+ printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
+ printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
+ printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
+ printk("id2.i_chain.cl_next_free_rec: %u\n",
+ fe->id2.i_chain.cl_next_free_rec);
+ for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+ printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, fe->id2.i_chain.cl_recs[i].c_free);
+ printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, fe->id2.i_chain.cl_recs[i].c_total);
+ printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, fe->id2.i_chain.cl_recs[i].c_blkno);
+ }
+ return;
+}
Added: branches/dlm-changes/src/suballoc.h
===================================================================
--- branches/dlm-changes/src/suballoc.h 2004-09-20 21:51:09 UTC (rev 1478)
+++ branches/dlm-changes/src/suballoc.h 2004-09-24 22:43:55 UTC (rev 1479)
@@ -0,0 +1,53 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Mark Fasheh, Kurt Hackel, Joel Becker, Sunil Mushran,
+ * Manish Singh, Wim Coekaerts
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+int ocfs_reserve_new_metadata(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ struct inode *inode,
+ ocfs2_dinode *fe,
+ ocfs2_alloc_context **ac);
+int ocfs_reserve_new_inode(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context **ac);
+int ocfs_claim_new_inode(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac,
+ u16 *suballoc_bit,
+ u64 *fe_blkno);
+int ocfs_claim_metadata(ocfs_super *osb,
+ ocfs_journal_handle *handle,
+ ocfs2_alloc_context *ac,
+ u32 bits_wanted,
+ u16 *suballoc_bit_start,
+ u32 *num_bits,
+ u64 *blkno_start);
+
+#endif /* _CHAINALLOC_H_ */
More information about the Ocfs2-commits
mailing list