[Ocfs2-devel] [PATCH] split inode.c

Christoph Hellwig hch at lst.de
Sat Aug 14 01:27:16 CDT 2004


two new files:

 - aops.c implementing ocfs_aops
 - 24io.c implementing 2.4 specific direct I/O and AIO code


Index: src/Makefile
===================================================================
--- src/Makefile	(revision 1355)
+++ src/Makefile	(working copy)
@@ -54,7 +54,9 @@
 endif
 
 CFILES = \
+	24io.c			\
 	alloc.c			\
+	aops.c			\
 	bitmap.c		\
 	buffer_head_io.c	\
 	dcache.c		\
Index: src/inode.c
===================================================================
--- src/inode.c	(revision 1355)
+++ src/inode.c	(working copy)
@@ -58,17 +58,8 @@
 
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_INODE
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-# include <linux/iobuf.h>
-# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
-#  define free_kiovec_sz(nr, buf, bh)     free_kiovec(nr, buf)
-#  define alloc_kiovec_sz(nr, buf, bh)    alloc_kiovec(nr, buf)
-# endif
-#endif /* for 2.6 - no more kiovec, kiobuf structures - vfs handles
-	* this for us (direct i/o) */
-
-
 extern struct semaphore recovery_list_sem;
+extern struct address_space_operations ocfs_aops;
 
 typedef struct _ocfs_find_inode_args
 {
@@ -78,47 +69,14 @@
 }
 ocfs_find_inode_args;
 
-static int ocfs_readpage (struct file *file, struct page *page);
-static int ocfs_prepare_write (struct file *file, struct page *page, unsigned from, unsigned to);
-static int ocfs_commit_write (struct file *file, struct page *page, unsigned from, unsigned to);
-static int ocfs_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create);
-static int ocfs_symlink_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create);
-
 static int ocfs_read_locked_inode(struct inode *inode, ocfs_find_inode_args *args);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static sector_t ocfs_bmap(struct address_space *mapping, sector_t block);
-static int ocfs_writepage (struct page *page, struct writeback_control *wbc);
-static ssize_t ocfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
 static int ocfs_init_locked_inode(struct inode * inode, void * opaque);
 static int ocfs_find_actor (struct inode *inode, void *opaque);
 #else /* 2.4 kernel */
 static int ocfs_find_inode (struct inode *inode, unsigned long ino, void *opaque);
-static int ocfs_bmap(struct address_space *mapping, long block);
-static int ocfs_writepage (struct page *page);
-static int ocfs_get_block2 (struct inode *inode, long iblock, long *oblock, int len);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) || defined(SUSE)
-static int ocfs_direct_IO (int rw, struct file *filp, struct kiobuf *iobuf, unsigned long blocknr, int blocksize);
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
-static int ocfs_direct_IO (int rw, struct inode *inode, struct kiobuf *iobuf, unsigned long blocknr, int blocksize);
 #endif
-#endif
 
-static struct address_space_operations ocfs_aops = {
-	.readpage = ocfs_readpage,
-	.writepage = ocfs_writepage,
-	.prepare_write = ocfs_prepare_write,
-	.bmap = ocfs_bmap,
-	.commit_write = ocfs_commit_write,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
-/*
- * On a 2.4 system, we are only adding this here as a dummy basically, 
- * just need open with O_DIRECT to succeed, we still call ocfs_rw_direct().
- * For a 2.6 system, this is the way a filesystem provides direct-io support. 
- */
-	.direct_IO = ocfs_direct_IO
-#endif
-};
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 /* 
  * ocfs_ilookup()
@@ -813,281 +771,7 @@
 	return;
 }				/* ocfs_clear_inode */
 
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-inline void __mark_dirty(struct buffer_head *bh)
-{
-	set_buffer_flushtime(bh);
-	refile_buffer(bh);
-}
-
-static int __block_commit_write(struct inode *inode, struct page *page,
-		unsigned from, unsigned to)
-{
-	unsigned block_start, block_end;
-	int partial = 0, need_balance_dirty = 0;
-	unsigned blocksize;
-	struct buffer_head *bh, *head;
-
-	blocksize = 1 << inode->i_blkbits;
-
-	for(bh = head = page->buffers, block_start = 0;
-	    bh != head || !block_start;
-	    block_start=block_end, bh = bh->b_this_page) {
-		block_end = block_start + blocksize;
-		if (block_end <= from || block_start >= to) {
-			if (!buffer_uptodate(bh))
-				partial = 1;
-		} else {
-			set_bit(BH_Uptodate, &bh->b_state);
-			if (!atomic_set_buffer_dirty(bh)) {
-				__mark_dirty(bh);
-				buffer_insert_inode_data_queue(bh, inode);
-				need_balance_dirty = 1;
-			}
-		}
-	}
-
-	if (need_balance_dirty)
-		balance_dirty();
-	/*
-	 * is this a partial write that happened to make all buffers
-	 * uptodate then we can optimize away a bogus readpage() for
-	 * the next read(). Here we 'discover' wether the page went
-	 * uptodate as a result of this (potentially partial) write.
-	 */
-	if (!partial)
-		SetPageUptodate(page);
-	return 0;
-}
-
-static int ocfs2_cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
-	struct page *new_page;
-	unsigned long pgpos;
-	long status;
-	unsigned zerofrom;
-	unsigned blocksize = 1 << inode->i_blkbits;
-	char *kaddr;
-
-	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
-		status = -ENOMEM;
-		new_page = grab_cache_page(mapping, pgpos);
-		if (!new_page)
-			goto out;
-		/* we might sleep */
-		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
-			unlock_page(new_page);
-			page_cache_release(new_page);
-			continue;
-		}
-		zerofrom = *bytes & ~PAGE_CACHE_MASK;
-		if (zerofrom & (blocksize-1)) {
-			*bytes |= (blocksize-1);
-			(*bytes)++;
-		}
-		status = block_prepare_write(new_page, zerofrom,
-					     PAGE_CACHE_SIZE, get_block);
-		if (status)
-			goto out_unmap;
-		kaddr = page_address(new_page);
-		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
-		flush_dcache_page(new_page);
-		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
-		kunmap(new_page);
-		unlock_page(new_page);
-		page_cache_release(new_page);
-	}
-
-	if (page->index < pgpos) {
-		/* completely inside the area */
-		zerofrom = offset;
-	} else {
-		/* page covers the boundary, find the boundary offset */
-		zerofrom = *bytes & ~PAGE_CACHE_MASK;
-
-		/* if we will expand the thing last block will be filled */
-		if (to > zerofrom && (zerofrom & (blocksize-1))) {
-			*bytes |= (blocksize-1);
-			(*bytes)++;
-		}
-
-		/* starting below the boundary? Nothing to zero out */
-		if (offset <= zerofrom)
-			zerofrom = offset;
-	}
-	status = block_prepare_write(page, zerofrom, to, get_block);
-	if (status)
-		goto out1;
-	kaddr = page_address(page);
-	if (zerofrom < offset) {
-		memset(kaddr+zerofrom, 0, offset-zerofrom);
-		flush_dcache_page(page);
-		__block_commit_write(inode, page, zerofrom, offset);
-	}
-	return 0;
-out1:
-	ClearPageUptodate(page);
-	kunmap(page);
-	return status;
-
-out_unmap:
-	ClearPageUptodate(new_page);
-	kunmap(new_page);
-	UnlockPage(new_page);
-	page_cache_release(new_page);
-out:
-	return status;
-}
-
-/* Mark's favorite hack */
-#undef cont_prepare_write
-#define cont_prepare_write ocfs2_cont_prepare_write
-#endif  /* < 2.6.0 */
-
 /*
- * ocfs_prepare_write()
- *
- */
-static int ocfs_prepare_write (struct file *file, struct page *page, unsigned from, unsigned to)
-{
-	int ret;
-	struct inode *inode = page->mapping->host;
-
-	LOG_SET_CONTEXT(PREPARE_WRITE);
-
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
-	if (!inode)
-		BUG();
-
-	ret = cont_prepare_write(page, from, to, ocfs_get_block,
-		&(OCFS_I(page->mapping->host)->ip_mmu_private));
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_prepare_write */
-
-/*
- * ocfs_commit_write()
- *
- */
-static int ocfs_commit_write (struct file *file, struct page *page, unsigned from, unsigned to)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(COMMIT_WRITE);
-
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
-	ret = generic_commit_write (file, page, from, to);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_commit_write */
-
-/*
- * ocfs_symlink_get_block()
- *  
- */
-static int ocfs_symlink_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
-{
-	int err = -EIO;
-	int status;
-	ocfs2_dinode *fe = NULL;
-	struct buffer_head *bh = NULL;
-	struct buffer_head *buffer_cache_bh = NULL;
-	ocfs_super *osb = OCFS_SB(inode->i_sb);
-	void *kaddr;
-
-	LOG_ENTRY_ARGS ("(0x%p, %llu, 0x%p, %d)\n", inode,
-			(unsigned long long)iblock, bh_result, create);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bad inode");
-		goto bail;
-	}
-
-	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
-		LOG_ERROR_ARGS ("block offset > PATH_MAX: %llu",
-				(unsigned long long)iblock);
-		goto bail;
-	}
-
-	status = ocfs_read_bh(OCFS_SB(inode->i_sb),
-			      OCFS_I(inode)->ip_blkno << inode->i_sb->s_blocksize_bits,
-			      &bh,
-			      OCFS_BH_CACHED, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-	fe = (ocfs2_dinode *) bh->b_data;
-
-	if (!IS_VALID_FILE_ENTRY(fe)) {
-		LOG_ERROR_ARGS("Invalid fe at blkno %llu",
-			       OCFS_I(inode)->ip_blkno);
-		goto bail;
-	}
-
-	if ((u64)iblock >= ocfs_clusters_to_blocks(inode->i_sb,
-					      fe->i_clusters)) {
-		LOG_ERROR_ARGS ("block offset is outside the allocated size: %llu",
-		     (unsigned long long)iblock);
-		goto bail;
-	}
-
-	/* We don't use the page cache to create symlink data, so if
-	 * need be, copy it over from the buffer cache. */
-	if (!buffer_uptodate(bh_result) && !ocfs_inode_is_new(osb, inode)) {
-		buffer_cache_bh = sb_getblk(osb->sb, 
-					    fe->id2.i_list.l_recs[0].e_blkno + iblock);
-		if (!buffer_cache_bh) {
-			LOG_ERROR_STR("couldn't getblock for symlink!");
-			goto bail;
-		}
-
-		/* we haven't locked out transactions, so a commit
-		 * could've happened. Since we've got a reference on
-		 * the bh, even if it commits while we're doing the
-		 * copy, the data is still good. */
-		if (buffer_jbd(buffer_cache_bh) 
-		    && !ocfs_inode_is_new(osb, inode)) {
-			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
-			if (!kaddr) {
-				LOG_ERROR_ARGS("couldn't kmap!\n");
-				goto bail;
-			}
-			memcpy(kaddr + (bh_result->b_size * iblock), 
-			       buffer_cache_bh->b_data, 
-			       bh_result->b_size);
-			kunmap_atomic(kaddr, KM_USER0);
-			set_buffer_uptodate(bh_result);
-		}
-		brelse(buffer_cache_bh);
-	}
-
-	map_bh(bh_result, inode->i_sb,
-	       fe->id2.i_list.l_recs[0].e_blkno + iblock);
-
-	err = 0;
-
-bail:
-	if (bh)
-		brelse(bh);
-
-	LOG_EXIT_INT (err);
-	return err;
-}				/* ocfs_symlink_get_block */
-
-
-/*
  * TODO: this should probably be merged into ocfs_get_block
  * 
  * However, you now need to pay attention to the cont_prepare_write()
@@ -1194,834 +878,6 @@
 }
 
 /*
- * ocfs_get_block()
- *
- */
-static int ocfs_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
-{
-	int err = -EIO;
-	__s64 vbo = 0;
-	__s64 lbo = 0;
-	__u32 len;
-	int open_direct;
-
-	LOG_ENTRY_ARGS ("(0x%p, %llu, 0x%p, %d)\n", inode,
-			(unsigned long long)iblock, bh_result, create);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bad inode");
-		goto bail;
-	}
-
-	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SYSTEM_FILE) {
-		printk("get_block on system inode 0x%p (%lu)\n",
-		       inode, inode->i_ino);
-	}
-
-	open_direct = OCFS_I(inode)->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO;
-
-	if (S_ISLNK (inode->i_mode)) {
-		/* this always does I/O for some reason. */
-		down_read(&OCFS_I(inode)->ip_io_sem);
-		err = ocfs_symlink_get_block (inode, iblock, bh_result, 
-					      create);
-		up_read(&OCFS_I(inode)->ip_io_sem);
-		goto bail;
-	}
-
-	vbo = (__s64) iblock << inode->i_sb->s_blocksize_bits;
-
-#if 0
-	if (!INODE_JOURNAL(inode) && vbo >= OCFS_I(inode)->ip_alloc_size) {
-		int vbo_pad;
-		
-		vbo_pad = inode->i_sb->s_blocksize;
-		vbo_pad -= vbo & (s64)(inode->i_sb->s_blocksize - 1);
-
-		LOG_TRACE_STR("Extending allocation");
-		LOG_ERROR_ARGS("extending inode %lu in get_block!!\n", 
-			       inode->i_ino);
-		down_write(&OCFS_I(inode)->ip_io_sem);
-		err = ocfs_extend_file(osb, vbo + vbo_pad, 
-				       NULL, inode, NULL, 0, NULL);
-		up_write(&OCFS_I(inode)->ip_io_sem);
-		if (err < 0) {
-			err = -ENOSPC;
-			LOG_ERROR_STATUS (err);
-			goto bail;
-		}
-	}
-#else
-	if (vbo >= OCFS_I(inode)->ip_alloc_size) {
-		err = -EIO;
-		LOG_ERROR_ARGS("Trying to extend in ocfs_get_block() (inode %llu, blkno %llu, vbo %llu, alloc %llu)\n", OCFS_I(inode)->ip_blkno, (u64)iblock, (u64)vbo, OCFS_I(inode)->ip_alloc_size);
-		goto bail;
-	}
-#endif
-
-	len = inode->i_sb->s_blocksize;
-	if (!open_direct)
-		down_read(&OCFS_I(inode)->ip_extend_sem);
-	err = ocfs_lookup_file_allocation(OCFS2_SB(inode->i_sb),
-					  vbo, &lbo, len, NULL, 
-					  inode, open_direct);
-	if (!open_direct)
-		up_read(&OCFS_I(inode)->ip_extend_sem);
-	if (err < 0) {
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo, lbo, len);
-		goto bail;
-	}
-
-	map_bh(bh_result, inode->i_sb, lbo >> inode->i_sb->s_blocksize_bits);
-
-	err = 0;
-
-	if (bh_result->b_blocknr == 0) {
-		err = -EIO;
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
-				vbo, lbo, len, 
-				OCFS_I(inode)->ip_blkno);
-	}
-
-	if (vbo < OCFS_I(inode)->ip_mmu_private)
-		goto bail;
-	if (!create)
-		goto bail;
-	if (vbo != OCFS_I(inode)->ip_mmu_private) {
-		LOG_ERROR_ARGS("Uh-oh, vbo = %lld, i_size = %llu, mmu = %llu, "
-			       "inode = %llu\n",
-			       vbo, inode->i_size, 
-			       OCFS_I(inode)->ip_mmu_private,
-			       OCFS_I(inode)->ip_blkno);
-		BUG();
-		err = -EIO;
-		goto bail;
-	}
-
-	bh_result->b_state |= (1UL << BH_New);
-	OCFS_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
-
-bail:
-	if (err < 0)
-		err = -EIO;
-
-	LOG_EXIT_INT (err);
-	return err;
-}				/* ocfs_get_block */
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static sector_t ocfs_bmap(struct address_space *mapping, sector_t block) 
-#else
-static int ocfs_bmap(struct address_space *mapping, long block) 
-#endif
-{
-	int disk_block = 0;
-	ocfs_super *osb = OCFS_SB(mapping->host->i_sb);
-	__s64 vbo = 0;
-	__s64 lbo = 0;
-	__u32 len;
-	int err = 0, status;
-	struct inode *inode = mapping->host;
-
-	LOG_SET_CONTEXT(BMAP);
-
-	LOG_ENTRY_ARGS("(block = %llu)\n", (unsigned long long)block);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bmap: bad inode");
-		err = -EINVAL;
-		LOG_ERROR_STATUS(err);
-		goto bail;
-	}
-
-	if (!INODE_JOURNAL(inode)) {
-		LOG_ERROR_STR("bmap is only for journal inodes!");
-		err = -EINVAL;
-		LOG_ERROR_STATUS(err);
-		goto bail;
-	}
-
-	vbo = (__s64) block << inode->i_sb->s_blocksize_bits;
-	len = osb->sb->s_blocksize;
-	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
-					   inode, 1);
-	if (err < 0) {
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo,
-				lbo, len);
-		LOG_ERROR_STATUS(err);
-		goto bail;
-	}
-
-	disk_block = lbo >> inode->i_sb->s_blocksize_bits;
-
-bail:
-	status = err ? err : disk_block;
-	LOG_EXIT_STATUS(status);
-
-	LOG_CLEAR_CONTEXT();
-	return(status);
-}
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-/*
- * ocfs_get_block2()
- *
- */
-static int ocfs_get_block2 (struct inode *inode, long iblock, long *oblock, int len)
-{
-	int err = -EIO;
-	ocfs_super *osb;
-	__s64 vbo = 0;
-	__s64 lbo = 0;
-
-	LOG_ENTRY_ARGS ("(0x%p, %ld)\n", inode, iblock);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bad inode");
-		err = -1;
-		goto bail;
-	}
-
-	osb = OCFS_SB(inode->i_sb);
-
-	vbo = (__s64) iblock << osb->s_sectsize_bits;
-	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
-					   inode, 1);
-	if (err < 0) {
-		LOG_ERROR_STATUS (err);
-		err = -1;
-		goto bail;
-	}
-
-	err = 0;
-
-	*oblock = lbo >> osb->s_sectsize_bits;
-	if (*oblock == 0) {
-		err = -EIO;
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
-				vbo, lbo, len, 
-				OCFS_I(inode)->ip_blkno);
-	}
-
-bail:
-	if (err < 0)
-		err = -EIO;
-	LOG_EXIT_INT (err);
-	return err;
-}				/* ocfs_get_block2 */
-#endif
-
-/*
- * ocfs_readpage()
- *
- */
-static int ocfs_readpage (struct file *file, struct page *page)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(READPAGE);
-
-	LOG_ENTRY_ARGS ("(0x%p, %lu)\n", file, (page ? page->index : 0));
-
-	ret = block_read_full_page (page, ocfs_get_block);
-	if (ret < 0)
-		goto bail;
-
-bail:
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_readpage */
-
-/*
- * ocfs_writepage()
- *
- */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static int ocfs_writepage (struct page *page, struct writeback_control *wbc)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(WRITEPAGE);
-
-	LOG_ENTRY_ARGS ("(0x%p)\n", page);
-
-	ret = block_write_full_page (page, ocfs_get_block, wbc);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_writepage */
-#else
-static int ocfs_writepage (struct page *page)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(WRITEPAGE);
-
-	LOG_ENTRY_ARGS ("(0x%p)\n", page);
-
-	ret = block_write_full_page (page, ocfs_get_block);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_writepage */
-#endif
-
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- *  "So what we do is to permit the ->get_blocks function to populate 
- *   bh.b_size with the size of IO which is permitted at this offset and 
- *   this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * We should probably have this data in the oin for the inode.
- * Otherwise, we might want to look at ocfs_rw_direct, 
- *  ocfs_lookup_file_allocation and ocfs_get_block
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * 					fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create)
-{
-	int ret = -1;
-	int status;
-	ocfs_super *osb = NULL;
-	__s64 vbo; /* file offset */
-	__s64 lbo; /* logical (disk) offset */
-	__s64 vbo_max; /* file offset, max_blocks from iblock */
-	int set_new = 0; /* flag */
-	__u64 new_size; /* In bytes, the size of the contiguous block */
-	unsigned char blocksize_bits;
-
-	if (!inode || !bh_result) {
-		LOG_ERROR_STR("ocfs_direct_IO_get_blocks: inode or bh_result is null");
-		return -EIO;
-	}
-
-	osb = inode->i_sb->s_fs_info;
-	blocksize_bits = inode->i_sb->s_blocksize_bits;
-	/* make sure we're up to date... */
-	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
-		LOG_TRACE_STR ("ocfs_direct_IO_get_blocks: verify oin.");
-		status = ocfs_verify_update_inode (osb, inode, 0);
-		if (status < 0) {
-			LOG_TRACE_STR ("ocfs_verify_update_inode failed");
-			ret = -EIO;
-			goto bail;
-		}
-	}
-
-	/* This function won't even be called if the request isn't all
-	 * nicely aligned and of the right size, so there's no need
-	 * for us to check any of that. */
-
-	vbo = (__s64) iblock << blocksize_bits;
-	vbo_max = vbo + ((__s64) max_blocks << blocksize_bits);
-
-	/* NOTE: create flag is set when we ?may? have to allocate some
-	   blocks for the file. */
-	if (create && vbo_max > OCFS_I(inode)->ip_alloc_size) {
-		/* WARNING: How much do we really want to extend the file? */
-		status = ocfs_extend_file(osb, vbo_max,
-					  NULL, inode, NULL, 0, NULL);
-		if (status < 0) {
-			status = -ENOSPC;
-			LOG_ERROR_STR("ocfs_direct_IO_get_blocks: failed to extend the file!");
-			goto bail;
-		}
-		set_new = 1;
-	}
-
-	/* This figure out the size of the next contiguous block, and
-	 * our logical offset */	
-	/* TODO: Try our damndest to give sizes in multiples of PAGE_SIZE */
-	status = ocfs_lookup_file_allocation(osb, vbo, &lbo, max_blocks << blocksize_bits, 
-					     &new_size, inode, 1);
-
-	/* Do whatever we need to the buffer_head */
-	if (set_new) {
-		set_buffer_new(bh_result);
-		/* Do we really want to set bh_result->b_blocknr here too? */
-		bh_result->b_blocknr = lbo >> blocksize_bits;
-	} else {
-		clear_buffer_new(bh_result);
-		/* is the last argument here correct? */
-		map_bh(bh_result, inode->i_sb, lbo >> blocksize_bits);
-	}
-
-	/* make sure we don't map more than max_blocks blocks here as
-	   that's all the kernel will handle at this point. */
-	if (new_size > (__u64)max_blocks << blocksize_bits)
-		new_size = (__u64)max_blocks << blocksize_bits;
-	bh_result->b_size = new_size;
-
-	ret = 0;
-bail:
-	return ret;
-}
-
-/*
- * ocfs_direct_IO()
- * used to be: 
- * static int ocfs_direct_IO (int rw,
- *	       struct inode *inode,
- *	       struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
- *
- * now:         
-  static int ocfs_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
- * int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
- *                  loff_t offset, unsigned long nr_segs);
- */
-static ssize_t ocfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
-	int ret;
-
-	LOG_SET_CONTEXT(DIRECT_IO);
-
-	LOG_ENTRY ();
-
-	/* blockdev_direct_IO checks alignment for us, using */
-	ret = blockdev_direct_IO (rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs_direct_IO_get_blocks, NULL);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_direct_IO */
-
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
-/*
- * ocfs_direct_IO()
- *
- * we are not using this function anymore, in fact
- * we should never get here any more
- * so let's just BUG(), hint from sct at redhat.com
- */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) || defined(SUSE)
-static int ocfs_direct_IO (int rw, struct file *filp, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
-{
-	BUG();
-	return 0;
-}				/* ocfs_direct_IO */
-#else
-static int ocfs_direct_IO (int rw, struct inode *inode, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
-{
-	BUG();
-	return 0;
-}				/* ocfs_direct_IO */
-#endif
-#endif  /* version >= 2.4.10 */
-
-#if defined(SUSE) && LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)
-#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->kio_blocks)
-#else
-#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->blocks)
-#endif
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,10)
-#define KERNEL_NO_F_IOBUF 1
-#elif defined(SUSE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)
-#define KERNEL_NO_F_IOBUF 1
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-/*
- * ocfs_rw_direct()
- *
- */
-ssize_t ocfs_rw_direct (int rw, struct file *filp, char *buf, size_t size, loff_t * offp)
-{
-#ifdef KERNEL_NO_F_IOBUF
-	struct kiobuf *iobuf;
-#else
-	struct kiobuf *iobuf = filp->f_iobuf;
-	int new_iobuf = 0;
-#endif
-	int err = 0;
-	unsigned long blocknr, blocks, myiosize;
-	size_t transferred;
-	int iosize, clustersize;
-	int i;
-	struct inode *inode = filp->f_dentry->d_inode;
-	int max_sectors;
-	int nbhs;
-	int sector_size, sector_bits, sector_mask, sectors_per_page;
-	int ret = 0;
-	int large_io = 0;
-	int inuse = 0;
-	unsigned long blocks_end_cluster = 0;
-	loff_t saved_off;
-	size_t saved_size;
-	unsigned long firstlogic;
-	long firstphys;
-	long nextphys;
-	unsigned long nextlogic = 0;
-	unsigned long totalioblocks = 0;
-
-	saved_off = *offp;
-	saved_size = size;
-	
-	/* FIXME: Need to differentiate between sectors and blocksize */
-	sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	sector_mask = sector_size - 1;
-	sectors_per_page = PAGE_SIZE / sector_size;
-	/* max sectors is 1024 in 2.4.9
-	 * max data is 512kb  
-	 */
-
-	err = -EINVAL;
-	if (size == 0) {
-		printk("direct write of 0 byte\n");
-		return 0;
-	}
-
-	if (rw == READ) {
-	   if (inode->i_size <= *offp) /* read past end of file */
-	      return 0;
-	   if  (size > (inode->i_size - *offp))
-	      size = inode->i_size - *offp;
-	}
-
-	/* make sure aligned to either PAGE_SIZE or sect_size IO */
-#ifndef LARGEIOS
-	if ((*offp & sector_mask) || (size & sector_mask)) 
-	   /* if not, then fail, we need either to do dio */
-	   return err;
-
-	max_sectors = KIO_MAX_SECTORS;
-	large_io = 0;
-#endif
-#ifdef LARGEIOS	
-	if ((*offp & ~PAGE_MASK) || (size & ~PAGE_MASK)) {
-		/* if it's not PAGE_SIZE, then sect_size */
-		 if ((*offp & sector_mask) || (size & sector_mask))
-			 /* if not, then fail, we need either to do dio */
-			 return err;
-		 max_sectors = KIO_MAX_SECTORS; /* for 2.4.9 - 1024 */
-	} /* ok we 're PAGE_SIZE aligned, lets see if the buffer is */
-	else {
-		if (!((unsigned long) buf & ~PAGE_MASK)) {
-			/* yippie we are .. we can do PAGE_SIZE size io's */
-			large_io = 1;
-			/* for 2.4.9 */
-			max_sectors = KIO_MAX_SECTORS / sectors_per_page;
-		} else {
-			max_sectors = KIO_MAX_SECTORS;
-			large_io = 0;
-		}
-
-	}	
-#endif
-	/* find out how far we are to the end of our cluster */
-
-	err = 0;
-	if (size)
-		err = -ENXIO;
-
-	/* Split the IO into KIO_MAX_SECTORS chunks, mapping and */
-	/* unmapping the single kiobuf as we go to perform each chunk of IO. */
-
-	transferred = 0;
-	blocknr = *offp >> sector_bits;
-	clustersize = inode->i_blksize >> sector_bits;
-	myiosize = size >> sector_bits;
-	blocks_end_cluster = clustersize - (blocknr % clustersize);
-	firstlogic = blocknr;
-	totalioblocks = 0;
-
-	ret = ocfs_get_block2 (inode, blocknr, &firstphys, sector_size);
-	if (ret == -1) {
-		err = 0;
-		goto out;
-	}
-	while (myiosize > 0) {
-	    if (blocks_end_cluster + 1 > myiosize) {
-		totalioblocks += myiosize;
-		myiosize = 0;
-		goto doio;
-	    } else {
-		totalioblocks += blocks_end_cluster;
-		myiosize -= blocks_end_cluster;
-		nextlogic = firstlogic + blocks_end_cluster;
-	    }
-again:
-	    ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
-	    if (ret == -1) {
-		err = 0;
-		goto out;
-	    }
-	    if (nextphys == (firstphys + totalioblocks)) {
-		// merge ok
-		blocks_end_cluster = clustersize - (nextlogic % clustersize);
-		if (blocks_end_cluster + 1 > myiosize) {
-		   totalioblocks += myiosize;
-		   myiosize = 0;
-		} else {
-		   totalioblocks += blocks_end_cluster;
-		   myiosize -= blocks_end_cluster;
-		   nextlogic = nextlogic + blocks_end_cluster;
-		   goto again;
-		}
-	    }
-doio:
-		size = totalioblocks << sector_bits;
-		if (large_io)
-			nbhs = (size >> PAGE_SHIFT);
-		else
-			nbhs = (size >> sector_bits);
-		if (nbhs > max_sectors)
-			nbhs = max_sectors;
-
-#ifdef KERNEL_NO_F_IOBUF
-		err = alloc_kiovec_sz (1, &iobuf, &nbhs);
-		if (err)
-			goto out;
-#else
-		if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
-			/*
-			* A parallel read/write is using the preallocated iobuf
-			* so just run slow and allocate a new one.
-			*/
-			err = alloc_kiovec_sz (1, &iobuf, &nbhs);
-			if (err)
-				goto out;
-			new_iobuf = 1;
-		} else
-			new_iobuf = 0;
-				
-#endif
-		inuse = 1;
-
-		totalioblocks = 0;
-		while (size > 0) {
-			if (large_io) {
-				blocks = size >> PAGE_SHIFT;
-				if (blocks > max_sectors)
-					blocks = max_sectors;
-				iosize = blocks << PAGE_SHIFT;
-			} else {
-				blocks = size >> sector_bits;
-				if (blocks > max_sectors)
-					blocks = max_sectors;
-				iosize = blocks << sector_bits;
-			}
-			if (!blocks)
-				break;
-			err = map_user_kiobuf (rw, iobuf, (unsigned long) buf, iosize);
-			if (err)
-				break;
-			/* get the blocknr depending on io size for all blocks */
-			/* since we are awlays within the extent we only need to get the first block */
-			OCFS_KIO_BLOCKS(iobuf)[0] = firstphys + totalioblocks;
-
-			if (large_io) {
-				blocknr += sectors_per_page;
-				OCFS_KIO_BLOCKS(iobuf)[0] = OCFS_KIO_BLOCKS(iobuf)[0] / sectors_per_page;
-			} else {
-				blocknr++;
-			}
-
-			for (i = 1; i < blocks; i++) {
-				if (large_io) {
-					blocknr += sectors_per_page;
-				} else {
-					blocknr++;
-				}
-				OCFS_KIO_BLOCKS(iobuf)[i] = OCFS_KIO_BLOCKS(iobuf)[0] + i;
-			}
-			err = brw_kiovec (rw, 1, &iobuf, inode->i_dev, OCFS_KIO_BLOCKS(iobuf),
-					large_io ? PAGE_SIZE : sector_size);
-#ifdef SUSE
-			if (rw == READ &&  err > 0)
-				mark_dirty_kiobuf(iobuf, err);
-#endif
-			if (err >= 0) {
-				transferred += err;
-				size -= err;
-				buf += err;
-				if (large_io) {
-					totalioblocks +=
-					    (blocks * sectors_per_page);
-				} else {
-					totalioblocks += blocks;
-				}
-			} else {
-				printk( "ocfs_rw_direct : brw_kiovec() %d\n", err);	
-				break;
-			}
-			unmap_kiobuf (iobuf);
-			if (err != iosize)
-				break;
-		}
-#ifdef KERNEL_NO_F_IOBUF
-		free_kiovec_sz(1, &iobuf, &nbhs);
-#else
-		if (!new_iobuf)
-			clear_bit(0, &filp->f_iobuf_lock);
-		else
-			free_kiovec_sz(1, &iobuf, &nbhs);
-#endif
-		inuse = 0;
-		totalioblocks = 0;
-		firstlogic = nextlogic;
-		firstphys = nextphys;
-	}
-	if (transferred) {
-		*offp += transferred;
-		err = transferred;
-	}
-
-out:
-#ifdef KERNEL_NO_F_IOBUF
-	if (inuse)
-	   free_kiovec_sz (1, &iobuf, &nbhs);
-#else
-	if (inuse) {
-		if (!new_iobuf)
-			clear_bit(0, &filp->f_iobuf_lock);
-		else
-			free_kiovec_sz(1, &iobuf, &nbhs);
-	}
-#endif
-	return err;
-}				/* ocfs_rw_direct */
-#endif /* 2.4.x kernel */
-
-#ifdef AIO_ENABLED
-static int ocfs_kvec_rw(struct file *filp, int rw, kvec_cb_t cb,
-		size_t size, loff_t pos)
-{
-	int	     err = 0;
-	int max_sectors = 25000;
-	struct inode *inode = filp->f_dentry->d_inode;
-	unsigned long blocknr, blocks, iosize,myiosize;
-	long firstphys;
-	int clustersize;
-	unsigned long blocks_end_cluster = 0;
- 
-	/* FIXME: Need to differentiate betwen sectors and blocksize */
-	int sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	int sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	int sector_mask = sector_size - 1;
-
-	int ret;
-	unsigned long firstlogic;
-	long nextphys;
-	unsigned long nextlogic = 0;
-	unsigned long totalioblocks = 0;
-
-	if (!size || (pos == inode->i_size)) {
-		cb.fn(cb.data, cb.vec, err);
-		return err;
-	}
-
-	err = -ENXIO;
-	if (pos >= inode->i_size) {
-		return err;
-	}
-
-	err = -EINVAL;
-	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
-		return err;
-	}
-
-	blocknr = pos >> sector_bits;
-
-	blocks = size >> sector_bits;;
-	if (blocks > max_sectors)
-		blocks = max_sectors;
-	if (!blocks) {
-		err = -ENXIO;
-		return err;;
-	}
-
-	iosize = blocks << sector_bits;
-	clustersize = inode->i_blksize >> sector_bits;
-	blocks_end_cluster = clustersize - (blocknr % clustersize);
-	myiosize = size >> sector_bits;
-	firstlogic = blocknr;
-	totalioblocks = 0;
-
-	err = ocfs_get_block2(inode, blocknr, &firstphys, sector_size);
-	if ( err == -1 ) {
-		err = 0;
-		return err;
-	}
-		if (blocks_end_cluster + 1 > myiosize) {
-			totalioblocks += myiosize;
-			myiosize = 0;
-			goto doio;
-		} else {
-			totalioblocks += blocks_end_cluster;
-			myiosize -= blocks_end_cluster;
-			nextlogic = firstlogic + blocks_end_cluster;
-		}
-again:
-		ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
-		if (ret == -1) {
-			err = 0;
-			return err;
-		}
-	    if (nextphys == (firstphys + totalioblocks)) {
-		blocks_end_cluster = clustersize - (nextlogic % clustersize);
-		if (blocks_end_cluster + 1 > myiosize) {
-		   totalioblocks += myiosize;
-		   myiosize = 0;
-		} else {
-		   totalioblocks += blocks_end_cluster;
-		   myiosize -= blocks_end_cluster;
-		   nextlogic = nextlogic + blocks_end_cluster;
-		   goto again;
-		}
-	    }
-doio:
-	blocks = totalioblocks;
-	err = brw_kvec_async(rw, cb, inode->i_dev, blocks, firstphys, sector_bits);
-	return err;
-
-}
-
-int ocfs_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
-{
-	int ret;
-
-	LOG_SET_CONTEXT(KVEC_READ);
-
-	ret = ocfs_kvec_rw(file, READ, cb, size, pos);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}
-
-int ocfs_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
-{
-	int ret;
-
-	LOG_SET_CONTEXT(KVEC_WRITE);
-
-	ret = ocfs_kvec_rw(file, WRITE, cb, size, pos);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}
-#endif
-
-/*
  * ocfs_inode_revalidate()
  *
  * In 2.4, this is called only from stat.c always without i_sem before
Index: src/aops.c
===================================================================
--- src/aops.c	(revision 0)
+++ src/aops.c	(revision 0)
@@ -0,0 +1,677 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel, Mark Fasheh, Sunil Mushran, Wim Coekaerts,
+ *	    Manish Singh, Neeraj Goyal, Suchit Kaura
+ */
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "buffer_head_io.h"
+#include "file.h"
+#include "inode.h"
+#include "ocfs_journal.h"
+
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_INODE
+
+static int ocfs_symlink_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	ocfs_super *osb = OCFS_SB(inode->i_sb);
+	void *kaddr;
+
+	LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, %d)\n", inode,
+			(unsigned long long)iblock, bh_result, create);
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		LOG_ERROR_ARGS ("block offset > PATH_MAX: %llu",
+				(unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs_read_bh(OCFS_SB(inode->i_sb),
+			      OCFS_I(inode)->ip_blkno <<
+			      	inode->i_sb->s_blocksize_bits,
+			      &bh,
+			      OCFS_BH_CACHED, inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+	fe = (ocfs2_dinode *) bh->b_data;
+
+	if (!IS_VALID_FILE_ENTRY(fe)) {
+		LOG_ERROR_ARGS("Invalid fe at blkno %llu",
+			       OCFS_I(inode)->ip_blkno);
+		goto bail;
+	}
+
+	if ((u64)iblock >= ocfs_clusters_to_blocks(inode->i_sb,
+					      fe->i_clusters)) {
+		LOG_ERROR_ARGS ("block offset is outside the allocated size: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && !ocfs_inode_is_new(osb, inode)) {
+		buffer_cache_bh = sb_getblk(osb->sb, 
+					    fe->id2.i_list.l_recs[0].e_blkno + iblock);
+		if (!buffer_cache_bh) {
+			LOG_ERROR_STR("couldn't getblock for symlink!");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh) 
+		    && !ocfs_inode_is_new(osb, inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				LOG_ERROR_ARGS("couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock), 
+			       buffer_cache_bh->b_data, 
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       fe->id2.i_list.l_recs[0].e_blkno + iblock);
+
+	err = 0;
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	LOG_EXIT_INT (err);
+	return err;
+}
+
+static int ocfs_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	__s64 vbo = 0;
+	__s64 lbo = 0;
+	__u32 len;
+	int open_direct;
+
+	LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, %d)\n", inode,
+			(unsigned long long)iblock, bh_result, create);
+
+	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SYSTEM_FILE) {
+		printk("get_block on system inode 0x%p (%lu)\n",
+		       inode, inode->i_ino);
+	}
+
+	open_direct = OCFS_I(inode)->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO;
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		down_read(&OCFS_I(inode)->ip_io_sem);
+		err = ocfs_symlink_get_block (inode, iblock, bh_result, 
+					      create);
+		up_read(&OCFS_I(inode)->ip_io_sem);
+		goto bail;
+	}
+
+	vbo = (__s64) iblock << inode->i_sb->s_blocksize_bits;
+
+#if 0
+	if (!INODE_JOURNAL(inode) && vbo >= OCFS_I(inode)->ip_alloc_size) {
+		int vbo_pad;
+		
+		vbo_pad = inode->i_sb->s_blocksize;
+		vbo_pad -= vbo & (s64)(inode->i_sb->s_blocksize - 1);
+
+		LOG_TRACE_STR("Extending allocation");
+		LOG_ERROR_ARGS("extending inode %lu in get_block!!\n", 
+			       inode->i_ino);
+		down_write(&OCFS_I(inode)->ip_io_sem);
+		err = ocfs_extend_file(osb, vbo + vbo_pad, 
+				       NULL, inode, NULL, 0, NULL);
+		up_write(&OCFS_I(inode)->ip_io_sem);
+		if (err < 0) {
+			err = -ENOSPC;
+			LOG_ERROR_STATUS (err);
+			goto bail;
+		}
+	}
+#else
+	if (vbo >= OCFS_I(inode)->ip_alloc_size) {
+		err = -EIO;
+		LOG_ERROR_ARGS("Trying to extend in ocfs_get_block() "
+			"(inode %llu, blkno %llu, vbo %llu, alloc %llu)\n",
+			OCFS_I(inode)->ip_blkno, (u64)iblock, (u64)vbo,
+			OCFS_I(inode)->ip_alloc_size);
+		goto bail;
+	}
+#endif
+
+	len = inode->i_sb->s_blocksize;
+	if (!open_direct)
+		down_read(&OCFS_I(inode)->ip_extend_sem);
+	err = ocfs_lookup_file_allocation(OCFS2_SB(inode->i_sb),
+					  vbo, &lbo, len, NULL, 
+					  inode, open_direct);
+	if (!open_direct)
+		up_read(&OCFS_I(inode)->ip_extend_sem);
+
+	if (err < 0) {
+		LOG_ERROR_ARGS("vbo=%lld lbo=%lld len=%u", vbo, lbo, len);
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, lbo >> inode->i_sb->s_blocksize_bits);
+
+	err = 0;
+
+	if (bh_result->b_blocknr == 0) {
+		err = -EIO;
+		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
+				vbo, lbo, len, 
+				OCFS_I(inode)->ip_blkno);
+	}
+
+	if (vbo < OCFS_I(inode)->ip_mmu_private)
+		goto bail;
+	if (!create)
+		goto bail;
+	if (vbo != OCFS_I(inode)->ip_mmu_private) {
+		LOG_ERROR_ARGS("Uh-oh, vbo = %lld, i_size = %llu, mmu = %llu, "
+			       "inode = %llu\n",
+			       vbo, inode->i_size, 
+			       OCFS_I(inode)->ip_mmu_private,
+			       OCFS_I(inode)->ip_blkno);
+		BUG();
+		err = -EIO;
+		goto bail;
+	}
+
+	bh_result->b_state |= (1UL << BH_New);
+	OCFS_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	LOG_EXIT_INT (err);
+	return err;
+}
+
+static int ocfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(READPAGE);
+	LOG_ENTRY_ARGS("(0x%p, %lu)\n", file, (page ? page->index : 0));
+
+	ret = block_read_full_page(page, ocfs_get_block);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+static int ocfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(WRITEPAGE);
+	LOG_ENTRY_ARGS("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs_get_block, wbc);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+#else
+static int ocfs_writepage(struct page *page)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(WRITEPAGE);
+	LOG_ENTRY_ARGS("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs_get_block);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+inline void __mark_dirty(struct buffer_head *bh)
+{
+	set_buffer_flushtime(bh);
+	refile_buffer(bh);
+}
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0, need_balance_dirty = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	for(bh = head = page->buffers, block_start = 0;
+	    bh != head || !block_start;
+	    block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_bit(BH_Uptodate, &bh->b_state);
+			if (!atomic_set_buffer_dirty(bh)) {
+				__mark_dirty(bh);
+				buffer_insert_inode_data_queue(bh, inode);
+				need_balance_dirty = 1;
+			}
+		}
+	}
+
+	if (need_balance_dirty)
+		balance_dirty();
+	/*
+	 * is this a partial write that happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' wether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+static int ocfs2_cont_prepare_write(struct page *page, unsigned offset,
+		unsigned to, get_block_t *get_block, loff_t *bytes)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct page *new_page;
+	unsigned long pgpos;
+	long status;
+	unsigned zerofrom;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	char *kaddr;
+
+	while (page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
+		status = -ENOMEM;
+		new_page = grab_cache_page(mapping, pgpos);
+		if (!new_page)
+			goto out;
+		/* we might sleep */
+		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
+			unlock_page(new_page);
+			page_cache_release(new_page);
+			continue;
+		}
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		status = block_prepare_write(new_page, zerofrom,
+					     PAGE_CACHE_SIZE, get_block);
+		if (status)
+			goto out_unmap;
+		kaddr = page_address(new_page);
+		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+		flush_dcache_page(new_page);
+		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
+		kunmap(new_page);
+		unlock_page(new_page);
+		page_cache_release(new_page);
+	}
+
+	if (page->index < pgpos) {
+		/* completely inside the area */
+		zerofrom = offset;
+	} else {
+		/* page covers the boundary, find the boundary offset */
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+
+		/* if we will expand the thing last block will be filled */
+		if (to > zerofrom && (zerofrom & (blocksize-1))) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+
+		/* starting below the boundary? Nothing to zero out */
+		if (offset <= zerofrom)
+			zerofrom = offset;
+	}
+	status = block_prepare_write(page, zerofrom, to, get_block);
+	if (status)
+		goto out1;
+	kaddr = page_address(page);
+	if (zerofrom < offset) {
+		memset(kaddr+zerofrom, 0, offset-zerofrom);
+		flush_dcache_page(page);
+		__block_commit_write(inode, page, zerofrom, offset);
+	}
+	return 0;
+out1:
+	ClearPageUptodate(page);
+	kunmap(page);
+	return status;
+
+out_unmap:
+	ClearPageUptodate(new_page);
+	kunmap(new_page);
+	UnlockPage(new_page);
+	page_cache_release(new_page);
+out:
+	return status;
+}
+
+/* Mark's favorite hack */
+#undef cont_prepare_write
+#define cont_prepare_write ocfs2_cont_prepare_write
+#endif  /* < 2.6.0 */
+
+/*
+ * ocfs_prepare_write()
+ *
+ */
+static int ocfs_prepare_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(PREPARE_WRITE);
+	LOG_ENTRY_ARGS("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = cont_prepare_write(page, from, to, ocfs_get_block,
+		&(OCFS_I(page->mapping->host)->ip_mmu_private));
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+/*
+ * ocfs_commit_write()
+ *
+ */
+static int ocfs_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(COMMIT_WRITE);
+	LOG_ENTRY_ARGS("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = generic_commit_write(file, page, from, to);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+static sector_t ocfs_bmap(struct address_space *mapping, sector_t block) 
+#else
+static int ocfs_bmap(struct address_space *mapping, long block) 
+#endif
+{
+	int disk_block = 0;
+	ocfs_super *osb = OCFS_SB(mapping->host->i_sb);
+	__s64 vbo = 0;
+	__s64 lbo = 0;
+	__u32 len;
+	int err = 0, status;
+	struct inode *inode = mapping->host;
+
+	LOG_SET_CONTEXT(BMAP);
+	LOG_ENTRY_ARGS("(block = %llu)\n", (unsigned long long)block);
+
+	if (!INODE_JOURNAL(inode)) {
+		LOG_ERROR_STR("bmap is only for journal inodes!");
+		err = -EINVAL;
+		LOG_ERROR_STATUS(err);
+		goto bail;
+	}
+
+	vbo = (__s64) block << inode->i_sb->s_blocksize_bits;
+	len = osb->sb->s_blocksize;
+	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
+					   inode, 1);
+	if (err < 0) {
+		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo,
+				lbo, len);
+		LOG_ERROR_STATUS(err);
+		goto bail;
+	}
+
+	disk_block = lbo >> inode->i_sb->s_blocksize_bits;
+
+bail:
+	status = err ? err : disk_block;
+
+	LOG_EXIT_STATUS(status);
+	LOG_CLEAR_CONTEXT();
+	return status;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate 
+ *   bh.b_size with the size of IO which is permitted at this offset and 
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * We should probably have this data in the oin for the inode.
+ * Otherwise, we might want to look at ocfs_rw_direct, 
+ *  ocfs_lookup_file_allocation and ocfs_get_block
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create)
+{
+	int ret = -1;
+	int status;
+	ocfs_super *osb = NULL;
+	__s64 vbo; /* file offset */
+	__s64 lbo; /* logical (disk) offset */
+	__s64 vbo_max; /* file offset, max_blocks from iblock */
+	int set_new = 0; /* flag */
+	__u64 new_size; /* In bytes, the size of the contiguous block */
+	unsigned char blocksize_bits;
+
+	if (!inode || !bh_result) {
+		LOG_ERROR_STR("ocfs_direct_IO_get_blocks: inode or bh_result is null");
+		return -EIO;
+	}
+
+	osb = inode->i_sb->s_fs_info;
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+	/* make sure we're up to date... */
+	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
+		LOG_TRACE_STR ("ocfs_direct_IO_get_blocks: verify oin.");
+		status = ocfs_verify_update_inode (osb, inode, 0);
+		if (status < 0) {
+			LOG_TRACE_STR ("ocfs_verify_update_inode failed");
+			ret = -EIO;
+			goto bail;
+		}
+	}
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	vbo = (__s64) iblock << blocksize_bits;
+	vbo_max = vbo + ((__s64) max_blocks << blocksize_bits);
+
+	/* NOTE: create flag is set when we ?may? have to allocate some
+	   blocks for the file. */
+	if (create && vbo_max > OCFS_I(inode)->ip_alloc_size) {
+		/* WARNING: How much do we really want to extend the file? */
+		status = ocfs_extend_file(osb, vbo_max,
+					  NULL, inode, NULL, 0, NULL);
+		if (status < 0) {
+			status = -ENOSPC;
+			LOG_ERROR_STR("ocfs_direct_IO_get_blocks: failed to extend the file!");
+			goto bail;
+		}
+		set_new = 1;
+	}
+
+	/* This figure out the size of the next contiguous block, and
+	 * our logical offset */	
+	/* TODO: Try our damndest to give sizes in multiples of PAGE_SIZE */
+	status = ocfs_lookup_file_allocation(osb, vbo, &lbo, max_blocks << blocksize_bits, 
+					     &new_size, inode, 1);
+
+	/* Do whatever we need to the buffer_head */
+	if (set_new) {
+		set_buffer_new(bh_result);
+		/* Do we really want to set bh_result->b_blocknr here too? */
+		bh_result->b_blocknr = lbo >> blocksize_bits;
+	} else {
+		clear_buffer_new(bh_result);
+		/* is the last argument here correct? */
+		map_bh(bh_result, inode->i_sb, lbo >> blocksize_bits);
+	}
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (new_size > (__u64)max_blocks << blocksize_bits)
+		new_size = (__u64)max_blocks << blocksize_bits;
+	bh_result->b_size = new_size;
+
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * ocfs_direct_IO()
+ * used to be: 
+ * static int ocfs_direct_IO (int rw,
+ *	       struct inode *inode,
+ *	       struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
+ *
+ * now:         
+  static int ocfs_direct_IO(int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+ * int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
+ *                  loff_t offset, unsigned long nr_segs);
+ */
+static ssize_t ocfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	int ret;
+
+	LOG_SET_CONTEXT(DIRECT_IO);
+
+	LOG_ENTRY ();
+
+	/* blockdev_direct_IO checks alignment for us, using */
+	ret = blockdev_direct_IO (rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs_direct_IO_get_blocks, NULL);
+
+	LOG_EXIT_INT (ret);
+
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}				/* ocfs_direct_IO */
+
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
+/*
+ * ocfs_direct_IO()
+ *
+ * we are not using this function anymore, in fact
+ * we should never get here any more
+ * so let's just BUG(), hint from sct at redhat.com
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) || defined(SUSE)
+static int ocfs_direct_IO (int rw, struct file *filp, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
+{
+	BUG();
+	return 0;
+}				/* ocfs_direct_IO */
+#else
+static int ocfs_direct_IO (int rw, struct inode *inode, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
+{
+	BUG();
+	return 0;
+}				/* ocfs_direct_IO */
+#endif
+#endif  /* version >= 2.4.10 */
+
+
+struct address_space_operations ocfs_aops = {
+	.readpage	= ocfs_readpage,
+	.writepage	= ocfs_writepage,
+	.prepare_write	= ocfs_prepare_write,
+	.commit_write	= ocfs_commit_write,
+	.bmap		= ocfs_bmap,
+
+	/*
+	 * On a 2.4 system, we are only adding this here as a dummy basically.
+	 * Just need open with O_DIRECT to succeed, we still call
+	 * ocfs_rw_direct().
+	 *
+	 * For a 2.6 system, this is the way a filesystem provides
+	 * direct-io support. 
+	 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
+	.direct_IO	= ocfs_direct_IO
+#endif
+};
Index: src/24io.c
===================================================================
--- src/24io.c	(revision 0)
+++ src/24io.c	(revision 0)
@@ -0,0 +1,481 @@
+
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+
+#define KERNEL_NO_F_IOBUF
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/iobuf.h>
+
+#include <asm/byteorder.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "dlm.h"
+#include "extmap.h"
+#include "file.h"
+#include "inode.h"
+#include "lockres.h"
+#include "namei.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "util.h"
+#include "vote.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_INODE
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
+# define free_kiovec_sz(nr, buf, bh)     free_kiovec(nr, buf)
+# define alloc_kiovec_sz(nr, buf, bh)    alloc_kiovec(nr, buf)
+#endif
+
+#if defined(SUSE) && LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)
+#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->kio_blocks)
+#else
+#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->blocks)
+#endif
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,10)
+#define KERNEL_NO_F_IOBUF 1
+#elif defined(SUSE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)
+#define KERNEL_NO_F_IOBUF 1
+#endif
+
+static int ocfs_get_block2 (struct inode *inode, long iblock, long *oblock, int len)
+{
+	int err = -EIO;
+	ocfs_super *osb;
+	__s64 vbo = 0;
+	__s64 lbo = 0;
+
+	LOG_ENTRY_ARGS ("(0x%p, %ld)\n", inode, iblock);
+
+	if (!inode) {
+		LOG_ERROR_STR ("bad inode");
+		err = -1;
+		goto bail;
+	}
+
+	osb = OCFS_SB(inode->i_sb);
+
+	vbo = (__s64) iblock << osb->s_sectsize_bits;
+	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
+					   inode, 1);
+	if (err < 0) {
+		LOG_ERROR_STATUS (err);
+		err = -1;
+		goto bail;
+	}
+
+	err = 0;
+
+	*oblock = lbo >> osb->s_sectsize_bits;
+	if (*oblock == 0) {
+		err = -EIO;
+		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
+				vbo, lbo, len, 
+				OCFS_I(inode)->ip_blkno);
+	}
+
+bail:
+	if (err < 0)
+		err = -EIO;
+	LOG_EXIT_INT (err);
+	return err;
+}				/* ocfs_get_block2 */
+
+/*
+ * ocfs_rw_direct()
+ *
+ */
+ssize_t ocfs_rw_direct (int rw, struct file *filp, char *buf, size_t size, loff_t * offp)
+{
+#ifdef KERNEL_NO_F_IOBUF
+	struct kiobuf *iobuf;
+#else
+	struct kiobuf *iobuf = filp->f_iobuf;
+	int new_iobuf = 0;
+#endif
+	int err = 0;
+	unsigned long blocknr, blocks, myiosize;
+	size_t transferred;
+	int iosize, clustersize;
+	int i;
+	struct inode *inode = filp->f_dentry->d_inode;
+	int max_sectors;
+	int nbhs;
+	int sector_size, sector_bits, sector_mask, sectors_per_page;
+	int ret = 0;
+	int large_io = 0;
+	int inuse = 0;
+	unsigned long blocks_end_cluster = 0;
+	loff_t saved_off;
+	size_t saved_size;
+	unsigned long firstlogic;
+	long firstphys;
+	long nextphys;
+	unsigned long nextlogic = 0;
+	unsigned long totalioblocks = 0;
+
+	saved_off = *offp;
+	saved_size = size;
+	
+	/* FIXME: Need to differentiate between sectors and blocksize */
+	sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	sector_mask = sector_size - 1;
+	sectors_per_page = PAGE_SIZE / sector_size;
+	/* max sectors is 1024 in 2.4.9
+	 * max data is 512kb  
+	 */
+
+	err = -EINVAL;
+	if (size == 0) {
+		printk("direct write of 0 byte\n");
+		return 0;
+	}
+
+	if (rw == READ) {
+	   if (inode->i_size <= *offp) /* read past end of file */
+	      return 0;
+	   if  (size > (inode->i_size - *offp))
+	      size = inode->i_size - *offp;
+	}
+
+	/* make sure aligned to either PAGE_SIZE or sect_size IO */
+#ifndef LARGEIOS
+	if ((*offp & sector_mask) || (size & sector_mask)) 
+	   /* if not, then fail, we need either to do dio */
+	   return err;
+
+	max_sectors = KIO_MAX_SECTORS;
+	large_io = 0;
+#endif
+#ifdef LARGEIOS	
+	if ((*offp & ~PAGE_MASK) || (size & ~PAGE_MASK)) {
+		/* if it's not PAGE_SIZE, then sect_size */
+		 if ((*offp & sector_mask) || (size & sector_mask))
+			 /* if not, then fail, we need either to do dio */
+			 return err;
+		 max_sectors = KIO_MAX_SECTORS; /* for 2.4.9 - 1024 */
+	} /* ok we 're PAGE_SIZE aligned, lets see if the buffer is */
+	else {
+		if (!((unsigned long) buf & ~PAGE_MASK)) {
+			/* yippie we are .. we can do PAGE_SIZE size io's */
+			large_io = 1;
+			/* for 2.4.9 */
+			max_sectors = KIO_MAX_SECTORS / sectors_per_page;
+		} else {
+			max_sectors = KIO_MAX_SECTORS;
+			large_io = 0;
+		}
+
+	}	
+#endif
+	/* find out how far we are to the end of our cluster */
+
+	err = 0;
+	if (size)
+		err = -ENXIO;
+
+	/* Split the IO into KIO_MAX_SECTORS chunks, mapping and */
+	/* unmapping the single kiobuf as we go to perform each chunk of IO. */
+
+	transferred = 0;
+	blocknr = *offp >> sector_bits;
+	clustersize = inode->i_blksize >> sector_bits;
+	myiosize = size >> sector_bits;
+	blocks_end_cluster = clustersize - (blocknr % clustersize);
+	firstlogic = blocknr;
+	totalioblocks = 0;
+
+	ret = ocfs_get_block2 (inode, blocknr, &firstphys, sector_size);
+	if (ret == -1) {
+		err = 0;
+		goto out;
+	}
+	while (myiosize > 0) {
+	    if (blocks_end_cluster + 1 > myiosize) {
+		totalioblocks += myiosize;
+		myiosize = 0;
+		goto doio;
+	    } else {
+		totalioblocks += blocks_end_cluster;
+		myiosize -= blocks_end_cluster;
+		nextlogic = firstlogic + blocks_end_cluster;
+	    }
+again:
+	    ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
+	    if (ret == -1) {
+		err = 0;
+		goto out;
+	    }
+	    if (nextphys == (firstphys + totalioblocks)) {
+		// merge ok
+		blocks_end_cluster = clustersize - (nextlogic % clustersize);
+		if (blocks_end_cluster + 1 > myiosize) {
+		   totalioblocks += myiosize;
+		   myiosize = 0;
+		} else {
+		   totalioblocks += blocks_end_cluster;
+		   myiosize -= blocks_end_cluster;
+		   nextlogic = nextlogic + blocks_end_cluster;
+		   goto again;
+		}
+	    }
+doio:
+		size = totalioblocks << sector_bits;
+		if (large_io)
+			nbhs = (size >> PAGE_SHIFT);
+		else
+			nbhs = (size >> sector_bits);
+		if (nbhs > max_sectors)
+			nbhs = max_sectors;
+
+#ifdef KERNEL_NO_F_IOBUF
+		err = alloc_kiovec_sz (1, &iobuf, &nbhs);
+		if (err)
+			goto out;
+#else
+		if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+			/*
+			* A parallel read/write is using the preallocated iobuf
+			* so just run slow and allocate a new one.
+			*/
+			err = alloc_kiovec_sz (1, &iobuf, &nbhs);
+			if (err)
+				goto out;
+			new_iobuf = 1;
+		} else
+			new_iobuf = 0;
+				
+#endif
+		inuse = 1;
+
+		totalioblocks = 0;
+		while (size > 0) {
+			if (large_io) {
+				blocks = size >> PAGE_SHIFT;
+				if (blocks > max_sectors)
+					blocks = max_sectors;
+				iosize = blocks << PAGE_SHIFT;
+			} else {
+				blocks = size >> sector_bits;
+				if (blocks > max_sectors)
+					blocks = max_sectors;
+				iosize = blocks << sector_bits;
+			}
+			if (!blocks)
+				break;
+			err = map_user_kiobuf (rw, iobuf, (unsigned long) buf, iosize);
+			if (err)
+				break;
+			/* get the blocknr depending on io size for all blocks */
+			/* since we are awlays within the extent we only need to get the first block */
+			OCFS_KIO_BLOCKS(iobuf)[0] = firstphys + totalioblocks;
+
+			if (large_io) {
+				blocknr += sectors_per_page;
+				OCFS_KIO_BLOCKS(iobuf)[0] = OCFS_KIO_BLOCKS(iobuf)[0] / sectors_per_page;
+			} else {
+				blocknr++;
+			}
+
+			for (i = 1; i < blocks; i++) {
+				if (large_io) {
+					blocknr += sectors_per_page;
+				} else {
+					blocknr++;
+				}
+				OCFS_KIO_BLOCKS(iobuf)[i] = OCFS_KIO_BLOCKS(iobuf)[0] + i;
+			}
+			err = brw_kiovec (rw, 1, &iobuf, inode->i_dev, OCFS_KIO_BLOCKS(iobuf),
+					large_io ? PAGE_SIZE : sector_size);
+#ifdef SUSE
+			if (rw == READ &&  err > 0)
+				mark_dirty_kiobuf(iobuf, err);
+#endif
+			if (err >= 0) {
+				transferred += err;
+				size -= err;
+				buf += err;
+				if (large_io) {
+					totalioblocks +=
+					    (blocks * sectors_per_page);
+				} else {
+					totalioblocks += blocks;
+				}
+			} else {
+				printk( "ocfs_rw_direct : brw_kiovec() %d\n", err);	
+				break;
+			}
+			unmap_kiobuf (iobuf);
+			if (err != iosize)
+				break;
+		}
+#ifdef KERNEL_NO_F_IOBUF
+		free_kiovec_sz(1, &iobuf, &nbhs);
+#else
+		if (!new_iobuf)
+			clear_bit(0, &filp->f_iobuf_lock);
+		else
+			free_kiovec_sz(1, &iobuf, &nbhs);
+#endif
+		inuse = 0;
+		totalioblocks = 0;
+		firstlogic = nextlogic;
+		firstphys = nextphys;
+	}
+	if (transferred) {
+		*offp += transferred;
+		err = transferred;
+	}
+
+out:
+#ifdef KERNEL_NO_F_IOBUF
+	if (inuse)
+	   free_kiovec_sz (1, &iobuf, &nbhs);
+#else
+	if (inuse) {
+		if (!new_iobuf)
+			clear_bit(0, &filp->f_iobuf_lock);
+		else
+			free_kiovec_sz(1, &iobuf, &nbhs);
+	}
+#endif
+	return err;
+}				/* ocfs_rw_direct */
+
+#ifdef AIO_ENABLED
+static int ocfs_kvec_rw(struct file *filp, int rw, kvec_cb_t cb,
+		size_t size, loff_t pos)
+{
+	int	     err = 0;
+	int max_sectors = 25000;
+	struct inode *inode = filp->f_dentry->d_inode;
+	unsigned long blocknr, blocks, iosize,myiosize;
+	long firstphys;
+	int clustersize;
+	unsigned long blocks_end_cluster = 0;
+ 
+	/* FIXME: Need to differentiate betwen sectors and blocksize */
+	int sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	int sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	int sector_mask = sector_size - 1;
+
+	int ret;
+	unsigned long firstlogic;
+	long nextphys;
+	unsigned long nextlogic = 0;
+	unsigned long totalioblocks = 0;
+
+	if (!size || (pos == inode->i_size)) {
+		cb.fn(cb.data, cb.vec, err);
+		return err;
+	}
+
+	err = -ENXIO;
+	if (pos >= inode->i_size) {
+		return err;
+	}
+
+	err = -EINVAL;
+	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
+		return err;
+	}
+
+	blocknr = pos >> sector_bits;
+
+	blocks = size >> sector_bits;;
+	if (blocks > max_sectors)
+		blocks = max_sectors;
+	if (!blocks) {
+		err = -ENXIO;
+		return err;;
+	}
+
+	iosize = blocks << sector_bits;
+	clustersize = inode->i_blksize >> sector_bits;
+	blocks_end_cluster = clustersize - (blocknr % clustersize);
+	myiosize = size >> sector_bits;
+	firstlogic = blocknr;
+	totalioblocks = 0;
+
+	err = ocfs_get_block2(inode, blocknr, &firstphys, sector_size);
+	if ( err == -1 ) {
+		err = 0;
+		return err;
+	}
+		if (blocks_end_cluster + 1 > myiosize) {
+			totalioblocks += myiosize;
+			myiosize = 0;
+			goto doio;
+		} else {
+			totalioblocks += blocks_end_cluster;
+			myiosize -= blocks_end_cluster;
+			nextlogic = firstlogic + blocks_end_cluster;
+		}
+again:
+		ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
+		if (ret == -1) {
+			err = 0;
+			return err;
+		}
+	    if (nextphys == (firstphys + totalioblocks)) {
+		blocks_end_cluster = clustersize - (nextlogic % clustersize);
+		if (blocks_end_cluster + 1 > myiosize) {
+		   totalioblocks += myiosize;
+		   myiosize = 0;
+		} else {
+		   totalioblocks += blocks_end_cluster;
+		   myiosize -= blocks_end_cluster;
+		   nextlogic = nextlogic + blocks_end_cluster;
+		   goto again;
+		}
+	    }
+doio:
+	blocks = totalioblocks;
+	err = brw_kvec_async(rw, cb, inode->i_dev, blocks, firstphys, sector_bits);
+	return err;
+
+}
+
+int ocfs_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
+{
+	int ret;
+
+	LOG_SET_CONTEXT(KVEC_READ);
+
+	ret = ocfs_kvec_rw(file, READ, cb, size, pos);
+
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+int ocfs_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
+{
+	int ret;
+
+	LOG_SET_CONTEXT(KVEC_WRITE);
+
+	ret = ocfs_kvec_rw(file, WRITE, cb, size, pos);
+
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+#endif /* aio */
+#endif /* 2.6 */


More information about the Ocfs2-devel mailing list