[Ocfs2-commits] zab commits r2141 - in trunk: . commit-tests commit-tests/tests commit-tests/tests/aio-stress commit-tests/tests/basic-dual-data commit-tests/tests/fsx

Wed Apr 13 16:04:20 CDT 2005

Author: zab
Signed-off-by: mfasheh
Signed-off-by: khackel
Date: 2005-04-13 16:04:19 -0500 (Wed, 13 Apr 2005)
New Revision: 2141

Added:
   trunk/commit-tests/
   trunk/commit-tests/README
   trunk/commit-tests/run-tests.sh
   trunk/commit-tests/tests/
   trunk/commit-tests/tests/Makefile
   trunk/commit-tests/tests/aio-stress/
   trunk/commit-tests/tests/aio-stress/Makefile
   trunk/commit-tests/tests/aio-stress/aio-stress.c
   trunk/commit-tests/tests/aio-stress/test.sh
   trunk/commit-tests/tests/basic-dual-data/
   trunk/commit-tests/tests/basic-dual-data/test.sh
   trunk/commit-tests/tests/fsx/
   trunk/commit-tests/tests/fsx/Makefile
   trunk/commit-tests/tests/fsx/fsx-linux.c
   trunk/commit-tests/tests/fsx/test.sh
Log:
Add a test harness that will let us build-up quick regression tests as
we fix bugs.

Signed-off-by: mfasheh
Signed-off-by: khackel



Added: trunk/commit-tests/README
===================================================================

--- trunk/commit-tests/README	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/README	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,81 @@
+
+These tests are intended to be a way to ensure basic functionality without
+investing a huge amount of resources.  They are not stress tests, nor
+are they exhaustive functionality tests.  They are a simple means to
+add tests that are developed in the course of investigating a bug.  If you
+can test something with C and shell, you an fold it under this harness.
+
+It looks something like this:
+
+	run-tests.sh
+	tests/
+		$test-name/
+			test.sh
+			.. various test-related files ..
+
+Running the tests involves building the binaries needed for the tests and then
+running the harness while specifying a local mount and optional remote mounts:
+
+	$ cd commit-tests/tests && make
+	$ sh commit-tests/run-tests.sh -m /mnt/ocfs2 -r remote_host:/mnt/ocfs2
+
+run-tests.sh starts by mounting the given -m mount with the help of fstab.  The
+-r argument indicates a remote hostname and mountpoint on that host that will
+be used in the test.  Multiple -r arguments can be given.  Tests explicitly
+know to work with remote hosts and may or may not use all of them.  The scripts
+only run mount and unmount so cluster services and fstab should be
+pre-configured before running these scripts.
+
+The run-tests.sh script will locate and run each test.  After running them all
+it will output a summary of the results by listing all the tests which failed
+or which were invalid for whatever reason.  Everything is output to stdout so
+teeing to a logfile is not a bad idea.
+
+Writing a new test involves creating a directory for it and providing the
+test.sh script that does the actual work.  The test directory may want to tie
+into the build system by providing a Makefile so that test.sh can run binaries,
+etc.
+
+The test.sh is given some environment variables by the harness.
+
+	CT_FUNCTIONS
+		a file to source to get at some helper shell functions
+	CT_DIR
+		a directory in the filesystem that the test should create
+		its files in
+	CT_TMP
+		a temporary directory for the test that is not within
+		the testing file system
+	CT_TST
+		the tests/test/$foo directory for the test in the repository
+	CT_REMOTE_DIR_[nr]
+		The directory on remote node [nr] that CT_TST can be reached
+		through.  so "ls $CT_TST" and "remote 0 ls $CT_REMOTE_DIR_0"
+		should produce identical output.
+
+The functions that are given by ". $CT_FUNCTIONS" are as follows:
+
+	out [message]
+		output a message prefixed by some pretty formating
+	invalid [message]
+		output a message and exit the test with the return code
+		that tells the harness that the test could not run 
+	fail [message]
+		output a message and exit the test with the return code
+		that tells the harness that the test failed
+	pass [message]
+		output a message and exit the test with the return code
+		that tells the harness that the test passed
+	needs_remote [nr]
+		fails with invalid if the given number of remote nodes
+		aren't available at the time the test is run
+	remote [nr] [command]
+		runs the given command on the 0-based remote node number.  the
+		caller must have verified that there are > [nr] nodes available
+		by running needs_remote earlier in the test.
+
+This is simply the first pass, there is a lot more we could do:
+
+	- teach run-tests to mount and unmount around each test, checking
+		for leaks
+	- multiple mounts per node

Added: trunk/commit-tests/run-tests.sh
===================================================================
--- trunk/commit-tests/run-tests.sh	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/run-tests.sh	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+#
+#	xxx
+#		move failed rundirs somewhere for inspection?
+
+# first some tiny helpers
+out() {
+	# commit-tests :)
+	echo "[ct] $@"
+}
+die() {
+	out $@", exiting"
+	exit 1
+}
+fs_type() {
+	fstype=$(awk '($2 == "'$2'") {print $3}' $1)
+	[ -z "$fstype" ] && fstype="unknown"
+	echo $fstype
+}
+
+usage() {
+	echo "	-m host:mnt	a host and mount point in fstab"
+}
+
+num_remotes=0
+while getopts ":m:r:" opt; do
+        case $opt in
+                m)
+                        mnt="$OPTARG"
+                        ;;
+                r)
+                        remotes[$num_remotes]="$OPTARG"
+			num_remotes=$((num_remotes + 1));
+                        ;;
+                \?) usage
+        esac
+done
+
+[ -z "$mnt" ] && die "an ocfs2 mount must be specified with -m"
+
+for rem in $(seq 0 $((num_remotes - 1)) ); do
+	host=$(echo ${remotes[$rem]} | cut -d: -f 1) \
+		|| die "bad remote argument: '${remotes[$rem]}'"
+	mount=$(echo ${remotes[$rem]} | cut -d: -f 2) \
+		|| die "bad remote argument: '${remotes[$rem]}'"
+
+	[ -z "$host" ]  && \
+		die "couldn't find host in remote argument: '${remotes[$rem]}'"
+	[ -z "$mount" ]  && \
+		die "couldn't find mount in remote argument: '${remotes[$rem]}'"
+
+	# XXX should check these better :/
+
+	hosts[$rem]=$host
+	mounts[$rem]=$mount
+done
+
+out mounting $mnt
+mount $mnt || die "mount $mnt failed"
+
+[ $(fs_type /proc/mounts "$mnt") != "ocfs2" ] && \
+	die "/proc/mounts doesn't think $mnt is an ocfs2 mount"
+
+# run the test from inside the tree
+ctdir=$(dirname $(readlink /proc/$$/fd/255)) || die "couldn't find ctdir"
+cd $ctdir || die "couldn't cd to $ctdir"
+
+# some directories that we create per run and then clean
+# up at the end
+tmpdir_template="/tmp/.ct-tmp-XXXXXX"
+tmpdir=`mktemp -d $tmpdir_template` || \
+	die "couldn't create tmp dir from template $tmpdir_template"
+rundir_template="$mnt/.ct-run-XXXXXX"
+rundir=`mktemp -d $rundir_template` || \
+	die "couldn't create run dir from template $rundir_template"
+cleanup() {
+        [ ${#tmpdir} == ${#tmpdir_template} ] && [ -d $tmpdir ] && \
+                rm -rf $tmpdir
+	# run dir is torn down as part of the tests
+}
+trap cleanup EXIT
+
+for rem in $(seq 0 $((num_remotes - 1)) ); do
+	out "mounting $mount on $host"
+	ssh ${hosts[$rem]} mount ${mounts[$rem]} || \
+		die "couldn't mount $mount on $host"
+
+	ssh ${hosts[$rem]} cat /proc/mounts > $tmpdir/mounts-$host || \
+		die "couldn't get /proc/mounts from $host"
+
+	if [ $(fs_type $tmpdir/mounts-$host ${mounts[$rem]}) != "ocfs2" ]; then
+		die "mount ${mounts[$rem]} on $host doesn't seem to be ocfs2"
+	fi
+done
+
+num_run=0
+num_invalid=0
+num_passed=0
+num_failed=0
+
+for script in `find tests -name test.sh -type f -mindepth 2 -maxdepth 2`; do
+	d=$(dirname $script)
+	test=$(basename $d)
+	out running test $test
+
+	# export variables that tests will use
+	export CT_FUNCTIONS="$tmpdir/functions"
+	export CT_DIR="$rundir/$test"
+	export CT_TMP="$tmpdir/$test"
+	export CT_TST="$d"
+	export CT_NUM_REMOTES="$num_remotes"
+	for rem in $(seq 0 $((num_remotes - 1)) ); do
+		eval "export CT_REMOTE_DIR_$rem=$(echo $rundir | \
+			sed -e "s@$mnt@${mounts[$rem]}@")/$test"
+	done
+
+	mkdir $CT_DIR || die "couldn't make dir $CT_DIR"
+	mkdir $CT_TMP || die "couldn't make dir $CT_TMP"
+
+	echo "			
+		out() {
+			echo "[ct-$test] \$@"
+		}
+		invalid() {
+			echo "[ct-$test] Invalid: \$@"
+			exit 1
+		}
+		pass() {
+			echo "[ct-$test] Pass: \$@"
+			exit 2
+		}
+		fail() {
+			echo "[ct-$test] Failed: \$@"
+			exit 3
+		}
+		needs_remotes() {
+			if [ \$1 -gt \$CT_NUM_REMOTES ]; then
+				invalid "needs \$1 remote nodes"
+			fi
+		}
+	" > $CT_FUNCTIONS || die "couldn't create test $test's functions"
+
+	if [ $num_remotes -gt 0 ]; then
+		echo "
+			remote() {
+				local num=\$1
+				shift;
+				case "'"$num"'" in 
+			" >> $CT_FUNCTIONS
+		for rem in $(seq 0 $((num_remotes - 1)) ); do
+			echo "$rem)
+				ssh ${hosts[$rem]} "'"$@"'"
+				;;" >> $CT_FUNCTIONS
+		done
+		echo "	\*)
+			invalid tried to use remote \$num
+			;;
+		esac }" >> $CT_FUNCTIONS
+	fi
+
+	sh $script
+	testret="$?"
+
+	num_run=$((num_run + 1))
+
+	case "$testret" in
+		1)
+			invalid="$invalid $test"
+			num_invalid=$((num_invalid + 1))
+			;;
+		2)
+			num_passed=$((num_passed + 1))
+			;;
+		3)
+			failed="$failed $test"
+			num_failed=$((num_failed + 1))
+			;;
+		*)
+			out "test $test is buggy, it returned $testret" \
+			    "instead of returning an explicit error code "
+			;;
+	esac
+done
+
+ret=0
+
+out "$num_run tests run"
+
+if [ $num_invalid != 0 ]; then
+	out $num_invalid invalid tests: $invalid
+	ret=1
+fi
+if [ $num_failed != 0 ]; then
+	out $num_failed failed tests: $failed
+	ret=1
+fi
+if [ $ret == 0 ]; then
+	out all tests passed
+fi
+
+out "removing our in-filesystem run dirs"
+if [ ${#rundir} == ${#rundir_template} -a -d $rundir ]; then
+	rm -rf $rundir || die "couldn't remove $rundir"
+fi
+
+for rem in $(seq 0 $((num_remotes - 1)) ); do
+	out "unmounting $mount on $host"
+	ssh ${hosts[$rem]} umount ${mounts[$rem]} || \
+		die "couldn't unmount $mount on $host"
+done
+
+out "unmounting $mnt"
+umount $mnt || die "couldn't unmount $mnt"
+
+exit $ret

Added: trunk/commit-tests/tests/Makefile
===================================================================
--- trunk/commit-tests/tests/Makefile	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/Makefile	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,7 @@
+TOPDIR = ../..
+
+include $(TOPDIR)/Preamble.make
+
+SUBDIRS = fsx aio-stress
+
+include $(TOPDIR)/Postamble.make


Property changes on: trunk/commit-tests/tests/aio-stress
___________________________________________________________________
Name: svn:ignore
   + *.o
.*.d
aio-stress


Added: trunk/commit-tests/tests/aio-stress/Makefile
===================================================================
--- trunk/commit-tests/tests/aio-stress/Makefile	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/aio-stress/Makefile	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,14 @@
+TOPDIR = ../../..
+
+include $(TOPDIR)/Preamble.make
+
+SBIN_PROGRAMS = aio-stress
+
+CFILES = aio-stress.c
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+aio-stress: $(OBJS)
+	$(LINK) -lpthread -laio
+
+include $(TOPDIR)/Postamble.make

Added: trunk/commit-tests/tests/aio-stress/aio-stress.c
===================================================================
--- trunk/commit-tests/tests/aio-stress/aio-stress.c	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/aio-stress/aio-stress.c	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,1452 @@
+/*
+ * aio-stress
+ *
+ * will open or create each file on the command line, and start a series
+ * of aio to it.  
+ *
+ * aio is done in a rotating loop.  first file1 gets 8 requests, then
+ * file2, then file3 etc.  As each file finishes writing, it is switched
+ * to reads
+ *
+ * io buffers are aligned in case you want to do raw io
+ *
+ * compile with gcc -Wall -laio -lpthread -o aio-stress aio-stress.c
+ *
+ * run aio-stress -h to see the options
+ *
+ * Please mail Chris Mason (mason at suse.com) with bug reports or patches
+ */
+#define _FILE_OFFSET_BITS 64
+#define PROG_VERSION "0.20"
+#define NEW_GETEVENTS
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <libaio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <pthread.h>
+
+#define IO_FREE 0
+#define IO_PENDING 1
+
+#ifndef O_DIRECT
+#define O_DIRECT         040000 /* direct disk access hint */
+#endif
+
+enum {
+    WRITE,
+    READ,
+    RWRITE,
+    RREAD,
+    LAST_STAGE,
+};
+
+#define USE_MALLOC 0
+#define USE_SHM 1
+#define USE_SHMFS 2
+
+/* 
+ * various globals, these are effectively read only by the time the threads
+ * are started
+ */
+long stages = 0;
+unsigned long page_size_mask;
+int o_direct = 0;
+int o_sync = 0;
+int latency_stats = 0;
+int completion_latency_stats = 0;
+int io_iter = 8;
+int max_io_submit = 0;
+long rec_len = 64 * 1024;
+int depth = 64;
+int num_threads = 1;
+int num_contexts = 1;
+off_t context_offset = 2 * 1024 * 1024;
+int fsync_stages = 1;
+int use_shm = 0;
+int shm_id;
+char *unaligned_buffer = NULL;
+char *aligned_buffer = NULL;
+int padded_reclen = 0;
+int stonewall = 1;
+int verify = 0;
+char *verify_buf = NULL;
+
+struct io_unit;
+struct thread_info;
+
+/* pthread mutexes and other globals for keeping the threads in sync */
+pthread_cond_t stage_cond = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t stage_mutex = PTHREAD_MUTEX_INITIALIZER;
+int threads_ending = 0;
+int threads_starting = 0;
+struct timeval global_stage_start_time;
+struct thread_info *global_thread_info;
+
+/* 
+ * latencies during io_submit are measured, these are the 
+ * granularities for deviations 
+ */
+#define DEVIATIONS 6
+int deviations[DEVIATIONS] = { 100, 250, 500, 1000, 5000, 10000 };
+struct io_latency {
+    double max;
+    double min;
+    double total_io;
+    double total_lat;
+    double deviations[DEVIATIONS]; 
+};
+
+/* container for a series of operations to a file */
+struct io_oper {
+    /* already open file descriptor, valid for whatever operation you want */
+    int fd;
+
+    /* starting byte of the operation */
+    off_t start;
+
+    /* ending byte of the operation */
+    off_t end;
+
+    /* size of the read/write buffer */
+    int reclen;
+
+    /* max number of pending requests before a wait is triggered */
+    int depth;
+
+    /* current number of pending requests */
+    int num_pending;
+
+    /* last error, zero if there were none */
+    int last_err;
+
+    /* total number of errors hit. */
+    int num_err;
+
+    /* read,write, random, etc */
+    int rw;
+
+    /* number of ios that will get sent to aio */
+    int total_ios;
+
+    /* number of ios we've already sent */
+    int started_ios;
+
+    /* last offset used in an io operation */
+    off_t last_offset;
+
+    /* stonewalled = 1 when we got cut off before submitting all our ios */
+    int stonewalled;
+
+    /* list management */
+    struct io_oper *next;
+    struct io_oper *prev;
+
+    struct timeval start_time;
+
+    char *file_name;
+};
+
+/* a single io, and all the tracking needed for it */
+struct io_unit {
+    /* note, iocb must go first! */
+    struct iocb iocb;
+
+    /* pointer to parent io operation struct */
+    struct io_oper *io_oper;
+
+    /* aligned buffer */
+    char *buf;
+
+    /* size of the aligned buffer (record size) */
+    int buf_size;
+
+    /* state of this io unit (free, pending, done) */
+    int busy;
+
+    /* result of last operation */
+    long res;
+
+    struct io_unit *next;
+
+    struct timeval io_start_time;		/* time of io_submit */
+};
+
+struct thread_info {
+    io_context_t io_ctx;
+    pthread_t tid;
+
+    /* allocated array of io_unit structs */
+    struct io_unit *ios;
+
+    /* list of io units available for io */
+    struct io_unit *free_ious;
+
+    /* number of io units in the ios array */
+    int num_global_ios;
+
+    /* number of io units in flight */
+    int num_global_pending;
+
+    /* preallocated array of iocb pointers, only used in run_active */
+    struct iocb **iocbs;
+
+    /* preallocated array of events */
+    struct io_event *events;
+
+    /* size of the events array */
+    int num_global_events;
+
+    /* latency stats for io_submit */
+    struct io_latency io_submit_latency;
+
+    /* list of operations still in progress, and of those finished */
+    struct io_oper *active_opers;
+    struct io_oper *finished_opers;
+
+    /* number of files this thread is doing io on */
+    int num_files;
+
+    /* how much io this thread did in the last stage */
+    double stage_mb_trans;
+
+    /* latency completion stats i/o time from io_submit until io_getevents */
+    struct io_latency io_completion_latency;
+};
+
+/*
+ * return seconds between start_tv and stop_tv in double precision
+ */
+static double time_since(struct timeval *start_tv, struct timeval *stop_tv)
+{
+    double sec, usec;
+    double ret;
+    sec = stop_tv->tv_sec - start_tv->tv_sec;
+    usec = stop_tv->tv_usec - start_tv->tv_usec;
+    if (sec > 0 && usec < 0) {
+        sec--;
+	usec += 1000000;
+    } 
+    ret = sec + usec / (double)1000000;
+    if (ret < 0)
+        ret = 0;
+    return ret;
+}
+
+/*
+ * return seconds between start_tv and now in double precision
+ */
+static double time_since_now(struct timeval *start_tv)
+{
+    struct timeval stop_time;
+    gettimeofday(&stop_time, NULL);
+    return time_since(start_tv, &stop_time);
+}
+
+/*
+ * Add latency info to latency struct 
+ */
+static void calc_latency(struct timeval *start_tv, struct timeval *stop_tv,
+			struct io_latency *lat)
+{
+    double delta;
+    int i;
+    delta = time_since(start_tv, stop_tv);
+    delta = delta * 1000;
+
+    if (delta > lat->max)
+    	lat->max = delta;
+    if (!lat->min || delta < lat->min)
+    	lat->min = delta;
+    lat->total_io++;
+    lat->total_lat += delta;
+    for (i = 0 ; i < DEVIATIONS ; i++) {
+        if (delta < deviations[i]) {
+	    lat->deviations[i]++;
+	    break;
+	}
+    }
+}
+
+static void oper_list_add(struct io_oper *oper, struct io_oper **list)
+{
+    if (!*list) {
+        *list = oper;
+	oper->prev = oper->next = oper;
+	return;
+    }
+    oper->prev = (*list)->prev;
+    oper->next = *list;
+    (*list)->prev->next = oper;
+    (*list)->prev = oper;
+    return;
+}
+
+static void oper_list_del(struct io_oper *oper, struct io_oper **list)
+{
+    if ((*list)->next == (*list)->prev && *list == (*list)->next) {
+        *list = NULL;
+	return;
+    }
+    oper->prev->next = oper->next;
+    oper->next->prev = oper->prev;
+    if (*list == oper)
+        *list = oper->next;
+}
+
+/* worker func to check error fields in the io unit */
+static int check_finished_io(struct io_unit *io) {
+    int i;
+    if (io->res != io->buf_size) {
+        fprintf(stderr, "io err %lu (%s) op %d, size %d\n",
+		io->res, strerror(-io->res), io->iocb.aio_lio_opcode,
+		io->buf_size);
+        io->io_oper->last_err = io->res;
+        io->io_oper->num_err++;
+	return -1;
+    }
+    if (verify && io->io_oper->rw == READ) {
+        if (memcmp(io->buf, verify_buf, io->io_oper->reclen)) {
+	    fprintf(stderr, "verify error, file %s offset %Lu contents (offset:bad:good):\n", 
+	            io->io_oper->file_name, io->iocb.u.c.offset);
+	    
+	    for (i = 0 ; i < io->io_oper->reclen ; i++) {
+	        if (io->buf[i] != verify_buf[i]) {
+		    fprintf(stderr, "%d:%c:%c ", i, io->buf[i], verify_buf[i]);
+		}
+	    }
+	    fprintf(stderr, "\n");
+	}
+
+    }
+    return 0;
+}
+
+/* worker func to check the busy bits and get an io unit ready for use */
+static int grab_iou(struct io_unit *io, struct io_oper *oper) {
+    if (io->busy == IO_PENDING)
+        return -1;
+
+    io->busy = IO_PENDING;
+    io->res = 0;
+    io->io_oper = oper;
+    return 0;
+}
+
+char *stage_name(int rw) {
+    switch(rw) {
+    case WRITE:
+        return "write";
+    case READ:
+        return "read";
+    case RWRITE:
+        return "random write";
+    case RREAD:
+        return "random read";
+    }
+    return "unknown";
+}
+
+static inline double oper_mb_trans(struct io_oper *oper) {
+    return ((double)oper->started_ios * (double)oper->reclen) /
+                (double)(1024 * 1024);
+}
+
+static void print_time(struct io_oper *oper) {
+    double runtime;
+    double tput;
+    double mb;
+
+    runtime = time_since_now(&oper->start_time); 
+    mb = oper_mb_trans(oper);
+    tput = mb / runtime;
+    fprintf(stderr, "%s on %s (%.2f MB/s) %.2f MB in %.2fs\n", 
+	    stage_name(oper->rw), oper->file_name, tput, mb, runtime);
+}
+
+static void print_lat(char *str, struct io_latency *lat) {
+    double avg = lat->total_lat / lat->total_io;
+    int i;
+    double total_counted = 0;
+    fprintf(stderr, "%s min %.2f avg %.2f max %.2f\n\t", 
+            str, lat->min, avg, lat->max);
+
+    for (i = 0 ; i < DEVIATIONS ; i++) {
+	fprintf(stderr, " %.0f < %d", lat->deviations[i], deviations[i]);
+	total_counted += lat->deviations[i];
+    }
+    if (total_counted && lat->total_io - total_counted)
+        fprintf(stderr, " < %.0f", lat->total_io - total_counted);
+    fprintf(stderr, "\n");
+    memset(lat, 0, sizeof(*lat));
+}
+
+static void print_latency(struct thread_info *t)
+{
+    struct io_latency *lat = &t->io_submit_latency;
+    print_lat("latency", lat);
+}
+
+static void print_completion_latency(struct thread_info *t)
+{
+    struct io_latency *lat = &t->io_completion_latency;
+    print_lat("completion latency", lat);
+}
+
+/*
+ * updates the fields in the io operation struct that belongs to this
+ * io unit, and make the io unit reusable again
+ */
+void finish_io(struct thread_info *t, struct io_unit *io, long result,
+		struct timeval *tv_now) {
+    struct io_oper *oper = io->io_oper;
+
+    calc_latency(&io->io_start_time, tv_now, &t->io_completion_latency);
+    io->res = result;
+    io->busy = IO_FREE;
+    io->next = t->free_ious;
+    t->free_ious = io;
+    oper->num_pending--;
+    t->num_global_pending--;
+    check_finished_io(io);
+    if (oper->num_pending == 0 && 
+       (oper->started_ios == oper->total_ios || oper->stonewalled)) 
+    {
+        print_time(oper);
+    } 
+}
+
+int read_some_events(struct thread_info *t) {
+    struct io_unit *event_io;
+    struct io_event *event;
+    int nr;
+    int i; 
+    int min_nr = io_iter;
+    struct timeval stop_time;
+
+    if (t->num_global_pending < io_iter)
+        min_nr = t->num_global_pending;
+
+#ifdef NEW_GETEVENTS
+    nr = io_getevents(t->io_ctx, min_nr, t->num_global_events, t->events,NULL);
+#else
+    nr = io_getevents(t->io_ctx, t->num_global_events, t->events, NULL);
+#endif
+    if (nr <= 0)
+        return nr;
+
+    gettimeofday(&stop_time, NULL);
+    for (i = 0 ; i < nr ; i++) {
+	event = t->events + i;
+	event_io = (struct io_unit *)((unsigned long)event->obj); 
+	finish_io(t, event_io, event->res, &stop_time);
+    }
+    return nr;
+}
+
+/* 
+ * finds a free io unit, waiting for pending requests if required.  returns
+ * null if none could be found
+ */
+static struct io_unit *find_iou(struct thread_info *t, struct io_oper *oper)
+{
+    struct io_unit *event_io;
+    int nr;
+
+retry:
+    if (t->free_ious) {
+        event_io = t->free_ious;
+	t->free_ious = t->free_ious->next;
+	if (grab_iou(event_io, oper)) {
+	    fprintf(stderr, "io unit on free list but not free\n");
+	    abort();
+	}
+	return event_io;
+    }
+    nr = read_some_events(t);
+    if (nr > 0)
+    	goto retry;
+    else
+    	fprintf(stderr, "no free ious after read_some_events\n");
+    return NULL;
+}
+
+/*
+ * wait for all pending requests for this io operation to finish
+ */
+static int io_oper_wait(struct thread_info *t, struct io_oper *oper) {
+    struct io_event event;
+    struct io_unit *event_io;
+
+    if (oper->num_pending == 0)
+        goto done;
+
+    /* this func is not speed sensitive, no need to go wild reading
+     * more than one event at a time
+     */
+#ifdef NEW_GETEVENTS
+    while(io_getevents(t->io_ctx, 1, 1, &event, NULL) > 0) {
+#else
+    while(io_getevents(t->io_ctx, 1, &event, NULL) > 0) {
+#endif
+	struct timeval tv_now;
+        event_io = (struct io_unit *)((unsigned long)event.obj); 
+
+	gettimeofday(&tv_now, NULL);
+	finish_io(t, event_io, event.res, &tv_now);
+
+	if (oper->num_pending == 0)
+	    break;
+    }
+done:
+    if (oper->num_err) {
+        fprintf(stderr, "%u errors on oper, last %u\n", 
+	        oper->num_err, oper->last_err);
+    }
+    return 0;
+}
+
+off_t random_byte_offset(struct io_oper *oper) {
+    off_t num;
+    off_t rand_byte = oper->start;
+    off_t range;
+    off_t offset = 1;
+
+    range = (oper->end - oper->start) / (1024 * 1024);
+    if ((page_size_mask+1) > (1024 * 1024))
+        offset = (page_size_mask+1) / (1024 * 1024);
+    if (range < offset)
+        range = 0;
+    else
+        range -= offset;
+
+    /* find a random mb offset */
+    num = 1 + (int)((double)range * rand() / (RAND_MAX + 1.0 ));
+    rand_byte += num * 1024 * 1024;
+    
+    /* find a random byte offset */
+    num = 1 + (int)((double)(1024 * 1024) * rand() / (RAND_MAX + 1.0));
+
+    /* page align */
+    num = (num + page_size_mask) & ~page_size_mask;
+    rand_byte += num;
+
+    if (rand_byte + oper->reclen > oper->end) {
+	rand_byte -= oper->reclen;
+    }
+    return rand_byte;
+}
+
+/* 
+ * build an aio iocb for an operation, based on oper->rw and the
+ * last offset used.  This finds the struct io_unit that will be attached
+ * to the iocb, and things are ready for submission to aio after this
+ * is called.
+ *
+ * returns null on error
+ */
+static struct io_unit *build_iocb(struct thread_info *t, struct io_oper *oper)
+{
+    struct io_unit *io;
+    off_t rand_byte;
+
+    io = find_iou(t, oper);
+    if (!io) {
+        fprintf(stderr, "unable to find io unit\n");
+	return NULL;
+    }
+
+    switch(oper->rw) {
+    case WRITE:
+        io_prep_pwrite(&io->iocb,oper->fd, io->buf, oper->reclen, 
+	               oper->last_offset);
+	oper->last_offset += oper->reclen;
+	break;
+    case READ:
+        io_prep_pread(&io->iocb,oper->fd, io->buf, oper->reclen, 
+	              oper->last_offset);
+	oper->last_offset += oper->reclen;
+	break;
+    case RREAD:
+	rand_byte = random_byte_offset(oper);
+	oper->last_offset = rand_byte;
+        io_prep_pread(&io->iocb,oper->fd, io->buf, oper->reclen, 
+	              rand_byte);
+        break;
+    case RWRITE:
+	rand_byte = random_byte_offset(oper);
+	oper->last_offset = rand_byte;
+        io_prep_pwrite(&io->iocb,oper->fd, io->buf, oper->reclen, 
+	              rand_byte);
+        
+        break;
+    }
+
+    return io;
+}
+
+/* 
+ * wait for any pending requests, and then free all ram associated with
+ * an operation.  returns the last error the operation hit (zero means none)
+ */
+static int
+finish_oper(struct thread_info *t, struct io_oper *oper)
+{
+    unsigned long last_err;
+
+    io_oper_wait(t, oper);
+    last_err = oper->last_err;
+    if (oper->num_pending > 0) {
+        fprintf(stderr, "oper num_pending is %d\n", oper->num_pending);
+    }
+    close(oper->fd);
+    free(oper);
+    return last_err;
+}
+
+/* 
+ * allocates an io operation and fills in all the fields.  returns
+ * null on error
+ */
+static struct io_oper * 
+create_oper(int fd, int rw, off_t start, off_t end, int reclen, int depth,
+            int iter, char *file_name)
+{
+    struct io_oper *oper;
+
+    oper = malloc (sizeof(*oper));
+    if (!oper) {
+	fprintf(stderr, "unable to allocate io oper\n");
+	return NULL;
+    }
+    memset(oper, 0, sizeof(*oper));
+
+    oper->depth = depth;
+    oper->start = start;
+    oper->end = end;
+    oper->last_offset = oper->start;
+    oper->fd = fd;
+    oper->reclen = reclen;
+    oper->rw = rw;
+    oper->total_ios = (oper->end - oper->start) / oper->reclen;
+    oper->file_name = file_name;
+
+    return oper;
+}
+
+/*
+ * does setup on num_ios worth of iocbs, but does not actually
+ * start any io
+ */
+int build_oper(struct thread_info *t, struct io_oper *oper, int num_ios, 
+               struct iocb **my_iocbs) 
+{
+    int i;
+    struct io_unit *io;
+
+    if (oper->started_ios == 0)
+	gettimeofday(&oper->start_time, NULL);
+
+    if (num_ios == 0)
+        num_ios = oper->total_ios;
+
+    if ((oper->started_ios + num_ios) > oper->total_ios)
+        num_ios = oper->total_ios - oper->started_ios;   
+
+    for( i = 0 ; i < num_ios ; i++) {
+	io = build_iocb(t, oper);
+	if (!io) {
+	    return -1;    
+	}
+	my_iocbs[i] = &io->iocb;
+    }
+    return num_ios;
+}
+
+/*
+ * runs through the iocbs in the array provided and updates
+ * counters in the associated oper struct
+ */
+static void update_iou_counters(struct iocb **my_iocbs, int nr,
+	struct timeval *tv_now) 
+{
+    struct io_unit *io;
+    int i;
+    for (i = 0 ; i < nr ; i++) {
+	io = (struct io_unit *)(my_iocbs[i]);
+	io->io_oper->num_pending++;
+	io->io_oper->started_ios++;
+	io->io_start_time = *tv_now;	/* set time of io_submit */
+    }
+}
+
+/* starts some io for a given file, returns zero if all went well */
+int run_built(struct thread_info *t, int num_ios, struct iocb **my_iocbs) 
+{
+    int ret;
+    struct timeval start_time;
+    struct timeval stop_time;
+
+resubmit:
+    gettimeofday(&start_time, NULL);
+    ret = io_submit(t->io_ctx, num_ios, my_iocbs);
+    gettimeofday(&stop_time, NULL);
+    calc_latency(&start_time, &stop_time, &t->io_submit_latency);
+
+    if (ret != num_ios) {
+	/* some ios got through */
+	if (ret > 0) {
+	    update_iou_counters(my_iocbs, ret, &stop_time);
+	    my_iocbs += ret;
+	    t->num_global_pending += ret;
+	    num_ios -= ret;
+	}
+	/* 
+	 * we've used all the requests allocated in aio_init, wait and
+	 * retry
+	 */
+	if (ret > 0 || ret == -EAGAIN) {
+	    int old_ret = ret;
+	    if ((ret = read_some_events(t) > 0)) {
+		goto resubmit;
+	    } else {
+	    	fprintf(stderr, "ret was %d and now is %d\n", ret, old_ret);
+		abort();
+	    }
+	}
+
+	fprintf(stderr, "ret %d (%s) on io_submit\n", ret, strerror(-ret));
+	return -1;
+    }
+    update_iou_counters(my_iocbs, ret, &stop_time);
+    t->num_global_pending += ret;
+    return 0;
+}
+
+/* 
+ * changes oper->rw to the next in a command sequence, or returns zero
+ * to say this operation is really, completely done for
+ */
+static int restart_oper(struct io_oper *oper) {
+    int new_rw  = 0;
+    if (oper->last_err)
+        return 0;
+
+    /* this switch falls through */
+    switch(oper->rw) {
+    case WRITE:
+	if (stages & (1 << READ))
+	    new_rw = READ;
+    case READ:
+	if (!new_rw && stages & (1 << RWRITE))
+	    new_rw = RWRITE;
+    case RWRITE:
+	if (!new_rw && stages & (1 << RREAD))
+	    new_rw = RREAD;
+    }
+
+    if (new_rw) {
+	oper->started_ios = 0;
+	oper->last_offset = oper->start;
+	oper->stonewalled = 0;
+
+	/* 
+	 * we're restarting an operation with pending requests, so the
+	 * timing info won't be printed by finish_io.  Printing it here
+	 */
+	if (oper->num_pending)
+	    print_time(oper);
+
+	oper->rw = new_rw;
+	return 1;
+    } 
+    return 0;
+}
+
+static int oper_runnable(struct io_oper *oper) {
+    struct stat buf;
+    int ret;
+
+    /* first context is always runnable, if started_ios > 0, no need to
+     * redo the calculations
+     */
+    if (oper->started_ios || oper->start == 0)
+        return 1;
+    /*
+     * only the sequential phases force delays in starting */
+    if (oper->rw >= RWRITE)
+        return 1;
+    ret = fstat(oper->fd, &buf);
+    if (ret < 0) {
+        perror("fstat");
+	exit(1);
+    }
+    if (S_ISREG(buf.st_mode) && buf.st_size < oper->start)
+        return 0;
+    return 1;
+}
+
+/*
+ * runs through all the io operations on the active list, and starts
+ * a chunk of io on each.  If any io operations are completely finished,
+ * it either switches them to the next stage or puts them on the 
+ * finished list.
+ *
+ * this function stops after max_io_submit iocbs are sent down the 
+ * pipe, even if it has not yet touched all the operations on the 
+ * active list.  Any operations that have finished are moved onto
+ * the finished_opers list.
+ */
+static int run_active_list(struct thread_info *t,
+			 int io_iter,
+			 int max_io_submit)
+{
+    struct io_oper *oper;
+    struct io_oper *built_opers = NULL;
+    struct iocb **my_iocbs = t->iocbs;
+    int ret = 0;
+    int num_built = 0;
+
+    oper = t->active_opers;
+    while(oper) {
+	if (!oper_runnable(oper)) {
+	    oper = oper->next;
+	    if (oper == t->active_opers)
+	        break;
+	    continue;
+	}
+	ret = build_oper(t, oper, io_iter, my_iocbs);
+	if (ret >= 0) {
+	    my_iocbs += ret;
+	    num_built += ret;
+	    oper_list_del(oper, &t->active_opers);
+	    oper_list_add(oper, &built_opers);
+	    oper = t->active_opers;
+	    if (num_built + io_iter > max_io_submit)
+	        break;
+	} else
+	    break;
+    }
+    if (num_built) {
+	ret = run_built(t, num_built, t->iocbs);
+	if (ret < 0) {
+	    fprintf(stderr, "error %d on run_built\n", ret);
+	    exit(1);
+	}
+	while(built_opers) {
+	    oper = built_opers;
+	    oper_list_del(oper, &built_opers);
+	    oper_list_add(oper, &t->active_opers);
+	    if (oper->started_ios == oper->total_ios) {
+		oper_list_del(oper, &t->active_opers);
+		oper_list_add(oper, &t->finished_opers);
+	    }
+	}
+    }
+    return 0;
+}
+
+void drop_shm() {
+    int ret;
+    struct shmid_ds ds;
+    if (use_shm != USE_SHM)
+        return;
+
+    ret = shmctl(shm_id, IPC_RMID, &ds);
+    if (ret) {
+        perror("shmctl IPC_RMID");
+    }
+}
+
+void aio_setup(io_context_t *io_ctx, int n)
+{
+    int res = io_queue_init(n, io_ctx);
+    if (res != 0) {
+	fprintf(stderr, "io_queue_setup(%d) returned %d (%s)\n",
+		n, res, strerror(-res));
+	exit(3);
+    }
+}
+
+/*
+ * allocate io operation and event arrays for a given thread
+ */
+int setup_ious(struct thread_info *t, 
+              int num_files, int depth, 
+	      int reclen, int max_io_submit) {
+    int i;
+    size_t bytes = num_files * depth * sizeof(*t->ios);
+
+    t->ios = malloc(bytes);
+    if (!t->ios) {
+	fprintf(stderr, "unable to allocate io units\n");
+	return -1;
+    }
+    memset(t->ios, 0, bytes);
+
+    for (i = 0 ; i < depth * num_files; i++) {
+	t->ios[i].buf = aligned_buffer;
+	aligned_buffer += padded_reclen;
+	t->ios[i].buf_size = reclen;
+	if (verify)
+	    memset(t->ios[i].buf, 'b', reclen);
+	else
+	    memset(t->ios[i].buf, 0, reclen);
+	t->ios[i].next = t->free_ious;
+	t->free_ious = t->ios + i;
+    }
+    if (verify) {
+        verify_buf = aligned_buffer;
+        memset(verify_buf, 'b', reclen);
+    }
+
+    t->iocbs = malloc(sizeof(struct iocb *) * max_io_submit);
+    if (!t->iocbs) {
+        fprintf(stderr, "unable to allocate iocbs\n");
+	goto free_buffers;
+    }
+
+    memset(t->iocbs, 0, max_io_submit * sizeof(struct iocb *));
+
+    t->events = malloc(sizeof(struct io_event) * depth * num_files);
+    if (!t->events) {
+        fprintf(stderr, "unable to allocate ram for events\n");
+	goto free_buffers;
+    }
+    memset(t->events, 0, num_files * sizeof(struct io_event)*depth);
+
+    t->num_global_ios = num_files * depth;
+    t->num_global_events = t->num_global_ios;
+    return 0;
+
+free_buffers:
+    if (t->ios)
+        free(t->ios);
+    if (t->iocbs)
+        free(t->iocbs);  
+    if (t->events)
+        free(t->events);
+    return -1;
+}
+
+/*
+ * The buffers used for file data are allocated as a single big
+ * malloc, and then each thread and operation takes a piece and uses
+ * that for file data.  This lets us do a large shm or bigpages alloc
+ * and without trying to find a special place in each thread to map the
+ * buffers to
+ */
+int setup_shared_mem(int num_threads, int num_files, int depth, 
+                     int reclen, int max_io_submit) 
+{
+    char *p = NULL;
+    size_t total_ram;
+    
+    padded_reclen = (reclen + page_size_mask) / (page_size_mask+1);
+    padded_reclen = padded_reclen * (page_size_mask+1);
+    total_ram = num_files * depth * padded_reclen + num_threads;
+    if (verify)
+    	total_ram += padded_reclen;
+
+    if (use_shm == USE_MALLOC) {
+	p = malloc(total_ram + page_size_mask);
+    } else if (use_shm == USE_SHM) {
+        shm_id = shmget(IPC_PRIVATE, total_ram, IPC_CREAT | 0700);
+	if (shm_id < 0) {
+	    perror("shmget");
+	    drop_shm();
+	    goto free_buffers;
+	}
+	p = shmat(shm_id, (char *)0x50000000, 0);
+        if ((long)p == -1) {
+	    perror("shmat");
+	    goto free_buffers;
+	}
+	/* won't really be dropped until we shmdt */
+	drop_shm();
+    } else if (use_shm == USE_SHMFS) {
+        char mmap_name[16]; /* /dev/shm/ + null + XXXXXX */    
+	int fd;
+
+	strcpy(mmap_name, "/dev/shm/XXXXXX");
+	fd = mkstemp(mmap_name);
+        if (fd < 0) {
+	    perror("mkstemp");
+	    goto free_buffers;
+	}
+	unlink(mmap_name);
+	ftruncate(fd, total_ram);
+	shm_id = fd;
+	p = mmap((char *)0x50000000, total_ram,
+	         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+        if (p == MAP_FAILED) {
+	    perror("mmap");
+	    goto free_buffers;
+	}
+    }
+    if (!p) {
+        fprintf(stderr, "unable to allocate buffers\n");
+	goto free_buffers;
+    }
+    unaligned_buffer = p;
+    (unsigned long)p = ((unsigned long) (p + page_size_mask) & ~page_size_mask);
+    aligned_buffer = p;
+    return 0;
+
+free_buffers:
+    drop_shm();
+    if (unaligned_buffer)
+        free(unaligned_buffer);
+    return -1;
+}
+
+/*
+ * runs through all the thread_info structs and calculates a combined
+ * throughput
+ */
+void global_thread_throughput(struct thread_info *t, char *this_stage) {
+    int i;
+    double runtime = time_since_now(&global_stage_start_time);
+    double total_mb = 0;
+    double min_trans = 0;
+
+    for (i = 0 ; i < num_threads ; i++) {
+        total_mb += global_thread_info[i].stage_mb_trans;
+	if (!min_trans || t->stage_mb_trans < min_trans)
+	    min_trans = t->stage_mb_trans;
+    }
+    if (total_mb) {
+	fprintf(stderr, "%s throughput (%.2f MB/s) ", this_stage,
+	        total_mb / runtime);
+	fprintf(stderr, "%.2f MB in %.2fs", total_mb, runtime);
+        if (stonewall)
+	    fprintf(stderr, " min transfer %.2fMB", min_trans);
+        fprintf(stderr, "\n");
+    }
+}
+
+
+/* this is the meat of the state machine.  There is a list of
+ * active operations structs, and as each one finishes the required
+ * io it is moved to a list of finished operations.  Once they have
+ * all finished whatever stage they were in, they are given the chance
+ * to restart and pick a different stage (read/write/random read etc)
+ *
+ * various timings are printed in between the stages, along with
+ * thread synchronization if there are more than one threads.
+ */
+int worker(struct thread_info *t)
+{
+    struct io_oper *oper;
+    char *this_stage;
+    struct timeval stage_time;
+    int status = 0;
+
+    aio_setup(&t->io_ctx, 512);
+
+restart:
+    if (num_threads > 1) {
+        pthread_mutex_lock(&stage_mutex);
+	threads_starting++;
+	if (threads_starting == num_threads) {
+	    threads_ending = 0;
+	    gettimeofday(&global_stage_start_time, NULL);
+	    pthread_cond_broadcast(&stage_cond);
+	}
+	while (threads_starting != num_threads)
+	    pthread_cond_wait(&stage_cond, &stage_mutex);
+        pthread_mutex_unlock(&stage_mutex);
+    }
+    if (t->active_opers) {
+        this_stage = stage_name(t->active_opers->rw);
+	gettimeofday(&stage_time, NULL);
+	t->stage_mb_trans = 0;
+    }
+
+    /* first we send everything through aio */
+    while(t->active_opers) {
+	if (stonewall && threads_ending) {
+	    oper = t->active_opers;
+	    oper->stonewalled = 1;
+	    oper_list_del(oper, &t->active_opers);
+	    oper_list_add(oper, &t->finished_opers);
+	} else {
+	    run_active_list(t, io_iter,  max_io_submit);
+        }
+    }
+    if (latency_stats)
+        print_latency(t);
+
+    if (completion_latency_stats)
+	print_completion_latency(t);
+
+    /* then we wait for all the operations to finish */
+    oper = t->finished_opers;
+    do {
+	io_oper_wait(t, oper);
+	oper = oper->next;
+    } while(oper != t->finished_opers);
+
+    /* then we do an fsync to get the timing for any future operations
+     * right, and check to see if any of these need to get restarted
+     */
+    oper = t->finished_opers;
+    while(oper) {
+	if (fsync_stages)
+            fsync(oper->fd);
+	t->stage_mb_trans += oper_mb_trans(oper);
+	if (restart_oper(oper)) {
+	    oper_list_del(oper, &t->finished_opers);
+	    oper_list_add(oper, &t->active_opers);
+	    oper = t->finished_opers;
+	    continue;
+	}
+	oper = oper->next;
+	if (oper == t->finished_opers)
+	    break;
+    } 
+
+    if (t->stage_mb_trans && t->num_files > 0) {
+        double seconds = time_since_now(&stage_time);
+	fprintf(stderr, "thread %d %s totals (%.2f MB/s) %.2f MB in %.2fs\n", 
+	        t - global_thread_info, this_stage, t->stage_mb_trans/seconds, 
+		t->stage_mb_trans, seconds);
+    }
+
+    if (num_threads > 1) {
+	pthread_mutex_lock(&stage_mutex);
+	threads_ending++;
+	if (threads_ending == num_threads) {
+	    threads_starting = 0;
+	    pthread_cond_broadcast(&stage_cond);
+	    global_thread_throughput(t, this_stage);
+	}
+	while(threads_ending != num_threads)
+	    pthread_cond_wait(&stage_cond, &stage_mutex);
+	pthread_mutex_unlock(&stage_mutex);
+    }
+    
+    /* someone got restarted, go back to the beginning */
+    if (t->active_opers) {
+        goto restart;
+    }
+
+    /* finally, free all the ram */
+    while(t->finished_opers) {
+	oper = t->finished_opers;
+	oper_list_del(oper, &t->finished_opers);
+	status = finish_oper(t, oper);
+    }
+
+    if (t->num_global_pending) {
+        fprintf(stderr, "global num pending is %d\n", t->num_global_pending);
+    }
+    io_queue_release(t->io_ctx);
+    
+    return status;
+}
+
+typedef void * (*start_routine)(void *);
+int run_workers(struct thread_info *t, int num_threads)
+{
+    int ret;
+    int thread_ret;
+    int i;
+
+    for(i = 0 ; i < num_threads ; i++) {
+        ret = pthread_create(&t[i].tid, NULL, (start_routine)worker, t + i);
+	if (ret) {
+	    perror("pthread_create");
+	    exit(1);
+	}
+    }
+    for(i = 0 ; i < num_threads ; i++) {
+        ret = pthread_join(t[i].tid, (void *)&thread_ret);
+        if (ret) {
+	    perror("pthread_join");
+	    exit(1);
+	}
+    }
+    return 0;
+}
+
+off_t parse_size(char *size_arg, off_t mult) {
+    char c;
+    int num;
+    off_t ret;
+    c = size_arg[strlen(size_arg) - 1];
+    if (c > '9') {
+        size_arg[strlen(size_arg) - 1] = '\0';
+    }
+    num = atoi(size_arg);
+    switch(c) {
+    case 'g':
+    case 'G':
+        mult = 1024 * 1024 * 1024;
+	break;
+    case 'm':
+    case 'M':
+        mult = 1024 * 1024;
+	break;
+    case 'k':
+    case 'K':
+        mult = 1024;
+	break;
+    case 'b':
+    case 'B':
+        mult = 1;
+	break;
+    }
+    ret = mult * num;
+    return ret;
+}
+
+void print_usage(void) {
+    printf("usage: aio-stress [-s size] [-r size] [-a size] [-d num] [-b num]\n");
+    printf("                  [-i num] [-t num] [-c num] [-C size] [-nxhOS ]\n");
+    printf("                  file1 [file2 ...]\n");
+    printf("\t-a size in KB at which to align buffers\n");
+    printf("\t-b max number of iocbs to give io_submit at once\n");
+    printf("\t-c number of io contexts per file\n");
+    printf("\t-C offset between contexts, default 2MB\n");
+    printf("\t-s size in MB of the test file(s), default 1024MB\n");
+    printf("\t-r record size in KB used for each io, default 64KB\n");
+    printf("\t-d number of pending aio requests for each file, default 64\n");
+    printf("\t-i number of ios per file sent before switching\n\t   to the next file, default 8\n");
+    printf("\t-O Use O_DIRECT (not available in 2.4 kernels),\n");
+    printf("\t-S Use O_SYNC for writes\n");
+    printf("\t-o add an operation to the list: write=0, read=1,\n"); 
+    printf("\t   random write=2, random read=3.\n");
+    printf("\t   repeat -o to specify multiple ops: -o 0 -o 1 etc.\n");
+    printf("\t-m shm use ipc shared memory for io buffers instead of malloc\n");
+    printf("\t-m shmfs mmap a file in /dev/shm for io buffers\n");
+    printf("\t-n no fsyncs between write stage and read stage\n");
+    printf("\t-l print io_submit latencies after each stage\n");
+    printf("\t-L print io completion latencies after each stage\n");
+    printf("\t-t number of threads to run\n");
+    printf("\t-x turn off thread stonewalling\n");
+    printf("\t-h this message\n");
+    printf("\n\t   the size options (-a -s and -r) allow modifiers -s 400{k,m,g}\n");
+    printf("\t   translate to 400KB, 400MB and 400GB\n");
+    printf("version %s\n", PROG_VERSION);
+}
+
+int main(int ac, char **av) 
+{
+    int rwfd;
+    int i;
+    int j;
+    int c;
+
+    off_t file_size = 1 * 1024 * 1024 * 1024;
+    int first_stage = WRITE;
+    struct io_oper *oper;
+    int status;
+    int num_files = 0;
+    int open_fds = 0;
+    struct thread_info *t;
+    off_t starting_offset = 0;
+
+    page_size_mask = getpagesize() - 1;
+
+    while(1) {
+	c = getopt(ac, av, "a:b:c:C:m:s:r:d:i:o:t:lLnhOSxvz:");
+	if  (c < 0)
+	    break;
+
+        switch(c) {
+	case 'a':
+	    page_size_mask = parse_size(optarg, 1024);
+	    page_size_mask--;
+	    break;
+	case 'c':
+	    num_contexts = atoi(optarg);
+	    break;
+	case 'C':
+	    context_offset = parse_size(optarg, 1024 * 1024);
+	case 'b':
+	    max_io_submit = atoi(optarg);
+	    break;
+	case 's':
+	    file_size = parse_size(optarg, 1024 * 1024);
+	    break;
+	case 'd':
+	    depth = atoi(optarg);
+	    break;
+	case 'r':
+	    rec_len = parse_size(optarg, 1024);
+	    break;
+	case 'i':
+	    io_iter = atoi(optarg);
+	    break;
+	case 'n':
+	    fsync_stages = 0;
+	    break;
+	case 'l':
+	    latency_stats = 1;
+	    break;
+	case 'L':
+	    completion_latency_stats = 1;
+	    break;
+	case 'm':
+	    if (!strcmp(optarg, "shm")) {
+		fprintf(stderr, "using ipc shm\n");
+	        use_shm = USE_SHM;
+	    } else if (!strcmp(optarg, "shmfs")) {
+	        fprintf(stderr, "using /dev/shm for buffers\n");
+		use_shm = USE_SHMFS;
+	    }
+	    break;
+	case 'o': 
+	    i = atoi(optarg);
+	    stages |= 1 << i;
+	    fprintf(stderr, "adding stage %s\n", stage_name(i));
+	    break;
+	case 'O':
+	    o_direct = O_DIRECT;
+	    break;
+	case 'z':
+	    starting_offset = parse_size(optarg, 1024 * 1024);
+	    break;
+	case 'S':
+	    o_sync = O_SYNC;
+	    break;
+	case 't':
+	    num_threads = atoi(optarg);
+	    break;
+	case 'x':
+	    stonewall = 0;
+	    break;
+	case 'v':
+	    verify = 1;
+	    break;
+	case 'h':
+	default:
+	    print_usage();
+	    exit(1);
+	}
+    }
+
+    /* 
+     * make sure we don't try to submit more ios than we have allocated
+     * memory for
+     */
+    if (depth < io_iter) {
+	io_iter = depth;
+        fprintf(stderr, "dropping io_iter to %d\n", io_iter);
+    }
+
+    if (optind >= ac) {
+	print_usage();
+	exit(1);
+    }
+
+    num_files = ac - optind;
+
+    if (num_threads > (num_files * num_contexts)) {
+        num_threads = num_files * num_contexts;
+	fprintf(stderr, "dropping thread count to the number of contexts %d\n", 
+	        num_threads);
+    }
+
+    t = malloc(num_threads * sizeof(*t));
+    if (!t) {
+        perror("malloc");
+	exit(1);
+    }
+    global_thread_info = t;
+
+    /* by default, allow a huge number of iocbs to be sent towards
+     * io_submit
+     */
+    if (!max_io_submit)
+        max_io_submit = num_files * io_iter * num_contexts;
+
+    /*
+     * make sure we don't try to submit more ios than max_io_submit allows 
+     */
+    if (max_io_submit < io_iter) {
+        io_iter = max_io_submit;
+	fprintf(stderr, "dropping io_iter to %d\n", io_iter);
+    }
+
+    if (!stages) {
+        stages = (1 << WRITE) | (1 << READ) | (1 << RREAD) | (1 << RWRITE);
+    } else {
+        for (i = 0 ; i < LAST_STAGE; i++) {
+	    if (stages & (1 << i)) {
+	        first_stage = i;
+		fprintf(stderr, "starting with %s\n", stage_name(i));
+		break;
+	    }
+	}
+    }
+
+    if (file_size < starting_offset + num_contexts * context_offset) {
+        fprintf(stderr, "file size %Lu too small for %d contexts\n", 
+	        file_size, num_contexts);
+	exit(1);
+    }
+
+    fprintf(stderr, "file size %LuMB, record size %luKB, depth %d, ios per iteration %d\n", file_size / (1024 * 1024), rec_len / 1024, depth, io_iter);
+    fprintf(stderr, "max io_submit %d, buffer alignment set to %luKB\n", 
+            max_io_submit, (page_size_mask + 1)/1024);
+    fprintf(stderr, "threads %d files %d contexts %d context offset %LuMB verification %s\n", 
+            num_threads, num_files, num_contexts, 
+	    context_offset / (1024 * 1024), verify ? "on" : "off");
+    fprintf(stderr, "starting from offset %Lu\n", starting_offset);
+    /* open all the files and do any required setup for them */
+    for (i = optind ; i < ac ; i++) {
+	int thread_index;
+	for (j = 0 ; j < num_contexts ; j++) {
+	    thread_index = open_fds % num_threads;
+	    open_fds++;
+	    fprintf(stderr, "adding file %s thread %d\n", av[i], thread_index);
+
+	    rwfd = open(av[i], O_CREAT | O_RDWR | o_direct | o_sync, 0600);
+	    assert(rwfd != -1);
+
+	    oper = create_oper(rwfd, first_stage, 
+			       starting_offset + (j * context_offset), 
+	                       starting_offset + ((j + 1) * context_offset),
+			       rec_len, depth, io_iter, av[i]);
+	    if (!oper) {
+		fprintf(stderr, "error in create_oper\n");
+		exit(-1);
+	    }
+	    oper_list_add(oper, &t[thread_index].active_opers);
+	    t[thread_index].num_files++;
+	}
+    }
+    if (setup_shared_mem(num_threads, num_files * num_contexts, 
+                         depth, rec_len, max_io_submit))
+    {
+        exit(1);
+    }
+    for (i = 0 ; i < num_threads ; i++) {
+	if (setup_ious(&t[i], t[i].num_files, depth, rec_len, max_io_submit))
+		exit(1);
+    }
+    if (num_threads > 1)
+        run_workers(t, num_threads);
+    else
+	status = worker(t);
+
+    if (status) {
+	exit(1);
+    }
+    return status;
+}
+

Added: trunk/commit-tests/tests/aio-stress/test.sh
===================================================================
--- trunk/commit-tests/tests/aio-stress/test.sh	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/aio-stress/test.sh	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,23 @@
+
+. $CT_FUNCTIONS
+
+PATH="$PATH:$CT_TST"
+
+if ! which aio-stress > /dev/null 2>&1; then
+	invalid "couldn't find aio-stress binary in my path"
+fi
+
+# this aio-stress has a hack that tells the aio-stress instance to work with a
+# given region.  the first set of arguments here lets one node beat on the
+# first 2m of a 4m file while the second set of arguments tells another node to
+# work with the second 2m of that 4m file.  but the harness doesn't really
+# encourage that kind of test yet.
+# 	-O -s 4m -C 2m -r 32k -i 1024 -b 1024 /mnt/ocfs2/shared-aio
+#	-O -s 4m -z 2m -C 2m -r 32k -i 1024 -b 1024 /mnt/ocfs2/shared-aio
+
+out "256k aio/dio ops on a 64meg file"
+
+aio-stress -O -s 64m -r 256k -i 1024 -b 1024 $CT_DIR/aio-stress-file || \
+	fail "aio-stress failed"
+
+pass

Added: trunk/commit-tests/tests/basic-dual-data/test.sh
===================================================================
--- trunk/commit-tests/tests/basic-dual-data/test.sh	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/basic-dual-data/test.sh	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,61 @@
+
+. $CT_FUNCTIONS
+needs_remotes 1
+
+echo "local" > $CT_DIR/writeread || fail "local write failed"
+
+contents=$(remote 0 cat $CT_REMOTE_DIR_0/writeread) || 
+	fail "couldn't get remote writeread contents"
+
+if [ "$contents" != "local" ]; then
+	fail "remote '$contents' != 'local'"
+fi
+
+remote 0 "echo remote > $CT_REMOTE_DIR_0/writeread" || fail "remote write failed"
+
+contents=$(cat $CT_DIR/writeread) || 
+	fail "couldn't get local writeread contents"
+
+if [ "$contents" != "remote" ]; then
+	fail "local '$contents' != 'remote'"
+fi
+
+
+ddstring="thequickbrownfoxjumpsoverthelazydog"
+
+doit_0=""
+doit_1="remote 0"
+
+whichnode=""
+whichpath="$CT_DIR"
+
+for c in $(seq 0 $((${#ddstring} - 1))); do
+	echo $ddstring | \
+		$whichnode dd conv=notrunc of=$whichpath/bytedd \
+			count=1 bs=1 skip=$c seek=$c 2>/dev/null ||
+			fail "$whichnode dd failed on byte $c"
+
+	if [ -z "$whichnode" ]; then
+		whichnode="remote 0"
+		whichpath="$CT_REMOTE_DIR_0"
+	else
+		whichnode=""
+		whichpath="$CT_DIR"
+	fi
+done
+
+contents=$(cat $CT_DIR/bytedd) || 
+	fail "couldn't get local bytedd contents"
+
+if [ "$contents" != "$ddstring" ]; then
+	fail "local '$contents' != '$ddstring'"
+fi
+
+contents=$(remote 0 cat $CT_REMOTE_DIR_0/bytedd) || 
+	fail "couldn't get remote bytedd contents"
+
+if [ "$contents" != "$ddstring" ]; then
+	fail "remote '$contents' != '$ddstring'"
+fi
+
+pass contents ok


Property changes on: trunk/commit-tests/tests/fsx
___________________________________________________________________
Name: svn:ignore
   + *.o
.*.d
fsx


Added: trunk/commit-tests/tests/fsx/Makefile
===================================================================
--- trunk/commit-tests/tests/fsx/Makefile	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/fsx/Makefile	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,14 @@
+TOPDIR = ../../..
+
+include $(TOPDIR)/Preamble.make
+
+SBIN_PROGRAMS = fsx
+
+CFILES = fsx-linux.c
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+fsx: $(OBJS)
+	$(LINK)
+
+include $(TOPDIR)/Postamble.make

Added: trunk/commit-tests/tests/fsx/fsx-linux.c
===================================================================
--- trunk/commit-tests/tests/fsx/fsx-linux.c	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/fsx/fsx-linux.c	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,1202 @@
+/*
+ *	Copyright (C) 1991, NeXT Computer, Inc.  All Rights Reserverd.
+ *
+ *	File:	fsx.c
+ *	Author:	Avadis Tevanian, Jr.
+ *
+ *	File system exerciser. 
+ *
+ *	Rewritten 8/98 by Conrad Minshall.
+ *
+ *	Small changes to work under Linux -- davej at suse.de
+ *
+ */
+
+#define _GNU_SOURCE
+#undef _XOPEN_SOURCE
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#ifdef _UWIN
+# include <sys/param.h>
+# include <limits.h>
+# include <time.h>
+# include <strings.h>
+# define MAP_FILE 0
+#else
+#ifndef linux
+# include <sys/dirent.h>
+#endif
+#endif
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <limits.h>
+#include <err.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <time.h>
+#ifdef AIO
+#include <libaio.h>
+#endif
+
+#define NUMPRINTCOLUMNS 32	/* # columns of data to print on each line */
+
+/*
+ *	A log entry is an operation and a bunch of arguments.
+ */
+
+struct log_entry {
+	int	operation;
+	int	args[3];
+};
+
+#define	LOGSIZE	1000
+
+struct log_entry	oplog[LOGSIZE];	/* the log */
+int			logptr = 0;	/* current position in log */
+int			logcount = 0;	/* total ops */
+
+/*
+ *	Define operations
+ */
+
+#define	OP_READ		1
+#define OP_WRITE	2
+#define OP_TRUNCATE	3
+#define OP_CLOSEOPEN	4
+#define OP_MAPREAD	5
+#define OP_MAPWRITE	6
+#define OP_SKIPPED	7
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE       4096
+#endif
+#define PAGE_MASK       (PAGE_SIZE - 1)
+
+char	*original_buf;			/* a pointer to the original data */
+char	*good_buf;			/* a pointer to the correct data */
+char	*temp_buf;			/* a pointer to the current data */
+char	*fname;				/* name of our test file */
+int	fd;				/* fd for our test file */
+
+off_t		file_size = 0;
+off_t		biggest = 0;
+char		state[256];
+unsigned long	testcalls = 0;		/* calls to function "test" */
+
+unsigned long	simulatedopcount = 0;	/* -b flag */
+int	closeprob = 0;			/* -c flag */
+int	debug = 0;			/* -d flag */
+unsigned long	debugstart = 0;		/* -D flag */
+int	do_fsync = 0;			/* -f flag */
+unsigned long	maxfilelen = 256 * 1024;	/* -l flag */
+int	sizechecks = 1;			/* -n flag disables them */
+int	maxoplen = 64 * 1024;		/* -o flag */
+int	quiet = 0;			/* -q flag */
+unsigned long progressinterval = 0;	/* -p flag */
+int	readbdy = 1;			/* -r flag */
+int	style = 0;			/* -s flag */
+int	truncbdy = 1;			/* -t flag */
+int	writebdy = 1;			/* -w flag */
+long	monitorstart = -1;		/* -m flag */
+long	monitorend = -1;		/* -m flag */
+int	lite = 0;			/* -L flag */
+long	numops = -1;			/* -N flag */
+int	randomoplen = 1;		/* -O flag disables it */
+int	seed = 1;			/* -S flag */
+int     mapped_writes = 1;              /* -W flag disables */
+int 	mapped_reads = 1;		/* -R flag disables it */
+int	fsxgoodfd = 0;
+int	o_direct;			/* -Z */
+int	aio = 0;
+
+#ifdef AIO
+int aio_rw(int rw, int fd, char *buf, unsigned len, unsigned offset);
+#define READ 0
+#define WRITE 1
+#define fsxread(a,b,c,d)	aio_rw(READ, a,b,c,d)
+#define fsxwrite(a,b,c,d)	aio_rw(WRITE, a,b,c,d)
+#else
+#define fsxread(a,b,c,d)	read(a,b,c)
+#define fsxwrite(a,b,c,d)	write(a,b,c)
+#endif
+
+FILE *	fsxlogf = NULL;
+int badoff = -1;
+int closeopen = 0;
+
+static void *round_up(void *ptr, unsigned long align, unsigned long offset)
+{
+	unsigned long ret = (unsigned long)ptr;
+
+	ret = ((ret + align - 1) & ~(align - 1));
+	ret += offset;
+	return (void *)ret;
+}
+
+void
+prt(char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vfprintf(stdout, fmt, args);
+	if (fsxlogf)
+		vfprintf(fsxlogf, fmt, args);
+	va_end(args);
+}
+
+void
+prterr(char *prefix)
+{
+	prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(errno));
+}
+
+
+void
+log4(int operation, int arg0, int arg1, int arg2)
+{
+	struct log_entry *le;
+
+	le = &oplog[logptr];
+	le->operation = operation;
+	if (closeopen)
+		le->operation = ~ le->operation;
+	le->args[0] = arg0;
+	le->args[1] = arg1;
+	le->args[2] = arg2;
+	logptr++;
+	logcount++;
+	if (logptr >= LOGSIZE)
+		logptr = 0;
+}
+
+
+void
+logdump(void)
+{
+	int	i, count, down;
+	struct log_entry	*lp;
+
+	prt("LOG DUMP (%d total operations):\n", logcount);
+	if (logcount < LOGSIZE) {
+		i = 0;
+		count = logcount;
+	} else {
+		i = logptr;
+		count = LOGSIZE;
+	}
+	for ( ; count > 0; count--) {
+		int opnum;
+
+		opnum = i+1 + (logcount/LOGSIZE)*LOGSIZE;
+		prt("%d(%d mod 256): ", opnum, opnum%256);
+		lp = &oplog[i];
+		if ((closeopen = lp->operation < 0))
+			lp->operation = ~ lp->operation;
+			
+		switch (lp->operation) {
+		case OP_MAPREAD:
+			prt("MAPREAD\t0x%x thru 0x%x\t(0x%x bytes)",
+			    lp->args[0], lp->args[0] + lp->args[1] - 1,
+			    lp->args[1]);
+			if (badoff >= lp->args[0] && badoff <
+						     lp->args[0] + lp->args[1])
+				prt("\t***RRRR***");
+			break;
+		case OP_MAPWRITE:
+			prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
+			    lp->args[0], lp->args[0] + lp->args[1] - 1,
+			    lp->args[1]);
+			if (badoff >= lp->args[0] && badoff <
+						     lp->args[0] + lp->args[1])
+				prt("\t******WWWW");
+			break;
+		case OP_READ:
+			prt("READ\t0x%x thru 0x%x\t(0x%x bytes)",
+			    lp->args[0], lp->args[0] + lp->args[1] - 1,
+			    lp->args[1]);
+			if (badoff >= lp->args[0] &&
+			    badoff < lp->args[0] + lp->args[1])
+				prt("\t***RRRR***");
+			break;
+		case OP_WRITE:
+			prt("WRITE\t0x%x thru 0x%x\t(0x%x bytes)",
+			    lp->args[0], lp->args[0] + lp->args[1] - 1,
+			    lp->args[1]);
+			if (lp->args[0] > lp->args[2])
+				prt(" HOLE");
+			else if (lp->args[0] + lp->args[1] > lp->args[2])
+				prt(" EXTEND");
+			if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
+			    badoff < lp->args[0] + lp->args[1])
+				prt("\t***WWWW");
+			break;
+		case OP_TRUNCATE:
+			down = lp->args[0] < lp->args[1];
+			prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
+			    down ? "DOWN" : "UP", lp->args[1], lp->args[0]);
+			if (badoff >= lp->args[!down] &&
+			    badoff < lp->args[!!down])
+				prt("\t******WWWW");
+			break;
+		case OP_SKIPPED:
+			prt("SKIPPED (no operation)");
+			break;
+		default:
+			prt("BOGUS LOG ENTRY (operation code = %d)!",
+			    lp->operation);
+		}
+		if (closeopen)
+			prt("\n\t\tCLOSE/OPEN");
+		prt("\n");
+		i++;
+		if (i == LOGSIZE)
+			i = 0;
+	}
+}
+
+
+void
+save_buffer(char *buffer, off_t bufferlength, int fd)
+{
+	off_t ret;
+	ssize_t byteswritten;
+
+	if (fd <= 0 || bufferlength == 0)
+		return;
+
+	if (bufferlength > SSIZE_MAX) {
+		prt("fsx flaw: overflow in save_buffer\n");
+		exit(67);
+	}
+	if (lite) {
+		off_t size_by_seek = lseek(fd, (off_t)0, L_XTND);
+		if (size_by_seek == (off_t)-1)
+			prterr("save_buffer: lseek eof");
+		else if (bufferlength > size_by_seek) {
+			warn("save_buffer: .fsxgood file too short... will save 0x%qx bytes instead of 0x%qx\n", (unsigned long long)size_by_seek,
+			     (unsigned long long)bufferlength);
+			bufferlength = size_by_seek;
+		}
+	}
+
+	ret = lseek(fd, (off_t)0, SEEK_SET);
+	if (ret == (off_t)-1)
+		prterr("save_buffer: lseek 0");
+	
+	byteswritten = write(fd, buffer, (size_t)bufferlength);
+	if (byteswritten != bufferlength) {
+		if (byteswritten == -1)
+			prterr("save_buffer write");
+		else
+			warn("save_buffer: short write, 0x%x bytes instead of 0x%qx\n",
+			     (unsigned)byteswritten,
+			     (unsigned long long)bufferlength);
+	}
+}
+
+
+void
+report_failure(int status)
+{
+	logdump();
+	
+	if (fsxgoodfd) {
+		if (good_buf) {
+			save_buffer(good_buf, file_size, fsxgoodfd);
+			prt("Correct content saved for comparison\n");
+			prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
+			    fname, fname);
+		}
+		close(fsxgoodfd);
+	}
+	exit(status);
+}
+
+
+#define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
+				        *(((unsigned char *)(cp)) + 1)))
+
+void
+check_buffers(unsigned offset, unsigned size)
+{
+	unsigned char c, t;
+	unsigned i = 0;
+	unsigned n = 0;
+	unsigned op = 0;
+	unsigned bad = 0;
+
+	if (bcmp(good_buf + offset, temp_buf, size) != 0) {
+		prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
+		    offset, size, fname);
+		prt("OFFSET\tGOOD\tBAD\tRANGE\n");
+		while (size > 0) {
+			c = good_buf[offset];
+			t = temp_buf[i];
+			if (c != t) {
+			        if (n < 16) {
+					bad = short_at(&temp_buf[i]);
+				        prt("0x%5x\t0x%04x\t0x%04x", offset,
+				            short_at(&good_buf[offset]), bad);
+					op = temp_buf[offset & 1 ? i+1 : i];
+				        prt("\t0x%5x\n", n);
+					if (op)
+						prt("operation# (mod 256) for "
+						  "the bad data may be %u\n",
+						((unsigned)op & 0xff));
+					else
+						prt("operation# (mod 256) for "
+						  "the bad data unknown, check"
+						  " HOLE and EXTEND ops\n");
+				}
+				n++;
+				badoff = offset;
+			}
+			offset++;
+			i++;
+			size--;
+		}
+		report_failure(110);
+	}
+}
+
+
+void
+check_size(void)
+{
+	struct stat	statbuf;
+	off_t	size_by_seek;
+
+	if (fstat(fd, &statbuf)) {
+		prterr("check_size: fstat");
+		statbuf.st_size = -1;
+	}
+	size_by_seek = lseek(fd, (off_t)0, L_XTND);
+	if (file_size != statbuf.st_size || file_size != size_by_seek) {
+		prt("Size error: expected 0x%qx stat 0x%qx seek 0x%qx\n",
+		    (unsigned long long)file_size,
+		    (unsigned long long)statbuf.st_size,
+		    (unsigned long long)size_by_seek);
+		report_failure(120);
+	}
+}
+
+
+void
+check_trunc_hack(void)
+{
+	struct stat statbuf;
+
+	ftruncate(fd, (off_t)0);
+	ftruncate(fd, (off_t)100000);
+	fstat(fd, &statbuf);
+	if (statbuf.st_size != (off_t)100000) {
+		prt("no extend on truncate! not posix!\n");
+		exit(130);
+	}
+	ftruncate(fd, 0);
+}
+
+
+void
+doread(unsigned offset, unsigned size)
+{
+	off_t ret;
+	unsigned iret;
+
+	offset -= offset % readbdy;
+	if (o_direct)
+		size -= size % readbdy;
+	if (size == 0) {
+		if (!quiet && testcalls > simulatedopcount && !o_direct)
+			prt("skipping zero size read\n");
+		log4(OP_SKIPPED, OP_READ, offset, size);
+		return;
+	}
+	if (size + offset > file_size) {
+		if (!quiet && testcalls > simulatedopcount)
+			prt("skipping seek/read past end of file\n");
+		log4(OP_SKIPPED, OP_READ, offset, size);
+		return;
+	}
+
+	log4(OP_READ, offset, size, 0);
+
+	if (testcalls <= simulatedopcount)
+		return;
+
+	if (!quiet &&
+		((progressinterval && testcalls % progressinterval == 0)  ||
+		(debug &&
+		       (monitorstart == -1 ||
+			(offset + size > monitorstart &&
+			(monitorend == -1 || offset <= monitorend))))))
+		prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
+		    offset, offset + size - 1, size);
+	ret = lseek(fd, (off_t)offset, SEEK_SET);
+	if (ret == (off_t)-1) {
+		prterr("doread: lseek");
+		report_failure(140);
+	}
+	iret = fsxread(fd, temp_buf, size, offset);
+	if (iret != size) {
+		if (iret == -1)
+			prterr("doread: read");
+		else
+			prt("short read: 0x%x bytes instead of 0x%x\n",
+			    iret, size);
+		report_failure(141);
+	}
+	check_buffers(offset, size);
+}
+
+
+void
+domapread(unsigned offset, unsigned size)
+{
+	unsigned pg_offset;
+	unsigned map_size;
+	char    *p;
+
+	offset -= offset % readbdy;
+	if (size == 0) {
+		if (!quiet && testcalls > simulatedopcount)
+			prt("skipping zero size read\n");
+		log4(OP_SKIPPED, OP_MAPREAD, offset, size);
+		return;
+	}
+	if (size + offset > file_size) {
+		if (!quiet && testcalls > simulatedopcount)
+			prt("skipping seek/read past end of file\n");
+		log4(OP_SKIPPED, OP_MAPREAD, offset, size);
+		return;
+	}
+
+	log4(OP_MAPREAD, offset, size, 0);
+
+	if (testcalls <= simulatedopcount)
+		return;
+
+	if (!quiet &&
+		((progressinterval && testcalls % progressinterval == 0) ||
+		       (debug &&
+		       (monitorstart == -1 ||
+			(offset + size > monitorstart &&
+			(monitorend == -1 || offset <= monitorend))))))
+		prt("%lu mapread\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
+		    offset, offset + size - 1, size);
+
+	pg_offset = offset & PAGE_MASK;
+	map_size  = pg_offset + size;
+
+#ifdef linux
+	if ((p = (char *)mmap(0, map_size, PROT_READ, MAP_SHARED, fd,
+#else
+	if ((p = (char *)mmap(0, map_size, PROT_READ, MAP_FILE, fd,
+#endif
+			      (off_t)(offset - pg_offset))) == (char *)-1) {
+	        prterr("domapread: mmap");
+		report_failure(190);
+	}
+	memcpy(temp_buf, p + pg_offset, size);
+	if (munmap(p, map_size) != 0) {
+		prterr("domapread: munmap");
+		report_failure(191);
+	}
+
+	check_buffers(offset, size);
+}
+
+
+void
+gendata(char *original_buf, char *good_buf, unsigned offset, unsigned size)
+{
+	while (size--) {
+		good_buf[offset] = testcalls % 256; 
+		if (offset % 2)
+			good_buf[offset] += original_buf[offset];
+		offset++;
+	}
+}
+
+
+void
+dowrite(unsigned offset, unsigned size)
+{
+	off_t ret;
+	unsigned iret;
+
+	offset -= offset % writebdy;
+	if (o_direct)
+		size -= size % writebdy;
+	if (size == 0) {
+		if (!quiet && testcalls > simulatedopcount && !o_direct)
+			prt("skipping zero size write\n");
+		log4(OP_SKIPPED, OP_WRITE, offset, size);
+		return;
+	}
+
+	log4(OP_WRITE, offset, size, file_size);
+
+	gendata(original_buf, good_buf, offset, size);
+	if (file_size < offset + size) {
+		if (file_size < offset)
+			bzero(good_buf + file_size, offset - file_size);
+		file_size = offset + size;
+		if (lite) {
+			warn("Lite file size bug in fsx!");
+			report_failure(149);
+		}
+	}
+
+	if (testcalls <= simulatedopcount)
+		return;
+
+	if (!quiet &&
+		((progressinterval && testcalls % progressinterval == 0) ||
+		       (debug &&
+		       (monitorstart == -1 ||
+			(offset + size > monitorstart &&
+			(monitorend == -1 || offset <= monitorend))))))
+		prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
+		    offset, offset + size - 1, size);
+	ret = lseek(fd, (off_t)offset, SEEK_SET);
+	if (ret == (off_t)-1) {
+		prterr("dowrite: lseek");
+		report_failure(150);
+	}
+	iret = fsxwrite(fd, good_buf + offset, size, offset);
+	if (iret != size) {
+		if (iret == -1)
+			prterr("dowrite: write");
+		else
+			prt("short write: 0x%x bytes instead of 0x%x\n",
+			    iret, size);
+		report_failure(151);
+	}
+	if (do_fsync) {
+		if (fsync(fd)) {
+			prt("fsync() failed: %s\n", strerror(errno));
+			report_failure(152);
+		}
+	}
+}
+
+
+void
+domapwrite(unsigned offset, unsigned size)
+{
+	unsigned pg_offset;
+	unsigned map_size;
+	off_t    cur_filesize;
+	char    *p;
+
+	offset -= offset % writebdy;
+	if (size == 0) {
+		if (!quiet && testcalls > simulatedopcount)
+			prt("skipping zero size write\n");
+		log4(OP_SKIPPED, OP_MAPWRITE, offset, size);
+		return;
+	}
+	cur_filesize = file_size;
+
+	log4(OP_MAPWRITE, offset, size, 0);
+
+	gendata(original_buf, good_buf, offset, size);
+	if (file_size < offset + size) {
+		if (file_size < offset)
+			bzero(good_buf + file_size, offset - file_size);
+		file_size = offset + size;
+		if (lite) {
+			warn("Lite file size bug in fsx!");
+			report_failure(200);
+		}
+	}
+
+	if (testcalls <= simulatedopcount)
+		return;
+
+	if (!quiet &&
+		((progressinterval && testcalls % progressinterval == 0) ||
+		       (debug &&
+		       (monitorstart == -1 ||
+			(offset + size > monitorstart &&
+			(monitorend == -1 || offset <= monitorend))))))
+		prt("%lu mapwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
+		    offset, offset + size - 1, size);
+
+	if (file_size > cur_filesize) {
+	        if (ftruncate(fd, file_size) == -1) {
+		        prterr("domapwrite: ftruncate");
+			exit(201);
+		}
+	}
+	pg_offset = offset & PAGE_MASK;
+	map_size  = pg_offset + size;
+
+	if ((p = (char *)mmap(0, map_size, PROT_READ | PROT_WRITE,
+			      MAP_FILE | MAP_SHARED, fd,
+			      (off_t)(offset - pg_offset))) == (char *)-1) {
+	        prterr("domapwrite: mmap");
+		report_failure(202);
+	}
+	memcpy(p + pg_offset, good_buf + offset, size);
+	if (msync(p, map_size, 0) != 0) {
+		prterr("domapwrite: msync");
+		report_failure(203);
+	}
+	if (munmap(p, map_size) != 0) {
+		prterr("domapwrite: munmap");
+		report_failure(204);
+	}
+}
+
+
+void
+dotruncate(unsigned size)
+{
+	int oldsize = file_size;
+
+	size -= size % truncbdy;
+	if (size > biggest) {
+		biggest = size;
+		if (!quiet && testcalls > simulatedopcount)
+			prt("truncating to largest ever: 0x%x\n", size);
+	}
+
+	log4(OP_TRUNCATE, size, (unsigned)file_size, 0);
+
+	if (size > file_size)
+		bzero(good_buf + file_size, size - file_size);
+	file_size = size;
+
+	if (testcalls <= simulatedopcount)
+		return;
+	
+	if ((progressinterval && testcalls % progressinterval == 0) ||
+	    (debug && (monitorstart == -1 || monitorend == -1 ||
+		      size <= monitorend)))
+		prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls, oldsize, size);
+	if (ftruncate(fd, (off_t)size) == -1) {
+	        prt("ftruncate1: %x\n", size);
+		prterr("dotruncate: ftruncate");
+		report_failure(160);
+	}
+}
+
+
+void
+writefileimage()
+{
+	ssize_t iret;
+
+	if (lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
+		prterr("writefileimage: lseek");
+		report_failure(171);
+	}
+	iret = write(fd, good_buf, file_size);
+	if ((off_t)iret != file_size) {
+		if (iret == -1)
+			prterr("writefileimage: write");
+		else
+			prt("short write: 0x%x bytes instead of 0x%qx\n",
+			    iret, (unsigned long long)file_size);
+		report_failure(172);
+	}
+	if (lite ? 0 : ftruncate(fd, file_size) == -1) {
+	        prt("ftruncate2: %qx\n", (unsigned long long)file_size);
+		prterr("writefileimage: ftruncate");
+		report_failure(173);
+	}
+}
+
+
+void
+docloseopen(void)
+{ 
+	if (testcalls <= simulatedopcount)
+		return;
+
+	if (debug)
+		prt("%lu close/open\n", testcalls);
+	if (close(fd)) {
+		prterr("docloseopen: close");
+		report_failure(180);
+	}
+	fd = open(fname, O_RDWR|o_direct, 0);
+	if (fd < 0) {
+		prterr("docloseopen: open");
+		report_failure(181);
+	}
+}
+
+
+void
+test(void)
+{
+	unsigned long	offset;
+	unsigned long	size = maxoplen;
+	unsigned long	rv = random();
+	unsigned long	op = rv % (3 + !lite + mapped_writes);
+
+        /* turn off the map read if necessary */
+
+        if (op == 2 && !mapped_reads)
+            op = 0;
+
+	if (simulatedopcount > 0 && testcalls == simulatedopcount)
+		writefileimage();
+
+	testcalls++;
+
+	if (closeprob)
+		closeopen = (rv >> 3) < (1 << 28) / closeprob;
+
+	if (debugstart > 0 && testcalls >= debugstart)
+		debug = 1;
+
+	if (!quiet && testcalls < simulatedopcount && testcalls % 100000 == 0)
+		prt("%lu...\n", testcalls);
+
+	/*
+	 * READ:	op = 0
+	 * WRITE:	op = 1
+	 * MAPREAD:     op = 2
+	 * TRUNCATE:	op = 3
+	 * MAPWRITE:    op = 3 or 4
+	 */
+	if (lite ? 0 : op == 3 && (style & 1) == 0) /* vanilla truncate? */
+		dotruncate(random() % maxfilelen);
+	else {
+		if (randomoplen)
+			size = random() % (maxoplen+1);
+		if (lite ? 0 : op == 3)
+			dotruncate(size);
+		else {
+			offset = random();
+			if (op == 1 || op == (lite ? 3 : 4)) {
+				offset %= maxfilelen;
+				if (offset + size > maxfilelen)
+					size = maxfilelen - offset;
+				if (op != 1)
+					domapwrite(offset, size);
+				else
+					dowrite(offset, size);
+			} else {
+				if (file_size)
+					offset %= file_size;
+				else
+					offset = 0;
+				if (offset + size > file_size)
+					size = file_size - offset;
+				if (op != 0)
+					domapread(offset, size);
+				else
+					doread(offset, size);
+			}
+		}
+	}
+	if (sizechecks && testcalls > simulatedopcount)
+		check_size();
+	if (closeopen)
+		docloseopen();
+}
+
+
+void
+cleanup(sig)
+	int	sig;
+{
+	if (sig)
+		prt("signal %d\n", sig);
+	prt("testcalls = %lu\n", testcalls);
+	exit(sig);
+}
+
+
+void
+usage(void)
+{
+	fprintf(stdout, "usage: %s",
+		"fsx [-dnqALOWZ] [-b opnum] [-c Prob] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] fname\n\
+	-b opnum: beginning operation number (default 1)\n\
+	-c P: 1 in P chance of file close+open at each op (default infinity)\n\
+	-d: debug output for all operations\n\
+	-l flen: the upper bound on file size (default 262144)\n\
+	-m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
+	-n: no verifications of file size\n\
+	-o oplen: the upper bound on operation size (default 65536)\n\
+	-p progressinterval: debug output at specified operation interval\n\
+	-q: quieter operation\n\
+	-r readbdy: 4096 would make reads page aligned (default 1)\n\
+	-s style: 1 gives smaller truncates (default 0)\n\
+	-t truncbdy: 4096 would make truncates page aligned (default 1)\n\
+	-w writebdy: 4096 would make writes page aligned (default 1)\n\
+	-A: Use the AIO system calls\n\
+	-D startingop: debug output starting at specified operation\n\
+	-L: fsxLite - no file creations & no file size changes\n\
+	-N numops: total # operations to do (default infinity)\n\
+	-O: use oplen (see -o flag) for every op (default random)\n\
+	-P: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
+	-S seed: for random # generator (default 1) 0 gets timestamp\n\
+	-W: mapped write operations DISabled\n\
+        -R: read() system calls only (mapped reads disabled)\n\
+        -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
+	fname: this filename is REQUIRED (no default)\n");
+	exit(90);
+}
+
+
+int
+getnum(char *s, char **e)
+{
+	int ret = -1;
+
+	*e = (char *) 0;
+	ret = strtol(s, e, 0);
+	if (*e)
+		switch (**e) {
+		case 'b':
+		case 'B':
+			ret *= 512;
+			*e = *e + 1;
+			break;
+		case 'k':
+		case 'K':
+			ret *= 1024;
+			*e = *e + 1;
+			break;
+		case 'm':
+		case 'M':
+			ret *= 1024*1024;
+			*e = *e + 1;
+			break;
+		case 'w':
+		case 'W':
+			ret *= 4;
+			*e = *e + 1;
+			break;
+		}
+	return (ret);
+}
+
+#ifdef AIO
+
+#define QSZ     1024
+io_context_t	io_ctx;
+struct iocb 	iocb;
+
+int aio_setup()
+{
+	int ret;
+	ret = io_queue_init(QSZ, &io_ctx);
+	if (ret != 0) {
+		fprintf(stderr, "aio_setup: io_queue_init failed: %s\n",
+                        strerror(ret));
+                return(-1);
+        }
+        return(0);
+}
+
+int
+__aio_rw(int rw, int fd, char *buf, unsigned len, unsigned offset)
+{
+	struct io_event event;
+	static struct timespec ts;
+	struct iocb *iocbs[] = { &iocb };
+	int ret;
+
+	if (rw == READ) {
+		io_prep_pread(&iocb, fd, buf, len, offset);
+	} else {
+		io_prep_pwrite(&iocb, fd, buf, len, offset);
+	}
+
+	ts.tv_sec = 30;
+	ts.tv_nsec = 0;
+	ret = io_submit(io_ctx, 1, iocbs);
+	if (ret != 1) {
+		fprintf(stderr, "errcode=%d\n", ret);
+		fprintf(stderr, "aio_rw: io_submit failed: %s\n",
+				strerror(ret));
+		return(-1);
+	}
+
+	ret = io_getevents(io_ctx, 1, 1, &event, &ts);
+	if (ret != 1) {
+		fprintf(stderr, "errcode=%d\n", ret);
+		fprintf(stderr, "aio_rw: io_getevents failed: %s\n",
+				 strerror(ret));
+		return -1;
+	}
+	if (len != event.res) {
+		fprintf(stderr, "bad read length: %lu instead of %u\n",
+				event.res, len);
+	}
+	return event.res;
+}
+
+int aio_rw(int rw, int fd, char *buf, unsigned len, unsigned offset)
+{
+	int ret;
+
+	if (aio) {
+		ret = __aio_rw(rw, fd, buf, len, offset);
+	} else {
+		if (rw == READ)
+			ret = read(fd, buf, len);
+		else
+			ret = write(fd, buf, len);
+	}
+	return ret;
+}
+
+#endif
+
+int
+main(int argc, char **argv)
+{
+	int	i, style, ch;
+	char	*endp;
+	char goodfile[1024];
+	char logfile[1024];
+
+	goodfile[0] = 0;
+	logfile[0] = 0;
+
+	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
+
+	while ((ch = getopt(argc, argv, "b:c:dfl:m:no:p:qr:s:t:w:AD:LN:OP:RS:WZ"))
+	       != EOF)
+		switch (ch) {
+		case 'b':
+			simulatedopcount = getnum(optarg, &endp);
+			if (!quiet)
+				fprintf(stdout, "Will begin at operation %ld\n",
+					simulatedopcount);
+			if (simulatedopcount == 0)
+				usage();
+			simulatedopcount -= 1;
+			break;
+		case 'c':
+			closeprob = getnum(optarg, &endp);
+			if (!quiet)
+				fprintf(stdout,
+					"Chance of close/open is 1 in %d\n",
+					closeprob);
+			if (closeprob <= 0)
+				usage();
+			break;
+		case 'd':
+			debug = 1;
+			break;
+		case 'f':
+			do_fsync = 1;
+			break;
+		case 'l':
+			maxfilelen = getnum(optarg, &endp);
+			if (maxfilelen <= 0)
+				usage();
+			break;
+		case 'm':
+			monitorstart = getnum(optarg, &endp);
+			if (monitorstart < 0)
+				usage();
+			if (!endp || *endp++ != ':')
+				usage();
+			monitorend = getnum(endp, &endp);
+			if (monitorend < 0)
+				usage();
+			if (monitorend == 0)
+				monitorend = -1; /* aka infinity */
+			debug = 1;
+		case 'n':
+			sizechecks = 0;
+			break;
+		case 'o':
+			maxoplen = getnum(optarg, &endp);
+			if (maxoplen <= 0)
+				usage();
+			break;
+		case 'p':
+			progressinterval = getnum(optarg, &endp);
+			if (progressinterval < 0)
+				usage();
+			break;
+		case 'q':
+			quiet = 1;
+			break;
+		case 'r':
+			readbdy = getnum(optarg, &endp);
+			if (readbdy <= 0)
+				usage();
+			break;
+		case 's':
+			style = getnum(optarg, &endp);
+			if (style < 0 || style > 1)
+				usage();
+			break;
+		case 't':
+			truncbdy = getnum(optarg, &endp);
+			if (truncbdy <= 0)
+				usage();
+			break;
+		case 'w':
+			writebdy = getnum(optarg, &endp);
+			if (writebdy <= 0)
+				usage();
+			break;
+		case 'A':
+		        aio = 1;
+			break;
+		case 'D':
+			debugstart = getnum(optarg, &endp);
+			if (debugstart < 1)
+				usage();
+			break;
+		case 'L':
+		        lite = 1;
+			break;
+		case 'N':
+			numops = getnum(optarg, &endp);
+			if (numops < 0)
+				usage();
+			break;
+		case 'O':
+			randomoplen = 0;
+			break;
+		case 'P':
+			strncpy(goodfile, optarg, sizeof(goodfile));
+			strcat(goodfile, "/");
+			strncpy(logfile, optarg, sizeof(logfile));
+			strcat(logfile, "/");
+			break;
+                case 'R':
+                        mapped_reads = 0;
+                        break;
+		case 'S':
+                        seed = getnum(optarg, &endp);
+			if (seed == 0)
+				seed = time(0) % 10000;
+			if (!quiet)
+				fprintf(stdout, "Seed set to %d\n", seed);
+			if (seed < 0)
+				usage();
+			break;
+		case 'W':
+		        mapped_writes = 0;
+			if (!quiet)
+				fprintf(stdout, "mapped writes DISABLED\n");
+			break;
+		case 'Z':
+			o_direct = O_DIRECT;
+			break;
+		default:
+			usage();
+			/* NOTREACHED */
+		}
+	argc -= optind;
+	argv += optind;
+	if (argc != 1)
+		usage();
+	fname = argv[0];
+
+	signal(SIGHUP,	cleanup);
+	signal(SIGINT,	cleanup);
+	signal(SIGPIPE,	cleanup);
+	signal(SIGALRM,	cleanup);
+	signal(SIGTERM,	cleanup);
+	signal(SIGXCPU,	cleanup);
+	signal(SIGXFSZ,	cleanup);
+	signal(SIGVTALRM,	cleanup);
+	signal(SIGUSR1,	cleanup);
+	signal(SIGUSR2,	cleanup);
+
+	initstate(seed, state, 256);
+	setstate(state);
+	fd = open(fname,
+		O_RDWR|(lite ? 0 : O_CREAT|O_TRUNC)|o_direct, 0666);
+	if (fd < 0) {
+		prterr(fname);
+		exit(91);
+	}
+	strncat(goodfile, fname, 256);
+	strcat (goodfile, ".fsxgood");
+	fsxgoodfd = open(goodfile, O_RDWR|O_CREAT|O_TRUNC, 0666);
+	if (fsxgoodfd < 0) {
+		prterr(goodfile);
+		exit(92);
+	}
+	strncat(logfile, fname, 256);
+	strcat (logfile, ".fsxlog");
+	fsxlogf = fopen(logfile, "w");
+	if (fsxlogf == NULL) {
+		prterr(logfile);
+		exit(93);
+	}
+
+#ifdef AIO
+	if (aio) 
+		aio_setup();
+#endif
+
+	if (lite) {
+		off_t ret;
+		file_size = maxfilelen = lseek(fd, (off_t)0, L_XTND);
+		if (file_size == (off_t)-1) {
+			prterr(fname);
+			warn("main: lseek eof");
+			exit(94);
+		}
+		ret = lseek(fd, (off_t)0, SEEK_SET);
+		if (ret == (off_t)-1) {
+			prterr(fname);
+			warn("main: lseek 0");
+			exit(95);
+		}
+	}
+	original_buf = (char *) malloc(maxfilelen);
+	for (i = 0; i < maxfilelen; i++)
+		original_buf[i] = random() % 256;
+	good_buf = (char *) malloc(maxfilelen + writebdy);
+	good_buf = round_up(good_buf, writebdy, 0);
+	bzero(good_buf, maxfilelen);
+	temp_buf = (char *) malloc(maxoplen + readbdy);
+	temp_buf = round_up(temp_buf, readbdy, 0);
+	bzero(temp_buf, maxoplen);
+	if (lite) {	/* zero entire existing file */
+		ssize_t written;
+
+		written = write(fd, good_buf, (size_t)maxfilelen);
+		if (written != maxfilelen) {
+			if (written == -1) {
+				prterr(fname);
+				warn("main: error on write");
+			} else
+				warn("main: short write, 0x%x bytes instead "
+					"of 0x%lx\n",
+					(unsigned)written,
+					maxfilelen);
+			exit(98);
+		}
+	} else 
+		check_trunc_hack();
+
+	while (numops == -1 || numops--)
+		test();
+
+	if (close(fd)) {
+		prterr("close");
+		report_failure(99);
+	}
+	prt("All operations completed A-OK!\n");
+
+	exit(0);
+	return 0;
+}

Added: trunk/commit-tests/tests/fsx/test.sh
===================================================================
--- trunk/commit-tests/tests/fsx/test.sh	2005-04-13 20:58:03 UTC (rev 2140)
+++ trunk/commit-tests/tests/fsx/test.sh	2005-04-13 21:04:19 UTC (rev 2141)
@@ -0,0 +1,28 @@
+
+. $CT_FUNCTIONS
+
+PATH="$PATH:$CT_TST"
+
+if ! which fsx > /dev/null 2>&1; then
+	invalid "couldn't find fsx binary in my path"
+fi
+
+args="-S $RANDOM -R -W -N 1000 -p 200 $CT_DIR/fsxfile"
+out "first regular read/write file io ($args)"
+fsx $args || fail "fsx returned $?"
+
+args="-A $args"
+out "this time including aio ($args)"
+fsx $args || fail "fsx returned $?"
+
+# can't do this yet because O_DIRECT extending doesn't zero in the
+# in the sparse regions created.  fsx will read them and get garbage 
+# and complain.
+#
+#pagesize=$(getconf PAGE_SIZE) || invalid "getconf PAGE_SIZE failed"
+#
+#args="-Z -r $pagesize -w $pagesize $args"
+#out "this time including page-sized DIO ($args)"
+#fsx $args || fail "fsx returned $?"
+
+pass "all combinations exited ok"