[Ocfs2-test-devel] [PATCH 3/3] Ocfs2-test: Add rest of multi-nodes testcases to multiple_run.sh.
tristan.ye
tristan.ye at oracle.com
Fri Jan 9 18:51:45 PST 2009
Marcos, Sunil,
Thanks for your explainations,that's really a good point to strike for a
balance between i/o workload and parallelism intensity, during
multi-nodes execution of the buildkernel test, we can limit the
workdir_nums of each node to 2(2 kernels being built on each node),and
increase the parallelism of building jobs(larger than 2) to expose the
races.
Regards,
Tristan
Sunil Mushran wrote:
> Marcos has a good point. Having lots of processes waiting on io is
> not an efficient way to test the fs. The trick is to find a balance...
> in which we get enough contention to expose races but not more that
> just leads to processes waiting on the ios to complete.
>
> So when I run build kernel by hand, I use -j to parallelize the build.
> Set to double the number of cpus.
>
> tristan.ye wrote:
>> Marcos E. Matsunaga wrote:
>>> Comments inline.
>>> Regards,
>>>
>>> Marcos Eduardo Matsunaga
>>>
>>> Oracle USA
>>> Linux Engineering
>>>
>>> “The statements and opinions expressed here are my own and do not
>>> necessarily represent those of Oracle Corporation.”
>>>
>>>
>>> Tristan Ye wrote:
>>>> After marcos has moved all the rest of mult-nodes testcases from
>>>> lam-mpi to openmpi,
>>>> we decide to add these tests into the multi-nodes testing
>>>> launcher(multiple_run.sh),
>>>> they are:
>>>>
>>>> 1. write_torture
>>>>
>>>> 2. build_kernel
>>>>
>>>> 3. cross_delete
>>>>
>>>> 4. recovery_load
>>>>
>>>> 5. fork_writer
>>>>
>>>> Signed-off-by: Tristan Ye <tristan.ye at oracle.com>
>>>> ---
>>>> programs/python_common/multiple_run.sh | 275
>>>> +++++++++++++++++++++++++++++++-
>>>> 1 files changed, 272 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/programs/python_common/multiple_run.sh
>>>> b/programs/python_common/multiple_run.sh
>>>> index 7b1ac1e..3ed6f82 100755
>>>> --- a/programs/python_common/multiple_run.sh
>>>> +++ b/programs/python_common/multiple_run.sh
>>>> @@ -38,6 +38,7 @@ REMOTE_UMOUNT_BIN="${BINDIR}/remote_umount.py"
>>>>
>>>> NODE_LIST=
>>>> DEVICE_NAME=
>>>> +TAR_FILE=
>>>> MOUNT_POINT=
>>>>
>>>> ################################################################################
>>>>
>>>> @@ -45,13 +46,14 @@ MOUNT_POINT=
>>>> ################################################################################
>>>>
>>>> f_usage()
>>>> {
>>>> - echo "usage: `basename ${0}` <-n nodes> <-d device>
>>>> <mountpoint path>"
>>>> + echo "usage: `basename ${0}` <-n nodes> <-d device> <-t
>>>> tarfile> <mountpoint path>"
>>>> echo " -n nodelist,should be comma separated."
>>>> echo " -d device name used for ocfs2 volume."
>>>> + echo " -t full path of kernel tarfile for test"
>>>> echo " <mountpoint path> path of mountpoint where test
>>>> will be performed."
>>>> echo echo "Eaxamples:"
>>>> - echo " `basename ${0}` -n
>>>> node1.us.oracle.com,node2.us.oracle.com -d /dev/sdd1 /storage"
>>>> + echo " `basename ${0}` -n
>>>> node1.us.oracle.com,node2.us.oracle.com -d /dev/sdd1 -t
>>>> /linux-2.6/linux-2.6.28.tgz /storage"
>>>> exit 1;
>>>>
>>>> }
>>>> @@ -63,10 +65,11 @@ f_getoptions()
>>>> exit 1
>>>> fi
>>>>
>>>> - while getopts "n:d:h:" options; do
>>>> + while getopts "n:d:h:t:" options; do
>>>> case $options in
>>>> n ) NODE_LIST="$OPTARG";;
>>>> d ) DEVICE_NAME="$OPTARG";;
>>>> + t ) TAR_FILE="$OPTARG";;
>>>> h ) f_usage
>>>> exit 1;;
>>>> * ) f_usage
>>>> @@ -162,6 +165,272 @@ run_inline_test()
>>>> }
>>>>
>>>> +run_write_torture()
>>>> +{ + LogMsg "write-torture-test"
>>>> + + local logdir=${O2TDIR}/log/write_torture_log
>>>> + local logfile=${logdir}/write_torture_${DATE}.log
>>>> + + local workdir=${MOUNT_POINT}/write_torture_test
>>>> + local testfile=write_torture_test_file
>>>> + + mkdir -p ${logdir}
>>>> + chmod 777 ${logdir}
>>>> + touch ${logfile}
>>>> + chmod 777 ${logfile}
>>>> + + #force to umount volume from all nodes
>>>> + ${ECHO} "Try to umount volume from all nodes before test."|tee
>>>> -a ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + CLUSTERSIZE=32k
>>>> + BLOCKSIZE=4k
>>>> + SLOTS=4
>>>> + LABEL=ocfs2-write-torture-test
>>>> + + ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> + echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> + ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> + return 1
>>>> + }
>>>> + + ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> + ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> + ${SUDO} chmod -R 777 ${MOUNT_POINT}
>>>> +
>>>> + mkdir -p ${workdir}
>>>> + chmod 777 ${workdir}
>>>> + + ${BINDIR}/run_write_torture.py -b 512,8096 -c 10 -d
>>>> ${workdir} -f ${testfile} -l ${logfile} -n ${NODE_LIST} -p ${SLOTS}
>>>> -s 60
>>>>
>>> I usually run this test a loop of 10 (-c 10) for 1800 seconds (-s
>>> 1600). Using -s 60 is really short.
>> I'm trusting you on the experience in this test.
>>
>>>> + + LogRC $?
>>>> +
>>>> + rm -rf ${workdir}
>>>> + + ${ECHO} "Umount volume from all nodes after test."|tee -a
>>>> ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_build_kernel()
>>>> +{
>>>> + LogMsg "build-kernel-test"
>>>> + + local logdir=${O2TDIR}/log/build_kernel_log
>>>> + local logfile=${logdir}/build_kernel__${DATE}.log
>>>> + + declare -i dir_nums=4
>>>> + declare -a workdir
>>>> + local workdir_list=""
>>>> +
>>>> + workdir[0]=${dir_nums}
>>>> + for i in $(seq ${dir_nums});do
>>>> + workdir[${i}]=${MOUNT_POINT}/build_kernel_dir${i}
>>>> + if [ "${i}" == "1" ];then
>>>> + workdir_list="${workdir[${i}]}"
>>>> + else
>>>> + workdir_list="${workdir_list},${workdir[${i}]}"
>>>> + fi
>>>> + done
>>>>
>>> Not sure this is necessary. You're just creating a lot of contention
>>> on the same disk as it is creating a process per directory on each
>>> node, extracting a tar file and building the kernel. It is probably
>>> going to end up with more I/O waits than runtime. Maybe we can use
>>> this script as a sanity and let the really heavy loads to be run
>>> manually. I also like the fact that running manually, I don't
>>> re-format the partition on every run. I have found problems that
>>> only came up because I was using old partitions with lots of tests
>>> on it.
>> I think this test is worth being kept in our automatic mult-nodes
>> laucher, as we can comfortably tune the workload(by specifying
>> different workdir_nums) lite or heavy to perform the test as a sainty
>> checker or stress puncher anyway.
>>
>> For the re-format issue, you may point me in a right direction. I
>> intended to keep tests separate enough by formating the volume in
>> each test,and to make sure the tests prepared well in case different
>> testcase has the different requirements for bs and cs size.
>>
>>>> + + mkdir -p ${logdir}
>>>> + chmod 777 ${logdir}
>>>> + touch ${logfile}
>>>> + chmod 777 ${logfile}
>>>> + + #force to umount volume from all nodes
>>>> + ${ECHO} "Try to umount volume from all nodes before test."|tee
>>>> -a ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + CLUSTERSIZE=32k
>>>> + BLOCKSIZE=4k
>>>> + SLOTS=4
>>>> + LABEL=ocfs2-build-kernel-test
>>>> + + ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> + echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> + ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> + return 1
>>>> + }
>>>> + + ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> + ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> + ${SUDO} chmod -R 777 ${MOUNT_POINT}
>>>> +
>>>> + for i in $(seq ${dir_nums});do
>>>> + mkdir -p ${workdir[${i}]}
>>>> + chmod -R 777 ${workdir[${i}]}
>>>> + done
>>>> + + ${BINDIR}/run_buildkernel.py -c 10 -u ${USERNAME} -d
>>>> ${workdir_list} -l ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>>> + + LogRC $?
>>>> +
>>>> + for i in $(seq ${dir_nums});do
>>>> + rm -rf ${workdir[${i}]}
>>>> + done
>>>> + + ${ECHO} "Umount volume from all nodes after test."|tee -a
>>>> ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_cross_delete()
>>>> +{
>>>> + LogMsg "cross-delete-test"
>>>> + + local logdir=${O2TDIR}/log/cross_delete_log
>>>> + local logfile=${logdir}/cross_delete_${DATE}.log
>>>> +
>>>> + declare -i dir_nums=4
>>>> + declare -a workdir
>>>> + local workdir_list=""
>>>> +
>>>> + workdir[0]=${dir_nums}
>>>> + for i in $(seq ${dir_nums});do
>>>> + workdir[${i}]=${MOUNT_POINT}/cross_delete_dir${i}
>>>> + if [ "${i}" == "1" ];then
>>>> + workdir_list="${workdir[${i}]}"
>>>> + else
>>>> + workdir_list="${workdir_list},${workdir[${i}]}"
>>>> + fi
>>>> + done
>>>>
>>> I think the same is valid here as it was for run_buildkernel.
>>>> + + mkdir -p ${logdir}
>>>> + chmod 777 ${logdir}
>>>> + touch ${logfile}
>>>> + chmod 777 ${logfile}
>>>> + + #force to umount volume from all nodes
>>>> + ${ECHO} "Try to umount volume from all nodes before test."|tee
>>>> -a ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + CLUSTERSIZE=32k
>>>> + BLOCKSIZE=4k
>>>> + SLOTS=4
>>>> + LABEL=ocfs2-cross-delete-test
>>>> + + ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> + echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> + ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> + return 1
>>>> + }
>>>> + + ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> + ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> + ${SUDO} chmod -R 777 ${MOUNT_POINT}
>>>> + + for i in $(seq ${dir_nums});do
>>>> + mkdir -p ${workdir[${i}]}
>>>> + chmod -R 777 ${workdir[${i}]}
>>>> + done
>>>> + + ${BINDIR}/cross_delete.py -c 10 -d ${workdir_list} -l
>>>> ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>>> + + LogRC $?
>>>> +
>>>> + for i in $(seq ${dir_nums});do
>>>> + rm -rf ${workdir[${i}]}
>>>> + done
>>>> +
>>>> + ${ECHO} "Umount volume from all nodes after test."|tee -a
>>>> ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_recovery_load()
>>>>
>>> This job is interactive as it will stop and prompt to hit enter.
>>> That the sync point where it will be ready to have nodes crashed. It
>>> has to run manually.
>>
>> Trust your experience on this:-), will not include this testcase in
>> automatic multi-nodes launcher later.
>>
>>>> +{
>>>> + LogMsg "recovery-load-test"
>>>> + + local logdir=${O2TDIR}/log/recovery_load_log
>>>> + local logfile=${logdir}/recover_load_${DATE}.log
>>>> + + local workdir=${MOUNT_POINT}/recovery_load_test_dir
>>>> + + mkdir -p ${logdir}
>>>> + chmod 777 ${logdir}
>>>> + touch ${logfile}
>>>> + chmod 777 ${logfile}
>>>> + + #force to umount volume from all nodes
>>>> + ${ECHO} "Try to umount volume from all nodes before test."|tee
>>>> -a ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + CLUSTERSIZE=32k
>>>> + BLOCKSIZE=4k
>>>> + SLOTS=4
>>>> + LABEL=ocfs2-recovery-load-test
>>>> + + ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> + echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> + ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> + return 1
>>>> + }
>>>> + + ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> + ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> + ${SUDO} chmod -R 777 ${MOUNT_POINT}
>>>> +
>>>> + mkdir -p ${workdir}
>>>> + chmod 777 ${workdir}
>>>> + + ${BINDIR}/recovery_load.py -d ${workdir} --extract --find
>>>> -l ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>>> + + LogRC $?
>>>> + + ${ECHO} "Umount volume from all nodes after test."|tee -a
>>>> ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_fork_writer()
>>>> +{
>>>> + LogMsg "fork-writer-test"
>>>> + + local logdir=${O2TDIR}/log/fork_writer_log
>>>> + local logfile=${logdir}/fork_writer_${DATE}.log
>>>> + + local testfile=${MOUNT_POINT}/fork_writer_test_file
>>>> + + mkdir -p ${logdir}
>>>> + chmod 777 ${logdir}
>>>> + touch ${logfile}
>>>> + chmod 777 ${logfile}
>>>> + + #force to umount volume from all nodes
>>>> + ${ECHO} "Try to umount volume from all nodes before test."|tee
>>>> -a ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + CLUSTERSIZE=32k
>>>> + BLOCKSIZE=4k
>>>> + SLOTS=4
>>>> + LABEL=ocfs2-fork-writer-test
>>>> + + ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> + echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> + ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> + return 1
>>>> + }
>>>> + + ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> + ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> + + ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> + ${SUDO} chmod -R 777 ${MOUNT_POINT}
>>>> + + ${BINDIR}/run_forkwriter.py -c 10 -f ${testfile} -l
>>>> ${logfile} -n ${NODE_LIST} -p ${SLOTS} -s 50000
>>>> + + LogRC $?
>>>> + + ${ECHO} "Umount volume from all nodes after test."|tee -a
>>>> ${logfile}
>>>> + ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> run_write_append_truncate_test()
>>>> {
>>>> LogMsg "write-append-truncate-test"
>>>>
>>
>
More information about the Ocfs2-test-devel
mailing list