[Ocfs2-test-devel] [PATCH 3/3] Ocfs2-test: Add rest of multi-nodes testcases to multiple_run.sh.

Fri Jan 9 18:51:45 PST 2009

Marcos, Sunil,

Thanks for your explainations,that's really a good point to strike for a 
balance between i/o workload and parallelism  intensity, during 
multi-nodes execution of the buildkernel test, we can limit the 
workdir_nums of each node to 2(2 kernels being built on each node),and 
increase the parallelism of building jobs(larger than 2) to expose the 
races.

Regards,
Tristan

Sunil Mushran wrote:
> Marcos has a good point. Having lots of processes waiting on io is
> not an efficient way to test the fs. The trick is to find a balance...
> in which we get enough contention to expose races but not more that
> just leads to processes waiting on the ios to complete.
>
> So when I run build kernel by hand, I use -j to parallelize the build.
> Set to double the number of cpus.
>
> tristan.ye wrote:
>> Marcos E. Matsunaga wrote:
>>> Comments inline.
>>> Regards,
>>>
>>> Marcos Eduardo Matsunaga
>>>
>>> Oracle USA
>>> Linux Engineering
>>>
>>> “The statements and opinions expressed here are my own and do not
>>> necessarily represent those of Oracle Corporation.”
>>>  
>>>
>>> Tristan Ye wrote:
>>>> After marcos has moved all the rest of mult-nodes testcases from 
>>>> lam-mpi to openmpi,
>>>> we decide to add these tests into the multi-nodes testing 
>>>> launcher(multiple_run.sh),
>>>> they are:
>>>>
>>>> 1. write_torture
>>>>
>>>> 2. build_kernel
>>>>
>>>> 3. cross_delete
>>>>
>>>> 4. recovery_load
>>>>
>>>> 5. fork_writer
>>>>
>>>> Signed-off-by: Tristan Ye <tristan.ye at oracle.com>
>>>> ---
>>>>  programs/python_common/multiple_run.sh |  275 
>>>> +++++++++++++++++++++++++++++++-
>>>>  1 files changed, 272 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/programs/python_common/multiple_run.sh 
>>>> b/programs/python_common/multiple_run.sh
>>>> index 7b1ac1e..3ed6f82 100755
>>>> --- a/programs/python_common/multiple_run.sh
>>>> +++ b/programs/python_common/multiple_run.sh
>>>> @@ -38,6 +38,7 @@ REMOTE_UMOUNT_BIN="${BINDIR}/remote_umount.py"
>>>>  
>>>>  NODE_LIST=
>>>>  DEVICE_NAME=
>>>> +TAR_FILE=
>>>>  MOUNT_POINT=
>>>>  
>>>>  ################################################################################ 
>>>>
>>>> @@ -45,13 +46,14 @@ MOUNT_POINT=
>>>>  ################################################################################ 
>>>>
>>>>  f_usage()
>>>>  {
>>>> -    echo "usage: `basename ${0}` <-n nodes> <-d device> 
>>>> <mountpoint path>"
>>>> +    echo "usage: `basename ${0}` <-n nodes> <-d device> <-t 
>>>> tarfile> <mountpoint path>"
>>>>      echo "       -n nodelist,should be comma separated."
>>>>      echo "       -d device name used for ocfs2 volume."
>>>> +    echo "       -t full path of kernel tarfile for test"
>>>>      echo "       <mountpoint path> path of mountpoint where test 
>>>> will be performed."
>>>>      echo      echo "Eaxamples:"
>>>> -    echo "     `basename ${0}` -n 
>>>> node1.us.oracle.com,node2.us.oracle.com -d /dev/sdd1 /storage"
>>>> +    echo "     `basename ${0}` -n 
>>>> node1.us.oracle.com,node2.us.oracle.com -d /dev/sdd1 -t 
>>>> /linux-2.6/linux-2.6.28.tgz /storage"
>>>>      exit 1;
>>>>  
>>>>  }
>>>> @@ -63,10 +65,11 @@ f_getoptions()
>>>>                  exit 1
>>>>           fi
>>>>  
>>>> -         while getopts "n:d:h:" options; do
>>>> +         while getopts "n:d:h:t:" options; do
>>>>                  case $options in
>>>>                  n ) NODE_LIST="$OPTARG";;
>>>>                  d ) DEVICE_NAME="$OPTARG";;
>>>> +                t ) TAR_FILE="$OPTARG";;
>>>>                  h ) f_usage
>>>>                      exit 1;;
>>>>                  * ) f_usage
>>>> @@ -162,6 +165,272 @@ run_inline_test()
>>>>      }
>>>>  
>>>> +run_write_torture()
>>>> +{   +    LogMsg "write-torture-test"
>>>> +   +    local logdir=${O2TDIR}/log/write_torture_log
>>>> +    local logfile=${logdir}/write_torture_${DATE}.log
>>>> +   +    local workdir=${MOUNT_POINT}/write_torture_test
>>>> +    local testfile=write_torture_test_file
>>>> +   +    mkdir -p ${logdir}
>>>> +    chmod 777 ${logdir}
>>>> +    touch ${logfile}
>>>> +    chmod 777 ${logfile}
>>>> +   +    #force to umount volume from all nodes
>>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>>> -a ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    CLUSTERSIZE=32k
>>>> +    BLOCKSIZE=4k
>>>> +    SLOTS=4
>>>> +    LABEL=ocfs2-write-torture-test
>>>> +   +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> +        return 1
>>>> +    }
>>>> +   +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>>> +
>>>> +    mkdir -p ${workdir}
>>>> +    chmod 777 ${workdir}
>>>> +   +    ${BINDIR}/run_write_torture.py -b 512,8096 -c 10 -d 
>>>> ${workdir} -f ${testfile} -l ${logfile} -n ${NODE_LIST} -p ${SLOTS} 
>>>> -s 60
>>>>   
>>> I usually run this test a loop of 10 (-c 10) for 1800 seconds (-s 
>>> 1600). Using -s 60 is really short.
>> I'm trusting you on the experience in this test.
>>
>>>> +   +    LogRC $?
>>>> +
>>>> +    rm -rf ${workdir}
>>>> +   +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>>> ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_build_kernel()
>>>> +{
>>>> +    LogMsg "build-kernel-test"
>>>> +   +    local logdir=${O2TDIR}/log/build_kernel_log
>>>> +    local logfile=${logdir}/build_kernel__${DATE}.log
>>>> +   +    declare -i dir_nums=4
>>>> +    declare -a workdir
>>>> +    local workdir_list=""
>>>> +
>>>> +    workdir[0]=${dir_nums}
>>>> +    for i in $(seq ${dir_nums});do
>>>> +        workdir[${i}]=${MOUNT_POINT}/build_kernel_dir${i}
>>>> +        if [ "${i}" == "1" ];then
>>>> +            workdir_list="${workdir[${i}]}"
>>>> +        else
>>>> +            workdir_list="${workdir_list},${workdir[${i}]}"
>>>> +        fi
>>>> +    done
>>>>   
>>> Not sure this is necessary. You're just creating a lot of contention 
>>> on the same disk as it is creating a process per directory on each 
>>> node, extracting a tar file and building the kernel. It is probably 
>>> going to end up with more I/O waits than runtime. Maybe we can use 
>>> this script as a sanity and let the really heavy loads to be run 
>>> manually. I also like the fact that running manually, I don't 
>>> re-format the partition on every run. I have found problems that 
>>> only came up because I was using old partitions with lots of tests 
>>> on it.
>> I think this test is worth being kept in our automatic mult-nodes 
>> laucher, as we can comfortably tune the workload(by specifying 
>> different workdir_nums) lite or heavy to perform the test as a sainty 
>> checker or stress puncher anyway.
>>
>> For the re-format issue, you may point me in a right direction. I 
>> intended to keep tests separate enough by formating the volume in 
>> each test,and to make sure the tests prepared well in case different 
>> testcase has the different requirements for bs and cs size.
>>
>>>> +   +    mkdir -p ${logdir}
>>>> +    chmod 777 ${logdir}
>>>> +    touch ${logfile}
>>>> +    chmod 777 ${logfile}
>>>> +   +    #force to umount volume from all nodes
>>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>>> -a ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    CLUSTERSIZE=32k
>>>> +    BLOCKSIZE=4k
>>>> +    SLOTS=4
>>>> +    LABEL=ocfs2-build-kernel-test
>>>> +   +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> +        return 1
>>>> +    }
>>>> +   +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>>> +
>>>> +    for i in $(seq ${dir_nums});do
>>>> +        mkdir -p ${workdir[${i}]}
>>>> +        chmod -R 777 ${workdir[${i}]}
>>>> +    done
>>>> +   +    ${BINDIR}/run_buildkernel.py -c 10 -u ${USERNAME} -d 
>>>> ${workdir_list} -l ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>>> +   +    LogRC $?
>>>> +
>>>> +    for i in $(seq ${dir_nums});do
>>>> +        rm -rf ${workdir[${i}]}
>>>> +    done
>>>> +   +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>>> ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_cross_delete()
>>>> +{
>>>> +    LogMsg "cross-delete-test"
>>>> +   +    local logdir=${O2TDIR}/log/cross_delete_log
>>>> +    local logfile=${logdir}/cross_delete_${DATE}.log
>>>> +
>>>> +    declare -i dir_nums=4
>>>> +    declare -a workdir
>>>> +    local workdir_list=""
>>>> +
>>>> +    workdir[0]=${dir_nums}
>>>> +    for i in $(seq ${dir_nums});do
>>>> +        workdir[${i}]=${MOUNT_POINT}/cross_delete_dir${i}
>>>> +        if [ "${i}" == "1" ];then
>>>> +            workdir_list="${workdir[${i}]}"
>>>> +        else
>>>> +            workdir_list="${workdir_list},${workdir[${i}]}"
>>>> +        fi
>>>> +    done
>>>>   
>>> I think the same is valid here as it was for run_buildkernel.
>>>> +   +    mkdir -p ${logdir}
>>>> +    chmod 777 ${logdir}
>>>> +    touch ${logfile}
>>>> +    chmod 777 ${logfile}
>>>> +   +    #force to umount volume from all nodes
>>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>>> -a ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    CLUSTERSIZE=32k
>>>> +    BLOCKSIZE=4k
>>>> +    SLOTS=4
>>>> +    LABEL=ocfs2-cross-delete-test
>>>> +   +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> +        return 1
>>>> +    }
>>>> +   +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>>> +   +    for i in $(seq ${dir_nums});do
>>>> +        mkdir -p ${workdir[${i}]}
>>>> +        chmod -R 777 ${workdir[${i}]}
>>>> +    done
>>>> +   +    ${BINDIR}/cross_delete.py -c 10  -d ${workdir_list} -l 
>>>> ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>>> +   +    LogRC $?
>>>> +
>>>> +    for i in $(seq ${dir_nums});do
>>>> +        rm -rf ${workdir[${i}]}
>>>> +    done
>>>> +
>>>> +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>>> ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_recovery_load()
>>>>   
>>> This job is interactive as it will stop and prompt to hit enter. 
>>> That the sync point where it will be ready to have nodes crashed. It 
>>> has to run manually.
>>
>> Trust your experience on this:-), will not include this testcase in 
>> automatic multi-nodes launcher later.
>>
>>>> +{
>>>> +    LogMsg "recovery-load-test"
>>>> +   +    local logdir=${O2TDIR}/log/recovery_load_log
>>>> +    local logfile=${logdir}/recover_load_${DATE}.log
>>>> +   +    local workdir=${MOUNT_POINT}/recovery_load_test_dir
>>>> +   +    mkdir -p ${logdir}
>>>> +    chmod 777 ${logdir}
>>>> +    touch ${logfile}
>>>> +    chmod 777 ${logfile}
>>>> +   +    #force to umount volume from all nodes
>>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>>> -a ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    CLUSTERSIZE=32k
>>>> +    BLOCKSIZE=4k
>>>> +    SLOTS=4
>>>> +    LABEL=ocfs2-recovery-load-test
>>>> +   +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> +        return 1
>>>> +    }
>>>> +   +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>>> +
>>>> +    mkdir -p ${workdir}
>>>> +    chmod 777 ${workdir}
>>>> +   +    ${BINDIR}/recovery_load.py -d ${workdir} --extract --find 
>>>> -l ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>>> +   +    LogRC $?
>>>> +   +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>>> ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>> +run_fork_writer()
>>>> +{
>>>> +    LogMsg "fork-writer-test"
>>>> +   +    local logdir=${O2TDIR}/log/fork_writer_log
>>>> +    local logfile=${logdir}/fork_writer_${DATE}.log
>>>> +   +    local testfile=${MOUNT_POINT}/fork_writer_test_file
>>>> +   +    mkdir -p ${logdir}
>>>> +    chmod 777 ${logdir}
>>>> +    touch ${logfile}
>>>> +    chmod 777 ${logfile}
>>>> +   +    #force to umount volume from all nodes
>>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>>> -a ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    CLUSTERSIZE=32k
>>>> +    BLOCKSIZE=4k
>>>> +    SLOTS=4
>>>> +    LABEL=ocfs2-fork-writer-test
>>>> +   +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>>> +        return 1
>>>> +    }
>>>> +   +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +   +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>>> +   +    ${BINDIR}/run_forkwriter.py -c 10 -f ${testfile} -l 
>>>> ${logfile} -n ${NODE_LIST} -p ${SLOTS} -s 50000
>>>> +   +    LogRC $?
>>>> +   +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>>> ${logfile}
>>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>>> ${NODE_LIST}>>${logfile} 2>&1
>>>> +}
>>>> +
>>>>  run_write_append_truncate_test()
>>>>  {
>>>>      LogMsg "write-append-truncate-test"
>>>>   
>>
>