[Ocfs2-test-devel] [PATCH 3/3] Ocfs2-test: Add rest of multi-nodes testcases to multiple_run.sh.

Fri Jan 9 18:26:55 PST 2009

Marcos has a good point. Having lots of processes waiting on io is
not an efficient way to test the fs. The trick is to find a balance...
in which we get enough contention to expose races but not more that
just leads to processes waiting on the ios to complete.

So when I run build kernel by hand, I use -j to parallelize the build.
Set to double the number of cpus.

tristan.ye wrote:
> Marcos E. Matsunaga wrote:
>> Comments inline.
>> Regards,
>>
>> Marcos Eduardo Matsunaga
>>
>> Oracle USA
>> Linux Engineering
>>
>> “The statements and opinions expressed here are my own and do not
>> necessarily represent those of Oracle Corporation.”
>>  
>>
>> Tristan Ye wrote:
>>> After marcos has moved all the rest of mult-nodes testcases from 
>>> lam-mpi to openmpi,
>>> we decide to add these tests into the multi-nodes testing 
>>> launcher(multiple_run.sh),
>>> they are:
>>>
>>> 1. write_torture
>>>
>>> 2. build_kernel
>>>
>>> 3. cross_delete
>>>
>>> 4. recovery_load
>>>
>>> 5. fork_writer
>>>
>>> Signed-off-by: Tristan Ye <tristan.ye at oracle.com>
>>> ---
>>>  programs/python_common/multiple_run.sh |  275 
>>> +++++++++++++++++++++++++++++++-
>>>  1 files changed, 272 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/programs/python_common/multiple_run.sh 
>>> b/programs/python_common/multiple_run.sh
>>> index 7b1ac1e..3ed6f82 100755
>>> --- a/programs/python_common/multiple_run.sh
>>> +++ b/programs/python_common/multiple_run.sh
>>> @@ -38,6 +38,7 @@ REMOTE_UMOUNT_BIN="${BINDIR}/remote_umount.py"
>>>  
>>>  NODE_LIST=
>>>  DEVICE_NAME=
>>> +TAR_FILE=
>>>  MOUNT_POINT=
>>>  
>>>  ################################################################################ 
>>>
>>> @@ -45,13 +46,14 @@ MOUNT_POINT=
>>>  ################################################################################ 
>>>
>>>  f_usage()
>>>  {
>>> -    echo "usage: `basename ${0}` <-n nodes> <-d device> <mountpoint 
>>> path>"
>>> +    echo "usage: `basename ${0}` <-n nodes> <-d device> <-t 
>>> tarfile> <mountpoint path>"
>>>      echo "       -n nodelist,should be comma separated."
>>>      echo "       -d device name used for ocfs2 volume."
>>> +    echo "       -t full path of kernel tarfile for test"
>>>      echo "       <mountpoint path> path of mountpoint where test 
>>> will be performed."
>>>      echo      echo "Eaxamples:"
>>> -    echo "     `basename ${0}` -n 
>>> node1.us.oracle.com,node2.us.oracle.com -d /dev/sdd1 /storage"
>>> +    echo "     `basename ${0}` -n 
>>> node1.us.oracle.com,node2.us.oracle.com -d /dev/sdd1 -t 
>>> /linux-2.6/linux-2.6.28.tgz /storage"
>>>      exit 1;
>>>  
>>>  }
>>> @@ -63,10 +65,11 @@ f_getoptions()
>>>                  exit 1
>>>           fi
>>>  
>>> -         while getopts "n:d:h:" options; do
>>> +         while getopts "n:d:h:t:" options; do
>>>                  case $options in
>>>                  n ) NODE_LIST="$OPTARG";;
>>>                  d ) DEVICE_NAME="$OPTARG";;
>>> +                t ) TAR_FILE="$OPTARG";;
>>>                  h ) f_usage
>>>                      exit 1;;
>>>                  * ) f_usage
>>> @@ -162,6 +165,272 @@ run_inline_test()
>>>     
>>>  }
>>>  
>>> +run_write_torture()
>>> +{   
>>> +    LogMsg "write-torture-test"
>>> +   
>>> +    local logdir=${O2TDIR}/log/write_torture_log
>>> +    local logfile=${logdir}/write_torture_${DATE}.log
>>> +   
>>> +    local workdir=${MOUNT_POINT}/write_torture_test
>>> +    local testfile=write_torture_test_file
>>> +   
>>> +    mkdir -p ${logdir}
>>> +    chmod 777 ${logdir}
>>> +    touch ${logfile}
>>> +    chmod 777 ${logfile}
>>> +   
>>> +    #force to umount volume from all nodes
>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>> -a ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    CLUSTERSIZE=32k
>>> +    BLOCKSIZE=4k
>>> +    SLOTS=4
>>> +    LABEL=ocfs2-write-torture-test
>>> +   
>>> +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>> +        return 1
>>> +    }
>>> +   
>>> +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>> +
>>> +    mkdir -p ${workdir}
>>> +    chmod 777 ${workdir}
>>> +   
>>> +    ${BINDIR}/run_write_torture.py -b 512,8096 -c 10 -d ${workdir} 
>>> -f ${testfile} -l ${logfile} -n ${NODE_LIST} -p ${SLOTS} -s 60
>>>   
>> I usually run this test a loop of 10 (-c 10) for 1800 seconds (-s 
>> 1600). Using -s 60 is really short.
> I'm trusting you on the experience in this test.
>
>>> +   
>>> +    LogRC $?
>>> +
>>> +    rm -rf ${workdir}
>>> +   
>>> +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>> ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +}
>>> +
>>> +run_build_kernel()
>>> +{
>>> +    LogMsg "build-kernel-test"
>>> +   
>>> +    local logdir=${O2TDIR}/log/build_kernel_log
>>> +    local logfile=${logdir}/build_kernel__${DATE}.log
>>> +   
>>> +    declare -i dir_nums=4
>>> +    declare -a workdir
>>> +    local workdir_list=""
>>> +
>>> +    workdir[0]=${dir_nums}
>>> +    for i in $(seq ${dir_nums});do
>>> +        workdir[${i}]=${MOUNT_POINT}/build_kernel_dir${i}
>>> +        if [ "${i}" == "1" ];then
>>> +            workdir_list="${workdir[${i}]}"
>>> +        else
>>> +            workdir_list="${workdir_list},${workdir[${i}]}"
>>> +        fi
>>> +    done
>>>   
>> Not sure this is necessary. You're just creating a lot of contention 
>> on the same disk as it is creating a process per directory on each 
>> node, extracting a tar file and building the kernel. It is probably 
>> going to end up with more I/O waits than runtime. Maybe we can use 
>> this script as a sanity and let the really heavy loads to be run 
>> manually. I also like the fact that running manually, I don't 
>> re-format the partition on every run. I have found problems that only 
>> came up because I was using old partitions with lots of tests on it.
> I think this test is worth being kept in our automatic mult-nodes 
> laucher, as we can comfortably tune the workload(by specifying 
> different workdir_nums) lite or heavy to perform the test as a sainty 
> checker or stress puncher anyway.
>
> For the re-format issue, you may point me in a right direction. I 
> intended to keep tests separate enough by formating the volume in each 
> test,and to make sure the tests prepared well in case different 
> testcase has the different requirements for bs and cs size.
>
>>> +   
>>> +    mkdir -p ${logdir}
>>> +    chmod 777 ${logdir}
>>> +    touch ${logfile}
>>> +    chmod 777 ${logfile}
>>> +   
>>> +    #force to umount volume from all nodes
>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>> -a ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    CLUSTERSIZE=32k
>>> +    BLOCKSIZE=4k
>>> +    SLOTS=4
>>> +    LABEL=ocfs2-build-kernel-test
>>> +   
>>> +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>> +        return 1
>>> +    }
>>> +   
>>> +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>> +
>>> +    for i in $(seq ${dir_nums});do
>>> +        mkdir -p ${workdir[${i}]}
>>> +        chmod -R 777 ${workdir[${i}]}
>>> +    done
>>> +   
>>> +    ${BINDIR}/run_buildkernel.py -c 10 -u ${USERNAME} -d 
>>> ${workdir_list} -l ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>> +   
>>> +    LogRC $?
>>> +
>>> +    for i in $(seq ${dir_nums});do
>>> +        rm -rf ${workdir[${i}]}
>>> +    done
>>> +   
>>> +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>> ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +}
>>> +
>>> +run_cross_delete()
>>> +{
>>> +    LogMsg "cross-delete-test"
>>> +   
>>> +    local logdir=${O2TDIR}/log/cross_delete_log
>>> +    local logfile=${logdir}/cross_delete_${DATE}.log
>>> +
>>> +    declare -i dir_nums=4
>>> +    declare -a workdir
>>> +    local workdir_list=""
>>> +
>>> +    workdir[0]=${dir_nums}
>>> +    for i in $(seq ${dir_nums});do
>>> +        workdir[${i}]=${MOUNT_POINT}/cross_delete_dir${i}
>>> +        if [ "${i}" == "1" ];then
>>> +            workdir_list="${workdir[${i}]}"
>>> +        else
>>> +            workdir_list="${workdir_list},${workdir[${i}]}"
>>> +        fi
>>> +    done
>>>   
>> I think the same is valid here as it was for run_buildkernel.
>>> +   
>>> +    mkdir -p ${logdir}
>>> +    chmod 777 ${logdir}
>>> +    touch ${logfile}
>>> +    chmod 777 ${logfile}
>>> +   
>>> +    #force to umount volume from all nodes
>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>> -a ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    CLUSTERSIZE=32k
>>> +    BLOCKSIZE=4k
>>> +    SLOTS=4
>>> +    LABEL=ocfs2-cross-delete-test
>>> +   
>>> +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>> +        return 1
>>> +    }
>>> +   
>>> +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>> +   
>>> +    for i in $(seq ${dir_nums});do
>>> +        mkdir -p ${workdir[${i}]}
>>> +        chmod -R 777 ${workdir[${i}]}
>>> +    done
>>> +   
>>> +    ${BINDIR}/cross_delete.py -c 10  -d ${workdir_list} -l 
>>> ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>> +   
>>> +    LogRC $?
>>> +
>>> +    for i in $(seq ${dir_nums});do
>>> +        rm -rf ${workdir[${i}]}
>>> +    done
>>> +
>>> +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>> ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +}
>>> +
>>> +run_recovery_load()
>>>   
>> This job is interactive as it will stop and prompt to hit enter. That 
>> the sync point where it will be ready to have nodes crashed. It has 
>> to run manually.
>
> Trust your experience on this:-), will not include this testcase in 
> automatic multi-nodes launcher later.
>
>>> +{
>>> +    LogMsg "recovery-load-test"
>>> +   
>>> +    local logdir=${O2TDIR}/log/recovery_load_log
>>> +    local logfile=${logdir}/recover_load_${DATE}.log
>>> +   
>>> +    local workdir=${MOUNT_POINT}/recovery_load_test_dir
>>> +   
>>> +    mkdir -p ${logdir}
>>> +    chmod 777 ${logdir}
>>> +    touch ${logfile}
>>> +    chmod 777 ${logfile}
>>> +   
>>> +    #force to umount volume from all nodes
>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>> -a ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    CLUSTERSIZE=32k
>>> +    BLOCKSIZE=4k
>>> +    SLOTS=4
>>> +    LABEL=ocfs2-recovery-load-test
>>> +   
>>> +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>> +        return 1
>>> +    }
>>> +   
>>> +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>> +
>>> +    mkdir -p ${workdir}
>>> +    chmod 777 ${workdir}
>>> +   
>>> +    ${BINDIR}/recovery_load.py -d ${workdir} --extract --find -l 
>>> ${logfile} -n ${NODE_LIST} -t ${TAR_FILE}
>>> +   
>>> +    LogRC $?
>>> +   
>>> +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>> ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +}
>>> +
>>> +run_fork_writer()
>>> +{
>>> +    LogMsg "fork-writer-test"
>>> +   
>>> +    local logdir=${O2TDIR}/log/fork_writer_log
>>> +    local logfile=${logdir}/fork_writer_${DATE}.log
>>> +   
>>> +    local testfile=${MOUNT_POINT}/fork_writer_test_file
>>> +   
>>> +    mkdir -p ${logdir}
>>> +    chmod 777 ${logdir}
>>> +    touch ${logfile}
>>> +    chmod 777 ${logfile}
>>> +   
>>> +    #force to umount volume from all nodes
>>> +    ${ECHO} "Try to umount volume from all nodes before test."|tee 
>>> -a ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    CLUSTERSIZE=32k
>>> +    BLOCKSIZE=4k
>>> +    SLOTS=4
>>> +    LABEL=ocfs2-fork-writer-test
>>> +   
>>> +    ${ECHO} "Format volume to launch new test"|tee -a ${logfile}
>>> +    echo y|${MKFS_BIN} -C ${CLUSTERSIZE} -b ${BLOCKSIZE} -N 
>>> ${SLOTS} -L ${LABEL} ${DEVICE_NAME} || {
>>> +        ${ECHO} "Can not format ${DEVICE_NAME}"
>>> +        return 1
>>> +    }
>>> +   
>>> +    ${ECHO} "Mount volume to all nodes"|tee -a ${logfile}
>>> +    ${REMOTE_MOUNT_BIN} -l ${LABEL} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +   
>>> +    ${SUDO} chown -R ${USERNAME}:${GROUPNAME} ${MOUNT_POINT}
>>> +    ${SUDO} chmod -R 777  ${MOUNT_POINT}
>>> +   
>>> +    ${BINDIR}/run_forkwriter.py -c 10 -f ${testfile} -l ${logfile} 
>>> -n ${NODE_LIST} -p ${SLOTS} -s 50000
>>> +   
>>> +    LogRC $?
>>> +   
>>> +    ${ECHO} "Umount volume from all nodes after test."|tee -a 
>>> ${logfile}
>>> +    ${REMOTE_UMOUNT_BIN} -m ${MOUNT_POINT} -n 
>>> ${NODE_LIST}>>${logfile} 2>&1
>>> +}
>>> +
>>>  run_write_append_truncate_test()
>>>  {
>>>      LogMsg "write-append-truncate-test"
>>>   
>