<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <p>Hi Junxiao,</p>
    <p>Thank you for review.<br>
    </p>
    <div class="moz-cite-prefix">在 2019/9/12 上午6:30, Junxiao Bi 写道:<br>
    </div>
    <blockquote type="cite"
      cite="mid:9df44b3c-5991-36b6-f788-2311015c8968@oracle.com">Hi
      Shuning,
      <br>
      <br>
      See comments inlined.
      <br>
      <br>
      On 9/11/19 2:58 AM, Shuning Zhang wrote:
      <br>
      <blockquote type="cite">When the extent tree is modified, it
        shuold be protected by
        <br>
        inode cluster lock and ip_alloc_sem.
        <br>
        <br>
        The extent tree is accessed and modified in the
        ocfs2_prepare_inode_for_write,
        <br>
        but isn't protected by ip_alloc_sem.
        <br>
        <br>
        The following is a case. The function ocfs2_fiemap is accessing
        <br>
        the extent tree, which is modified at the same time.
        <br>
        <br>
        [47145.974472] kernel BUG at fs/ocfs2/extent_map.c:475!
        <br>
        [47145.974480] invalid opcode: 0000 [#1] SMP
        <br>
        [47145.974489] Modules linked in: tun ocfs2 ocfs2_nodemanager
        configfs
        <br>
        ocfs2_stackglue xen_netback xen_blkback xen_gntalloc xen_gntdev
        xen_evtchn
        <br>
        xenfs xen_privcmd vfat fat bnx2fc fcoe libfcoe libfc
        scsi_transport_fc sunrpc
        <br>
        bridge 8021q mrp garp stp llc ib_iser rdma_cm ib_cm iw_cm ib_sa
        ib_mad
        <br>
        ib_core ib_addr dm_round_robin dm_multipath sg pcspkr raid1
        shpchp
        <br>
        ipmi_devintf ipmi_msghandler ext4 jbd2 mbcache2 sd_mod nvme
        nvme_core bnxt_en
        <br>
        xhci_pci xhci_hcd crc32c_intel be2iscsi bnx2i cnic uio cxgb4i
        cxgb4 cxgb3i
        <br>
        libcxgbi ipv6 cxgb3 mdio qla4xxx wmi dm_mirror dm_region_hash
        dm_log dm_mod
        <br>
        iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi iscsi_ibft
        <br>
        iscsi_boot_sysfs
        <br>
        [47145.974636] CPU: 16 PID: 14047 Comm: o2info Not tainted
        <br>
        4.1.12-124.23.1.el6uek.x86_64 #2
        <br>
        [47145.974646] Hardware name: Oracle Corporation ORACLE SERVER
        X7-2L/ASM, MB
        <br>
        MECH, X7-2L, BIOS 42040600 10/19/2018
        <br>
        [47145.974658] task: ffff88019487e200 ti: ffff88003daa4000
        task.ti:
        <br>
        ffff88003daa4000
        <br>
        [47145.974667] RIP: e030:[&lt;ffffffffa05e4840&gt;] 
        [&lt;ffffffffa05e4840&gt;]
        <br>
        ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
        <br>
        [47145.974708] RSP: e02b:ffff88003daa7d88  EFLAGS: 00010287
        <br>
        [47145.974713] RAX: 00000000000000de RBX: ffff8801d1104030 RCX:
        <br>
        ffff8801d1104e10
        <br>
        [47145.974719] RDX: 00000000000000de RSI: 000000000009ec40 RDI:
        <br>
        ffff8801d1104e24
        <br>
        [47145.974725] RBP: ffff88003daa7df8 R08: ffff88003daa7e38 R09:
        <br>
        0000000000000000
        <br>
        [47145.974732] R10: 000000000009ec3f R11: 0000000000000246 R12:
        <br>
        000000000009ec3f
        <br>
        [47145.974739] R13: ffff88004c419000 R14: 0000000000000002 R15:
        <br>
        ffff88003daa7e28
        <br>
        [47145.974754] FS:  00007fdbccc92720(0000)
        GS:ffff880358800000(0000)
        <br>
        knlGS:ffff880358800000
        <br>
        [47145.974764] CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
        <br>
        [47145.974772] CR2: 00007fd5dfcd8350 CR3: 0000000208677000 CR4:
        <br>
        0000000000042660
        <br>
        [47145.974785] Stack:
        <br>
        [47145.974790]  ffff88003daa7df8 00002000a05e249b
        ffff8801d1104000
        <br>
        ffff88003daa7e2c
        <br>
        [47145.974802]  ffff88003daa7e38 ffff88000cc484c0
        ffff880145f5b478
        <br>
        0000000000000000
        <br>
        [47145.974811]  0000000000002000 ffff88000cc484c0
        ffff88003daa7ea0
        <br>
        0000000000000000
        <br>
        [47145.974820] Call Trace:
        <br>
        [47145.974837]  [&lt;ffffffffa05e53e3&gt;]
        ocfs2_fiemap+0x1e3/0x430 [ocfs2]
        <br>
        [47145.974848]  [&lt;ffffffff816f644f&gt;] ?
        xen_hypervisor_callback+0x7f/0x120
        <br>
        [47145.974855]  [&lt;ffffffff816f6448&gt;] ?
        xen_hypervisor_callback+0x78/0x120
        <br>
        [47145.974861]  [&lt;ffffffff816f64a3&gt;] ?
        xen_hypervisor_callback+0xd3/0x120
        <br>
        [47145.974872]  [&lt;ffffffff81220905&gt;]
        do_vfs_ioctl+0x155/0x510
        <br>
        [47145.974878]  [&lt;ffffffff81220d41&gt;] SyS_ioctl+0x81/0xa0
        <br>
        [47145.974885]  [&lt;ffffffff816f1b9f&gt;] ?
        system_call_after_swapgs+0xe9/0x190
        <br>
        [47145.974891]  [&lt;ffffffff816f1b98&gt;] ?
        system_call_after_swapgs+0xe2/0x190
        <br>
        [47145.974899]  [&lt;ffffffff816f1b91&gt;] ?
        system_call_after_swapgs+0xdb/0x190
        <br>
        [47145.974905]  [&lt;ffffffff816f1c5e&gt;]
        system_call_fastpath+0x18/0xd8
        <br>
        [47145.974910] Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff
        ff 48 8b 4a 40
        <br>
        48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff
        0f 1f 00 &lt;0f&gt;
        <br>
        0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f
        <br>
        [47145.975000] RIP  [&lt;ffffffffa05e4840&gt;]
        <br>
        ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
        <br>
        [47145.975018]  RSP &lt;ffff88003daa7d88&gt;
        <br>
        [47145.989999] ---[ end trace c8aa0c8180e869dc ]---
        <br>
        [47146.087579] Kernel panic - not syncing: Fatal exception
        <br>
        [47146.087691] Kernel Offset: disabled
        <br>
        <br>
        Signed-off-by: Shuning Zhang <a class="moz-txt-link-rfc2396E" href="mailto:sunny.s.zhang@oracle.com">&lt;sunny.s.zhang@oracle.com&gt;</a>
        <br>
        ---
        <br>
          fs/ocfs2/file.c | 79
        +++++++++++++++++++++++++++++++++++++++------------------
        <br>
          1 file changed, 54 insertions(+), 25 deletions(-)
        <br>
        <br>
        diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
        <br>
        index 4435df3..85cddd2 100644
        <br>
        --- a/fs/ocfs2/file.c
        <br>
        +++ b/fs/ocfs2/file.c
        <br>
        @@ -2093,29 +2093,19 @@ static int ocfs2_is_io_unaligned(struct
        inode *inode, size_t count, loff_t pos)
        <br>
          }
        <br>
            static int ocfs2_prepare_inode_for_refcount(struct inode
        *inode,
        <br>
        +                        struct buffer_head *di_bh,
        <br>
                                  struct file *file,
        <br>
        -                        loff_t pos, size_t count,
        <br>
        -                        int *meta_level)
        <br>
        +                        loff_t pos, size_t count)
        <br>
          {
        <br>
              int ret;
        <br>
        -    struct buffer_head *di_bh = NULL;
        <br>
              u32 cpos = pos &gt;&gt;
        OCFS2_SB(inode-&gt;i_sb)-&gt;s_clustersize_bits;
        <br>
              u32 clusters =
        <br>
                  ocfs2_clusters_for_bytes(inode-&gt;i_sb, pos + count)
        - cpos;
        <br>
          -    ret = ocfs2_inode_lock(inode, &amp;di_bh, 1);
        <br>
        -    if (ret) {
        <br>
        -        mlog_errno(ret);
        <br>
        -        goto out;
        <br>
        -    }
        <br>
        -
        <br>
        -    *meta_level = 1;
        <br>
        -
        <br>
              ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters,
        UINT_MAX);
        <br>
              if (ret)
        <br>
                  mlog_errno(ret);
        <br>
        -out:
        <br>
        -    brelse(di_bh);
        <br>
        +
        <br>
              return ret;
        <br>
          }
        <br>
      </blockquote>
      Since you moved out cluster from
      ocfs2_prepare_inode_for_refcount() , only ocfs2_refcount_cow left,
      can we go even further to remove it?
      <br>
      <br>
    </blockquote>
    That is a good idea. I will have a try.<br>
    <blockquote type="cite"
      cite="mid:9df44b3c-5991-36b6-f788-2311015c8968@oracle.com">
      <blockquote type="cite">  @@ -2123,6 +2113,7 @@ static int
        ocfs2_prepare_inode_for_write(struct file *file,
        <br>
                               loff_t pos, size_t count, int wait)
        <br>
          {
        <br>
              int ret = 0, meta_level = 0, overwrite_io = 0;
        <br>
        +    int write_sem = 0;
        <br>
              struct dentry *dentry = file-&gt;f_path.dentry;
        <br>
              struct inode *inode = d_inode(dentry);
        <br>
              struct buffer_head *di_bh = NULL;
        <br>
        @@ -2145,25 +2136,30 @@ static int
        ocfs2_prepare_inode_for_write(struct file *file,
        <br>
                      goto out;
        <br>
                  }
        <br>
          +        if (wait)
        <br>
        +            down_read(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
        +        else {
        <br>
        +            ret =
        down_read_trylock(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
        +            if (!ret) {
        <br>
        +                ret = -EAGAIN;
        <br>
        +                goto out_unlock;
        <br>
        +            }
        <br>
        +        }
        <br>
      </blockquote>
      <br>
      can we do it like the following?
      <br>
      <br>
      if(wait) {
      <br>
      <br>
          lock(inode_lock);
      <br>
      <br>
          lock(ip_alloc_sem);
      <br>
      <br>
      } else {
      <br>
      <br>
          try_lock();
      <br>
      <br>
      }
      <br>
      <br>
    </blockquote>
    <p><span class="tlid-translation translation" lang="en"><span
          title="" class="">Is that to make the code concise?</span></span></p>
    <p>If using this mode, the code will be more <span
        class="tlid-translation translation" lang="en"><span title=""
          class="">redundant</span></span><span
        class="tlid-translation-gender-indicator
        translation-gender-indicator">. That is because we should <br>
      </span></p>
    <p><span class="tlid-translation-gender-indicator
        translation-gender-indicator">check the retrun value of each
        function.</span></p>
    <p><br>
      <span class="tlid-translation-gender-indicator
        translation-gender-indicator">if(wait) {
        <br>
            ret = lock(inode_lock);
        <br>
           
        if (!ret) {</span></p>
    <p><span class="tlid-translation-gender-indicator
        translation-gender-indicator">    }<br>
          
        lock(ip_alloc_sem);
        <br>
        } else {
        <br>
            ret = try_lock(</span><span
        class="tlid-translation-gender-indicator
        translation-gender-indicator"><span
          class="tlid-translation-gender-indicator
          translation-gender-indicator">inode_lock</span>); <br>
      </span></p>
    <p><span class="tlid-translation-gender-indicator
        translation-gender-indicator"><span
          class="tlid-translation-gender-indicator
          translation-gender-indicator">   
          if (!ret) {</span>
      </span></p>
    <p><span class="tlid-translation-gender-indicator
        translation-gender-indicator">    }</span></p>
    <p><br>
      <span class="tlid-translation-gender-indicator
        translation-gender-indicator"><span
          class="tlid-translation-gender-indicator
          translation-gender-indicator">    ret = try_lock(</span></span><span
        class="tlid-translation-gender-indicator
        translation-gender-indicator"><span
          class="tlid-translation-gender-indicator
          translation-gender-indicator"><span
            class="tlid-translation-gender-indicator
            translation-gender-indicator">ip_alloc_sem</span>); <br>
        </span>
      </span></p>
    <p><span class="tlid-translation-gender-indicator
        translation-gender-indicator"><span
          class="tlid-translation-gender-indicator
          translation-gender-indicator">   
          if (!ret) {</span> </span></p>
    <p><span class="tlid-translation-gender-indicator
        translation-gender-indicator">    }</span></p>
    <p><span class="tlid-translation-gender-indicator
        translation-gender-indicator">
        }
      </span></p>
    <blockquote type="cite"
      cite="mid:9df44b3c-5991-36b6-f788-2311015c8968@oracle.com">
      <blockquote type="cite">+
        <br>
                  /*
        <br>
                   * Check if IO will overwrite allocated blocks in case
        <br>
                   * IOCB_NOWAIT flag is set.
        <br>
                   */
        <br>
                  if (!wait &amp;&amp; !overwrite_io) {
        <br>
                      overwrite_io = 1;
        <br>
        -            if
        (!down_read_trylock(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem)) {
        <br>
        -                ret = -EAGAIN;
        <br>
        -                goto out_unlock;
        <br>
        -            }
        <br>
                        ret = ocfs2_overwrite_io(inode, di_bh, pos,
        count);
        <br>
                      brelse(di_bh);
        <br>
                      di_bh = NULL;
        <br>
        -            up_read(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
                      if (ret &lt; 0) {
        <br>
                          if (ret != -EAGAIN)
        <br>
                              mlog_errno(ret);
        <br>
        -                goto out_unlock;
        <br>
        +                goto out_up_sem;
        <br>
                      }
        <br>
                  }
        <br>
          @@ -2178,6 +2174,7 @@ static int
        ocfs2_prepare_inode_for_write(struct file *file,
        <br>
                   * set inode-&gt;i_size at the end of a write. */
        <br>
                  if (should_remove_suid(dentry)) {
        <br>
                      if (meta_level == 0) {
        <br>
        +                up_read(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
                          ocfs2_inode_unlock(inode, meta_level);
        <br>
                          meta_level = 1;
        <br>
                          continue;
        <br>
        @@ -2186,7 +2183,7 @@ static int
        ocfs2_prepare_inode_for_write(struct file *file,
        <br>
                      ret = ocfs2_write_remove_suid(inode);
        <br>
                      if (ret &lt; 0) {
        <br>
                          mlog_errno(ret);
        <br>
        -                goto out_unlock;
        <br>
        +                goto out_up_sem;
        <br>
                      }
        <br>
                  }
        <br>
          @@ -2194,24 +2191,56 @@ static int
        ocfs2_prepare_inode_for_write(struct file *file,
        <br>
                    ret = ocfs2_check_range_for_refcount(inode, pos,
        count);
        <br>
                  if (ret == 1) {
        <br>
        +            up_read(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
                      ocfs2_inode_unlock(inode, meta_level);
        <br>
        -            meta_level = -1;
        <br>
        +            brelse(di_bh);
        <br>
        +            di_bh = NULL;
        <br>
        +
        <br>
        +            if (wait)
        <br>
        +                ret = ocfs2_inode_lock(inode, &amp;di_bh, 1);
        <br>
        +            else
        <br>
        +                ret = ocfs2_try_inode_lock(inode, &amp;di_bh,
        1);
        <br>
      </blockquote>
      You seemed miss call brelse(di_bh)? That will cause buffer_head
      leak. ocfs2_assign_bh() get it.
      <br>
    </blockquote>
    The function brelse is at end of ocfs2_prepare_inode_for_write.<br>
    <blockquote type="cite"
      cite="mid:9df44b3c-5991-36b6-f788-2311015c8968@oracle.com">
      <blockquote type="cite">+            if (ret) {
        <br>
        +                if (ret != -EAGAIN)
        <br>
        +                    mlog_errno(ret);
        <br>
        +                goto out;
        <br>
        +            }
        <br>
        +
        <br>
        +            meta_level = 1;
        <br>
        +
        <br>
        +            if (wait)
        <br>
        +               
        down_write(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
        +            else {
        <br>
        +                ret =
        down_write_trylock(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
        +                if (!ret) {
        <br>
        +                    ret = -EAGAIN;
        <br>
        +                    goto out_unlock;
        <br>
        +                }
        <br>
        +            }
        <br>
      </blockquote>
      The same style issue like above. Maybe we can abstract it in a
      function?
      <br>
    </blockquote>
    Yes, there are some <span class="tlid-translation translation"
      lang="en"><span title="" class="">redundancy</span></span><span
      class="tlid-translation-gender-indicator
      translation-gender-indicator">. I will have a try.<br>
    </span>
    <blockquote type="cite"
      cite="mid:9df44b3c-5991-36b6-f788-2311015c8968@oracle.com">
      <blockquote type="cite">+
        <br>
        +            write_sem = 1;
        <br>
                        ret = ocfs2_prepare_inode_for_refcount(inode,
        <br>
        +                                   di_bh,
        <br>
                                             file,
        <br>
                                             pos,
        <br>
        -                                   count,
        <br>
        -                                   &amp;meta_level);
        <br>
        +                                   count);
        <br>
                  }
        <br>
                    if (ret &lt; 0) {
        <br>
        -            mlog_errno(ret);
        <br>
        -            goto out_unlock;
        <br>
        +            if (ret != -EAGAIN)
        <br>
        +                mlog_errno(ret);
        <br>
        +            goto out_up_sem;
        <br>
                  }
        <br>
                    break;
        <br>
              }
        <br>
          +out_up_sem:
        <br>
        +    if (write_sem)
        <br>
        +        up_write(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
        +    else
        <br>
        +        up_read(&amp;OCFS2_I(inode)-&gt;ip_alloc_sem);
        <br>
      </blockquote>
      <br>
      Where were inode cluster lock unlocked?
      <br>
      <br>
    </blockquote>
    <p>This lock is unlocked at the end of function. <br>
    </p>
    <p>2250         if (meta_level &gt;= 0)<br>
      2251                 ocfs2_inode_unlock(inode, meta_level);<br>
      <br>
    </p>
    <blockquote type="cite"
      cite="mid:9df44b3c-5991-36b6-f788-2311015c8968@oracle.com">Thanks,
      <br>
      <br>
      Junxiao.
      <br>
      <br>
      <blockquote type="cite">  out_unlock:
        <br>
             
        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)-&gt;ip_blkno,
        <br>
                                  pos, count, wait);
        <br>
      </blockquote>
    </blockquote>
  </body>
</html>