From: Greg KH on
2.6.32-stable review patch. If anyone has any objections, please let us know.

------------------

From: Tao Ma <tao.ma(a)oracle.com>

commit 38a04e432768ec0b016f3c687b4de31ac111ae59 upstream.

ocfs2 refcount tree is stored as an extent tree while
the leaf ocfs2_refcount_rec points to a refcount block.

The following step can trip a kernel panic.
mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE
mount -t ocfs2 $DEVICE $MNT_DIR
FILE_NAME=$RANDOM
FILE_NAME_1=$RANDOM
FILE_REF="${FILE_NAME}_ref"
FILE_REF_1="${FILE_NAME}_ref_1"
for((i=0;i<305;i++))
do
# /mnt/1048576 is a file with 1048576 sizes.
cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1
done
for((i=0;i<3;i++))
do
cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
done

for((i=0;i<2;i++))
do
cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1
done

cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME

for((i=0;i<11;i++))
do
cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1
done
reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF
# write_f is a program which will write some bytes to a file at offset.
# write_f -f file_name -l offset -w write_bytes.
../write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096
../write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096
../write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096
../write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096
../write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096
reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1
../write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096
#kernel panic here.

The reason is that if the ocfs2_extent_rec is the last record
in a leaf extent block, the old solution fails to find the
suitable end cpos. So this patch try to walk through the b-tree,
find the next sub root and get the c_pos the next sub-tree starts
from.

btw, I have runned tristan's test case against the patched kernel
for several days and this type of kernel panic never happens again.

Signed-off-by: Tao Ma <tao.ma(a)oracle.com>
Signed-off-by: Joel Becker <joel.becker(a)oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)suse.de>

---
fs/ocfs2/alloc.c | 10 ++--
fs/ocfs2/alloc.h | 5 ++
fs/ocfs2/refcounttree.c | 117 ++++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 119 insertions(+), 13 deletions(-)

--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1765,9 +1765,9 @@ set_and_inc:
*
* The array index of the subtree root is passed back.
*/
-static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
- struct ocfs2_path *left,
- struct ocfs2_path *right)
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right)
{
int i = 0;

@@ -2872,8 +2872,8 @@ out:
* This looks similar, but is subtly different to
* ocfs2_find_cpos_for_left_leaf().
*/
-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
- struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
{
int i, j, ret = 0;
u64 blkno;
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_
int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
handle_t *handle,
struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right);
#endif /* OCFS2_ALLOC_H */
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -969,6 +969,103 @@ out:
}

/*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_extent_block *eb,
+ struct ocfs2_extent_list *el,
+ int index, u32 *cpos_end)
+{
+ int ret, i, subtree_root;
+ u32 cpos;
+ u64 blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_path *left_path = NULL, *right_path = NULL;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_extent_list *tmp_el;
+
+ if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+ /*
+ * We have a extent rec after index, so just use the e_cpos
+ * of the next extent rec.
+ */
+ *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+ return 0;
+ }
+
+ if (!eb || (eb && !eb->h_next_leaf_blk)) {
+ /*
+ * We are the last extent rec, so any high cpos should
+ * be stored in this leaf refcount block.
+ */
+ *cpos_end = UINT_MAX;
+ return 0;
+ }
+
+ /*
+ * If the extent block isn't the last one, we have to find
+ * the subtree root between this extent block and the next
+ * leaf extent block and get the corresponding e_cpos from
+ * the subroot. Otherwise we may corrupt the b-tree.
+ */
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+ left_path = ocfs2_new_path_from_et(&et);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+ ret = ocfs2_find_path(ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ right_path = ocfs2_new_path_from_path(left_path);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(ci, right_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_root = ocfs2_find_subtree_root(&et, left_path,
+ right_path);
+
+ tmp_el = left_path->p_node[subtree_root].el;
+ blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+ for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
+ if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+ *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+ break;
+ }
+ }
+
+ BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
+
+out:
+ ocfs2_free_path(left_path);
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
+/*
* Given a cpos and len, try to find the refcount record which contains cpos.
* 1. If cpos can be found in one refcount record, return the record.
* 2. If cpos can't be found, return a fake record which start from cpos
@@ -983,10 +1080,10 @@ static int ocfs2_get_refcount_rec(struct
struct buffer_head **ret_bh)
{
int ret = 0, i, found;
- u32 low_cpos;
+ u32 low_cpos, uninitialized_var(cpos_end);
struct ocfs2_extent_list *el;
- struct ocfs2_extent_rec *tmp, *rec = NULL;
- struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_extent_block *eb = NULL;
struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_refcount_block *rb =
@@ -1034,12 +1131,16 @@ static int ocfs2_get_refcount_rec(struct
}
}

- /* adjust len when we have ocfs2_extent_rec after it. */
- if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
- tmp = &el->l_recs[i+1];
+ if (found) {
+ ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
+ eb, el, i, &cpos_end);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }

- if (le32_to_cpu(tmp->e_cpos) < cpos + len)
- len = le32_to_cpu(tmp->e_cpos) - cpos;
+ if (cpos_end < low_cpos + len)
+ len = cpos_end - low_cpos;
}

ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/