libceph, ceph: get and handle cluster maps with addrvecs

In preparation for msgr2, make the cluster send us maps with addrvecs
including both LEGACY and MSGR2 addrs instead of a single LEGACY addr.
This means advertising support for SERVER_NAUTILUS and also some older
features: SERVER_MIMIC, MONENC and MONNAMES.

MONNAMES and MONENC are actually pre-argonaut, we just never updated
ceph_monmap_decode() for them.  Decoding is unconditional, see commit
23c625ce30 ("libceph: assume argonaut on the server side").

SERVER_MIMIC doesn't bear any meaning for the kernel client.

Since ceph_decode_entity_addrvec() is guarded by encoding version
checks (and in msgr2 case it is guarded implicitly by the fact that
server is speaking msgr2), we assume MSG_ADDR2 for it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
Ilya Dryomov 2020-10-30 13:30:51 +01:00
parent 8921f25116
commit a5cbd5fc22
10 changed files with 222 additions and 72 deletions

View file

@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/ceph/decode.h>
@ -82,3 +83,58 @@ bad:
}
EXPORT_SYMBOL(ceph_decode_entity_addr);
/*
* Return addr of desired type (MSGR2 or LEGACY) or error.
* Make sure there is only one match.
*
* Assume encoding with MSG_ADDR2.
*/
int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
struct ceph_entity_addr *addr)
{
__le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
CEPH_ENTITY_ADDR_TYPE_LEGACY;
struct ceph_entity_addr tmp_addr;
int addr_cnt;
bool found;
u8 marker;
int ret;
int i;
ceph_decode_8_safe(p, end, marker, e_inval);
if (marker != 2) {
pr_err("bad addrvec marker %d\n", marker);
return -EINVAL;
}
ceph_decode_32_safe(p, end, addr_cnt, e_inval);
found = false;
for (i = 0; i < addr_cnt; i++) {
ret = ceph_decode_entity_addr(p, end, &tmp_addr);
if (ret)
return ret;
if (tmp_addr.type == my_type) {
if (found) {
pr_err("another match of type %d in addrvec\n",
le32_to_cpu(my_type));
return -EINVAL;
}
memcpy(addr, &tmp_addr, sizeof(*addr));
found = true;
}
}
if (!found && addr_cnt != 0) {
pr_err("no match of type %d in addrvec\n",
le32_to_cpu(my_type));
return -ENOENT;
}
return 0;
e_inval:
return -EINVAL;
}
EXPORT_SYMBOL(ceph_decode_entity_addrvec);

View file

@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
static int __validate_auth(struct ceph_mon_client *monc);
static int decode_mon_info(void **p, void *end, bool msgr2,
struct ceph_entity_addr *addr)
{
void *mon_info_end;
u32 struct_len;
u8 struct_v;
int ret;
ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
&struct_len);
if (ret)
return ret;
mon_info_end = *p + struct_len;
ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
if (ret)
return ret;
*p = mon_info_end;
return 0;
e_inval:
return -EINVAL;
}
/*
* Decode a monmap blob (e.g., during mount).
*
* Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
*/
static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
{
struct ceph_monmap *m = NULL;
int i, err = -EINVAL;
struct ceph_monmap *monmap = NULL;
struct ceph_fsid fsid;
u32 epoch, num_mon;
u32 len;
u32 struct_len;
int blob_len;
int num_mon;
u8 struct_v;
u32 epoch;
int ret;
int i;
ceph_decode_32_safe(&p, end, len, bad);
ceph_decode_need(&p, end, len, bad);
ceph_decode_32_safe(p, end, blob_len, e_inval);
ceph_decode_need(p, end, blob_len, e_inval);
dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
p += sizeof(u16); /* skip version */
ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
if (ret)
goto fail;
ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
ceph_decode_copy(&p, &fsid, sizeof(fsid));
epoch = ceph_decode_32(&p);
dout("%s struct_v %d\n", __func__, struct_v);
ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
ceph_decode_32_safe(p, end, epoch, e_inval);
if (struct_v >= 6) {
u32 feat_struct_len;
u8 feat_struct_v;
num_mon = ceph_decode_32(&p);
*p += sizeof(struct ceph_timespec); /* skip last_changed */
*p += sizeof(struct ceph_timespec); /* skip created */
if (num_mon > CEPH_MAX_MON)
goto bad;
m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
if (m == NULL)
return ERR_PTR(-ENOMEM);
m->fsid = fsid;
m->epoch = epoch;
m->num_mon = num_mon;
for (i = 0; i < num_mon; ++i) {
struct ceph_entity_inst *inst = &m->mon_inst[i];
ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
&feat_struct_v, &feat_struct_len);
if (ret)
goto fail;
/* copy name portion */
ceph_decode_copy_safe(&p, end, &inst->name,
sizeof(inst->name), bad);
err = ceph_decode_entity_addr(&p, end, &inst->addr);
if (err)
goto bad;
*p += feat_struct_len; /* skip persistent_features */
ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
&feat_struct_v, &feat_struct_len);
if (ret)
goto fail;
*p += feat_struct_len; /* skip optional_features */
}
dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
m->num_mon);
for (i = 0; i < m->num_mon; i++)
dout("monmap_decode mon%d is %s\n", i,
ceph_pr_addr(&m->mon_inst[i].addr));
return m;
bad:
dout("monmap_decode failed with %d\n", err);
kfree(m);
return ERR_PTR(err);
ceph_decode_32_safe(p, end, num_mon, e_inval);
dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
num_mon);
if (num_mon > CEPH_MAX_MON)
goto e_inval;
monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
if (!monmap) {
ret = -ENOMEM;
goto fail;
}
monmap->fsid = fsid;
monmap->epoch = epoch;
monmap->num_mon = num_mon;
/* legacy_mon_addr map or mon_info map */
for (i = 0; i < num_mon; i++) {
struct ceph_entity_inst *inst = &monmap->mon_inst[i];
ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
inst->name.type = CEPH_ENTITY_TYPE_MON;
inst->name.num = cpu_to_le64(i);
if (struct_v >= 6)
ret = decode_mon_info(p, end, msgr2, &inst->addr);
else
ret = ceph_decode_entity_addr(p, end, &inst->addr);
if (ret)
goto fail;
dout("%s mon%d addr %s\n", __func__, i,
ceph_pr_addr(&inst->addr));
}
return monmap;
e_inval:
ret = -EINVAL;
fail:
kfree(monmap);
return ERR_PTR(ret);
}
/*
@ -476,7 +541,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
p = msg->front.iov_base;
end = p + msg->front.iov_len;
monmap = ceph_monmap_decode(p, end);
monmap = ceph_monmap_decode(&p, end, false);
if (IS_ERR(monmap)) {
pr_err("problem decoding monmap, %d\n",
(int)PTR_ERR(monmap));

View file

@ -3918,9 +3918,9 @@ static int handle_one_map(struct ceph_osd_client *osdc,
set_pool_was_full(osdc);
if (incremental)
newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
else
newmap = ceph_osdmap_decode(&p, end);
newmap = ceph_osdmap_decode(&p, end, false);
if (IS_ERR(newmap))
return PTR_ERR(newmap);

View file

@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
/*
* decode a full map.
*/
static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
static int osdmap_decode(void **p, void *end, bool msgr2,
struct ceph_osdmap *map)
{
u8 struct_v;
u32 epoch = 0;
@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
goto e_inval;
for (i = 0; i < map->max_osd; i++) {
err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
struct ceph_entity_addr *addr = &map->osd_addr[i];
if (struct_v >= 8)
err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
else
err = ceph_decode_entity_addr(p, end, addr);
if (err)
goto bad;
dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
}
/* pg_temp */
@ -1790,7 +1798,7 @@ bad:
/*
* Allocate and decode a full map.
*/
struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
{
struct ceph_osdmap *map;
int ret;
@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
if (!map)
return ERR_PTR(-ENOMEM);
ret = osdmap_decode(p, end, map);
ret = osdmap_decode(p, end, msgr2, map);
if (ret) {
ceph_osdmap_destroy(map);
return ERR_PTR(ret);
@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/
static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_osdmap *map)
bool msgr2, struct ceph_osdmap *map)
{
void *new_up_client;
void *new_state;
void *new_weight_end;
u32 len;
int ret;
int i;
new_up_client = *p;
@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_entity_addr addr;
ceph_decode_skip_32(p, end, e_inval);
if (ceph_decode_entity_addr(p, end, &addr))
goto e_inval;
if (struct_v >= 7)
ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
else
ret = ceph_decode_entity_addr(p, end, &addr);
if (ret)
return ret;
}
new_state = *p;
@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
while (len--) {
s32 osd;
u32 xorstate;
int ret;
osd = ceph_decode_32(p);
if (struct_v >= 5)
@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
osd = ceph_decode_32(p);
BUG_ON(osd >= map->max_osd);
if (ceph_decode_entity_addr(p, end, &addr))
goto e_inval;
if (struct_v >= 7)
ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
else
ret = ceph_decode_entity_addr(p, end, &addr);
if (ret)
return ret;
dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
pr_info("osd%d up\n", osd);
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
map->osd_addr[osd] = addr;
@ -1927,7 +1946,7 @@ e_inval:
/*
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
struct ceph_osdmap *map)
{
struct ceph_fsid fsid;
@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
return ceph_osdmap_decode(p, min(*p+len, end));
return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
}
/* new crush? */
@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
}
/* new_up_client, new_state, new_weight */
err = decode_new_up_state_weight(p, end, struct_v, map);
err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
if (err)
goto bad;