mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-06-29 01:51:39 +00:00
ceph: factor out libceph from Ceph file system
This factors out protocol and low-level storage parts of ceph into a separate libceph module living in net/ceph and include/linux/ceph. This is mostly a matter of moving files around. However, a few key pieces of the interface change as well: - ceph_client becomes ceph_fs_client and ceph_client, where the latter captures the mon and osd clients, and the fs_client gets the mds client and file system specific pieces. - Mount option parsing and debugfs setup is correspondingly broken into two pieces. - The mon client gets a generic handler callback for otherwise unknown messages (mds map, in this case). - The basic supported/required feature bits can be expanded (and are by ceph_fs_client). No functional change, aside from some subtle error handling cases that got cleaned up in the refactoring process. Signed-off-by: Sage Weil <sage@newdream.net>
This commit is contained in:
parent
ae1533b62b
commit
3d14c5d2b6
73 changed files with 2590 additions and 1862 deletions
92
include/linux/ceph/auth.h
Normal file
92
include/linux/ceph/auth.h
Normal file
|
@ -0,0 +1,92 @@
|
|||
#ifndef _FS_CEPH_AUTH_H
|
||||
#define _FS_CEPH_AUTH_H
|
||||
|
||||
#include <linux/ceph/types.h>
|
||||
#include <linux/ceph/buffer.h>
|
||||
|
||||
/*
|
||||
* Abstract interface for communicating with the authenticate module.
|
||||
* There is some handshake that takes place between us and the monitor
|
||||
* to acquire the necessary keys. These are used to generate an
|
||||
* 'authorizer' that we use when connecting to a service (mds, osd).
|
||||
*/
|
||||
|
||||
struct ceph_auth_client;
|
||||
struct ceph_authorizer;
|
||||
|
||||
struct ceph_auth_client_ops {
|
||||
const char *name;
|
||||
|
||||
/*
|
||||
* true if we are authenticated and can connect to
|
||||
* services.
|
||||
*/
|
||||
int (*is_authenticated)(struct ceph_auth_client *ac);
|
||||
|
||||
/*
|
||||
* true if we should (re)authenticate, e.g., when our tickets
|
||||
* are getting old and crusty.
|
||||
*/
|
||||
int (*should_authenticate)(struct ceph_auth_client *ac);
|
||||
|
||||
/*
|
||||
* build requests and process replies during monitor
|
||||
* handshake. if handle_reply returns -EAGAIN, we build
|
||||
* another request.
|
||||
*/
|
||||
int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
|
||||
int (*handle_reply)(struct ceph_auth_client *ac, int result,
|
||||
void *buf, void *end);
|
||||
|
||||
/*
|
||||
* Create authorizer for connecting to a service, and verify
|
||||
* the response to authenticate the service.
|
||||
*/
|
||||
int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
|
||||
struct ceph_authorizer **a,
|
||||
void **buf, size_t *len,
|
||||
void **reply_buf, size_t *reply_len);
|
||||
int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
|
||||
struct ceph_authorizer *a, size_t len);
|
||||
void (*destroy_authorizer)(struct ceph_auth_client *ac,
|
||||
struct ceph_authorizer *a);
|
||||
void (*invalidate_authorizer)(struct ceph_auth_client *ac,
|
||||
int peer_type);
|
||||
|
||||
/* reset when we (re)connect to a monitor */
|
||||
void (*reset)(struct ceph_auth_client *ac);
|
||||
|
||||
void (*destroy)(struct ceph_auth_client *ac);
|
||||
};
|
||||
|
||||
struct ceph_auth_client {
|
||||
u32 protocol; /* CEPH_AUTH_* */
|
||||
void *private; /* for use by protocol implementation */
|
||||
const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
|
||||
|
||||
bool negotiating; /* true if negotiating protocol */
|
||||
const char *name; /* entity name */
|
||||
u64 global_id; /* our unique id in system */
|
||||
const char *secret; /* our secret key */
|
||||
unsigned want_keys; /* which services we want */
|
||||
};
|
||||
|
||||
extern struct ceph_auth_client *ceph_auth_init(const char *name,
|
||||
const char *secret);
|
||||
extern void ceph_auth_destroy(struct ceph_auth_client *ac);
|
||||
|
||||
extern void ceph_auth_reset(struct ceph_auth_client *ac);
|
||||
|
||||
extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
|
||||
void *buf, size_t len);
|
||||
extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
|
||||
void *buf, size_t len,
|
||||
void *reply_buf, size_t reply_len);
|
||||
extern int ceph_entity_name_encode(const char *name, void **p, void *end);
|
||||
|
||||
extern int ceph_build_auth(struct ceph_auth_client *ac,
|
||||
void *msg_buf, size_t msg_len);
|
||||
|
||||
extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
|
||||
|
||||
#endif
|
39
include/linux/ceph/buffer.h
Normal file
39
include/linux/ceph/buffer.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
#ifndef __FS_CEPH_BUFFER_H
|
||||
#define __FS_CEPH_BUFFER_H
|
||||
|
||||
#include <linux/kref.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/uio.h>
|
||||
|
||||
/*
|
||||
* a simple reference counted buffer.
|
||||
*
|
||||
* use kmalloc for small sizes (<= one page), vmalloc for larger
|
||||
* sizes.
|
||||
*/
|
||||
struct ceph_buffer {
|
||||
struct kref kref;
|
||||
struct kvec vec;
|
||||
size_t alloc_len;
|
||||
bool is_vmalloc;
|
||||
};
|
||||
|
||||
extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
|
||||
extern void ceph_buffer_release(struct kref *kref);
|
||||
|
||||
static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
|
||||
{
|
||||
kref_get(&b->kref);
|
||||
return b;
|
||||
}
|
||||
|
||||
static inline void ceph_buffer_put(struct ceph_buffer *b)
|
||||
{
|
||||
kref_put(&b->kref, ceph_buffer_release);
|
||||
}
|
||||
|
||||
extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
|
||||
|
||||
#endif
|
38
include/linux/ceph/ceph_debug.h
Normal file
38
include/linux/ceph/ceph_debug.h
Normal file
|
@ -0,0 +1,38 @@
|
|||
#ifndef _FS_CEPH_DEBUG_H
|
||||
#define _FS_CEPH_DEBUG_H
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
|
||||
|
||||
/*
|
||||
* wrap pr_debug to include a filename:lineno prefix on each line.
|
||||
* this incurs some overhead (kernel size and execution time) due to
|
||||
* the extra function call at each call site.
|
||||
*/
|
||||
|
||||
# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
|
||||
extern const char *ceph_file_part(const char *s, int len);
|
||||
# define dout(fmt, ...) \
|
||||
pr_debug("%.*s %12.12s:%-4d : " fmt, \
|
||||
8 - (int)sizeof(KBUILD_MODNAME), " ", \
|
||||
ceph_file_part(__FILE__, sizeof(__FILE__)), \
|
||||
__LINE__, ##__VA_ARGS__)
|
||||
# else
|
||||
/* faux printk call just to see any compiler warnings. */
|
||||
# define dout(fmt, ...) do { \
|
||||
if (0) \
|
||||
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
# endif
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* or, just wrap pr_debug
|
||||
*/
|
||||
# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
109
include/linux/ceph/ceph_frag.h
Normal file
109
include/linux/ceph/ceph_frag.h
Normal file
|
@ -0,0 +1,109 @@
|
|||
#ifndef FS_CEPH_FRAG_H
|
||||
#define FS_CEPH_FRAG_H
|
||||
|
||||
/*
|
||||
* "Frags" are a way to describe a subset of a 32-bit number space,
|
||||
* using a mask and a value to match against that mask. Any given frag
|
||||
* (subset of the number space) can be partitioned into 2^n sub-frags.
|
||||
*
|
||||
* Frags are encoded into a 32-bit word:
|
||||
* 8 upper bits = "bits"
|
||||
* 24 lower bits = "value"
|
||||
* (We could go to 5+27 bits, but who cares.)
|
||||
*
|
||||
* We use the _most_ significant bits of the 24 bit value. This makes
|
||||
* values logically sort.
|
||||
*
|
||||
* Unfortunately, because the "bits" field is still in the high bits, we
|
||||
* can't sort encoded frags numerically. However, it does allow you
|
||||
* to feed encoded frags as values into frag_contains_value.
|
||||
*/
|
||||
static inline __u32 ceph_frag_make(__u32 b, __u32 v)
|
||||
{
|
||||
return (b << 24) |
|
||||
(v & (0xffffffu << (24-b)) & 0xffffffu);
|
||||
}
|
||||
static inline __u32 ceph_frag_bits(__u32 f)
|
||||
{
|
||||
return f >> 24;
|
||||
}
|
||||
static inline __u32 ceph_frag_value(__u32 f)
|
||||
{
|
||||
return f & 0xffffffu;
|
||||
}
|
||||
static inline __u32 ceph_frag_mask(__u32 f)
|
||||
{
|
||||
return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
|
||||
}
|
||||
static inline __u32 ceph_frag_mask_shift(__u32 f)
|
||||
{
|
||||
return 24 - ceph_frag_bits(f);
|
||||
}
|
||||
|
||||
static inline int ceph_frag_contains_value(__u32 f, __u32 v)
|
||||
{
|
||||
return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
|
||||
}
|
||||
static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
|
||||
{
|
||||
/* is sub as specific as us, and contained by us? */
|
||||
return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
|
||||
(ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
|
||||
}
|
||||
|
||||
static inline __u32 ceph_frag_parent(__u32 f)
|
||||
{
|
||||
return ceph_frag_make(ceph_frag_bits(f) - 1,
|
||||
ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
|
||||
}
|
||||
static inline int ceph_frag_is_left_child(__u32 f)
|
||||
{
|
||||
return ceph_frag_bits(f) > 0 &&
|
||||
(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
|
||||
}
|
||||
static inline int ceph_frag_is_right_child(__u32 f)
|
||||
{
|
||||
return ceph_frag_bits(f) > 0 &&
|
||||
(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
|
||||
}
|
||||
static inline __u32 ceph_frag_sibling(__u32 f)
|
||||
{
|
||||
return ceph_frag_make(ceph_frag_bits(f),
|
||||
ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
|
||||
}
|
||||
static inline __u32 ceph_frag_left_child(__u32 f)
|
||||
{
|
||||
return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
|
||||
}
|
||||
static inline __u32 ceph_frag_right_child(__u32 f)
|
||||
{
|
||||
return ceph_frag_make(ceph_frag_bits(f)+1,
|
||||
ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
|
||||
}
|
||||
static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
|
||||
{
|
||||
int newbits = ceph_frag_bits(f) + by;
|
||||
return ceph_frag_make(newbits,
|
||||
ceph_frag_value(f) | (i << (24 - newbits)));
|
||||
}
|
||||
static inline int ceph_frag_is_leftmost(__u32 f)
|
||||
{
|
||||
return ceph_frag_value(f) == 0;
|
||||
}
|
||||
static inline int ceph_frag_is_rightmost(__u32 f)
|
||||
{
|
||||
return ceph_frag_value(f) == ceph_frag_mask(f);
|
||||
}
|
||||
static inline __u32 ceph_frag_next(__u32 f)
|
||||
{
|
||||
return ceph_frag_make(ceph_frag_bits(f),
|
||||
ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
|
||||
}
|
||||
|
||||
/*
|
||||
* comparator to sort frags logically, as when traversing the
|
||||
* number space in ascending order...
|
||||
*/
|
||||
int ceph_frag_compare(__u32 a, __u32 b);
|
||||
|
||||
#endif
|
728
include/linux/ceph/ceph_fs.h
Normal file
728
include/linux/ceph/ceph_fs.h
Normal file
|
@ -0,0 +1,728 @@
|
|||
/*
|
||||
* ceph_fs.h - Ceph constants and data types to share between kernel and
|
||||
* user space.
|
||||
*
|
||||
* Most types in this file are defined as little-endian, and are
|
||||
* primarily intended to describe data structures that pass over the
|
||||
* wire or that are stored on disk.
|
||||
*
|
||||
* LGPL2
|
||||
*/
|
||||
|
||||
#ifndef CEPH_FS_H
|
||||
#define CEPH_FS_H
|
||||
|
||||
#include "msgr.h"
|
||||
#include "rados.h"
|
||||
|
||||
/*
|
||||
* subprotocol versions. when specific messages types or high-level
|
||||
* protocols change, bump the affected components. we keep rev
|
||||
* internal cluster protocols separately from the public,
|
||||
* client-facing protocol.
|
||||
*/
|
||||
#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
|
||||
#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
|
||||
#define CEPH_MON_PROTOCOL 5 /* cluster internal */
|
||||
#define CEPH_OSDC_PROTOCOL 24 /* server/client */
|
||||
#define CEPH_MDSC_PROTOCOL 32 /* server/client */
|
||||
#define CEPH_MONC_PROTOCOL 15 /* server/client */
|
||||
|
||||
|
||||
#define CEPH_INO_ROOT 1
|
||||
#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
|
||||
|
||||
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
|
||||
#define CEPH_MAX_MON 31
|
||||
|
||||
|
||||
/*
|
||||
* feature bits
|
||||
*/
|
||||
#define CEPH_FEATURE_UID (1<<0)
|
||||
#define CEPH_FEATURE_NOSRCADDR (1<<1)
|
||||
#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
|
||||
#define CEPH_FEATURE_FLOCK (1<<3)
|
||||
|
||||
|
||||
/*
|
||||
* ceph_file_layout - describe data layout for a file/inode
|
||||
*/
|
||||
struct ceph_file_layout {
|
||||
/* file -> object mapping */
|
||||
__le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
|
||||
of page size. */
|
||||
__le32 fl_stripe_count; /* over this many objects */
|
||||
__le32 fl_object_size; /* until objects are this big, then move to
|
||||
new objects */
|
||||
__le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
|
||||
|
||||
/* pg -> disk layout */
|
||||
__le32 fl_object_stripe_unit; /* for per-object parity, if any */
|
||||
|
||||
/* object -> pg layout */
|
||||
__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
|
||||
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_MIN_STRIPE_UNIT 65536
|
||||
|
||||
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
|
||||
|
||||
|
||||
/* crypto algorithms */
|
||||
#define CEPH_CRYPTO_NONE 0x0
|
||||
#define CEPH_CRYPTO_AES 0x1
|
||||
|
||||
#define CEPH_AES_IV "cephsageyudagreg"
|
||||
|
||||
/* security/authentication protocols */
|
||||
#define CEPH_AUTH_UNKNOWN 0x0
|
||||
#define CEPH_AUTH_NONE 0x1
|
||||
#define CEPH_AUTH_CEPHX 0x2
|
||||
|
||||
#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
|
||||
|
||||
|
||||
/*********************************************
|
||||
* message layer
|
||||
*/
|
||||
|
||||
/*
|
||||
* message types
|
||||
*/
|
||||
|
||||
/* misc */
|
||||
#define CEPH_MSG_SHUTDOWN 1
|
||||
#define CEPH_MSG_PING 2
|
||||
|
||||
/* client <-> monitor */
|
||||
#define CEPH_MSG_MON_MAP 4
|
||||
#define CEPH_MSG_MON_GET_MAP 5
|
||||
#define CEPH_MSG_STATFS 13
|
||||
#define CEPH_MSG_STATFS_REPLY 14
|
||||
#define CEPH_MSG_MON_SUBSCRIBE 15
|
||||
#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
|
||||
#define CEPH_MSG_AUTH 17
|
||||
#define CEPH_MSG_AUTH_REPLY 18
|
||||
|
||||
/* client <-> mds */
|
||||
#define CEPH_MSG_MDS_MAP 21
|
||||
|
||||
#define CEPH_MSG_CLIENT_SESSION 22
|
||||
#define CEPH_MSG_CLIENT_RECONNECT 23
|
||||
|
||||
#define CEPH_MSG_CLIENT_REQUEST 24
|
||||
#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
|
||||
#define CEPH_MSG_CLIENT_REPLY 26
|
||||
#define CEPH_MSG_CLIENT_CAPS 0x310
|
||||
#define CEPH_MSG_CLIENT_LEASE 0x311
|
||||
#define CEPH_MSG_CLIENT_SNAP 0x312
|
||||
#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
|
||||
|
||||
/* pool ops */
|
||||
#define CEPH_MSG_POOLOP_REPLY 48
|
||||
#define CEPH_MSG_POOLOP 49
|
||||
|
||||
|
||||
/* osd */
|
||||
#define CEPH_MSG_OSD_MAP 41
|
||||
#define CEPH_MSG_OSD_OP 42
|
||||
#define CEPH_MSG_OSD_OPREPLY 43
|
||||
|
||||
/* pool operations */
|
||||
enum {
|
||||
POOL_OP_CREATE = 0x01,
|
||||
POOL_OP_DELETE = 0x02,
|
||||
POOL_OP_AUID_CHANGE = 0x03,
|
||||
POOL_OP_CREATE_SNAP = 0x11,
|
||||
POOL_OP_DELETE_SNAP = 0x12,
|
||||
POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
|
||||
POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
|
||||
};
|
||||
|
||||
struct ceph_mon_request_header {
|
||||
__le64 have_version;
|
||||
__le16 session_mon;
|
||||
__le64 session_mon_tid;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mon_statfs {
|
||||
struct ceph_mon_request_header monhdr;
|
||||
struct ceph_fsid fsid;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_statfs {
|
||||
__le64 kb, kb_used, kb_avail;
|
||||
__le64 num_objects;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mon_statfs_reply {
|
||||
struct ceph_fsid fsid;
|
||||
__le64 version;
|
||||
struct ceph_statfs st;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
const char *ceph_pool_op_name(int op);
|
||||
|
||||
struct ceph_mon_poolop {
|
||||
struct ceph_mon_request_header monhdr;
|
||||
struct ceph_fsid fsid;
|
||||
__le32 pool;
|
||||
__le32 op;
|
||||
__le64 auid;
|
||||
__le64 snapid;
|
||||
__le32 name_len;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mon_poolop_reply {
|
||||
struct ceph_mon_request_header monhdr;
|
||||
struct ceph_fsid fsid;
|
||||
__le32 reply_code;
|
||||
__le32 epoch;
|
||||
char has_data;
|
||||
char data[0];
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mon_unmanaged_snap {
|
||||
__le64 snapid;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_osd_getmap {
|
||||
struct ceph_mon_request_header monhdr;
|
||||
struct ceph_fsid fsid;
|
||||
__le32 start;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mds_getmap {
|
||||
struct ceph_mon_request_header monhdr;
|
||||
struct ceph_fsid fsid;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_client_mount {
|
||||
struct ceph_mon_request_header monhdr;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mon_subscribe_item {
|
||||
__le64 have_version; __le64 have;
|
||||
__u8 onetime;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mon_subscribe_ack {
|
||||
__le32 duration; /* seconds */
|
||||
struct ceph_fsid fsid;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* mds states
|
||||
* > 0 -> in
|
||||
* <= 0 -> out
|
||||
*/
|
||||
#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
|
||||
#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
|
||||
empty log. */
|
||||
#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
|
||||
#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
|
||||
#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
|
||||
#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
|
||||
#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
|
||||
|
||||
#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
|
||||
#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
|
||||
operations (import, rename, etc.) */
|
||||
#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
|
||||
#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
|
||||
#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
|
||||
#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
|
||||
#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
|
||||
|
||||
extern const char *ceph_mds_state_name(int s);
|
||||
|
||||
|
||||
/*
|
||||
* metadata lock types.
|
||||
* - these are bitmasks.. we can compose them
|
||||
* - they also define the lock ordering by the MDS
|
||||
* - a few of these are internal to the mds
|
||||
*/
|
||||
#define CEPH_LOCK_DVERSION 1
|
||||
#define CEPH_LOCK_DN 2
|
||||
#define CEPH_LOCK_ISNAP 16
|
||||
#define CEPH_LOCK_IVERSION 32 /* mds internal */
|
||||
#define CEPH_LOCK_IFILE 64
|
||||
#define CEPH_LOCK_IAUTH 128
|
||||
#define CEPH_LOCK_ILINK 256
|
||||
#define CEPH_LOCK_IDFT 512 /* dir frag tree */
|
||||
#define CEPH_LOCK_INEST 1024 /* mds internal */
|
||||
#define CEPH_LOCK_IXATTR 2048
|
||||
#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
|
||||
#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
|
||||
|
||||
/* client_session ops */
|
||||
enum {
|
||||
CEPH_SESSION_REQUEST_OPEN,
|
||||
CEPH_SESSION_OPEN,
|
||||
CEPH_SESSION_REQUEST_CLOSE,
|
||||
CEPH_SESSION_CLOSE,
|
||||
CEPH_SESSION_REQUEST_RENEWCAPS,
|
||||
CEPH_SESSION_RENEWCAPS,
|
||||
CEPH_SESSION_STALE,
|
||||
CEPH_SESSION_RECALL_STATE,
|
||||
};
|
||||
|
||||
extern const char *ceph_session_op_name(int op);
|
||||
|
||||
struct ceph_mds_session_head {
|
||||
__le32 op;
|
||||
__le64 seq;
|
||||
struct ceph_timespec stamp;
|
||||
__le32 max_caps, max_leases;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/* client_request */
|
||||
/*
|
||||
* metadata ops.
|
||||
* & 0x001000 -> write op
|
||||
* & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
|
||||
& & 0x100000 -> use weird ino/path trace
|
||||
*/
|
||||
#define CEPH_MDS_OP_WRITE 0x001000
|
||||
enum {
|
||||
CEPH_MDS_OP_LOOKUP = 0x00100,
|
||||
CEPH_MDS_OP_GETATTR = 0x00101,
|
||||
CEPH_MDS_OP_LOOKUPHASH = 0x00102,
|
||||
CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
|
||||
|
||||
CEPH_MDS_OP_SETXATTR = 0x01105,
|
||||
CEPH_MDS_OP_RMXATTR = 0x01106,
|
||||
CEPH_MDS_OP_SETLAYOUT = 0x01107,
|
||||
CEPH_MDS_OP_SETATTR = 0x01108,
|
||||
CEPH_MDS_OP_SETFILELOCK= 0x01109,
|
||||
CEPH_MDS_OP_GETFILELOCK= 0x00110,
|
||||
|
||||
CEPH_MDS_OP_MKNOD = 0x01201,
|
||||
CEPH_MDS_OP_LINK = 0x01202,
|
||||
CEPH_MDS_OP_UNLINK = 0x01203,
|
||||
CEPH_MDS_OP_RENAME = 0x01204,
|
||||
CEPH_MDS_OP_MKDIR = 0x01220,
|
||||
CEPH_MDS_OP_RMDIR = 0x01221,
|
||||
CEPH_MDS_OP_SYMLINK = 0x01222,
|
||||
|
||||
CEPH_MDS_OP_CREATE = 0x01301,
|
||||
CEPH_MDS_OP_OPEN = 0x00302,
|
||||
CEPH_MDS_OP_READDIR = 0x00305,
|
||||
|
||||
CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
|
||||
CEPH_MDS_OP_MKSNAP = 0x01400,
|
||||
CEPH_MDS_OP_RMSNAP = 0x01401,
|
||||
CEPH_MDS_OP_LSSNAP = 0x00402,
|
||||
};
|
||||
|
||||
extern const char *ceph_mds_op_name(int op);
|
||||
|
||||
|
||||
#define CEPH_SETATTR_MODE 1
|
||||
#define CEPH_SETATTR_UID 2
|
||||
#define CEPH_SETATTR_GID 4
|
||||
#define CEPH_SETATTR_MTIME 8
|
||||
#define CEPH_SETATTR_ATIME 16
|
||||
#define CEPH_SETATTR_SIZE 32
|
||||
#define CEPH_SETATTR_CTIME 64
|
||||
|
||||
union ceph_mds_request_args {
|
||||
struct {
|
||||
__le32 mask; /* CEPH_CAP_* */
|
||||
} __attribute__ ((packed)) getattr;
|
||||
struct {
|
||||
__le32 mode;
|
||||
__le32 uid;
|
||||
__le32 gid;
|
||||
struct ceph_timespec mtime;
|
||||
struct ceph_timespec atime;
|
||||
__le64 size, old_size; /* old_size needed by truncate */
|
||||
__le32 mask; /* CEPH_SETATTR_* */
|
||||
} __attribute__ ((packed)) setattr;
|
||||
struct {
|
||||
__le32 frag; /* which dir fragment */
|
||||
__le32 max_entries; /* how many dentries to grab */
|
||||
__le32 max_bytes;
|
||||
} __attribute__ ((packed)) readdir;
|
||||
struct {
|
||||
__le32 mode;
|
||||
__le32 rdev;
|
||||
} __attribute__ ((packed)) mknod;
|
||||
struct {
|
||||
__le32 mode;
|
||||
} __attribute__ ((packed)) mkdir;
|
||||
struct {
|
||||
__le32 flags;
|
||||
__le32 mode;
|
||||
__le32 stripe_unit; /* layout for newly created file */
|
||||
__le32 stripe_count; /* ... */
|
||||
__le32 object_size;
|
||||
__le32 file_replication;
|
||||
__le32 preferred;
|
||||
} __attribute__ ((packed)) open;
|
||||
struct {
|
||||
__le32 flags;
|
||||
} __attribute__ ((packed)) setxattr;
|
||||
struct {
|
||||
struct ceph_file_layout layout;
|
||||
} __attribute__ ((packed)) setlayout;
|
||||
struct {
|
||||
__u8 rule; /* currently fcntl or flock */
|
||||
__u8 type; /* shared, exclusive, remove*/
|
||||
__le64 pid; /* process id requesting the lock */
|
||||
__le64 pid_namespace;
|
||||
__le64 start; /* initial location to lock */
|
||||
__le64 length; /* num bytes to lock from start */
|
||||
__u8 wait; /* will caller wait for lock to become available? */
|
||||
} __attribute__ ((packed)) filelock_change;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
|
||||
#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
|
||||
|
||||
struct ceph_mds_request_head {
|
||||
__le64 oldest_client_tid;
|
||||
__le32 mdsmap_epoch; /* on client */
|
||||
__le32 flags; /* CEPH_MDS_FLAG_* */
|
||||
__u8 num_retry, num_fwd; /* count retry, fwd attempts */
|
||||
__le16 num_releases; /* # include cap/lease release records */
|
||||
__le32 op; /* mds op code */
|
||||
__le32 caller_uid, caller_gid;
|
||||
__le64 ino; /* use this ino for openc, mkdir, mknod,
|
||||
etc. (if replaying) */
|
||||
union ceph_mds_request_args args;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/* cap/lease release record */
|
||||
struct ceph_mds_request_release {
|
||||
__le64 ino, cap_id; /* ino and unique cap id */
|
||||
__le32 caps, wanted; /* new issued, wanted */
|
||||
__le32 seq, issue_seq, mseq;
|
||||
__le32 dname_seq; /* if releasing a dentry lease, a */
|
||||
__le32 dname_len; /* string follows. */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/* client reply */
|
||||
struct ceph_mds_reply_head {
|
||||
__le32 op;
|
||||
__le32 result;
|
||||
__le32 mdsmap_epoch;
|
||||
__u8 safe; /* true if committed to disk */
|
||||
__u8 is_dentry, is_target; /* true if dentry, target inode records
|
||||
are included with reply */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/* one for each node split */
|
||||
struct ceph_frag_tree_split {
|
||||
__le32 frag; /* this frag splits... */
|
||||
__le32 by; /* ...by this many bits */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_frag_tree_head {
|
||||
__le32 nsplits; /* num ceph_frag_tree_split records */
|
||||
struct ceph_frag_tree_split splits[];
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/* capability issue, for bundling with mds reply */
|
||||
struct ceph_mds_reply_cap {
|
||||
__le32 caps, wanted; /* caps issued, wanted */
|
||||
__le64 cap_id;
|
||||
__le32 seq, mseq;
|
||||
__le64 realm; /* snap realm */
|
||||
__u8 flags; /* CEPH_CAP_FLAG_* */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
|
||||
|
||||
/* inode record, for bundling with mds reply */
|
||||
struct ceph_mds_reply_inode {
|
||||
__le64 ino;
|
||||
__le64 snapid;
|
||||
__le32 rdev;
|
||||
__le64 version; /* inode version */
|
||||
__le64 xattr_version; /* version for xattr blob */
|
||||
struct ceph_mds_reply_cap cap; /* caps issued for this inode */
|
||||
struct ceph_file_layout layout;
|
||||
struct ceph_timespec ctime, mtime, atime;
|
||||
__le32 time_warp_seq;
|
||||
__le64 size, max_size, truncate_size;
|
||||
__le32 truncate_seq;
|
||||
__le32 mode, uid, gid;
|
||||
__le32 nlink;
|
||||
__le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
|
||||
struct ceph_timespec rctime;
|
||||
struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
|
||||
} __attribute__ ((packed));
|
||||
/* followed by frag array, then symlink string, then xattr blob */
|
||||
|
||||
/* reply_lease follows dname, and reply_inode */
|
||||
struct ceph_mds_reply_lease {
|
||||
__le16 mask; /* lease type(s) */
|
||||
__le32 duration_ms; /* lease duration */
|
||||
__le32 seq;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mds_reply_dirfrag {
|
||||
__le32 frag; /* fragment */
|
||||
__le32 auth; /* auth mds, if this is a delegation point */
|
||||
__le32 ndist; /* number of mds' this is replicated on */
|
||||
__le32 dist[];
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_LOCK_FCNTL 1
|
||||
#define CEPH_LOCK_FLOCK 2
|
||||
|
||||
#define CEPH_LOCK_SHARED 1
|
||||
#define CEPH_LOCK_EXCL 2
|
||||
#define CEPH_LOCK_UNLOCK 4
|
||||
|
||||
struct ceph_filelock {
|
||||
__le64 start;/* file offset to start lock at */
|
||||
__le64 length; /* num bytes to lock; 0 for all following start */
|
||||
__le64 client; /* which client holds the lock */
|
||||
__le64 pid; /* process id holding the lock on the client */
|
||||
__le64 pid_namespace;
|
||||
__u8 type; /* shared lock, exclusive lock, or unlock */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
|
||||
/* file access modes */
|
||||
#define CEPH_FILE_MODE_PIN 0
|
||||
#define CEPH_FILE_MODE_RD 1
|
||||
#define CEPH_FILE_MODE_WR 2
|
||||
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
|
||||
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
|
||||
#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
|
||||
|
||||
int ceph_flags_to_mode(int flags);
|
||||
|
||||
|
||||
/* capability bits */
|
||||
#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
|
||||
|
||||
/* generic cap bits */
|
||||
#define CEPH_CAP_GSHARED 1 /* client can reads */
|
||||
#define CEPH_CAP_GEXCL 2 /* client can read and update */
|
||||
#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
|
||||
#define CEPH_CAP_GRD 8 /* (file) client can read */
|
||||
#define CEPH_CAP_GWR 16 /* (file) client can write */
|
||||
#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
|
||||
#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
|
||||
#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
|
||||
|
||||
/* per-lock shift */
|
||||
#define CEPH_CAP_SAUTH 2
|
||||
#define CEPH_CAP_SLINK 4
|
||||
#define CEPH_CAP_SXATTR 6
|
||||
#define CEPH_CAP_SFILE 8
|
||||
#define CEPH_CAP_SFLOCK 20
|
||||
|
||||
#define CEPH_CAP_BITS 22
|
||||
|
||||
/* composed values */
|
||||
#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
|
||||
#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
|
||||
#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
|
||||
#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
|
||||
#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
|
||||
#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
|
||||
#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
|
||||
#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
|
||||
#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
|
||||
|
||||
|
||||
/* cap masks (for getattr) */
|
||||
#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
|
||||
#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
|
||||
#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
|
||||
#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
|
||||
#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
|
||||
#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
|
||||
#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
|
||||
#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
|
||||
#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
|
||||
#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
|
||||
#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
|
||||
#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
|
||||
#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
|
||||
CEPH_CAP_AUTH_SHARED | \
|
||||
CEPH_CAP_LINK_SHARED | \
|
||||
CEPH_CAP_FILE_SHARED | \
|
||||
CEPH_CAP_XATTR_SHARED)
|
||||
|
||||
#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
|
||||
CEPH_CAP_LINK_SHARED | \
|
||||
CEPH_CAP_XATTR_SHARED | \
|
||||
CEPH_CAP_FILE_SHARED)
|
||||
#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
|
||||
CEPH_CAP_FILE_CACHE)
|
||||
|
||||
#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
|
||||
CEPH_CAP_LINK_EXCL | \
|
||||
CEPH_CAP_XATTR_EXCL | \
|
||||
CEPH_CAP_FILE_EXCL)
|
||||
#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
|
||||
CEPH_CAP_FILE_EXCL)
|
||||
#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
|
||||
#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
|
||||
CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
|
||||
CEPH_CAP_PIN)
|
||||
|
||||
#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
|
||||
CEPH_LOCK_IXATTR)
|
||||
|
||||
int ceph_caps_for_mode(int mode);
|
||||
|
||||
enum {
|
||||
CEPH_CAP_OP_GRANT, /* mds->client grant */
|
||||
CEPH_CAP_OP_REVOKE, /* mds->client revoke */
|
||||
CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
|
||||
CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
|
||||
CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
|
||||
CEPH_CAP_OP_UPDATE, /* client->mds update */
|
||||
CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
|
||||
CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
|
||||
CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
|
||||
CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
|
||||
CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
|
||||
CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
|
||||
CEPH_CAP_OP_RENEW, /* client->mds renewal request */
|
||||
};
|
||||
|
||||
extern const char *ceph_cap_op_name(int op);
|
||||
|
||||
/*
|
||||
* caps message, used for capability callbacks, acks, requests, etc.
|
||||
*/
|
||||
struct ceph_mds_caps {
|
||||
__le32 op; /* CEPH_CAP_OP_* */
|
||||
__le64 ino, realm;
|
||||
__le64 cap_id;
|
||||
__le32 seq, issue_seq;
|
||||
__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
|
||||
__le32 migrate_seq;
|
||||
__le64 snap_follows;
|
||||
__le32 snap_trace_len;
|
||||
|
||||
/* authlock */
|
||||
__le32 uid, gid, mode;
|
||||
|
||||
/* linklock */
|
||||
__le32 nlink;
|
||||
|
||||
/* xattrlock */
|
||||
__le32 xattr_len;
|
||||
__le64 xattr_version;
|
||||
|
||||
/* filelock */
|
||||
__le64 size, max_size, truncate_size;
|
||||
__le32 truncate_seq;
|
||||
struct ceph_timespec mtime, atime, ctime;
|
||||
struct ceph_file_layout layout;
|
||||
__le32 time_warp_seq;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/* cap release msg head */
|
||||
struct ceph_mds_cap_release {
|
||||
__le32 num; /* number of cap_items that follow */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mds_cap_item {
|
||||
__le64 ino;
|
||||
__le64 cap_id;
|
||||
__le32 migrate_seq, seq;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
|
||||
#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
|
||||
#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
|
||||
#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
|
||||
|
||||
extern const char *ceph_lease_op_name(int o);
|
||||
|
||||
/* lease msg header */
|
||||
struct ceph_mds_lease {
|
||||
__u8 action; /* CEPH_MDS_LEASE_* */
|
||||
__le16 mask; /* which lease */
|
||||
__le64 ino;
|
||||
__le64 first, last; /* snap range */
|
||||
__le32 seq;
|
||||
__le32 duration_ms; /* duration of renewal */
|
||||
} __attribute__ ((packed));
|
||||
/* followed by a __le32+string for dname */
|
||||
|
||||
/* client reconnect */
|
||||
struct ceph_mds_cap_reconnect {
|
||||
__le64 cap_id;
|
||||
__le32 wanted;
|
||||
__le32 issued;
|
||||
__le64 snaprealm;
|
||||
__le64 pathbase; /* base ino for our path to this ino */
|
||||
__le32 flock_len; /* size of flock state blob, if any */
|
||||
} __attribute__ ((packed));
|
||||
/* followed by flock blob */
|
||||
|
||||
struct ceph_mds_cap_reconnect_v1 {
|
||||
__le64 cap_id;
|
||||
__le32 wanted;
|
||||
__le32 issued;
|
||||
__le64 size;
|
||||
struct ceph_timespec mtime, atime;
|
||||
__le64 snaprealm;
|
||||
__le64 pathbase; /* base ino for our path to this ino */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_mds_snaprealm_reconnect {
|
||||
__le64 ino; /* snap realm base */
|
||||
__le64 seq; /* snap seq for this snap realm */
|
||||
__le64 parent; /* parent realm */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* snaps
|
||||
*/
|
||||
enum {
|
||||
CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
|
||||
CEPH_SNAP_OP_CREATE,
|
||||
CEPH_SNAP_OP_DESTROY,
|
||||
CEPH_SNAP_OP_SPLIT,
|
||||
};
|
||||
|
||||
extern const char *ceph_snap_op_name(int o);
|
||||
|
||||
/* snap msg header */
|
||||
struct ceph_mds_snap_head {
|
||||
__le32 op; /* CEPH_SNAP_OP_* */
|
||||
__le64 split; /* ino to split off, if any */
|
||||
__le32 num_split_inos; /* # inos belonging to new child realm */
|
||||
__le32 num_split_realms; /* # child realms udner new child realm */
|
||||
__le32 trace_len; /* size of snap trace blob */
|
||||
} __attribute__ ((packed));
|
||||
/* followed by split ino list, then split realms, then the trace blob */
|
||||
|
||||
/*
|
||||
* encode info about a snaprealm, as viewed by a client
|
||||
*/
|
||||
struct ceph_mds_snap_realm {
|
||||
__le64 ino; /* ino */
|
||||
__le64 created; /* snap: when created */
|
||||
__le64 parent; /* ino: parent realm */
|
||||
__le64 parent_since; /* snap: same parent since */
|
||||
__le64 seq; /* snap: version */
|
||||
__le32 num_snaps;
|
||||
__le32 num_prior_parent_snaps;
|
||||
} __attribute__ ((packed));
|
||||
/* followed by my snap list, then prior parent snap list */
|
||||
|
||||
#endif
|
13
include/linux/ceph/ceph_hash.h
Normal file
13
include/linux/ceph/ceph_hash.h
Normal file
|
@ -0,0 +1,13 @@
|
|||
#ifndef FS_CEPH_HASH_H
|
||||
#define FS_CEPH_HASH_H
|
||||
|
||||
#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
|
||||
#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
|
||||
|
||||
extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
|
||||
extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
|
||||
|
||||
extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
|
||||
extern const char *ceph_str_hash_name(int type);
|
||||
|
||||
#endif
|
33
include/linux/ceph/debugfs.h
Normal file
33
include/linux/ceph/debugfs.h
Normal file
|
@ -0,0 +1,33 @@
|
|||
#ifndef _FS_CEPH_DEBUGFS_H
|
||||
#define _FS_CEPH_DEBUGFS_H
|
||||
|
||||
#include "ceph_debug.h"
|
||||
#include "types.h"
|
||||
|
||||
#define CEPH_DEFINE_SHOW_FUNC(name) \
|
||||
static int name##_open(struct inode *inode, struct file *file) \
|
||||
{ \
|
||||
struct seq_file *sf; \
|
||||
int ret; \
|
||||
\
|
||||
ret = single_open(file, name, NULL); \
|
||||
sf = file->private_data; \
|
||||
sf->private = inode->i_private; \
|
||||
return ret; \
|
||||
} \
|
||||
\
|
||||
static const struct file_operations name##_fops = { \
|
||||
.open = name##_open, \
|
||||
.read = seq_read, \
|
||||
.llseek = seq_lseek, \
|
||||
.release = single_release, \
|
||||
};
|
||||
|
||||
/* debugfs.c */
|
||||
extern int ceph_debugfs_init(void);
|
||||
extern void ceph_debugfs_cleanup(void);
|
||||
extern int ceph_debugfs_client_init(struct ceph_client *client);
|
||||
extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
|
||||
|
||||
#endif
|
||||
|
201
include/linux/ceph/decode.h
Normal file
201
include/linux/ceph/decode.h
Normal file
|
@ -0,0 +1,201 @@
|
|||
#ifndef __CEPH_DECODE_H
|
||||
#define __CEPH_DECODE_H
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/time.h>
|
||||
|
||||
#include "types.h"
|
||||
|
||||
/*
|
||||
* in all cases,
|
||||
* void **p pointer to position pointer
|
||||
* void *end pointer to end of buffer (last byte + 1)
|
||||
*/
|
||||
|
||||
static inline u64 ceph_decode_64(void **p)
|
||||
{
|
||||
u64 v = get_unaligned_le64(*p);
|
||||
*p += sizeof(u64);
|
||||
return v;
|
||||
}
|
||||
static inline u32 ceph_decode_32(void **p)
|
||||
{
|
||||
u32 v = get_unaligned_le32(*p);
|
||||
*p += sizeof(u32);
|
||||
return v;
|
||||
}
|
||||
static inline u16 ceph_decode_16(void **p)
|
||||
{
|
||||
u16 v = get_unaligned_le16(*p);
|
||||
*p += sizeof(u16);
|
||||
return v;
|
||||
}
|
||||
static inline u8 ceph_decode_8(void **p)
|
||||
{
|
||||
u8 v = *(u8 *)*p;
|
||||
(*p)++;
|
||||
return v;
|
||||
}
|
||||
static inline void ceph_decode_copy(void **p, void *pv, size_t n)
|
||||
{
|
||||
memcpy(pv, *p, n);
|
||||
*p += n;
|
||||
}
|
||||
|
||||
/*
|
||||
* bounds check input.
|
||||
*/
|
||||
#define ceph_decode_need(p, end, n, bad) \
|
||||
do { \
|
||||
if (unlikely(*(p) + (n) > (end))) \
|
||||
goto bad; \
|
||||
} while (0)
|
||||
|
||||
#define ceph_decode_64_safe(p, end, v, bad) \
|
||||
do { \
|
||||
ceph_decode_need(p, end, sizeof(u64), bad); \
|
||||
v = ceph_decode_64(p); \
|
||||
} while (0)
|
||||
#define ceph_decode_32_safe(p, end, v, bad) \
|
||||
do { \
|
||||
ceph_decode_need(p, end, sizeof(u32), bad); \
|
||||
v = ceph_decode_32(p); \
|
||||
} while (0)
|
||||
#define ceph_decode_16_safe(p, end, v, bad) \
|
||||
do { \
|
||||
ceph_decode_need(p, end, sizeof(u16), bad); \
|
||||
v = ceph_decode_16(p); \
|
||||
} while (0)
|
||||
#define ceph_decode_8_safe(p, end, v, bad) \
|
||||
do { \
|
||||
ceph_decode_need(p, end, sizeof(u8), bad); \
|
||||
v = ceph_decode_8(p); \
|
||||
} while (0)
|
||||
|
||||
#define ceph_decode_copy_safe(p, end, pv, n, bad) \
|
||||
do { \
|
||||
ceph_decode_need(p, end, n, bad); \
|
||||
ceph_decode_copy(p, pv, n); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* struct ceph_timespec <-> struct timespec
|
||||
*/
|
||||
static inline void ceph_decode_timespec(struct timespec *ts,
|
||||
const struct ceph_timespec *tv)
|
||||
{
|
||||
ts->tv_sec = le32_to_cpu(tv->tv_sec);
|
||||
ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
|
||||
}
|
||||
static inline void ceph_encode_timespec(struct ceph_timespec *tv,
|
||||
const struct timespec *ts)
|
||||
{
|
||||
tv->tv_sec = cpu_to_le32(ts->tv_sec);
|
||||
tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
|
||||
}
|
||||
|
||||
/*
|
||||
* sockaddr_storage <-> ceph_sockaddr
|
||||
*/
|
||||
static inline void ceph_encode_addr(struct ceph_entity_addr *a)
|
||||
{
|
||||
__be16 ss_family = htons(a->in_addr.ss_family);
|
||||
a->in_addr.ss_family = *(__u16 *)&ss_family;
|
||||
}
|
||||
static inline void ceph_decode_addr(struct ceph_entity_addr *a)
|
||||
{
|
||||
__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
|
||||
a->in_addr.ss_family = ntohs(ss_family);
|
||||
WARN_ON(a->in_addr.ss_family == 512);
|
||||
}
|
||||
|
||||
/*
|
||||
* encoders
|
||||
*/
|
||||
static inline void ceph_encode_64(void **p, u64 v)
|
||||
{
|
||||
put_unaligned_le64(v, (__le64 *)*p);
|
||||
*p += sizeof(u64);
|
||||
}
|
||||
static inline void ceph_encode_32(void **p, u32 v)
|
||||
{
|
||||
put_unaligned_le32(v, (__le32 *)*p);
|
||||
*p += sizeof(u32);
|
||||
}
|
||||
static inline void ceph_encode_16(void **p, u16 v)
|
||||
{
|
||||
put_unaligned_le16(v, (__le16 *)*p);
|
||||
*p += sizeof(u16);
|
||||
}
|
||||
static inline void ceph_encode_8(void **p, u8 v)
|
||||
{
|
||||
*(u8 *)*p = v;
|
||||
(*p)++;
|
||||
}
|
||||
static inline void ceph_encode_copy(void **p, const void *s, int len)
|
||||
{
|
||||
memcpy(*p, s, len);
|
||||
*p += len;
|
||||
}
|
||||
|
||||
/*
|
||||
* filepath, string encoders
|
||||
*/
|
||||
static inline void ceph_encode_filepath(void **p, void *end,
|
||||
u64 ino, const char *path)
|
||||
{
|
||||
u32 len = path ? strlen(path) : 0;
|
||||
BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
|
||||
ceph_encode_8(p, 1);
|
||||
ceph_encode_64(p, ino);
|
||||
ceph_encode_32(p, len);
|
||||
if (len)
|
||||
memcpy(*p, path, len);
|
||||
*p += len;
|
||||
}
|
||||
|
||||
static inline void ceph_encode_string(void **p, void *end,
|
||||
const char *s, u32 len)
|
||||
{
|
||||
BUG_ON(*p + sizeof(len) + len > end);
|
||||
ceph_encode_32(p, len);
|
||||
if (len)
|
||||
memcpy(*p, s, len);
|
||||
*p += len;
|
||||
}
|
||||
|
||||
#define ceph_encode_need(p, end, n, bad) \
|
||||
do { \
|
||||
if (unlikely(*(p) + (n) > (end))) \
|
||||
goto bad; \
|
||||
} while (0)
|
||||
|
||||
#define ceph_encode_64_safe(p, end, v, bad) \
|
||||
do { \
|
||||
ceph_encode_need(p, end, sizeof(u64), bad); \
|
||||
ceph_encode_64(p, v); \
|
||||
} while (0)
|
||||
#define ceph_encode_32_safe(p, end, v, bad) \
|
||||
do { \
|
||||
ceph_encode_need(p, end, sizeof(u32), bad); \
|
||||
ceph_encode_32(p, v); \
|
||||
} while (0)
|
||||
#define ceph_encode_16_safe(p, end, v, bad) \
|
||||
do { \
|
||||
ceph_encode_need(p, end, sizeof(u16), bad); \
|
||||
ceph_encode_16(p, v); \
|
||||
} while (0)
|
||||
|
||||
#define ceph_encode_copy_safe(p, end, pv, n, bad) \
|
||||
do { \
|
||||
ceph_encode_need(p, end, n, bad); \
|
||||
ceph_encode_copy(p, pv, n); \
|
||||
} while (0)
|
||||
#define ceph_encode_string_safe(p, end, s, n, bad) \
|
||||
do { \
|
||||
ceph_encode_need(p, end, n, bad); \
|
||||
ceph_encode_string(p, end, s, n); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#endif
|
249
include/linux/ceph/libceph.h
Normal file
249
include/linux/ceph/libceph.h
Normal file
|
@ -0,0 +1,249 @@
|
|||
#ifndef _FS_CEPH_LIBCEPH_H
|
||||
#define _FS_CEPH_LIBCEPH_H
|
||||
|
||||
#include "ceph_debug.h"
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/exportfs.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "types.h"
|
||||
#include "messenger.h"
|
||||
#include "msgpool.h"
|
||||
#include "mon_client.h"
|
||||
#include "osd_client.h"
|
||||
#include "ceph_fs.h"
|
||||
|
||||
/*
|
||||
* Supported features
|
||||
*/
|
||||
#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
|
||||
#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR
|
||||
|
||||
/*
|
||||
* mount options
|
||||
*/
|
||||
#define CEPH_OPT_FSID (1<<0)
|
||||
#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
|
||||
#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
|
||||
#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
|
||||
|
||||
#define CEPH_OPT_DEFAULT (0);
|
||||
|
||||
#define ceph_set_opt(client, opt) \
|
||||
(client)->options->flags |= CEPH_OPT_##opt;
|
||||
#define ceph_test_opt(client, opt) \
|
||||
(!!((client)->options->flags & CEPH_OPT_##opt))
|
||||
|
||||
struct ceph_options {
|
||||
int flags;
|
||||
struct ceph_fsid fsid;
|
||||
struct ceph_entity_addr my_addr;
|
||||
int mount_timeout;
|
||||
int osd_idle_ttl;
|
||||
int osd_timeout;
|
||||
int osd_keepalive_timeout;
|
||||
|
||||
/*
|
||||
* any type that can't be simply compared or doesn't need need
|
||||
* to be compared should go beyond this point,
|
||||
* ceph_compare_options() should be updated accordingly
|
||||
*/
|
||||
|
||||
struct ceph_entity_addr *mon_addr; /* should be the first
|
||||
pointer type of args */
|
||||
int num_mon;
|
||||
char *name;
|
||||
char *secret;
|
||||
};
|
||||
|
||||
/*
|
||||
* defaults
|
||||
*/
|
||||
#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
|
||||
#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
|
||||
#define CEPH_OSD_KEEPALIVE_DEFAULT 5
|
||||
#define CEPH_OSD_IDLE_TTL_DEFAULT 60
|
||||
#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
|
||||
|
||||
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
|
||||
#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
|
||||
|
||||
#define CEPH_AUTH_NAME_DEFAULT "guest"
|
||||
|
||||
/*
|
||||
* Delay telling the MDS we no longer want caps, in case we reopen
|
||||
* the file. Delay a minimum amount of time, even if we send a cap
|
||||
* message for some other reason. Otherwise, take the oppotunity to
|
||||
* update the mds to avoid sending another message later.
|
||||
*/
|
||||
#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
|
||||
#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
|
||||
|
||||
#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
|
||||
|
||||
/* mount state */
|
||||
enum {
|
||||
CEPH_MOUNT_MOUNTING,
|
||||
CEPH_MOUNT_MOUNTED,
|
||||
CEPH_MOUNT_UNMOUNTING,
|
||||
CEPH_MOUNT_UNMOUNTED,
|
||||
CEPH_MOUNT_SHUTDOWN,
|
||||
};
|
||||
|
||||
/*
|
||||
* subtract jiffies
|
||||
*/
|
||||
static inline unsigned long time_sub(unsigned long a, unsigned long b)
|
||||
{
|
||||
BUG_ON(time_after(b, a));
|
||||
return (long)a - (long)b;
|
||||
}
|
||||
|
||||
struct ceph_mds_client;
|
||||
|
||||
/*
|
||||
* per client state
|
||||
*
|
||||
* possibly shared by multiple mount points, if they are
|
||||
* mounting the same ceph filesystem/cluster.
|
||||
*/
|
||||
struct ceph_client {
|
||||
struct ceph_fsid fsid;
|
||||
bool have_fsid;
|
||||
|
||||
void *private;
|
||||
|
||||
struct ceph_options *options;
|
||||
|
||||
struct mutex mount_mutex; /* serialize mount attempts */
|
||||
wait_queue_head_t auth_wq;
|
||||
int auth_err;
|
||||
|
||||
int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
|
||||
|
||||
u32 supported_features;
|
||||
u32 required_features;
|
||||
|
||||
struct ceph_messenger *msgr; /* messenger instance */
|
||||
struct ceph_mon_client monc;
|
||||
struct ceph_osd_client osdc;
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
struct dentry *debugfs_dir;
|
||||
struct dentry *debugfs_monmap;
|
||||
struct dentry *debugfs_osdmap;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* snapshots
|
||||
*/
|
||||
|
||||
/*
|
||||
* A "snap context" is the set of existing snapshots when we
|
||||
* write data. It is used by the OSD to guide its COW behavior.
|
||||
*
|
||||
* The ceph_snap_context is refcounted, and attached to each dirty
|
||||
* page, indicating which context the dirty data belonged when it was
|
||||
* dirtied.
|
||||
*/
|
||||
struct ceph_snap_context {
|
||||
atomic_t nref;
|
||||
u64 seq;
|
||||
int num_snaps;
|
||||
u64 snaps[];
|
||||
};
|
||||
|
||||
static inline struct ceph_snap_context *
|
||||
ceph_get_snap_context(struct ceph_snap_context *sc)
|
||||
{
|
||||
/*
|
||||
printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
|
||||
atomic_read(&sc->nref)+1);
|
||||
*/
|
||||
if (sc)
|
||||
atomic_inc(&sc->nref);
|
||||
return sc;
|
||||
}
|
||||
|
||||
static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
|
||||
{
|
||||
if (!sc)
|
||||
return;
|
||||
/*
|
||||
printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
|
||||
atomic_read(&sc->nref)-1);
|
||||
*/
|
||||
if (atomic_dec_and_test(&sc->nref)) {
|
||||
/*printk(" deleting snap_context %p\n", sc);*/
|
||||
kfree(sc);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* calculate the number of pages a given length and offset map onto,
|
||||
* if we align the data.
|
||||
*/
|
||||
static inline int calc_pages_for(u64 off, u64 len)
|
||||
{
|
||||
return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
|
||||
(off >> PAGE_CACHE_SHIFT);
|
||||
}
|
||||
|
||||
/* ceph_common.c */
|
||||
extern const char *ceph_msg_type_name(int type);
|
||||
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
|
||||
extern struct kmem_cache *ceph_inode_cachep;
|
||||
extern struct kmem_cache *ceph_cap_cachep;
|
||||
extern struct kmem_cache *ceph_dentry_cachep;
|
||||
extern struct kmem_cache *ceph_file_cachep;
|
||||
|
||||
extern int ceph_parse_options(struct ceph_options **popt, char *options,
|
||||
const char *dev_name, const char *dev_name_end,
|
||||
int (*parse_extra_token)(char *c, void *private),
|
||||
void *private);
|
||||
extern void ceph_destroy_options(struct ceph_options *opt);
|
||||
extern int ceph_compare_options(struct ceph_options *new_opt,
|
||||
struct ceph_client *client);
|
||||
extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
|
||||
void *private);
|
||||
extern u64 ceph_client_id(struct ceph_client *client);
|
||||
extern void ceph_destroy_client(struct ceph_client *client);
|
||||
extern int __ceph_open_session(struct ceph_client *client,
|
||||
unsigned long started);
|
||||
extern int ceph_open_session(struct ceph_client *client);
|
||||
|
||||
/* pagevec.c */
|
||||
extern void ceph_release_page_vector(struct page **pages, int num_pages);
|
||||
|
||||
extern struct page **ceph_get_direct_page_vector(const char __user *data,
|
||||
int num_pages,
|
||||
loff_t off, size_t len);
|
||||
extern void ceph_put_page_vector(struct page **pages, int num_pages);
|
||||
extern void ceph_release_page_vector(struct page **pages, int num_pages);
|
||||
extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
|
||||
extern int ceph_copy_user_to_page_vector(struct page **pages,
|
||||
const char __user *data,
|
||||
loff_t off, size_t len);
|
||||
extern int ceph_copy_to_page_vector(struct page **pages,
|
||||
const char *data,
|
||||
loff_t off, size_t len);
|
||||
extern int ceph_copy_from_page_vector(struct page **pages,
|
||||
char *data,
|
||||
loff_t off, size_t len);
|
||||
extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
|
||||
loff_t off, size_t len);
|
||||
extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
|
||||
|
||||
|
||||
#endif /* _FS_CEPH_SUPER_H */
|
62
include/linux/ceph/mdsmap.h
Normal file
62
include/linux/ceph/mdsmap.h
Normal file
|
@ -0,0 +1,62 @@
|
|||
#ifndef _FS_CEPH_MDSMAP_H
|
||||
#define _FS_CEPH_MDSMAP_H
|
||||
|
||||
#include "types.h"
|
||||
|
||||
/*
|
||||
* mds map - describe servers in the mds cluster.
|
||||
*
|
||||
* we limit fields to those the client actually xcares about
|
||||
*/
|
||||
struct ceph_mds_info {
|
||||
u64 global_id;
|
||||
struct ceph_entity_addr addr;
|
||||
s32 state;
|
||||
int num_export_targets;
|
||||
bool laggy;
|
||||
u32 *export_targets;
|
||||
};
|
||||
|
||||
struct ceph_mdsmap {
|
||||
u32 m_epoch, m_client_epoch, m_last_failure;
|
||||
u32 m_root;
|
||||
u32 m_session_timeout; /* seconds */
|
||||
u32 m_session_autoclose; /* seconds */
|
||||
u64 m_max_file_size;
|
||||
u32 m_max_mds; /* size of m_addr, m_state arrays */
|
||||
struct ceph_mds_info *m_info;
|
||||
|
||||
/* which object pools file data can be stored in */
|
||||
int m_num_data_pg_pools;
|
||||
u32 *m_data_pg_pools;
|
||||
u32 m_cas_pg_pool;
|
||||
};
|
||||
|
||||
static inline struct ceph_entity_addr *
|
||||
ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
|
||||
{
|
||||
if (w >= m->m_max_mds)
|
||||
return NULL;
|
||||
return &m->m_info[w].addr;
|
||||
}
|
||||
|
||||
static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
|
||||
{
|
||||
BUG_ON(w < 0);
|
||||
if (w >= m->m_max_mds)
|
||||
return CEPH_MDS_STATE_DNE;
|
||||
return m->m_info[w].state;
|
||||
}
|
||||
|
||||
static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
|
||||
{
|
||||
if (w >= 0 && w < m->m_max_mds)
|
||||
return m->m_info[w].laggy;
|
||||
return false;
|
||||
}
|
||||
|
||||
extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
|
||||
extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
|
||||
extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
|
||||
|
||||
#endif
|
261
include/linux/ceph/messenger.h
Normal file
261
include/linux/ceph/messenger.h
Normal file
|
@ -0,0 +1,261 @@
|
|||
#ifndef __FS_CEPH_MESSENGER_H
|
||||
#define __FS_CEPH_MESSENGER_H
|
||||
|
||||
#include <linux/kref.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/radix-tree.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/version.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "types.h"
|
||||
#include "buffer.h"
|
||||
|
||||
struct ceph_msg;
|
||||
struct ceph_connection;
|
||||
|
||||
extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
|
||||
|
||||
/*
|
||||
* Ceph defines these callbacks for handling connection events.
|
||||
*/
|
||||
struct ceph_connection_operations {
|
||||
struct ceph_connection *(*get)(struct ceph_connection *);
|
||||
void (*put)(struct ceph_connection *);
|
||||
|
||||
/* handle an incoming message. */
|
||||
void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
|
||||
|
||||
/* authorize an outgoing connection */
|
||||
int (*get_authorizer) (struct ceph_connection *con,
|
||||
void **buf, int *len, int *proto,
|
||||
void **reply_buf, int *reply_len, int force_new);
|
||||
int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
|
||||
int (*invalidate_authorizer)(struct ceph_connection *con);
|
||||
|
||||
/* protocol version mismatch */
|
||||
void (*bad_proto) (struct ceph_connection *con);
|
||||
|
||||
/* there was some error on the socket (disconnect, whatever) */
|
||||
void (*fault) (struct ceph_connection *con);
|
||||
|
||||
/* a remote host as terminated a message exchange session, and messages
|
||||
* we sent (or they tried to send us) may be lost. */
|
||||
void (*peer_reset) (struct ceph_connection *con);
|
||||
|
||||
struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
|
||||
struct ceph_msg_header *hdr,
|
||||
int *skip);
|
||||
};
|
||||
|
||||
/* use format string %s%d */
|
||||
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
|
||||
|
||||
struct ceph_messenger {
|
||||
struct ceph_entity_inst inst; /* my name+address */
|
||||
struct ceph_entity_addr my_enc_addr;
|
||||
struct page *zero_page; /* used in certain error cases */
|
||||
|
||||
bool nocrc;
|
||||
|
||||
/*
|
||||
* the global_seq counts connections i (attempt to) initiate
|
||||
* in order to disambiguate certain connect race conditions.
|
||||
*/
|
||||
u32 global_seq;
|
||||
spinlock_t global_seq_lock;
|
||||
|
||||
u32 supported_features;
|
||||
u32 required_features;
|
||||
};
|
||||
|
||||
/*
|
||||
* a single message. it contains a header (src, dest, message type, etc.),
|
||||
* footer (crc values, mainly), a "front" message body, and possibly a
|
||||
* data payload (stored in some number of pages).
|
||||
*/
|
||||
struct ceph_msg {
|
||||
struct ceph_msg_header hdr; /* header */
|
||||
struct ceph_msg_footer footer; /* footer */
|
||||
struct kvec front; /* unaligned blobs of message */
|
||||
struct ceph_buffer *middle;
|
||||
struct page **pages; /* data payload. NOT OWNER. */
|
||||
unsigned nr_pages; /* size of page array */
|
||||
struct ceph_pagelist *pagelist; /* instead of pages */
|
||||
struct list_head list_head;
|
||||
struct kref kref;
|
||||
struct bio *bio; /* instead of pages/pagelist */
|
||||
struct bio *bio_iter; /* bio iterator */
|
||||
int bio_seg; /* current bio segment */
|
||||
struct ceph_pagelist *trail; /* the trailing part of the data */
|
||||
bool front_is_vmalloc;
|
||||
bool more_to_follow;
|
||||
bool needs_out_seq;
|
||||
int front_max;
|
||||
|
||||
struct ceph_msgpool *pool;
|
||||
};
|
||||
|
||||
struct ceph_msg_pos {
|
||||
int page, page_pos; /* which page; offset in page */
|
||||
int data_pos; /* offset in data payload */
|
||||
int did_page_crc; /* true if we've calculated crc for current page */
|
||||
};
|
||||
|
||||
/* ceph connection fault delay defaults, for exponential backoff */
|
||||
#define BASE_DELAY_INTERVAL (HZ/2)
|
||||
#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
|
||||
|
||||
/*
|
||||
* ceph_connection state bit flags
|
||||
*
|
||||
* QUEUED and BUSY are used together to ensure that only a single
|
||||
* thread is currently opening, reading or writing data to the socket.
|
||||
*/
|
||||
#define LOSSYTX 0 /* we can close channel or drop messages on errors */
|
||||
#define CONNECTING 1
|
||||
#define NEGOTIATING 2
|
||||
#define KEEPALIVE_PENDING 3
|
||||
#define WRITE_PENDING 4 /* we have data ready to send */
|
||||
#define QUEUED 5 /* there is work queued on this connection */
|
||||
#define BUSY 6 /* work is being done */
|
||||
#define STANDBY 8 /* no outgoing messages, socket closed. we keep
|
||||
* the ceph_connection around to maintain shared
|
||||
* state with the peer. */
|
||||
#define CLOSED 10 /* we've closed the connection */
|
||||
#define SOCK_CLOSED 11 /* socket state changed to closed */
|
||||
#define OPENING 13 /* open connection w/ (possibly new) peer */
|
||||
#define DEAD 14 /* dead, about to kfree */
|
||||
|
||||
/*
|
||||
* A single connection with another host.
|
||||
*
|
||||
* We maintain a queue of outgoing messages, and some session state to
|
||||
* ensure that we can preserve the lossless, ordered delivery of
|
||||
* messages in the case of a TCP disconnect.
|
||||
*/
|
||||
struct ceph_connection {
|
||||
void *private;
|
||||
atomic_t nref;
|
||||
|
||||
const struct ceph_connection_operations *ops;
|
||||
|
||||
struct ceph_messenger *msgr;
|
||||
struct socket *sock;
|
||||
unsigned long state; /* connection state (see flags above) */
|
||||
const char *error_msg; /* error message, if any */
|
||||
|
||||
struct ceph_entity_addr peer_addr; /* peer address */
|
||||
struct ceph_entity_name peer_name; /* peer name */
|
||||
struct ceph_entity_addr peer_addr_for_me;
|
||||
unsigned peer_features;
|
||||
u32 connect_seq; /* identify the most recent connection
|
||||
attempt for this connection, client */
|
||||
u32 peer_global_seq; /* peer's global seq for this connection */
|
||||
|
||||
int auth_retry; /* true if we need a newer authorizer */
|
||||
void *auth_reply_buf; /* where to put the authorizer reply */
|
||||
int auth_reply_buf_len;
|
||||
|
||||
struct mutex mutex;
|
||||
|
||||
/* out queue */
|
||||
struct list_head out_queue;
|
||||
struct list_head out_sent; /* sending or sent but unacked */
|
||||
u64 out_seq; /* last message queued for send */
|
||||
bool out_keepalive_pending;
|
||||
|
||||
u64 in_seq, in_seq_acked; /* last message received, acked */
|
||||
|
||||
/* connection negotiation temps */
|
||||
char in_banner[CEPH_BANNER_MAX_LEN];
|
||||
union {
|
||||
struct { /* outgoing connection */
|
||||
struct ceph_msg_connect out_connect;
|
||||
struct ceph_msg_connect_reply in_reply;
|
||||
};
|
||||
struct { /* incoming */
|
||||
struct ceph_msg_connect in_connect;
|
||||
struct ceph_msg_connect_reply out_reply;
|
||||
};
|
||||
};
|
||||
struct ceph_entity_addr actual_peer_addr;
|
||||
|
||||
/* message out temps */
|
||||
struct ceph_msg *out_msg; /* sending message (== tail of
|
||||
out_sent) */
|
||||
bool out_msg_done;
|
||||
struct ceph_msg_pos out_msg_pos;
|
||||
|
||||
struct kvec out_kvec[8], /* sending header/footer data */
|
||||
*out_kvec_cur;
|
||||
int out_kvec_left; /* kvec's left in out_kvec */
|
||||
int out_skip; /* skip this many bytes */
|
||||
int out_kvec_bytes; /* total bytes left */
|
||||
bool out_kvec_is_msg; /* kvec refers to out_msg */
|
||||
int out_more; /* there is more data after the kvecs */
|
||||
__le64 out_temp_ack; /* for writing an ack */
|
||||
|
||||
/* message in temps */
|
||||
struct ceph_msg_header in_hdr;
|
||||
struct ceph_msg *in_msg;
|
||||
struct ceph_msg_pos in_msg_pos;
|
||||
u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
|
||||
|
||||
char in_tag; /* protocol control byte */
|
||||
int in_base_pos; /* bytes read */
|
||||
__le64 in_temp_ack; /* for reading an ack */
|
||||
|
||||
struct delayed_work work; /* send|recv work */
|
||||
unsigned long delay; /* current delay interval */
|
||||
};
|
||||
|
||||
|
||||
extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
|
||||
extern int ceph_parse_ips(const char *c, const char *end,
|
||||
struct ceph_entity_addr *addr,
|
||||
int max_count, int *count);
|
||||
|
||||
|
||||
extern int ceph_msgr_init(void);
|
||||
extern void ceph_msgr_exit(void);
|
||||
extern void ceph_msgr_flush(void);
|
||||
|
||||
extern struct ceph_messenger *ceph_messenger_create(
|
||||
struct ceph_entity_addr *myaddr,
|
||||
u32 features, u32 required);
|
||||
extern void ceph_messenger_destroy(struct ceph_messenger *);
|
||||
|
||||
extern void ceph_con_init(struct ceph_messenger *msgr,
|
||||
struct ceph_connection *con);
|
||||
extern void ceph_con_open(struct ceph_connection *con,
|
||||
struct ceph_entity_addr *addr);
|
||||
extern bool ceph_con_opened(struct ceph_connection *con);
|
||||
extern void ceph_con_close(struct ceph_connection *con);
|
||||
extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
|
||||
extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
|
||||
extern void ceph_con_revoke_message(struct ceph_connection *con,
|
||||
struct ceph_msg *msg);
|
||||
extern void ceph_con_keepalive(struct ceph_connection *con);
|
||||
extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
|
||||
extern void ceph_con_put(struct ceph_connection *con);
|
||||
|
||||
extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
|
||||
extern void ceph_msg_kfree(struct ceph_msg *m);
|
||||
|
||||
|
||||
static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
|
||||
{
|
||||
kref_get(&msg->kref);
|
||||
return msg;
|
||||
}
|
||||
extern void ceph_msg_last_put(struct kref *kref);
|
||||
static inline void ceph_msg_put(struct ceph_msg *msg)
|
||||
{
|
||||
kref_put(&msg->kref, ceph_msg_last_put);
|
||||
}
|
||||
|
||||
extern void ceph_msg_dump(struct ceph_msg *msg);
|
||||
|
||||
#endif
|
122
include/linux/ceph/mon_client.h
Normal file
122
include/linux/ceph/mon_client.h
Normal file
|
@ -0,0 +1,122 @@
|
|||
#ifndef _FS_CEPH_MON_CLIENT_H
|
||||
#define _FS_CEPH_MON_CLIENT_H
|
||||
|
||||
#include <linux/completion.h>
|
||||
#include <linux/kref.h>
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
struct ceph_client;
|
||||
struct ceph_mount_args;
|
||||
struct ceph_auth_client;
|
||||
|
||||
/*
|
||||
* The monitor map enumerates the set of all monitors.
|
||||
*/
|
||||
struct ceph_monmap {
|
||||
struct ceph_fsid fsid;
|
||||
u32 epoch;
|
||||
u32 num_mon;
|
||||
struct ceph_entity_inst mon_inst[0];
|
||||
};
|
||||
|
||||
struct ceph_mon_client;
|
||||
struct ceph_mon_generic_request;
|
||||
|
||||
|
||||
/*
|
||||
* Generic mechanism for resending monitor requests.
|
||||
*/
|
||||
typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
|
||||
int newmon);
|
||||
|
||||
/* a pending monitor request */
|
||||
struct ceph_mon_request {
|
||||
struct ceph_mon_client *monc;
|
||||
struct delayed_work delayed_work;
|
||||
unsigned long delay;
|
||||
ceph_monc_request_func_t do_request;
|
||||
};
|
||||
|
||||
/*
|
||||
* ceph_mon_generic_request is being used for the statfs and poolop requests
|
||||
* which are bening done a bit differently because we need to get data back
|
||||
* to the caller
|
||||
*/
|
||||
struct ceph_mon_generic_request {
|
||||
struct kref kref;
|
||||
u64 tid;
|
||||
struct rb_node node;
|
||||
int result;
|
||||
void *buf;
|
||||
int buf_len;
|
||||
struct completion completion;
|
||||
struct ceph_msg *request; /* original request */
|
||||
struct ceph_msg *reply; /* and reply */
|
||||
};
|
||||
|
||||
struct ceph_mon_client {
|
||||
struct ceph_client *client;
|
||||
struct ceph_monmap *monmap;
|
||||
|
||||
struct mutex mutex;
|
||||
struct delayed_work delayed_work;
|
||||
|
||||
struct ceph_auth_client *auth;
|
||||
struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
|
||||
int pending_auth;
|
||||
|
||||
bool hunting;
|
||||
int cur_mon; /* last monitor i contacted */
|
||||
unsigned long sub_sent, sub_renew_after;
|
||||
struct ceph_connection *con;
|
||||
bool have_fsid;
|
||||
|
||||
/* pending generic requests */
|
||||
struct rb_root generic_request_tree;
|
||||
int num_generic_requests;
|
||||
u64 last_tid;
|
||||
|
||||
/* mds/osd map */
|
||||
int want_mdsmap;
|
||||
int want_next_osdmap; /* 1 = want, 2 = want+asked */
|
||||
u32 have_osdmap, have_mdsmap;
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
struct dentry *debugfs_file;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
|
||||
extern int ceph_monmap_contains(struct ceph_monmap *m,
|
||||
struct ceph_entity_addr *addr);
|
||||
|
||||
extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
|
||||
extern void ceph_monc_stop(struct ceph_mon_client *monc);
|
||||
|
||||
/*
|
||||
* The model here is to indicate that we need a new map of at least
|
||||
* epoch @want, and also call in when we receive a map. We will
|
||||
* periodically rerequest the map from the monitor cluster until we
|
||||
* get what we want.
|
||||
*/
|
||||
extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
|
||||
extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
|
||||
|
||||
extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
|
||||
|
||||
extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
|
||||
struct ceph_statfs *buf);
|
||||
|
||||
extern int ceph_monc_open_session(struct ceph_mon_client *monc);
|
||||
|
||||
extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
|
||||
|
||||
extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
|
||||
u32 pool, u64 *snapid);
|
||||
|
||||
extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
|
||||
u32 pool, u64 snapid);
|
||||
|
||||
#endif
|
25
include/linux/ceph/msgpool.h
Normal file
25
include/linux/ceph/msgpool.h
Normal file
|
@ -0,0 +1,25 @@
|
|||
#ifndef _FS_CEPH_MSGPOOL
|
||||
#define _FS_CEPH_MSGPOOL
|
||||
|
||||
#include <linux/mempool.h>
|
||||
#include "messenger.h"
|
||||
|
||||
/*
|
||||
* we use memory pools for preallocating messages we may receive, to
|
||||
* avoid unexpected OOM conditions.
|
||||
*/
|
||||
struct ceph_msgpool {
|
||||
const char *name;
|
||||
mempool_t *pool;
|
||||
int front_len; /* preallocated payload size */
|
||||
};
|
||||
|
||||
extern int ceph_msgpool_init(struct ceph_msgpool *pool,
|
||||
int front_len, int size, bool blocking,
|
||||
const char *name);
|
||||
extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
|
||||
extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
|
||||
int front_len);
|
||||
extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
|
||||
|
||||
#endif
|
175
include/linux/ceph/msgr.h
Normal file
175
include/linux/ceph/msgr.h
Normal file
|
@ -0,0 +1,175 @@
|
|||
#ifndef CEPH_MSGR_H
|
||||
#define CEPH_MSGR_H
|
||||
|
||||
/*
|
||||
* Data types for message passing layer used by Ceph.
|
||||
*/
|
||||
|
||||
#define CEPH_MON_PORT 6789 /* default monitor port */
|
||||
|
||||
/*
|
||||
* client-side processes will try to bind to ports in this
|
||||
* range, simply for the benefit of tools like nmap or wireshark
|
||||
* that would like to identify the protocol.
|
||||
*/
|
||||
#define CEPH_PORT_FIRST 6789
|
||||
#define CEPH_PORT_START 6800 /* non-monitors start here */
|
||||
#define CEPH_PORT_LAST 6900
|
||||
|
||||
/*
|
||||
* tcp connection banner. include a protocol version. and adjust
|
||||
* whenever the wire protocol changes. try to keep this string length
|
||||
* constant.
|
||||
*/
|
||||
#define CEPH_BANNER "ceph v027"
|
||||
#define CEPH_BANNER_MAX_LEN 30
|
||||
|
||||
|
||||
/*
|
||||
* Rollover-safe type and comparator for 32-bit sequence numbers.
|
||||
* Comparator returns -1, 0, or 1.
|
||||
*/
|
||||
typedef __u32 ceph_seq_t;
|
||||
|
||||
static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
|
||||
{
|
||||
return (__s32)a - (__s32)b;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* entity_name -- logical name for a process participating in the
|
||||
* network, e.g. 'mds0' or 'osd3'.
|
||||
*/
|
||||
struct ceph_entity_name {
|
||||
__u8 type; /* CEPH_ENTITY_TYPE_* */
|
||||
__le64 num;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_ENTITY_TYPE_MON 0x01
|
||||
#define CEPH_ENTITY_TYPE_MDS 0x02
|
||||
#define CEPH_ENTITY_TYPE_OSD 0x04
|
||||
#define CEPH_ENTITY_TYPE_CLIENT 0x08
|
||||
#define CEPH_ENTITY_TYPE_AUTH 0x20
|
||||
|
||||
#define CEPH_ENTITY_TYPE_ANY 0xFF
|
||||
|
||||
extern const char *ceph_entity_type_name(int type);
|
||||
|
||||
/*
|
||||
* entity_addr -- network address
|
||||
*/
|
||||
struct ceph_entity_addr {
|
||||
__le32 type;
|
||||
__le32 nonce; /* unique id for process (e.g. pid) */
|
||||
struct sockaddr_storage in_addr;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_entity_inst {
|
||||
struct ceph_entity_name name;
|
||||
struct ceph_entity_addr addr;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
|
||||
/* used by message exchange protocol */
|
||||
#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
|
||||
#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
|
||||
#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
|
||||
incoming connection */
|
||||
#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
|
||||
with higher cseq */
|
||||
#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
|
||||
with higher gseq */
|
||||
#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
|
||||
#define CEPH_MSGR_TAG_MSG 7 /* message */
|
||||
#define CEPH_MSGR_TAG_ACK 8 /* message ack */
|
||||
#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
|
||||
#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
|
||||
#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
|
||||
#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
|
||||
|
||||
|
||||
/*
|
||||
* connection negotiation
|
||||
*/
|
||||
struct ceph_msg_connect {
|
||||
__le64 features; /* supported feature bits */
|
||||
__le32 host_type; /* CEPH_ENTITY_TYPE_* */
|
||||
__le32 global_seq; /* count connections initiated by this host */
|
||||
__le32 connect_seq; /* count connections initiated in this session */
|
||||
__le32 protocol_version;
|
||||
__le32 authorizer_protocol;
|
||||
__le32 authorizer_len;
|
||||
__u8 flags; /* CEPH_MSG_CONNECT_* */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_msg_connect_reply {
|
||||
__u8 tag;
|
||||
__le64 features; /* feature bits for this session */
|
||||
__le32 global_seq;
|
||||
__le32 connect_seq;
|
||||
__le32 protocol_version;
|
||||
__le32 authorizer_len;
|
||||
__u8 flags;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
|
||||
|
||||
|
||||
/*
|
||||
* message header
|
||||
*/
|
||||
struct ceph_msg_header_old {
|
||||
__le64 seq; /* message seq# for this session */
|
||||
__le64 tid; /* transaction id */
|
||||
__le16 type; /* message type */
|
||||
__le16 priority; /* priority. higher value == higher priority */
|
||||
__le16 version; /* version of message encoding */
|
||||
|
||||
__le32 front_len; /* bytes in main payload */
|
||||
__le32 middle_len;/* bytes in middle payload */
|
||||
__le32 data_len; /* bytes of data payload */
|
||||
__le16 data_off; /* sender: include full offset;
|
||||
receiver: mask against ~PAGE_MASK */
|
||||
|
||||
struct ceph_entity_inst src, orig_src;
|
||||
__le32 reserved;
|
||||
__le32 crc; /* header crc32c */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_msg_header {
|
||||
__le64 seq; /* message seq# for this session */
|
||||
__le64 tid; /* transaction id */
|
||||
__le16 type; /* message type */
|
||||
__le16 priority; /* priority. higher value == higher priority */
|
||||
__le16 version; /* version of message encoding */
|
||||
|
||||
__le32 front_len; /* bytes in main payload */
|
||||
__le32 middle_len;/* bytes in middle payload */
|
||||
__le32 data_len; /* bytes of data payload */
|
||||
__le16 data_off; /* sender: include full offset;
|
||||
receiver: mask against ~PAGE_MASK */
|
||||
|
||||
struct ceph_entity_name src;
|
||||
__le32 reserved;
|
||||
__le32 crc; /* header crc32c */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_MSG_PRIO_LOW 64
|
||||
#define CEPH_MSG_PRIO_DEFAULT 127
|
||||
#define CEPH_MSG_PRIO_HIGH 196
|
||||
#define CEPH_MSG_PRIO_HIGHEST 255
|
||||
|
||||
/*
|
||||
* follows data payload
|
||||
*/
|
||||
struct ceph_msg_footer {
|
||||
__le32 front_crc, middle_crc, data_crc;
|
||||
__u8 flags;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
|
||||
#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
|
||||
|
||||
|
||||
#endif
|
234
include/linux/ceph/osd_client.h
Normal file
234
include/linux/ceph/osd_client.h
Normal file
|
@ -0,0 +1,234 @@
|
|||
#ifndef _FS_CEPH_OSD_CLIENT_H
|
||||
#define _FS_CEPH_OSD_CLIENT_H
|
||||
|
||||
#include <linux/completion.h>
|
||||
#include <linux/kref.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
#include "types.h"
|
||||
#include "osdmap.h"
|
||||
#include "messenger.h"
|
||||
|
||||
struct ceph_msg;
|
||||
struct ceph_snap_context;
|
||||
struct ceph_osd_request;
|
||||
struct ceph_osd_client;
|
||||
struct ceph_authorizer;
|
||||
struct ceph_pagelist;
|
||||
|
||||
/*
|
||||
* completion callback for async writepages
|
||||
*/
|
||||
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
|
||||
struct ceph_msg *);
|
||||
|
||||
/* a given osd we're communicating with */
|
||||
struct ceph_osd {
|
||||
atomic_t o_ref;
|
||||
struct ceph_osd_client *o_osdc;
|
||||
int o_osd;
|
||||
int o_incarnation;
|
||||
struct rb_node o_node;
|
||||
struct ceph_connection o_con;
|
||||
struct list_head o_requests;
|
||||
struct list_head o_osd_lru;
|
||||
struct ceph_authorizer *o_authorizer;
|
||||
void *o_authorizer_buf, *o_authorizer_reply_buf;
|
||||
size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
|
||||
unsigned long lru_ttl;
|
||||
int o_marked_for_keepalive;
|
||||
struct list_head o_keepalive_item;
|
||||
};
|
||||
|
||||
/* an in-flight request */
|
||||
struct ceph_osd_request {
|
||||
u64 r_tid; /* unique for this client */
|
||||
struct rb_node r_node;
|
||||
struct list_head r_req_lru_item;
|
||||
struct list_head r_osd_item;
|
||||
struct ceph_osd *r_osd;
|
||||
struct ceph_pg r_pgid;
|
||||
int r_pg_osds[CEPH_PG_MAX_SIZE];
|
||||
int r_num_pg_osds;
|
||||
|
||||
struct ceph_connection *r_con_filling_msg;
|
||||
|
||||
struct ceph_msg *r_request, *r_reply;
|
||||
int r_result;
|
||||
int r_flags; /* any additional flags for the osd */
|
||||
u32 r_sent; /* >0 if r_request is sending/sent */
|
||||
int r_got_reply;
|
||||
|
||||
struct ceph_osd_client *r_osdc;
|
||||
struct kref r_kref;
|
||||
bool r_mempool;
|
||||
struct completion r_completion, r_safe_completion;
|
||||
ceph_osdc_callback_t r_callback, r_safe_callback;
|
||||
struct ceph_eversion r_reassert_version;
|
||||
struct list_head r_unsafe_item;
|
||||
|
||||
struct inode *r_inode; /* for use by callbacks */
|
||||
void *r_priv; /* ditto */
|
||||
|
||||
char r_oid[40]; /* object name */
|
||||
int r_oid_len;
|
||||
unsigned long r_stamp; /* send OR check time */
|
||||
bool r_resend; /* msg send failed, needs retry */
|
||||
|
||||
struct ceph_file_layout r_file_layout;
|
||||
struct ceph_snap_context *r_snapc; /* snap context for writes */
|
||||
unsigned r_num_pages; /* size of page array (follows) */
|
||||
struct page **r_pages; /* pages for data payload */
|
||||
int r_pages_from_pool;
|
||||
int r_own_pages; /* if true, i own page list */
|
||||
#ifdef CONFIG_BLOCK
|
||||
struct bio *r_bio; /* instead of pages */
|
||||
#endif
|
||||
|
||||
struct ceph_pagelist *r_trail; /* trailing part of the data */
|
||||
};
|
||||
|
||||
struct ceph_osd_client {
|
||||
struct ceph_client *client;
|
||||
|
||||
struct ceph_osdmap *osdmap; /* current map */
|
||||
struct rw_semaphore map_sem;
|
||||
struct completion map_waiters;
|
||||
u64 last_requested_map;
|
||||
|
||||
struct mutex request_mutex;
|
||||
struct rb_root osds; /* osds */
|
||||
struct list_head osd_lru; /* idle osds */
|
||||
u64 timeout_tid; /* tid of timeout triggering rq */
|
||||
u64 last_tid; /* tid of last request */
|
||||
struct rb_root requests; /* pending requests */
|
||||
struct list_head req_lru; /* pending requests lru */
|
||||
int num_requests;
|
||||
struct delayed_work timeout_work;
|
||||
struct delayed_work osds_timeout_work;
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
struct dentry *debugfs_file;
|
||||
#endif
|
||||
|
||||
mempool_t *req_mempool;
|
||||
|
||||
struct ceph_msgpool msgpool_op;
|
||||
struct ceph_msgpool msgpool_op_reply;
|
||||
};
|
||||
|
||||
struct ceph_osd_req_op {
|
||||
u16 op; /* CEPH_OSD_OP_* */
|
||||
u32 flags; /* CEPH_OSD_FLAG_* */
|
||||
union {
|
||||
struct {
|
||||
u64 offset, length;
|
||||
u64 truncate_size;
|
||||
u32 truncate_seq;
|
||||
} extent;
|
||||
struct {
|
||||
const char *name;
|
||||
u32 name_len;
|
||||
const char *val;
|
||||
u32 value_len;
|
||||
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
|
||||
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
|
||||
} xattr;
|
||||
struct {
|
||||
const char *class_name;
|
||||
__u8 class_len;
|
||||
const char *method_name;
|
||||
__u8 method_len;
|
||||
__u8 argc;
|
||||
const char *indata;
|
||||
u32 indata_len;
|
||||
} cls;
|
||||
struct {
|
||||
u64 cookie, count;
|
||||
} pgls;
|
||||
struct {
|
||||
u64 snapid;
|
||||
} snap;
|
||||
};
|
||||
u32 payload_len;
|
||||
};
|
||||
|
||||
extern int ceph_osdc_init(struct ceph_osd_client *osdc,
|
||||
struct ceph_client *client);
|
||||
extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
|
||||
|
||||
extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
|
||||
struct ceph_msg *msg);
|
||||
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
|
||||
struct ceph_msg *msg);
|
||||
|
||||
extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
|
||||
struct ceph_file_layout *layout,
|
||||
u64 snapid,
|
||||
u64 off, u64 *plen, u64 *bno,
|
||||
struct ceph_osd_request *req,
|
||||
struct ceph_osd_req_op *op);
|
||||
|
||||
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
|
||||
int flags,
|
||||
struct ceph_snap_context *snapc,
|
||||
struct ceph_osd_req_op *ops,
|
||||
bool use_mempool,
|
||||
gfp_t gfp_flags,
|
||||
struct page **pages,
|
||||
struct bio *bio);
|
||||
|
||||
extern void ceph_osdc_build_request(struct ceph_osd_request *req,
|
||||
u64 off, u64 *plen,
|
||||
struct ceph_osd_req_op *src_ops,
|
||||
struct ceph_snap_context *snapc,
|
||||
struct timespec *mtime,
|
||||
const char *oid,
|
||||
int oid_len);
|
||||
|
||||
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
|
||||
struct ceph_file_layout *layout,
|
||||
struct ceph_vino vino,
|
||||
u64 offset, u64 *len, int op, int flags,
|
||||
struct ceph_snap_context *snapc,
|
||||
int do_sync, u32 truncate_seq,
|
||||
u64 truncate_size,
|
||||
struct timespec *mtime,
|
||||
bool use_mempool, int num_reply);
|
||||
|
||||
static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
|
||||
{
|
||||
kref_get(&req->r_kref);
|
||||
}
|
||||
extern void ceph_osdc_release_request(struct kref *kref);
|
||||
static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
|
||||
{
|
||||
kref_put(&req->r_kref, ceph_osdc_release_request);
|
||||
}
|
||||
|
||||
extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
|
||||
struct ceph_osd_request *req,
|
||||
bool nofail);
|
||||
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
|
||||
struct ceph_osd_request *req);
|
||||
extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
|
||||
|
||||
extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
|
||||
struct ceph_vino vino,
|
||||
struct ceph_file_layout *layout,
|
||||
u64 off, u64 *plen,
|
||||
u32 truncate_seq, u64 truncate_size,
|
||||
struct page **pages, int nr_pages);
|
||||
|
||||
extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
|
||||
struct ceph_vino vino,
|
||||
struct ceph_file_layout *layout,
|
||||
struct ceph_snap_context *sc,
|
||||
u64 off, u64 len,
|
||||
u32 truncate_seq, u64 truncate_size,
|
||||
struct timespec *mtime,
|
||||
struct page **pages, int nr_pages,
|
||||
int flags, int do_sync, bool nofail);
|
||||
|
||||
#endif
|
||||
|
130
include/linux/ceph/osdmap.h
Normal file
130
include/linux/ceph/osdmap.h
Normal file
|
@ -0,0 +1,130 @@
|
|||
#ifndef _FS_CEPH_OSDMAP_H
|
||||
#define _FS_CEPH_OSDMAP_H
|
||||
|
||||
#include <linux/rbtree.h>
|
||||
#include "types.h"
|
||||
#include "ceph_fs.h"
|
||||
#include <linux/crush/crush.h>
|
||||
|
||||
/*
|
||||
* The osd map describes the current membership of the osd cluster and
|
||||
* specifies the mapping of objects to placement groups and placement
|
||||
* groups to (sets of) osds. That is, it completely specifies the
|
||||
* (desired) distribution of all data objects in the system at some
|
||||
* point in time.
|
||||
*
|
||||
* Each map version is identified by an epoch, which increases monotonically.
|
||||
*
|
||||
* The map can be updated either via an incremental map (diff) describing
|
||||
* the change between two successive epochs, or as a fully encoded map.
|
||||
*/
|
||||
struct ceph_pg_pool_info {
|
||||
struct rb_node node;
|
||||
int id;
|
||||
struct ceph_pg_pool v;
|
||||
int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
|
||||
char *name;
|
||||
};
|
||||
|
||||
struct ceph_pg_mapping {
|
||||
struct rb_node node;
|
||||
struct ceph_pg pgid;
|
||||
int len;
|
||||
int osds[];
|
||||
};
|
||||
|
||||
struct ceph_osdmap {
|
||||
struct ceph_fsid fsid;
|
||||
u32 epoch;
|
||||
u32 mkfs_epoch;
|
||||
struct ceph_timespec created, modified;
|
||||
|
||||
u32 flags; /* CEPH_OSDMAP_* */
|
||||
|
||||
u32 max_osd; /* size of osd_state, _offload, _addr arrays */
|
||||
u8 *osd_state; /* CEPH_OSD_* */
|
||||
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
|
||||
struct ceph_entity_addr *osd_addr;
|
||||
|
||||
struct rb_root pg_temp;
|
||||
struct rb_root pg_pools;
|
||||
u32 pool_max;
|
||||
|
||||
/* the CRUSH map specifies the mapping of placement groups to
|
||||
* the list of osds that store+replicate them. */
|
||||
struct crush_map *crush;
|
||||
};
|
||||
|
||||
/*
|
||||
* file layout helpers
|
||||
*/
|
||||
#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
|
||||
#define ceph_file_layout_stripe_count(l) \
|
||||
((__s32)le32_to_cpu((l).fl_stripe_count))
|
||||
#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
|
||||
#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
|
||||
#define ceph_file_layout_object_su(l) \
|
||||
((__s32)le32_to_cpu((l).fl_object_stripe_unit))
|
||||
#define ceph_file_layout_pg_preferred(l) \
|
||||
((__s32)le32_to_cpu((l).fl_pg_preferred))
|
||||
#define ceph_file_layout_pg_pool(l) \
|
||||
((__s32)le32_to_cpu((l).fl_pg_pool))
|
||||
|
||||
static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
|
||||
{
|
||||
return le32_to_cpu(l->fl_stripe_unit) *
|
||||
le32_to_cpu(l->fl_stripe_count);
|
||||
}
|
||||
|
||||
/* "period" == bytes before i start on a new set of objects */
|
||||
static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
|
||||
{
|
||||
return le32_to_cpu(l->fl_object_size) *
|
||||
le32_to_cpu(l->fl_stripe_count);
|
||||
}
|
||||
|
||||
|
||||
static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
|
||||
{
|
||||
return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
|
||||
}
|
||||
|
||||
static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
|
||||
{
|
||||
return map && (map->flags & flag);
|
||||
}
|
||||
|
||||
extern char *ceph_osdmap_state_str(char *str, int len, int state);
|
||||
|
||||
static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
|
||||
int osd)
|
||||
{
|
||||
if (osd >= map->max_osd)
|
||||
return NULL;
|
||||
return &map->osd_addr[osd];
|
||||
}
|
||||
|
||||
extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
|
||||
extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
|
||||
struct ceph_osdmap *map,
|
||||
struct ceph_messenger *msgr);
|
||||
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
|
||||
|
||||
/* calculate mapping of a file extent to an object */
|
||||
extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
|
||||
u64 off, u64 *plen,
|
||||
u64 *bno, u64 *oxoff, u64 *oxlen);
|
||||
|
||||
/* calculate mapping of object to a placement group */
|
||||
extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
|
||||
const char *oid,
|
||||
struct ceph_file_layout *fl,
|
||||
struct ceph_osdmap *osdmap);
|
||||
extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
|
||||
int *acting);
|
||||
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
|
||||
struct ceph_pg pgid);
|
||||
|
||||
extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
|
||||
|
||||
#endif
|
54
include/linux/ceph/pagelist.h
Normal file
54
include/linux/ceph/pagelist.h
Normal file
|
@ -0,0 +1,54 @@
|
|||
#ifndef __FS_CEPH_PAGELIST_H
|
||||
#define __FS_CEPH_PAGELIST_H
|
||||
|
||||
#include <linux/list.h>
|
||||
|
||||
struct ceph_pagelist {
|
||||
struct list_head head;
|
||||
void *mapped_tail;
|
||||
size_t length;
|
||||
size_t room;
|
||||
};
|
||||
|
||||
static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
|
||||
{
|
||||
INIT_LIST_HEAD(&pl->head);
|
||||
pl->mapped_tail = NULL;
|
||||
pl->length = 0;
|
||||
pl->room = 0;
|
||||
}
|
||||
extern int ceph_pagelist_release(struct ceph_pagelist *pl);
|
||||
|
||||
extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
|
||||
|
||||
static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
|
||||
{
|
||||
__le64 ev = cpu_to_le64(v);
|
||||
return ceph_pagelist_append(pl, &ev, sizeof(ev));
|
||||
}
|
||||
static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
|
||||
{
|
||||
__le32 ev = cpu_to_le32(v);
|
||||
return ceph_pagelist_append(pl, &ev, sizeof(ev));
|
||||
}
|
||||
static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
|
||||
{
|
||||
__le16 ev = cpu_to_le16(v);
|
||||
return ceph_pagelist_append(pl, &ev, sizeof(ev));
|
||||
}
|
||||
static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
|
||||
{
|
||||
return ceph_pagelist_append(pl, &v, 1);
|
||||
}
|
||||
static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
|
||||
char *s, size_t len)
|
||||
{
|
||||
int ret = ceph_pagelist_encode_32(pl, len);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (len)
|
||||
return ceph_pagelist_append(pl, s, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
405
include/linux/ceph/rados.h
Normal file
405
include/linux/ceph/rados.h
Normal file
|
@ -0,0 +1,405 @@
|
|||
#ifndef CEPH_RADOS_H
|
||||
#define CEPH_RADOS_H
|
||||
|
||||
/*
|
||||
* Data types for the Ceph distributed object storage layer RADOS
|
||||
* (Reliable Autonomic Distributed Object Store).
|
||||
*/
|
||||
|
||||
#include "msgr.h"
|
||||
|
||||
/*
|
||||
* osdmap encoding versions
|
||||
*/
|
||||
#define CEPH_OSDMAP_INC_VERSION 5
|
||||
#define CEPH_OSDMAP_INC_VERSION_EXT 5
|
||||
#define CEPH_OSDMAP_VERSION 5
|
||||
#define CEPH_OSDMAP_VERSION_EXT 5
|
||||
|
||||
/*
|
||||
* fs id
|
||||
*/
|
||||
struct ceph_fsid {
|
||||
unsigned char fsid[16];
|
||||
};
|
||||
|
||||
static inline int ceph_fsid_compare(const struct ceph_fsid *a,
|
||||
const struct ceph_fsid *b)
|
||||
{
|
||||
return memcmp(a, b, sizeof(*a));
|
||||
}
|
||||
|
||||
/*
|
||||
* ino, object, etc.
|
||||
*/
|
||||
typedef __le64 ceph_snapid_t;
|
||||
#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
|
||||
#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
|
||||
#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
|
||||
|
||||
struct ceph_timespec {
|
||||
__le32 tv_sec;
|
||||
__le32 tv_nsec;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
|
||||
/*
|
||||
* object layout - how objects are mapped into PGs
|
||||
*/
|
||||
#define CEPH_OBJECT_LAYOUT_HASH 1
|
||||
#define CEPH_OBJECT_LAYOUT_LINEAR 2
|
||||
#define CEPH_OBJECT_LAYOUT_HASHINO 3
|
||||
|
||||
/*
|
||||
* pg layout -- how PGs are mapped onto (sets of) OSDs
|
||||
*/
|
||||
#define CEPH_PG_LAYOUT_CRUSH 0
|
||||
#define CEPH_PG_LAYOUT_HASH 1
|
||||
#define CEPH_PG_LAYOUT_LINEAR 2
|
||||
#define CEPH_PG_LAYOUT_HYBRID 3
|
||||
|
||||
#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
|
||||
|
||||
/*
|
||||
* placement group.
|
||||
* we encode this into one __le64.
|
||||
*/
|
||||
struct ceph_pg {
|
||||
__le16 preferred; /* preferred primary osd */
|
||||
__le16 ps; /* placement seed */
|
||||
__le32 pool; /* object pool */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* pg_pool is a set of pgs storing a pool of objects
|
||||
*
|
||||
* pg_num -- base number of pseudorandomly placed pgs
|
||||
*
|
||||
* pgp_num -- effective number when calculating pg placement. this
|
||||
* is used for pg_num increases. new pgs result in data being "split"
|
||||
* into new pgs. for this to proceed smoothly, new pgs are intiially
|
||||
* colocated with their parents; that is, pgp_num doesn't increase
|
||||
* until the new pgs have successfully split. only _then_ are the new
|
||||
* pgs placed independently.
|
||||
*
|
||||
* lpg_num -- localized pg count (per device). replicas are randomly
|
||||
* selected.
|
||||
*
|
||||
* lpgp_num -- as above.
|
||||
*/
|
||||
#define CEPH_PG_TYPE_REP 1
|
||||
#define CEPH_PG_TYPE_RAID4 2
|
||||
#define CEPH_PG_POOL_VERSION 2
|
||||
struct ceph_pg_pool {
|
||||
__u8 type; /* CEPH_PG_TYPE_* */
|
||||
__u8 size; /* number of osds in each pg */
|
||||
__u8 crush_ruleset; /* crush placement rule */
|
||||
__u8 object_hash; /* hash mapping object name to ps */
|
||||
__le32 pg_num, pgp_num; /* number of pg's */
|
||||
__le32 lpg_num, lpgp_num; /* number of localized pg's */
|
||||
__le32 last_change; /* most recent epoch changed */
|
||||
__le64 snap_seq; /* seq for per-pool snapshot */
|
||||
__le32 snap_epoch; /* epoch of last snap */
|
||||
__le32 num_snaps;
|
||||
__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
|
||||
__le64 auid; /* who owns the pg */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* stable_mod func is used to control number of placement groups.
|
||||
* similar to straight-up modulo, but produces a stable mapping as b
|
||||
* increases over time. b is the number of bins, and bmask is the
|
||||
* containing power of 2 minus 1.
|
||||
*
|
||||
* b <= bmask and bmask=(2**n)-1
|
||||
* e.g., b=12 -> bmask=15, b=123 -> bmask=127
|
||||
*/
|
||||
static inline int ceph_stable_mod(int x, int b, int bmask)
|
||||
{
|
||||
if ((x & bmask) < b)
|
||||
return x & bmask;
|
||||
else
|
||||
return x & (bmask >> 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* object layout - how a given object should be stored.
|
||||
*/
|
||||
struct ceph_object_layout {
|
||||
struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
|
||||
__le32 ol_stripe_unit; /* for per-object parity, if any */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* compound epoch+version, used by storage layer to serialize mutations
|
||||
*/
|
||||
struct ceph_eversion {
|
||||
__le32 epoch;
|
||||
__le64 version;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* osd map bits
|
||||
*/
|
||||
|
||||
/* status bits */
|
||||
#define CEPH_OSD_EXISTS 1
|
||||
#define CEPH_OSD_UP 2
|
||||
|
||||
/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
|
||||
#define CEPH_OSD_IN 0x10000
|
||||
#define CEPH_OSD_OUT 0
|
||||
|
||||
|
||||
/*
|
||||
* osd map flag bits
|
||||
*/
|
||||
#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
|
||||
#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
|
||||
#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
|
||||
#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
|
||||
#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
|
||||
|
||||
/*
|
||||
* osd ops
|
||||
*/
|
||||
#define CEPH_OSD_OP_MODE 0xf000
|
||||
#define CEPH_OSD_OP_MODE_RD 0x1000
|
||||
#define CEPH_OSD_OP_MODE_WR 0x2000
|
||||
#define CEPH_OSD_OP_MODE_RMW 0x3000
|
||||
#define CEPH_OSD_OP_MODE_SUB 0x4000
|
||||
|
||||
#define CEPH_OSD_OP_TYPE 0x0f00
|
||||
#define CEPH_OSD_OP_TYPE_LOCK 0x0100
|
||||
#define CEPH_OSD_OP_TYPE_DATA 0x0200
|
||||
#define CEPH_OSD_OP_TYPE_ATTR 0x0300
|
||||
#define CEPH_OSD_OP_TYPE_EXEC 0x0400
|
||||
#define CEPH_OSD_OP_TYPE_PG 0x0500
|
||||
|
||||
enum {
|
||||
/** data **/
|
||||
/* read */
|
||||
CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
|
||||
CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
|
||||
|
||||
/* fancy read */
|
||||
CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
|
||||
|
||||
/* write */
|
||||
CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
|
||||
CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
|
||||
CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
|
||||
CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
|
||||
CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
|
||||
|
||||
/* fancy write */
|
||||
CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
|
||||
CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
|
||||
CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
|
||||
CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
|
||||
|
||||
CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
|
||||
CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
|
||||
CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
|
||||
|
||||
CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
|
||||
CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
|
||||
|
||||
/** attrs **/
|
||||
/* read */
|
||||
CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
|
||||
CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
|
||||
CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
|
||||
|
||||
/* write */
|
||||
CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
|
||||
CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
|
||||
CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
|
||||
CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
|
||||
|
||||
/** subop **/
|
||||
CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
|
||||
CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
|
||||
CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
|
||||
CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
|
||||
CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
|
||||
|
||||
/** lock **/
|
||||
CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
|
||||
CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
|
||||
CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
|
||||
CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
|
||||
CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
|
||||
CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
|
||||
|
||||
/** exec **/
|
||||
CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
|
||||
|
||||
/** pg **/
|
||||
CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
|
||||
};
|
||||
|
||||
static inline int ceph_osd_op_type_lock(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
|
||||
}
|
||||
static inline int ceph_osd_op_type_data(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
|
||||
}
|
||||
static inline int ceph_osd_op_type_attr(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
|
||||
}
|
||||
static inline int ceph_osd_op_type_exec(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
|
||||
}
|
||||
static inline int ceph_osd_op_type_pg(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
|
||||
}
|
||||
|
||||
static inline int ceph_osd_op_mode_subop(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
|
||||
}
|
||||
static inline int ceph_osd_op_mode_read(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
|
||||
}
|
||||
static inline int ceph_osd_op_mode_modify(int op)
|
||||
{
|
||||
return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
|
||||
}
|
||||
|
||||
/*
|
||||
* note that the following tmap stuff is also defined in the ceph librados.h
|
||||
* any modification here needs to be updated there
|
||||
*/
|
||||
#define CEPH_OSD_TMAP_HDR 'h'
|
||||
#define CEPH_OSD_TMAP_SET 's'
|
||||
#define CEPH_OSD_TMAP_RM 'r'
|
||||
|
||||
extern const char *ceph_osd_op_name(int op);
|
||||
|
||||
|
||||
/*
|
||||
* osd op flags
|
||||
*
|
||||
* An op may be READ, WRITE, or READ|WRITE.
|
||||
*/
|
||||
enum {
|
||||
CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
|
||||
CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
|
||||
CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
|
||||
CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
|
||||
CEPH_OSD_FLAG_READ = 16, /* op may read */
|
||||
CEPH_OSD_FLAG_WRITE = 32, /* op may write */
|
||||
CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
|
||||
CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
|
||||
CEPH_OSD_FLAG_BALANCE_READS = 256,
|
||||
CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
|
||||
CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
|
||||
CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
|
||||
CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
|
||||
};
|
||||
|
||||
enum {
|
||||
CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
|
||||
};
|
||||
|
||||
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
|
||||
#define EBLACKLISTED ESHUTDOWN /* blacklisted */
|
||||
|
||||
/* xattr comparison */
|
||||
enum {
|
||||
CEPH_OSD_CMPXATTR_OP_NOP = 0,
|
||||
CEPH_OSD_CMPXATTR_OP_EQ = 1,
|
||||
CEPH_OSD_CMPXATTR_OP_NE = 2,
|
||||
CEPH_OSD_CMPXATTR_OP_GT = 3,
|
||||
CEPH_OSD_CMPXATTR_OP_GTE = 4,
|
||||
CEPH_OSD_CMPXATTR_OP_LT = 5,
|
||||
CEPH_OSD_CMPXATTR_OP_LTE = 6
|
||||
};
|
||||
|
||||
enum {
|
||||
CEPH_OSD_CMPXATTR_MODE_STRING = 1,
|
||||
CEPH_OSD_CMPXATTR_MODE_U64 = 2
|
||||
};
|
||||
|
||||
/*
|
||||
* an individual object operation. each may be accompanied by some data
|
||||
* payload
|
||||
*/
|
||||
struct ceph_osd_op {
|
||||
__le16 op; /* CEPH_OSD_OP_* */
|
||||
__le32 flags; /* CEPH_OSD_FLAG_* */
|
||||
union {
|
||||
struct {
|
||||
__le64 offset, length;
|
||||
__le64 truncate_size;
|
||||
__le32 truncate_seq;
|
||||
} __attribute__ ((packed)) extent;
|
||||
struct {
|
||||
__le32 name_len;
|
||||
__le32 value_len;
|
||||
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
|
||||
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
|
||||
} __attribute__ ((packed)) xattr;
|
||||
struct {
|
||||
__u8 class_len;
|
||||
__u8 method_len;
|
||||
__u8 argc;
|
||||
__le32 indata_len;
|
||||
} __attribute__ ((packed)) cls;
|
||||
struct {
|
||||
__le64 cookie, count;
|
||||
} __attribute__ ((packed)) pgls;
|
||||
struct {
|
||||
__le64 snapid;
|
||||
} __attribute__ ((packed)) snap;
|
||||
};
|
||||
__le32 payload_len;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* osd request message header. each request may include multiple
|
||||
* ceph_osd_op object operations.
|
||||
*/
|
||||
struct ceph_osd_request_head {
|
||||
__le32 client_inc; /* client incarnation */
|
||||
struct ceph_object_layout layout; /* pgid */
|
||||
__le32 osdmap_epoch; /* client's osdmap epoch */
|
||||
|
||||
__le32 flags;
|
||||
|
||||
struct ceph_timespec mtime; /* for mutations only */
|
||||
struct ceph_eversion reassert_version; /* if we are replaying op */
|
||||
|
||||
__le32 object_len; /* length of object name */
|
||||
|
||||
__le64 snapid; /* snapid to read */
|
||||
__le64 snap_seq; /* writer's snap context */
|
||||
__le32 num_snaps;
|
||||
|
||||
__le16 num_ops;
|
||||
struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct ceph_osd_reply_head {
|
||||
__le32 client_inc; /* client incarnation */
|
||||
__le32 flags;
|
||||
struct ceph_object_layout layout;
|
||||
__le32 osdmap_epoch;
|
||||
struct ceph_eversion reassert_version; /* for replaying uncommitted */
|
||||
|
||||
__le32 result; /* result code */
|
||||
|
||||
__le32 object_len; /* length of object name */
|
||||
__le32 num_ops;
|
||||
struct ceph_osd_op ops[0]; /* ops[], object */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
|
||||
#endif
|
29
include/linux/ceph/types.h
Normal file
29
include/linux/ceph/types.h
Normal file
|
@ -0,0 +1,29 @@
|
|||
#ifndef _FS_CEPH_TYPES_H
|
||||
#define _FS_CEPH_TYPES_H
|
||||
|
||||
/* needed before including ceph_fs.h */
|
||||
#include <linux/in.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/fcntl.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
#include "ceph_fs.h"
|
||||
#include "ceph_frag.h"
|
||||
#include "ceph_hash.h"
|
||||
|
||||
/*
|
||||
* Identify inodes by both their ino AND snapshot id (a u64).
|
||||
*/
|
||||
struct ceph_vino {
|
||||
u64 ino;
|
||||
u64 snap;
|
||||
};
|
||||
|
||||
|
||||
/* context for the caps reservation mechanism */
|
||||
struct ceph_cap_reservation {
|
||||
int count;
|
||||
};
|
||||
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue