Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f24e9980 authored by Sage Weil's avatar Sage Weil
Browse files

ceph: OSD client



The OSD client is responsible for reading and writing data from/to the
object storage pool.  This includes determining where objects are
stored in the cluster, and ensuring that requests are retried or
redirected in the event of a node failure or data migration.

If an OSD does not respond before a timeout expires, keepalive
messages are sent across the lossless, ordered communications channel
to ensure that any break in the TCP is discovered.  If the session
does reset, a reconnection is attempted and affected requests are
resent (by the message transport layer).

Signed-off-by: default avatarSage Weil <sage@newdream.net>
parent 2f2dc053
Loading
Loading
Loading
Loading

fs/ceph/osd_client.c

0 → 100644
+1294 −0

File added.

Preview size limit exceeded, changes collapsed.

fs/ceph/osd_client.h

0 → 100644
+144 −0
Original line number Original line Diff line number Diff line
#ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H

#include <linux/completion.h>
#include <linux/mempool.h>
#include <linux/rbtree.h>

#include "types.h"
#include "osdmap.h"
#include "messenger.h"

struct ceph_msg;
struct ceph_snap_context;
struct ceph_osd_request;
struct ceph_osd_client;

/*
 * completion callback for async writepages
 */
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
				     struct ceph_msg *);

/* a given osd we're communicating with */
struct ceph_osd {
	atomic_t o_ref;
	struct ceph_osd_client *o_osdc;
	int o_osd;
	int o_incarnation;
	struct rb_node o_node;
	struct ceph_connection o_con;
	struct list_head o_requests;
};

/* an in-flight request */
struct ceph_osd_request {
	u64             r_tid;              /* unique for this client */
	struct rb_node  r_node;
	struct list_head r_osd_item;
	struct ceph_osd *r_osd;

	struct ceph_msg  *r_request, *r_reply;
	int               r_result;
	int               r_flags;     /* any additional flags for the osd */
	u32               r_sent;      /* >0 if r_request is sending/sent */
	int r_prepared_pages, r_got_reply;

	struct ceph_osd_client *r_osdc;
	atomic_t          r_ref;
	bool              r_mempool;
	struct completion r_completion, r_safe_completion;
	ceph_osdc_callback_t r_callback, r_safe_callback;
	struct ceph_eversion r_reassert_version;
	struct list_head  r_unsafe_item;

	struct inode *r_inode;         	      /* for use by callbacks */
	struct writeback_control *r_wbc;      /* ditto */

	char              r_oid[40];          /* object name */
	int               r_oid_len;
	unsigned long     r_timeout_stamp;
	bool              r_resend;           /* msg send failed, needs retry */

	struct ceph_file_layout r_file_layout;
	struct ceph_snap_context *r_snapc;    /* snap context for writes */
	unsigned          r_num_pages;        /* size of page array (follows) */
	struct page     **r_pages;            /* pages for data payload */
	int               r_pages_from_pool;
	int               r_own_pages;        /* if true, i own page list */
};

struct ceph_osd_client {
	struct ceph_client     *client;

	struct ceph_osdmap     *osdmap;       /* current map */
	struct rw_semaphore    map_sem;
	struct completion      map_waiters;
	u64                    last_requested_map;

	struct mutex           request_mutex;
	struct rb_root         osds;          /* osds */
	u64                    timeout_tid;   /* tid of timeout triggering rq */
	u64                    last_tid;      /* tid of last request */
	struct rb_root         requests;      /* pending requests */
	int                    num_requests;
	struct delayed_work    timeout_work;
	struct dentry 	       *debugfs_file;

	mempool_t              *req_mempool;

	struct ceph_msgpool   msgpool_op;
	struct ceph_msgpool   msgpool_op_reply;
};

extern int ceph_osdc_init(struct ceph_osd_client *osdc,
			  struct ceph_client *client);
extern void ceph_osdc_stop(struct ceph_osd_client *osdc);

extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
				   struct ceph_msg *msg);
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
				 struct ceph_msg *msg);

extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
				      struct ceph_file_layout *layout,
				      struct ceph_vino vino,
				      u64 offset, u64 *len, int op, int flags,
				      struct ceph_snap_context *snapc,
				      int do_sync, u32 truncate_seq,
				      u64 truncate_size,
				      struct timespec *mtime,
				      bool use_mempool, int num_reply);

static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
{
	atomic_inc(&req->r_ref);
}
extern void ceph_osdc_put_request(struct ceph_osd_request *req);

extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
				   struct ceph_osd_request *req,
				   bool nofail);
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
				  struct ceph_osd_request *req);
extern void ceph_osdc_sync(struct ceph_osd_client *osdc);

extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
			       struct ceph_vino vino,
			       struct ceph_file_layout *layout,
			       u64 off, u64 *plen,
			       u32 truncate_seq, u64 truncate_size,
			       struct page **pages, int nr_pages);

extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
				struct ceph_vino vino,
				struct ceph_file_layout *layout,
				struct ceph_snap_context *sc,
				u64 off, u64 len,
				u32 truncate_seq, u64 truncate_size,
				struct timespec *mtime,
				struct page **pages, int nr_pages,
				int flags, int do_sync, bool nofail);

#endif

fs/ceph/osdmap.c

0 → 100644
+875 −0

File added.

Preview size limit exceeded, changes collapsed.

fs/ceph/osdmap.h

0 → 100644
+123 −0
Original line number Original line Diff line number Diff line
#ifndef _FS_CEPH_OSDMAP_H
#define _FS_CEPH_OSDMAP_H

#include <linux/rbtree.h>
#include "types.h"
#include "ceph_fs.h"
#include "crush/crush.h"

/*
 * The osd map describes the current membership of the osd cluster and
 * specifies the mapping of objects to placement groups and placement
 * groups to (sets of) osds.  That is, it completely specifies the
 * (desired) distribution of all data objects in the system at some
 * point in time.
 *
 * Each map version is identified by an epoch, which increases monotonically.
 *
 * The map can be updated either via an incremental map (diff) describing
 * the change between two successive epochs, or as a fully encoded map.
 */
struct ceph_pg_pool_info {
	struct ceph_pg_pool v;
	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
};

struct ceph_pg_mapping {
	struct rb_node node;
	u64 pgid;
	int len;
	int osds[];
};

struct ceph_osdmap {
	struct ceph_fsid fsid;
	u32 epoch;
	u32 mkfs_epoch;
	struct ceph_timespec created, modified;

	u32 flags;         /* CEPH_OSDMAP_* */

	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
	u8 *osd_state;     /* CEPH_OSD_* */
	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
	struct ceph_entity_addr *osd_addr;

	struct rb_root pg_temp;

	u32 num_pools;
	struct ceph_pg_pool_info *pg_pool;

	/* the CRUSH map specifies the mapping of placement groups to
	 * the list of osds that store+replicate them. */
	struct crush_map *crush;
};

/*
 * file layout helpers
 */
#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
#define ceph_file_layout_stripe_count(l) \
	((__s32)le32_to_cpu((l).fl_stripe_count))
#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
#define ceph_file_layout_object_su(l) \
	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
#define ceph_file_layout_pg_preferred(l) \
	((__s32)le32_to_cpu((l).fl_pg_preferred))
#define ceph_file_layout_pg_pool(l) \
	((__s32)le32_to_cpu((l).fl_pg_pool))

static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
{
	return le32_to_cpu(l->fl_stripe_unit) *
		le32_to_cpu(l->fl_stripe_count);
}

/* "period" == bytes before i start on a new set of objects */
static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
{
	return le32_to_cpu(l->fl_object_size) *
		le32_to_cpu(l->fl_stripe_count);
}


static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
{
	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
}

static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
{
	return map && (map->flags & flag);
}

extern char *ceph_osdmap_state_str(char *str, int len, int state);

static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
						     int osd)
{
	if (osd >= map->max_osd)
		return NULL;
	return &map->osd_addr[osd];
}

extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
					    struct ceph_osdmap *map,
					    struct ceph_messenger *msgr);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);

/* calculate mapping of a file extent to an object */
extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
					  u64 off, u64 *plen,
					  u64 *bno, u64 *oxoff, u64 *oxlen);

/* calculate mapping of object to a placement group */
extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
				   const char *oid,
				   struct ceph_file_layout *fl,
				   struct ceph_osdmap *osdmap);
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid);

#endif