123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588 |
- /*
- * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
- * Copyright (C) 2011-2013 Red Hat, Inc.
- *
- * This file is released under the GPL.
- *
- * dm-switch is a device-mapper target that maps IO to underlying block
- * devices efficiently when there are a large number of fixed-sized
- * address regions but there is no simple pattern to allow for a compact
- * mapping representation such as dm-stripe.
- */
- #include <linux/device-mapper.h>
- #include <linux/module.h>
- #include <linux/init.h>
- #include <linux/vmalloc.h>
- #define DM_MSG_PREFIX "switch"
- /*
- * One region_table_slot_t holds <region_entries_per_slot> region table
- * entries each of which is <region_table_entry_bits> in size.
- */
- typedef unsigned long region_table_slot_t;
- /*
- * A device with the offset to its start sector.
- */
- struct switch_path {
- struct dm_dev *dmdev;
- sector_t start;
- };
- /*
- * Context block for a dm switch device.
- */
- struct switch_ctx {
- struct dm_target *ti;
- unsigned nr_paths; /* Number of paths in path_list. */
- unsigned region_size; /* Region size in 512-byte sectors */
- unsigned long nr_regions; /* Number of regions making up the device */
- signed char region_size_bits; /* log2 of region_size or -1 */
- unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
- unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
- signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
- region_table_slot_t *region_table; /* Region table */
- /*
- * Array of dm devices to switch between.
- */
- struct switch_path path_list[0];
- };
- static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
- unsigned region_size)
- {
- struct switch_ctx *sctx;
- sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
- GFP_KERNEL);
- if (!sctx)
- return NULL;
- sctx->ti = ti;
- sctx->region_size = region_size;
- ti->private = sctx;
- return sctx;
- }
- static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
- {
- struct switch_ctx *sctx = ti->private;
- sector_t nr_regions = ti->len;
- sector_t nr_slots;
- if (!(sctx->region_size & (sctx->region_size - 1)))
- sctx->region_size_bits = __ffs(sctx->region_size);
- else
- sctx->region_size_bits = -1;
- sctx->region_table_entry_bits = 1;
- while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
- (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
- sctx->region_table_entry_bits++;
- sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
- if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
- sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
- else
- sctx->region_entries_per_slot_bits = -1;
- if (sector_div(nr_regions, sctx->region_size))
- nr_regions++;
- if (nr_regions >= ULONG_MAX) {
- ti->error = "Region table too large";
- return -EINVAL;
- }
- sctx->nr_regions = nr_regions;
- nr_slots = nr_regions;
- if (sector_div(nr_slots, sctx->region_entries_per_slot))
- nr_slots++;
- if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
- ti->error = "Region table too large";
- return -EINVAL;
- }
- sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
- if (!sctx->region_table) {
- ti->error = "Cannot allocate region table";
- return -ENOMEM;
- }
- return 0;
- }
- static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
- unsigned long *region_index, unsigned *bit)
- {
- if (sctx->region_entries_per_slot_bits >= 0) {
- *region_index = region_nr >> sctx->region_entries_per_slot_bits;
- *bit = region_nr & (sctx->region_entries_per_slot - 1);
- } else {
- *region_index = region_nr / sctx->region_entries_per_slot;
- *bit = region_nr % sctx->region_entries_per_slot;
- }
- *bit *= sctx->region_table_entry_bits;
- }
- static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
- {
- unsigned long region_index;
- unsigned bit;
- switch_get_position(sctx, region_nr, ®ion_index, &bit);
- return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
- ((1 << sctx->region_table_entry_bits) - 1);
- }
- /*
- * Find which path to use at given offset.
- */
- static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
- {
- unsigned path_nr;
- sector_t p;
- p = offset;
- if (sctx->region_size_bits >= 0)
- p >>= sctx->region_size_bits;
- else
- sector_div(p, sctx->region_size);
- path_nr = switch_region_table_read(sctx, p);
- /* This can only happen if the processor uses non-atomic stores. */
- if (unlikely(path_nr >= sctx->nr_paths))
- path_nr = 0;
- return path_nr;
- }
- static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
- unsigned value)
- {
- unsigned long region_index;
- unsigned bit;
- region_table_slot_t pte;
- switch_get_position(sctx, region_nr, ®ion_index, &bit);
- pte = sctx->region_table[region_index];
- pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
- pte |= (region_table_slot_t)value << bit;
- sctx->region_table[region_index] = pte;
- }
- /*
- * Fill the region table with an initial round robin pattern.
- */
- static void initialise_region_table(struct switch_ctx *sctx)
- {
- unsigned path_nr = 0;
- unsigned long region_nr;
- for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
- switch_region_table_write(sctx, region_nr, path_nr);
- if (++path_nr >= sctx->nr_paths)
- path_nr = 0;
- }
- }
- static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
- {
- struct switch_ctx *sctx = ti->private;
- unsigned long long start;
- int r;
- r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
- &sctx->path_list[sctx->nr_paths].dmdev);
- if (r) {
- ti->error = "Device lookup failed";
- return r;
- }
- if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
- ti->error = "Invalid device starting offset";
- dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
- return -EINVAL;
- }
- sctx->path_list[sctx->nr_paths].start = start;
- sctx->nr_paths++;
- return 0;
- }
- /*
- * Destructor: Don't free the dm_target, just the ti->private data (if any).
- */
- static void switch_dtr(struct dm_target *ti)
- {
- struct switch_ctx *sctx = ti->private;
- while (sctx->nr_paths--)
- dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
- vfree(sctx->region_table);
- kfree(sctx);
- }
- /*
- * Constructor arguments:
- * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
- * [<dev_path> <offset>]+
- *
- * Optional args are to allow for future extension: currently this
- * parameter must be 0.
- */
- static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
- {
- static struct dm_arg _args[] = {
- {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
- {1, UINT_MAX, "Invalid region size"},
- {0, 0, "Invalid number of optional args"},
- };
- struct switch_ctx *sctx;
- struct dm_arg_set as;
- unsigned nr_paths, region_size, nr_optional_args;
- int r;
- as.argc = argc;
- as.argv = argv;
- r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
- if (r)
- return -EINVAL;
- r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error);
- if (r)
- return r;
- r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
- if (r)
- return r;
- /* parse optional arguments here, if we add any */
- if (as.argc != nr_paths * 2) {
- ti->error = "Incorrect number of path arguments";
- return -EINVAL;
- }
- sctx = alloc_switch_ctx(ti, nr_paths, region_size);
- if (!sctx) {
- ti->error = "Cannot allocate redirection context";
- return -ENOMEM;
- }
- r = dm_set_target_max_io_len(ti, region_size);
- if (r)
- goto error;
- while (as.argc) {
- r = parse_path(&as, ti);
- if (r)
- goto error;
- }
- r = alloc_region_table(ti, nr_paths);
- if (r)
- goto error;
- initialise_region_table(sctx);
- /* For UNMAP, sending the request down any path is sufficient */
- ti->num_discard_bios = 1;
- return 0;
- error:
- switch_dtr(ti);
- return r;
- }
- static int switch_map(struct dm_target *ti, struct bio *bio)
- {
- struct switch_ctx *sctx = ti->private;
- sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
- unsigned path_nr = switch_get_path_nr(sctx, offset);
- bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
- bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
- return DM_MAPIO_REMAPPED;
- }
- /*
- * We need to parse hex numbers in the message as quickly as possible.
- *
- * This table-based hex parser improves performance.
- * It improves a time to load 1000000 entries compared to the condition-based
- * parser.
- * table-based parser condition-based parser
- * PA-RISC 0.29s 0.31s
- * Opteron 0.0495s 0.0498s
- */
- static const unsigned char hex_table[256] = {
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
- 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
- };
- static __always_inline unsigned long parse_hex(const char **string)
- {
- unsigned char d;
- unsigned long r = 0;
- while ((d = hex_table[(unsigned char)**string]) < 16) {
- r = (r << 4) | d;
- (*string)++;
- }
- return r;
- }
- static int process_set_region_mappings(struct switch_ctx *sctx,
- unsigned argc, char **argv)
- {
- unsigned i;
- unsigned long region_index = 0;
- for (i = 1; i < argc; i++) {
- unsigned long path_nr;
- const char *string = argv[i];
- if ((*string & 0xdf) == 'R') {
- unsigned long cycle_length, num_write;
- string++;
- if (unlikely(*string == ',')) {
- DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
- return -EINVAL;
- }
- cycle_length = parse_hex(&string);
- if (unlikely(*string != ',')) {
- DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
- return -EINVAL;
- }
- string++;
- if (unlikely(!*string)) {
- DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
- return -EINVAL;
- }
- num_write = parse_hex(&string);
- if (unlikely(*string)) {
- DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
- return -EINVAL;
- }
- if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
- DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
- cycle_length - 1, region_index);
- return -EINVAL;
- }
- if (unlikely(region_index + num_write < region_index) ||
- unlikely(region_index + num_write >= sctx->nr_regions)) {
- DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
- region_index, num_write, sctx->nr_regions);
- return -EINVAL;
- }
- while (num_write--) {
- region_index++;
- path_nr = switch_region_table_read(sctx, region_index - cycle_length);
- switch_region_table_write(sctx, region_index, path_nr);
- }
- continue;
- }
- if (*string == ':')
- region_index++;
- else {
- region_index = parse_hex(&string);
- if (unlikely(*string != ':')) {
- DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
- return -EINVAL;
- }
- }
- string++;
- if (unlikely(!*string)) {
- DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
- return -EINVAL;
- }
- path_nr = parse_hex(&string);
- if (unlikely(*string)) {
- DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
- return -EINVAL;
- }
- if (unlikely(region_index >= sctx->nr_regions)) {
- DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
- return -EINVAL;
- }
- if (unlikely(path_nr >= sctx->nr_paths)) {
- DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
- return -EINVAL;
- }
- switch_region_table_write(sctx, region_index, path_nr);
- }
- return 0;
- }
- /*
- * Messages are processed one-at-a-time.
- *
- * Only set_region_mappings is supported.
- */
- static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
- {
- static DEFINE_MUTEX(message_mutex);
- struct switch_ctx *sctx = ti->private;
- int r = -EINVAL;
- mutex_lock(&message_mutex);
- if (!strcasecmp(argv[0], "set_region_mappings"))
- r = process_set_region_mappings(sctx, argc, argv);
- else
- DMWARN("Unrecognised message received.");
- mutex_unlock(&message_mutex);
- return r;
- }
- static void switch_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
- {
- struct switch_ctx *sctx = ti->private;
- unsigned sz = 0;
- int path_nr;
- switch (type) {
- case STATUSTYPE_INFO:
- result[0] = '\0';
- break;
- case STATUSTYPE_TABLE:
- DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
- for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
- DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
- (unsigned long long)sctx->path_list[path_nr].start);
- break;
- }
- }
- /*
- * Switch ioctl:
- *
- * Passthrough all ioctls to the path for sector 0
- */
- static int switch_prepare_ioctl(struct dm_target *ti,
- struct block_device **bdev, fmode_t *mode)
- {
- struct switch_ctx *sctx = ti->private;
- unsigned path_nr;
- path_nr = switch_get_path_nr(sctx, 0);
- *bdev = sctx->path_list[path_nr].dmdev->bdev;
- *mode = sctx->path_list[path_nr].dmdev->mode;
- /*
- * Only pass ioctls through if the device sizes match exactly.
- */
- if (ti->len + sctx->path_list[path_nr].start !=
- i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
- return 1;
- return 0;
- }
- static int switch_iterate_devices(struct dm_target *ti,
- iterate_devices_callout_fn fn, void *data)
- {
- struct switch_ctx *sctx = ti->private;
- int path_nr;
- int r;
- for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
- r = fn(ti, sctx->path_list[path_nr].dmdev,
- sctx->path_list[path_nr].start, ti->len, data);
- if (r)
- return r;
- }
- return 0;
- }
- static struct target_type switch_target = {
- .name = "switch",
- .version = {1, 1, 0},
- .module = THIS_MODULE,
- .ctr = switch_ctr,
- .dtr = switch_dtr,
- .map = switch_map,
- .message = switch_message,
- .status = switch_status,
- .prepare_ioctl = switch_prepare_ioctl,
- .iterate_devices = switch_iterate_devices,
- };
- static int __init dm_switch_init(void)
- {
- int r;
- r = dm_register_target(&switch_target);
- if (r < 0)
- DMERR("dm_register_target() failed %d", r);
- return r;
- }
- static void __exit dm_switch_exit(void)
- {
- dm_unregister_target(&switch_target);
- }
- module_init(dm_switch_init);
- module_exit(dm_switch_exit);
- MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
- MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
- MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
- MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
- MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
- MODULE_LICENSE("GPL");
|