From d77312aeb21d93c3547f20b8cc09916daa928acc Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 25 Apr 2017 18:30:45 +0000 Subject: Revert "[RFC]cgroup: Change from CAP_SYS_NICE to CAP_SYS_RESOURCE for cgroup migration permissions" This reverts commit 64f0245f312a849acc83b7c6cdcc8f7c1a14cca3. Needs to be reverted since system_server no longer has CAP_SYS_RESOURCE. Change-Id: Ic500cffb14b80a3e2a5dd41fda0b0b781b5245d6 --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 253fdeabf667..8ab133d96435 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2689,7 +2689,7 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid) && - !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { -- cgit v1.2.3 From 6a6a7657c231e947233c43ae0522bbd4edf0139e Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Tue, 24 Jan 2017 13:02:35 -0500 Subject: ANDROID: Update init/do_mounts_dm.c to the latest ChromiumOS version. This is needed for AVB integration work. Bug: 31796270 Test: Manually tested (other arch). Change-Id: I32fd37c1578c6414e3e6ff277d16ad94df7886b8 Signed-off-by: David Zeuthen --- include/linux/device-mapper.h | 7 + init/do_mounts_dm.c | 556 +++++++++++++++++++++++------------------- 2 files changed, 307 insertions(+), 256 deletions(-) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index b874d5b61ffc..f3f87db34429 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -415,6 +415,13 @@ union map_info *dm_get_rq_mapinfo(struct request *rq); struct queue_limits *dm_get_queue_limits(struct mapped_device *md); +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); +int dm_setup_md_queue(struct mapped_device *md); +unsigned dm_table_get_type(struct dm_table *t); + /* * Geometry functions. */ diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index ecda58df9a19..bce1c2fbb915 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -5,13 +5,17 @@ * * This file is released under the GPL. */ +#include +#include #include #include #include +#include #include "do_mounts.h" -#include "../drivers/md/dm.h" +#define DM_MAX_DEVICES 256 +#define DM_MAX_TARGETS 256 #define DM_MAX_NAME 32 #define DM_MAX_UUID 129 #define DM_NO_UUID "none" @@ -19,14 +23,47 @@ #define DM_MSG_PREFIX "init" /* Separators used for parsing the dm= argument. */ -#define DM_FIELD_SEP ' ' -#define DM_LINE_SEP ',' +#define DM_FIELD_SEP " " +#define DM_LINE_SEP "," +#define DM_ANY_SEP DM_FIELD_SEP DM_LINE_SEP /* * When the device-mapper and any targets are compiled into the kernel - * (not a module), one target may be created and used as the root device at - * boot time with the parameters given with the boot line dm=... - * The code for that is here. + * (not a module), one or more device-mappers may be created and used + * as the root device at boot time with the parameters given with the + * boot line dm=... + * + * Multiple device-mappers can be stacked specifing the number of + * devices. A device can have multiple targets if the the number of + * targets is specified. + * + * TODO(taysom:defect 32847) + * In the future, the field will be mandatory. + * + * ::= [] + + * ::= "," + + * ::= [] + * ::= "," + * ::= "ro" | "rw" + * ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none" + * ::= "verity" | "bootcache" | ... + * + * Example: + * 2 vboot none ro 1, + * 0 1768000 bootcache + * device=aa55b119-2a47-8c45-946a-5ac57765011f+1 + * signature=76e9be054b15884a9fa85973e9cb274c93afadb6 + * cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000, + * vroot none ro 1, + * 0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1 + * root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6 + * salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe + * + * Notes: + * 1. uuid is a label for the device and we set it to "none". + * 2. The field will be optional initially and assumed to be 1. + * Once all the scripts that set these fields have been set, it will + * be made mandatory. */ struct dm_setup_target { @@ -38,381 +75,388 @@ struct dm_setup_target { struct dm_setup_target *next; }; -static struct { +struct dm_device { int minor; int ro; char name[DM_MAX_NAME]; char uuid[DM_MAX_UUID]; - char *targets; + unsigned long num_targets; struct dm_setup_target *target; int target_count; + struct dm_device *next; +}; + +struct dm_option { + char *start; + char *next; + size_t len; + char delim; +}; + +static struct { + unsigned long num_devices; + char *str; } dm_setup_args __initdata; static __initdata int dm_early_setup; -static size_t __init get_dm_option(char *str, char **next, char sep) +static int __init get_dm_option(struct dm_option *opt, const char *accept) { - size_t len = 0; - char *endp = NULL; + char *str = opt->next; + char *endp; if (!str) return 0; - endp = strchr(str, sep); + str = skip_spaces(str); + opt->start = str; + endp = strpbrk(str, accept); if (!endp) { /* act like strchrnul */ - len = strlen(str); - endp = str + len; + opt->len = strlen(str); + endp = str + opt->len; } else { - len = endp - str; + opt->len = endp - str; } - - if (endp == str) - return 0; - - if (!next) - return len; - + opt->delim = *endp; if (*endp == 0) { /* Don't advance past the nul. */ - *next = endp; + opt->next = endp; } else { - *next = endp + 1; + opt->next = endp + 1; } - return len; -} - -static int __init dm_setup_args_init(void) -{ - dm_setup_args.minor = 0; - dm_setup_args.ro = 0; - dm_setup_args.target = NULL; - dm_setup_args.target_count = 0; - return 0; + return opt->len != 0; } -static int __init dm_setup_cleanup(void) +static int __init dm_setup_cleanup(struct dm_device *devices) { - struct dm_setup_target *target = dm_setup_args.target; - struct dm_setup_target *old_target = NULL; - while (target) { - kfree(target->type); - kfree(target->params); - old_target = target; - target = target->next; - kfree(old_target); - dm_setup_args.target_count--; + struct dm_device *dev = devices; + + while (dev) { + struct dm_device *old_dev = dev; + struct dm_setup_target *target = dev->target; + while (target) { + struct dm_setup_target *old_target = target; + kfree(target->type); + kfree(target->params); + target = target->next; + kfree(old_target); + dev->target_count--; + } + BUG_ON(dev->target_count); + dev = dev->next; + kfree(old_dev); } - BUG_ON(dm_setup_args.target_count); return 0; } -static char * __init dm_setup_parse_device_args(char *str) +static char * __init dm_parse_device(struct dm_device *dev, char *str) { - char *next = NULL; - size_t len = 0; + struct dm_option opt; + size_t len; /* Grab the logical name of the device to be exported to udev */ - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len) { + opt.next = str; + if (!get_dm_option(&opt, DM_FIELD_SEP)) { DMERR("failed to parse device name"); goto parse_fail; } - len = min(len + 1, sizeof(dm_setup_args.name)); - strlcpy(dm_setup_args.name, str, len); /* includes nul */ - str = skip_spaces(next); + len = min(opt.len + 1, sizeof(dev->name)); + strlcpy(dev->name, opt.start, len); /* includes nul */ /* Grab the UUID value or "none" */ - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len) { + if (!get_dm_option(&opt, DM_FIELD_SEP)) { DMERR("failed to parse device uuid"); goto parse_fail; } - len = min(len + 1, sizeof(dm_setup_args.uuid)); - strlcpy(dm_setup_args.uuid, str, len); - str = skip_spaces(next); + len = min(opt.len + 1, sizeof(dev->uuid)); + strlcpy(dev->uuid, opt.start, len); /* Determine if the table/device will be read only or read-write */ - if (!strncmp("ro,", str, 3)) { - dm_setup_args.ro = 1; - } else if (!strncmp("rw,", str, 3)) { - dm_setup_args.ro = 0; + get_dm_option(&opt, DM_ANY_SEP); + if (!strncmp("ro", opt.start, opt.len)) { + dev->ro = 1; + } else if (!strncmp("rw", opt.start, opt.len)) { + dev->ro = 0; } else { DMERR("failed to parse table mode"); goto parse_fail; } - str = skip_spaces(str + 3); - return str; + /* Optional number field */ + /* XXX: The field will be mandatory in the next round */ + if (opt.delim == DM_FIELD_SEP[0]) { + if (!get_dm_option(&opt, DM_LINE_SEP)) + return NULL; + dev->num_targets = simple_strtoul(opt.start, NULL, 10); + } else { + dev->num_targets = 1; + } + if (dev->num_targets > DM_MAX_TARGETS) { + DMERR("too many targets %lu > %d", + dev->num_targets, DM_MAX_TARGETS); + } + return opt.next; parse_fail: return NULL; } -static void __init dm_substitute_devices(char *str, size_t str_len) +static char * __init dm_parse_targets(struct dm_device *dev, char *str) { - char *candidate = str; - char *candidate_end = str; - char old_char; - size_t len = 0; - dev_t dev; - - if (str_len < 3) - return; - - while (str && *str) { - candidate = strchr(str, '/'); - if (!candidate) - break; - - /* Avoid embedded slashes */ - if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) { - str = strchr(candidate, DM_FIELD_SEP); - continue; - } - - len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP); - str = skip_spaces(candidate_end); - if (len < 3 || len > 37) /* name_to_dev_t max; maj:mix min */ - continue; - - /* Temporarily terminate with a nul */ - if (*candidate_end) - candidate_end--; - old_char = *candidate_end; - *candidate_end = '\0'; - - DMDEBUG("converting candidate device '%s' to dev_t", candidate); - /* Use the boot-time specific device naming */ - dev = name_to_dev_t(candidate); - *candidate_end = old_char; - - DMDEBUG(" -> %u", dev); - /* No suitable replacement found */ - if (!dev) - continue; - - /* Rewrite the /dev/path as a major:minor */ - len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev)); - if (!len) { - DMERR("error substituting device major/minor."); - break; - } - candidate += len; - /* Pad out with spaces (fixing our nul) */ - while (candidate < candidate_end) - *(candidate++) = DM_FIELD_SEP; - } -} - -static int __init dm_setup_parse_targets(char *str) -{ - char *next = NULL; - size_t len = 0; - struct dm_setup_target **target = NULL; + struct dm_option opt; + struct dm_setup_target **target = &dev->target; + unsigned long num_targets = dev->num_targets; + unsigned long i; /* Targets are defined as per the table format but with a * comma as a newline separator. */ - target = &dm_setup_args.target; - while (str && *str) { + opt.next = str; + for (i = 0; i < num_targets; i++) { *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL); if (!*target) { - DMERR("failed to allocate memory for target %d", - dm_setup_args.target_count); + DMERR("failed to allocate memory for target %s<%ld>", + dev->name, i); goto parse_fail; } - dm_setup_args.target_count++; + dev->target_count++; - (*target)->begin = simple_strtoull(str, &next, 10); - if (!next || *next != DM_FIELD_SEP) { - DMERR("failed to parse starting sector for target %d", - dm_setup_args.target_count - 1); + if (!get_dm_option(&opt, DM_FIELD_SEP)) { + DMERR("failed to parse starting sector" + " for target %s<%ld>", dev->name, i); goto parse_fail; } - str = skip_spaces(next + 1); + (*target)->begin = simple_strtoull(opt.start, NULL, 10); - (*target)->length = simple_strtoull(str, &next, 10); - if (!next || *next != DM_FIELD_SEP) { - DMERR("failed to parse length for target %d", - dm_setup_args.target_count - 1); + if (!get_dm_option(&opt, DM_FIELD_SEP)) { + DMERR("failed to parse length for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next + 1); - - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len || - !((*target)->type = kstrndup(str, len, GFP_KERNEL))) { - DMERR("failed to parse type for target %d", - dm_setup_args.target_count - 1); + (*target)->length = simple_strtoull(opt.start, NULL, 10); + + if (get_dm_option(&opt, DM_FIELD_SEP)) + (*target)->type = kstrndup(opt.start, opt.len, + GFP_KERNEL); + if (!((*target)->type)) { + DMERR("failed to parse type for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next); - - len = get_dm_option(str, &next, DM_LINE_SEP); - if (!len || - !((*target)->params = kstrndup(str, len, GFP_KERNEL))) { - DMERR("failed to parse params for target %d", - dm_setup_args.target_count - 1); + if (get_dm_option(&opt, DM_LINE_SEP)) + (*target)->params = kstrndup(opt.start, opt.len, + GFP_KERNEL); + if (!((*target)->params)) { + DMERR("failed to parse params for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next); - - /* Before moving on, walk through the copied target and - * attempt to replace all /dev/xxx with the major:minor number. - * It may not be possible to resolve them traditionally at - * boot-time. */ - dm_substitute_devices((*target)->params, len); - target = &((*target)->next); } - DMDEBUG("parsed %d targets", dm_setup_args.target_count); + DMDEBUG("parsed %d targets", dev->target_count); - return 0; + return opt.next; parse_fail: - return 1; + return NULL; +} + +static struct dm_device * __init dm_parse_args(void) +{ + struct dm_device *devices = NULL; + struct dm_device **tail = &devices; + struct dm_device *dev; + char *str = dm_setup_args.str; + unsigned long num_devices = dm_setup_args.num_devices; + unsigned long i; + + if (!str) + return NULL; + for (i = 0; i < num_devices; i++) { + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + DMERR("failed to allocated memory for dev"); + goto error; + } + *tail = dev; + tail = &dev->next; + /* + * devices are given minor numbers 0 - n-1 + * in the order they are found in the arg + * string. + */ + dev->minor = i; + str = dm_parse_device(dev, str); + if (!str) /* NULL indicates error in parsing, bail */ + goto error; + + str = dm_parse_targets(dev, str); + if (!str) + goto error; + } + return devices; +error: + dm_setup_cleanup(devices); + return NULL; } /* * Parse the command-line parameters given our kernel, but do not * actually try to invoke the DM device now; that is handled by - * dm_setup_drive after the low-level disk drivers have initialised. - * dm format is as follows: - * dm="name uuid fmode,[table line 1],[table line 2],..." - * May be used with root=/dev/dm-0 as it always uses the first dm minor. + * dm_setup_drives after the low-level disk drivers have initialised. + * dm format is described at the top of the file. + * + * Because dm minor numbers are assigned in assending order starting with 0, + * You can assume the first device is /dev/dm-0, the next device is /dev/dm-1, + * and so forth. */ - static int __init dm_setup(char *str) { - dm_setup_args_init(); + struct dm_option opt; + unsigned long num_devices; - str = dm_setup_parse_device_args(str); if (!str) { DMDEBUG("str is NULL"); goto parse_fail; } - - /* Target parsing is delayed until we have dynamic memory */ - dm_setup_args.targets = str; - - printk(KERN_INFO "dm: will configure '%s' on dm-%d\n", - dm_setup_args.name, dm_setup_args.minor); - + opt.next = str; + if (!get_dm_option(&opt, DM_FIELD_SEP)) + goto parse_fail; + if (isdigit(opt.start[0])) { /* XXX: Optional number field */ + num_devices = simple_strtoul(opt.start, NULL, 10); + str = opt.next; + } else { + num_devices = 1; + /* Don't advance str */ + } + if (num_devices > DM_MAX_DEVICES) { + DMDEBUG("too many devices %lu > %d", + num_devices, DM_MAX_DEVICES); + } + dm_setup_args.str = str; + dm_setup_args.num_devices = num_devices; + DMINFO("will configure %lu devices", num_devices); dm_early_setup = 1; return 1; parse_fail: - printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n"); + DMWARN("Invalid arguments supplied to dm=."); return 0; } - -static void __init dm_setup_drive(void) +static void __init dm_setup_drives(void) { struct mapped_device *md = NULL; struct dm_table *table = NULL; struct dm_setup_target *target; - char *uuid = dm_setup_args.uuid; + struct dm_device *dev; + char *uuid; fmode_t fmode = FMODE_READ; + struct dm_device *devices; - /* Finish parsing the targets. */ - if (dm_setup_parse_targets(dm_setup_args.targets)) - goto parse_fail; - - if (dm_create(dm_setup_args.minor, &md)) { - DMDEBUG("failed to create the device"); - goto dm_create_fail; - } - DMDEBUG("created device '%s'", dm_device_name(md)); - - /* In addition to flagging the table below, the disk must be - * set explicitly ro/rw. */ - set_disk_ro(dm_disk(md), dm_setup_args.ro); + devices = dm_parse_args(); - if (!dm_setup_args.ro) - fmode |= FMODE_WRITE; - if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) { - DMDEBUG("failed to create the table"); - goto dm_table_create_fail; - } + for (dev = devices; dev; dev = dev->next) { + if (dm_create(dev->minor, &md)) { + DMDEBUG("failed to create the device"); + goto dm_create_fail; + } + DMDEBUG("created device '%s'", dm_device_name(md)); + + /* + * In addition to flagging the table below, the disk must be + * set explicitly ro/rw. + */ + set_disk_ro(dm_disk(md), dev->ro); + + if (!dev->ro) + fmode |= FMODE_WRITE; + if (dm_table_create(&table, fmode, dev->target_count, md)) { + DMDEBUG("failed to create the table"); + goto dm_table_create_fail; + } - dm_lock_md_type(md); - target = dm_setup_args.target; - while (target) { - DMINFO("adding target '%llu %llu %s %s'", - (unsigned long long) target->begin, - (unsigned long long) target->length, target->type, - target->params); - if (dm_table_add_target(table, target->type, target->begin, - target->length, target->params)) { - DMDEBUG("failed to add the target to the table"); - goto add_target_fail; + dm_lock_md_type(md); + + for (target = dev->target; target; target = target->next) { + DMINFO("adding target '%llu %llu %s %s'", + (unsigned long long) target->begin, + (unsigned long long) target->length, + target->type, target->params); + if (dm_table_add_target(table, target->type, + target->begin, + target->length, + target->params)) { + DMDEBUG("failed to add the target" + " to the table"); + goto add_target_fail; + } + } + if (dm_table_complete(table)) { + DMDEBUG("failed to complete the table"); + goto table_complete_fail; } - target = target->next; - } - if (dm_table_complete(table)) { - DMDEBUG("failed to complete the table"); - goto table_complete_fail; - } + /* Suspend the device so that we can bind it to the table. */ + if (dm_suspend(md, 0)) { + DMDEBUG("failed to suspend the device pre-bind"); + goto suspend_fail; + } - if (dm_get_md_type(md) == DM_TYPE_NONE) { + /* Initial table load: acquire type of table. */ dm_set_md_type(md, dm_table_get_type(table)); + + /* Setup md->queue to reflect md's type. */ if (dm_setup_md_queue(md)) { DMWARN("unable to set up device queue for new table."); goto setup_md_queue_fail; } - } else if (dm_get_md_type(md) != dm_table_get_type(table)) { - DMWARN("can't change device type after initial table load."); - goto setup_md_queue_fail; - } - - /* Suspend the device so that we can bind it to the table. */ - if (dm_suspend(md, 0)) { - DMDEBUG("failed to suspend the device pre-bind"); - goto suspend_fail; - } - /* Bind the table to the device. This is the only way to associate - * md->map with the table and set the disk capacity directly. */ - if (dm_swap_table(md, table)) { /* should return NULL. */ - DMDEBUG("failed to bind the device to the table"); - goto table_bind_fail; - } + /* + * Bind the table to the device. This is the only way + * to associate md->map with the table and set the disk + * capacity directly. + */ + if (dm_swap_table(md, table)) { /* should return NULL. */ + DMDEBUG("failed to bind the device to the table"); + goto table_bind_fail; + } - /* Finally, resume and the device should be ready. */ - if (dm_resume(md)) { - DMDEBUG("failed to resume the device"); - goto resume_fail; - } + /* Finally, resume and the device should be ready. */ + if (dm_resume(md)) { + DMDEBUG("failed to resume the device"); + goto resume_fail; + } - /* Export the dm device via the ioctl interface */ - if (!strcmp(DM_NO_UUID, dm_setup_args.uuid)) - uuid = NULL; - if (dm_ioctl_export(md, dm_setup_args.name, uuid)) { - DMDEBUG("failed to export device with given name and uuid"); - goto export_fail; - } - printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); + /* Export the dm device via the ioctl interface */ + if (!strcmp(DM_NO_UUID, dev->uuid)) + uuid = NULL; + if (dm_ioctl_export(md, dev->name, uuid)) { + DMDEBUG("failed to export device with given" + " name and uuid"); + goto export_fail; + } - dm_unlock_md_type(md); - dm_setup_cleanup(); + dm_unlock_md_type(md); + + DMINFO("dm-%d is ready", dev->minor); + } + dm_setup_cleanup(devices); return; export_fail: resume_fail: table_bind_fail: -suspend_fail: setup_md_queue_fail: +suspend_fail: table_complete_fail: add_target_fail: dm_unlock_md_type(md); dm_table_create_fail: dm_put(md); dm_create_fail: - dm_setup_cleanup(); -parse_fail: - printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n", - dm_setup_args.minor, dm_setup_args.name); + DMWARN("starting dm-%d (%s) failed", + dev->minor, dev->name); + dm_setup_cleanup(devices); } __setup("dm=", dm_setup); @@ -421,6 +465,6 @@ void __init dm_run_setup(void) { if (!dm_early_setup) return; - printk(KERN_INFO "dm: attempting early device configuration.\n"); - dm_setup_drive(); + DMINFO("attempting early device configuration."); + dm_setup_drives(); } -- cgit v1.2.3 From 8d6f006d608c3b03652fb919e496945f2d4d4f1d Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Tue, 24 Jan 2017 13:17:01 -0500 Subject: ANDROID: AVB error handler to invalidate vbmeta partition. If androidboot.vbmeta.device is set and points to a device with vbmeta magic, this header will be overwritten upon an irrecoverable dm-verity error. The side-effect of this is that the slot will fail to verify on next reboot, effectively triggering the boot loader to fallback to another slot. This work both if the vbmeta struct is at the start of a partition or if there's an AVB footer at the end. This code is based on drivers/md/dm-verity-chromeos.c from ChromiumOS. Example: [ 0.000000] Kernel command line: rootfstype=ext4 init=/init console=ttyS0,115200 androidboot.console=ttyS0 androidboot.hardware=uefi_x86_64 enforcing=0 androidboot.selinux=permissive androidboot.debuggable=1 buildvariant=eng dm="1 vroot none ro 1,0 2080496 verity 1 PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b 4096 4096 260062 260062 sha1 4f76354c86e430e27426d584a726f2fbffecae32 7e4085342d634065269631ac9a199e1a43f4632c 1 ignore_zero_blocks" root=0xfd00 androidboot.vbmeta.device=PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552 androidboot.slot_suffix=_a androidboot.vbmeta.device_state=unlocked androidboot.vbmeta.hash_alg=sha256 androidboot.vbmeta.size=3200 androidboot.vbmeta.digest=14fe41c2b3696c31b7ad5eae7877d7d188995e1ab122c604aaaf4785850b91f7 skip_initramfs [...] [ 0.612802] device-mapper: verity-avb: AVB error handler initialized with vbmeta device: PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552 [...] [ 1.213804] device-mapper: init: attempting early device configuration. [ 1.214752] device-mapper: init: adding target '0 2080496 verity 1 PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b 4096 4096 260062 260062 sha1 4f76354c86e430e27426d584a726f2fbffecae32 7e4085342d634065269631ac9a199e1a43f4632c 1 ignore_zero_blocks' [ 1.217643] device-mapper: init: dm-0 is ready [ 1.226694] device-mapper: verity: 8:6: data block 0 is corrupted [ 1.227666] device-mapper: verity-avb: AVB error handler called for PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552 [ 1.234308] device-mapper: verity-avb: invalidate_vbmeta: found vbmeta partition [ 1.235848] device-mapper: verity-avb: invalidate_vbmeta: completed. [...] Bug: 31622239 Test: Manually tested (other arch). Change-Id: Idf6be32d6a3d28e15de9302aa26ad6a516d663aa Signed-off-by: David Zeuthen --- drivers/md/Kconfig | 11 +++ drivers/md/Makefile | 4 + drivers/md/dm-verity-avb.c | 217 ++++++++++++++++++++++++++++++++++++++++++ drivers/md/dm-verity-target.c | 6 +- drivers/md/dm-verity.h | 1 + 5 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 drivers/md/dm-verity-avb.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9eb08a43cd27..6f2fde5d98e7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -515,6 +515,17 @@ config DM_LOG_WRITES If unsure, say N. +config DM_VERITY_AVB + tristate "Support AVB specific verity error behavior" + depends on DM_VERITY + ---help--- + Enables Android Verified Boot platform-specific error + behavior. In particular, it will modify the vbmeta partition + specified on the kernel command-line when non-transient error + occurs (followed by a panic). + + If unsure, say N. + config DM_ANDROID_VERITY bool "Android verity target support" depends on DM_VERITY=y diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 32b5d0a90d60..c22cc74c9fa8 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -69,3 +69,7 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif + +ifeq ($(CONFIG_DM_VERITY_AVB),y) +dm-verity-objs += dm-verity-avb.o +endif diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c new file mode 100644 index 000000000000..88487346c4c6 --- /dev/null +++ b/drivers/md/dm-verity-avb.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2017 Google. + * + * This file is released under the GPLv2. + * + * Based on drivers/md/dm-verity-chromeos.c + */ + +#include +#include +#include + +#define DM_MSG_PREFIX "verity-avb" + +/* Set via module parameter. */ +static char avb_vbmeta_device[64]; + +static void invalidate_vbmeta_endio(struct bio *bio) +{ + complete(bio->bi_private); +} + +static int invalidate_vbmeta_submit(struct bio *bio, + struct block_device *bdev, + int rw, int access_last_sector, + struct page *page) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + bio->bi_private = &wait; + bio->bi_end_io = invalidate_vbmeta_endio; + bio->bi_bdev = bdev; + + bio->bi_iter.bi_sector = 0; + if (access_last_sector) { + sector_t last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1; + bio->bi_iter.bi_sector = last_sector; + } + bio->bi_vcnt = 1; + bio->bi_iter.bi_idx = 0; + bio->bi_iter.bi_size = 512; + bio->bi_iter.bi_bvec_done = 0; + bio->bi_rw = rw; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = 512; + bio->bi_io_vec[0].bv_offset = 0; + + submit_bio(rw, bio); + /* Wait up to 2 seconds for completion or fail. */ + if (!wait_for_completion_timeout(&wait, msecs_to_jiffies(2000))) + return -EIO; + return 0; +} + +static int invalidate_vbmeta(dev_t vbmeta_devt) +{ + int ret = 0; + struct block_device *bdev; + struct bio *bio; + struct page *page; + fmode_t dev_mode; + /* Ensure we do synchronous unblocked I/O. We may also need + * sync_bdev() on completion, but it really shouldn't. + */ + int rw = REQ_SYNC | REQ_SOFTBARRIER | REQ_NOIDLE; + int access_last_sector = 0; + + /* First we open the device for reading. */ + dev_mode = FMODE_READ | FMODE_EXCL; + bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode, + invalidate_vbmeta); + if (IS_ERR(bdev)) { + DMERR("invalidate_kernel: could not open device for reading"); + dev_mode = 0; + ret = -ENOENT; + goto failed_to_read; + } + + bio = bio_alloc(GFP_NOIO, 1); + if (!bio) { + ret = -ENOMEM; + goto failed_bio_alloc; + } + + page = alloc_page(GFP_NOIO); + if (!page) { + ret = -ENOMEM; + goto failed_to_alloc_page; + } + + access_last_sector = 0; + ret = invalidate_vbmeta_submit(bio, bdev, rw, access_last_sector, page); + if (ret) { + DMERR("invalidate_vbmeta: error reading"); + goto failed_to_submit_read; + } + + /* We have a page. Let's make sure it looks right. */ + if (memcmp("AVB0", page_address(page), 4) == 0) { + /* Stamp it. */ + memcpy(page_address(page), "AVE0", 4); + DMINFO("invalidate_vbmeta: found vbmeta partition"); + } else { + /* Could be this is on a AVB footer, check. Also, since the + * AVB footer is in the last 64 bytes, adjust for the fact that + * we're dealing with 512-byte sectors. + */ + size_t offset = (1<bi_remaining. + */ + bio_reset(bio); + + rw |= REQ_WRITE; + ret = invalidate_vbmeta_submit(bio, bdev, rw, access_last_sector, page); + if (ret) { + DMERR("invalidate_vbmeta: error writing"); + goto failed_to_submit_write; + } + + DMERR("invalidate_vbmeta: completed."); + ret = 0; +failed_to_submit_write: +failed_to_write: +invalid_header: + __free_page(page); +failed_to_submit_read: + /* Technically, we'll leak a page with the pending bio, but + * we're about to reboot anyway. + */ +failed_to_alloc_page: + bio_put(bio); +failed_bio_alloc: + if (dev_mode) + blkdev_put(bdev, dev_mode); +failed_to_read: + return ret; +} + +void dm_verity_avb_error_handler(void) +{ + dev_t dev; + + DMINFO("AVB error handler called for %s", avb_vbmeta_device); + + if (avb_vbmeta_device[0] == '\0') { + DMERR("avb_vbmeta_device parameter not set"); + goto fail_no_dev; + } + + dev = name_to_dev_t(avb_vbmeta_device); + if (!dev) { + DMERR("No matching partition for device: %s", + avb_vbmeta_device); + goto fail_no_dev; + } + + invalidate_vbmeta(dev); + +fail_no_dev: + ; +} + +static int __init dm_verity_avb_init(void) +{ + DMINFO("AVB error handler initialized with vbmeta device: %s", + avb_vbmeta_device); + return 0; +} + +static void __exit dm_verity_avb_exit(void) +{ +} + +module_init(dm_verity_avb_init); +module_exit(dm_verity_avb_exit); + +MODULE_AUTHOR("David Zeuthen "); +MODULE_DESCRIPTION("AVB-specific error handler for dm-verity"); +MODULE_LICENSE("GPL"); + +/* Declare parameter with no module prefix */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "androidboot.vbmeta." +module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index c7e97cf6e7fb..e34cf53bd068 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -233,8 +233,12 @@ out: if (v->mode == DM_VERITY_MODE_LOGGING) return 0; - if (v->mode == DM_VERITY_MODE_RESTART) + if (v->mode == DM_VERITY_MODE_RESTART) { +#ifdef CONFIG_DM_VERITY_AVB + dm_verity_avb_error_handler(); +#endif kernel_restart("dm-verity device corrupted"); + } return 1; } diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 75effca400a3..a90d1d416107 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -136,4 +136,5 @@ extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); extern void verity_dtr(struct dm_target *ti); extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv); extern int verity_map(struct dm_target *ti, struct bio *bio); +extern void dm_verity_avb_error_handler(void); #endif /* DM_VERITY_H */ -- cgit v1.2.3 From 0f3b6e26eb2ebb0c66f20ed53134e26496c62dd7 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 20 Apr 2017 18:05:02 -0700 Subject: ANDROID: sdcardfs: Use filesystem specific hash We weren't accounting for FS specific hash functions, causing us to miss negative dentries for any FS that had one. Similar to a patch from esdfs commit 75bd25a9476d ("esdfs: support lower's own hash") Signed-off-by: Daniel Rosenberg Change-Id: I32d1ba304d728e0ca2648cacfb4c2e441ae63608 --- fs/sdcardfs/lookup.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index a0f221501b4c..446ef4027ebc 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -366,8 +366,13 @@ put_name: /* instatiate a new negative dentry */ dname.name = name->name; dname.len = name->len; - dname.hash = full_name_hash(dname.name, dname.len); - lower_dentry = d_lookup(lower_dir_dentry, &dname); + + /* See if the low-level filesystem might want + * to use its own hash + */ + lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname); + if (IS_ERR(lower_dentry)) + return lower_dentry; if (lower_dentry) goto setup_lower; -- cgit v1.2.3 From b4840d3bba899cd1f2434fb9f1277b36673a4b3e Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 20 Apr 2017 18:21:50 -0700 Subject: Revert "Revert "Android: sdcardfs: Don't do d_add for lower fs"" This reverts commit ffa75fdb9c408f49b9622b6d55752ed99ff61488. Turns out we just needed the right hash. Signed-off-by: Daniel Rosenberg Bug: 37231161 Change-Id: I6a6de7f7df99ad42b20fa062913b219f64020c31 --- fs/sdcardfs/lookup.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 446ef4027ebc..509d5fbcb472 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -373,17 +373,15 @@ put_name: lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname); if (IS_ERR(lower_dentry)) return lower_dentry; - if (lower_dentry) - goto setup_lower; - - lower_dentry = d_alloc(lower_dir_dentry, &dname); if (!lower_dentry) { - err = -ENOMEM; + /* We called vfs_path_lookup earlier, and did not get a negative + * dentry then. Don't confuse the lower filesystem by forcing + * one on it now... + */ + err = -ENOENT; goto out; } - d_add(lower_dentry, NULL); /* instantiate and hash */ -setup_lower: lower_path.dentry = lower_dentry; lower_path.mnt = mntget(lower_dir_mnt); sdcardfs_set_lower_path(dentry, &lower_path); -- cgit v1.2.3 From 46d925efcc72a2b52d404ad8a457a5bdc7c18b83 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 24 Apr 2017 16:10:21 -0700 Subject: ANDROID: sdcardfs: Copy meta-data from lower inode From wrapfs commit 3ee9b365e38c ("Wrapfs: properly copy meta-data after AIO operations from lower inode") Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: I9a789222e27a17b8d85ce61c45397d1839f9a675 --- fs/sdcardfs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 1f6921e2ffbf..6076c342dae6 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -358,9 +358,12 @@ ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter) get_file(lower_file); /* prevent lower_file from being released */ iocb->ki_filp = lower_file; err = lower_file->f_op->read_iter(iocb, iter); - /* ? wait IO finish to update atime as ecryptfs ? */ iocb->ki_filp = file; fput(lower_file); + /* update upper inode atime as needed */ + if (err >= 0 || err == -EIOCBQUEUED) + fsstack_copy_attr_atime(file->f_path.dentry->d_inode, + file_inode(lower_file)); out: return err; } @@ -384,6 +387,13 @@ ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter) err = lower_file->f_op->write_iter(iocb, iter); iocb->ki_filp = file; fput(lower_file); + /* update upper inode times/sizes as needed */ + if (err >= 0 || err == -EIOCBQUEUED) { + fsstack_copy_inode_size(file->f_path.dentry->d_inode, + file_inode(lower_file)); + fsstack_copy_attr_times(file->f_path.dentry->d_inode, + file_inode(lower_file)); + } out: return err; } -- cgit v1.2.3 From 33fddbee41d511bd1f1946f3dd66351d6895c30f Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 24 Apr 2017 16:11:03 -0700 Subject: ANDROID: sdcardfs: Avoid setting GIDs outside of valid ranges When setting up the ownership of files on the lower filesystem, ensure that these values are in reasonable ranges for apps. If they aren't, default to AID_MEDIA_RW Signed-off-by: Daniel Rosenberg Bug: 37516160 Change-Id: I0bec76a61ac72aff0b993ab1ad04be8382178a00 --- fs/sdcardfs/derived_perm.c | 8 ++++---- fs/sdcardfs/multiuser.h | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index cddfc79ea365..b4595aab5713 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -215,16 +215,16 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) gid = AID_MEDIA_OBB; break; case PERM_ANDROID_PACKAGE: - if (info->d_uid != 0) + if (uid_is_app(info->d_uid)) gid = multiuser_get_ext_gid(info->d_uid); else - gid = multiuser_get_uid(info->userid, uid); + gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); break; case PERM_ANDROID_PACKAGE_CACHE: - if (info->d_uid != 0) + if (uid_is_app(info->d_uid)) gid = multiuser_get_ext_cache_gid(info->d_uid); else - gid = multiuser_get_uid(info->userid, uid); + gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); break; case PERM_PRE_ROOT: default: diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h index d0c925cda299..85341e753f8c 100644 --- a/fs/sdcardfs/multiuser.h +++ b/fs/sdcardfs/multiuser.h @@ -35,6 +35,13 @@ static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET); } +static inline bool uid_is_app(uid_t uid) +{ + appid_t appid = uid % AID_USER_OFFSET; + + return appid >= AID_APP_START && appid <= AID_APP_END; +} + static inline gid_t multiuser_get_ext_cache_gid(uid_t uid) { return uid - AID_APP_START + AID_EXT_CACHE_GID_START; -- cgit v1.2.3 From b878b260109386898ece2fc99ff2da255acd9c10 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 24 Apr 2017 19:49:02 -0700 Subject: ANDROID: sdcardfs: Call lower fs's revalidate We should be calling the lower filesystem's revalidate inside of sdcardfs's revalidate, as wrapfs does. Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: I939d1c4192fafc1e21678aeab43fe3d588b8e2f4 --- fs/sdcardfs/dentry.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index 8e31d1a80f0c..7a19e77fce99 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -60,6 +60,14 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) lower_dentry = lower_path.dentry; lower_cur_parent_dentry = dget_parent(lower_dentry); + if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) { + err = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (err == 0) { + d_drop(dentry); + goto out; + } + } + spin_lock(&lower_dentry->d_lock); if (d_unhashed(lower_dentry)) { spin_unlock(&lower_dentry->d_lock); -- cgit v1.2.3 From b5737b92560efcb956d2def4dcd3f4b6d4118e58 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Apr 2017 15:31:07 +0100 Subject: KEYS: Disallow keyrings beginning with '.' to be joined as session keyrings commit ee8f844e3c5a73b999edf733df1c529d6503ec2f upstream. This fixes CVE-2016-9604. Keyrings whose name begin with a '.' are special internal keyrings and so userspace isn't allowed to create keyrings by this name to prevent shadowing. However, the patch that added the guard didn't fix KEYCTL_JOIN_SESSION_KEYRING. Not only can that create dot-named keyrings, it can also subscribe to them as a session keyring if they grant SEARCH permission to the user. This, for example, allows a root process to set .builtin_trusted_keys as its session keyring, at which point it has full access because now the possessor permissions are added. This permits root to add extra public keys, thereby bypassing module verification. This also affects kexec and IMA. This can be tested by (as root): keyctl session .builtin_trusted_keys keyctl add user a a @s keyctl list @s which on my test box gives me: 2 keys in keyring: 180010936: ---lswrv 0 0 asymmetric: Build time autogenerated kernel key: ae3d4a31b82daa8e1a75b49dc2bba949fd992a05 801382539: --alswrv 0 0 user: a Fix this by rejecting names beginning with a '.' in the keyctl. Signed-off-by: David Howells Acked-by: Mimi Zohar cc: linux-ima-devel@lists.sourceforge.net Signed-off-by: Greg Kroah-Hartman --- security/keys/keyctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 1c3872aeed14..4ffb51ff0a61 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -271,7 +271,8 @@ error: * Create and join an anonymous session keyring or join a named session * keyring, creating it if necessary. A named session keyring must have Search * permission for it to be joined. Session keyrings without this permit will - * be skipped over. + * be skipped over. It is not permitted for userspace to create or join + * keyrings whose name begin with a dot. * * If successful, the ID of the joined session keyring will be returned. */ @@ -288,12 +289,16 @@ long keyctl_join_session_keyring(const char __user *_name) ret = PTR_ERR(name); goto error; } + + ret = -EPERM; + if (name[0] == '.') + goto error_name; } /* join the session */ ret = join_session_keyring(name); +error_name: kfree(name); - error: return ret; } -- cgit v1.2.3 From eb78d987757967749d0b2e82fce0314697937ee5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Apr 2017 15:31:08 +0100 Subject: KEYS: Change the name of the dead type to ".dead" to prevent user access commit c1644fe041ebaf6519f6809146a77c3ead9193af upstream. This fixes CVE-2017-6951. Userspace should not be able to do things with the "dead" key type as it doesn't have some of the helper functions set upon it that the kernel needs. Attempting to use it may cause the kernel to crash. Fix this by changing the name of the type to ".dead" so that it's rejected up front on userspace syscalls by key_get_type_from_user(). Though this doesn't seem to affect recent kernels, it does affect older ones, certainly those prior to: commit c06cfb08b88dfbe13be44a69ae2fdc3a7c902d81 Author: David Howells Date: Tue Sep 16 17:36:06 2014 +0100 KEYS: Remove key_type::match in favour of overriding default by match_preparse which went in before 3.18-rc1. Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- security/keys/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/gc.c b/security/keys/gc.c index addf060399e0..9cb4fe4478a1 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -46,7 +46,7 @@ static unsigned long key_gc_flags; * immediately unlinked. */ struct key_type key_type_dead = { - .name = "dead", + .name = ".dead", }; /* -- cgit v1.2.3 From c9460fbceb2f3efa1d20050cdbffa51ec025745a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 18 Apr 2017 15:31:09 +0100 Subject: KEYS: fix keyctl_set_reqkey_keyring() to not leak thread keyrings commit c9f838d104fed6f2f61d68164712e3204bf5271b upstream. This fixes CVE-2017-7472. Running the following program as an unprivileged user exhausts kernel memory by leaking thread keyrings: #include int main() { for (;;) keyctl_set_reqkey_keyring(KEY_REQKEY_DEFL_THREAD_KEYRING); } Fix it by only creating a new thread keyring if there wasn't one before. To make things more consistent, make install_thread_keyring_to_cred() and install_process_keyring_to_cred() both return 0 if the corresponding keyring is already present. Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- security/keys/keyctl.c | 11 ++++------- security/keys/process_keys.c | 44 +++++++++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 4ffb51ff0a61..442e350c209d 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1228,8 +1228,8 @@ error: * Read or set the default keyring in which request_key() will cache keys and * return the old setting. * - * If a process keyring is specified then this will be created if it doesn't - * yet exist. The old setting will be returned if successful. + * If a thread or process keyring is specified then it will be created if it + * doesn't yet exist. The old setting will be returned if successful. */ long keyctl_set_reqkey_keyring(int reqkey_defl) { @@ -1254,11 +1254,8 @@ long keyctl_set_reqkey_keyring(int reqkey_defl) case KEY_REQKEY_DEFL_PROCESS_KEYRING: ret = install_process_keyring_to_cred(new); - if (ret < 0) { - if (ret != -EEXIST) - goto error; - ret = 0; - } + if (ret < 0) + goto error; goto set; case KEY_REQKEY_DEFL_DEFAULT: diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index e6d50172872f..4ed909142956 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -125,13 +125,18 @@ error: } /* - * Install a fresh thread keyring directly to new credentials. This keyring is - * allowed to overrun the quota. + * Install a thread keyring to the given credentials struct if it didn't have + * one already. This is allowed to overrun the quota. + * + * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; + if (new->thread_keyring) + return 0; + keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL); @@ -143,7 +148,9 @@ int install_thread_keyring_to_cred(struct cred *new) } /* - * Install a fresh thread keyring, discarding the old one. + * Install a thread keyring to the current task if it didn't have one already. + * + * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { @@ -154,8 +161,6 @@ static int install_thread_keyring(void) if (!new) return -ENOMEM; - BUG_ON(new->thread_keyring); - ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); @@ -166,17 +171,17 @@ static int install_thread_keyring(void) } /* - * Install a process keyring directly to a credentials struct. + * Install a process keyring to the given credentials struct if it didn't have + * one already. This is allowed to overrun the quota. * - * Returns -EEXIST if there was already a process keyring, 0 if one installed, - * and other value on any other error + * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) - return -EEXIST; + return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, @@ -189,11 +194,9 @@ int install_process_keyring_to_cred(struct cred *new) } /* - * Make sure a process keyring is installed for the current process. The - * existing process keyring is not replaced. + * Install a process keyring to the current task if it didn't have one already. * - * Returns 0 if there is a process keyring by the end of this function, some - * error otherwise. + * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { @@ -207,14 +210,18 @@ static int install_process_keyring(void) ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); - return ret != -EEXIST ? ret : 0; + return ret; } return commit_creds(new); } /* - * Install a session keyring directly to a credentials struct. + * Install the given keyring as the session keyring of the given credentials + * struct, replacing the existing one if any. If the given keyring is NULL, + * then install a new anonymous session keyring. + * + * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { @@ -249,8 +256,11 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) } /* - * Install a session keyring, discarding the old one. If a keyring is not - * supplied, an empty one is invented. + * Install the given keyring as the session keyring of the current task, + * replacing the existing one if any. If the given keyring is NULL, then + * install a new anonymous session keyring. + * + * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { -- cgit v1.2.3 From 1dfb1c7bd63f7ea8975f32ddd04a9c4406f8d64d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 19 Apr 2017 12:07:08 -0400 Subject: tracing: Allocate the snapshot buffer before enabling probe commit df62db5be2e5f070ecd1a5ece5945b590ee112e0 upstream. Currently the snapshot trigger enables the probe and then allocates the snapshot. If the probe triggers before the allocation, it could cause the snapshot to fail and turn tracing off. It's best to allocate the snapshot buffer first, and then enable the trigger. If something goes wrong in the enabling of the trigger, the snapshot buffer is still allocated, but it can also be freed by the user by writting zero into the snapshot buffer file. Also add a check of the return status of alloc_snapshot(). Fixes: 77fd5c15e3 ("tracing: Add snapshot trigger to function probes") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 059233abcfcf..4c21c0b7dc91 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6060,11 +6060,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = alloc_snapshot(&global_trace); + if (ret < 0) + goto out; - if (ret >= 0) - alloc_snapshot(&global_trace); + ret = register_ftrace_function_probe(glob, ops, count); + out: return ret < 0 ? ret : 0; } -- cgit v1.2.3 From a2a67e53f92f9a3b5d94671b17b043eb56ec79ab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 19 Apr 2017 14:29:46 -0400 Subject: ring-buffer: Have ring_buffer_iter_empty() return true when empty commit 78f7a45dac2a2d2002f98a3a95f7979867868d73 upstream. I noticed that reading the snapshot file when it is empty no longer gives a status. It suppose to show the status of the snapshot buffer as well as how to allocate and use it. For example: ># cat snapshot # tracer: nop # # # * Snapshot is allocated * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') But instead it just showed an empty buffer: ># cat snapshot # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | What happened was that it was using the ring_buffer_iter_empty() function to see if it was empty, and if it was, it showed the status. But that function was returning false when it was empty. The reason was that the iter header page was on the reader page, and the reader page was empty, but so was the buffer itself. The check only tested to see if the iter was on the commit page, but the commit page was no longer pointing to the reader page, but as all pages were empty, the buffer is also. Fixes: 651e22f2701b ("ring-buffer: Always reset iterator to reader page") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ring_buffer.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7d7f99b0db47..1275175b0946 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3440,11 +3440,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + struct buffer_page *head_page; + struct buffer_page *commit_page; + unsigned commit; cpu_buffer = iter->cpu_buffer; - return iter->head_page == cpu_buffer->commit_page && - iter->head == rb_commit_index(cpu_buffer); + /* Remember, trace recording is off when iterator is in use */ + reader = cpu_buffer->reader_page; + head_page = cpu_buffer->head_page; + commit_page = cpu_buffer->commit_page; + commit = rb_page_commit(commit_page); + + return ((iter->head_page == commit_page && iter->head == commit) || + (iter->head_page == reader && commit_page == head_page && + head_page->read == commit && + iter->head == rb_page_commit(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); -- cgit v1.2.3 From f8fe51c86583ccb38263277ee471f77053a9482a Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Sun, 16 Apr 2017 20:37:24 +0100 Subject: cifs: Do not send echoes before Negotiate is complete commit 62a6cfddcc0a5313e7da3e8311ba16226fe0ac10 upstream. commit 4fcd1813e640 ("Fix reconnect to not defer smb3 session reconnect long after socket reconnect") added support for Negotiate requests to be initiated by echo calls. To avoid delays in calling echo after a reconnect, I added the patch introduced by the commit b8c600120fc8 ("Call echo service immediately after socket reconnect"). This has however caused a regression with cifs shares which do not have support for echo calls to trigger Negotiate requests. On connections which need to call Negotiation, the echo calls trigger an error which triggers a reconnect which in turn triggers another echo call. This results in a loop which is only broken when an operation is performed on the cifs share. For an idle share, it can DOS a server. The patch uses the smb_operation can_echo() for cifs so that it is called only if connection has been already been setup. kernel bz: 194531 Signed-off-by: Sachin Prabhu Tested-by: Jonathan Liu Acked-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb1ops.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index fc537c29044e..87b87e091e8e 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1015,6 +1015,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile) return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; } +static bool +cifs_can_echo(struct TCP_Server_Info *server) +{ + if (server->tcpStatus == CifsGood) + return true; + + return false; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1049,6 +1058,7 @@ struct smb_version_operations smb1_operations = { .get_dfs_refer = CIFSGetDFSRefer, .qfs_tcon = cifs_qfs_tcon, .is_path_accessible = cifs_is_path_accessible, + .can_echo = cifs_can_echo, .query_path_info = cifs_query_path_info, .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, -- cgit v1.2.3 From 859d615b5be1a6123b68f08233e548ee7f4e4298 Mon Sep 17 00:00:00 2001 From: Germano Percossi Date: Fri, 7 Apr 2017 12:29:37 +0100 Subject: CIFS: remove bad_network_name flag commit a0918f1ce6a43ac980b42b300ec443c154970979 upstream. STATUS_BAD_NETWORK_NAME can be received during node failover, causing the flag to be set and making the reconnect thread always unsuccessful, thereafter. Once the only place where it is set is removed, the remaining bits are rendered moot. Removing it does not prevent "mount" from failing when a non existent share is passed. What happens when the share really ceases to exist while the share is mounted is undefined now as much as it was before. Signed-off-by: Germano Percossi Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsglob.h | 1 - fs/cifs/smb2pdu.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b76883606e4b..94906aaa9b7c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -906,7 +906,6 @@ struct cifs_tcon { bool use_persistent:1; /* use persistent instead of durable handles */ #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ - bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6cb5c4b30e78..6cb2603f8a5c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -932,9 +932,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, else return -EIO; - if (tcon && tcon->bad_network_name) - return -ENOENT; - if ((tcon && tcon->seal) && ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) { cifs_dbg(VFS, "encryption requested but no server support"); @@ -1036,8 +1033,6 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - if (tcon) - tcon->bad_network_name = true; } goto tcon_exit; } -- cgit v1.2.3 From 702db976b8574745b20492fc2a89abfa4ec2bef1 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Sun, 9 Apr 2017 22:09:38 +0200 Subject: s390/mm: fix CMMA vs KSM vs others commit a8f60d1fadf7b8b54449fcc9d6b15248917478ba upstream. On heavy paging with KSM I see guest data corruption. Turns out that KSM will add pages to its tree, where the mapping return true for pte_unused (or might become as such later). KSM will unmap such pages and reinstantiate with different attributes (e.g. write protected or special, e.g. in replace_page or write_protect_page)). This uncovered a bug in our pagetable handling: We must remove the unused flag as soon as an entry becomes present again. Signed-of-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 024f85f947ae..e2c0e4eab037 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -829,6 +829,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; + if (pte_present(entry)) + pte_val(entry) &= ~_PAGE_UNUSED; if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; -- cgit v1.2.3 From 5ab982a01201749f49b5a6a23b45b20a03490ce5 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 3 Jun 2016 17:09:24 -0700 Subject: Drivers: hv: don't leak memory in vmbus_establish_gpadl() commit 7cc80c98070ccc7940fc28811c92cca0a681015d upstream. In some cases create_gpadl_header() allocates submessages but we never free them. [sumits] Note for stable: Upstream commit 4d63763296ab7865a98bc29cc7d77145815ef89f: (Drivers: hv: get rid of redundant messagecount in create_gpadl_header()) changes the list usage to initialize list header in all cases; that patch isn't added to stable, so the current patch is modified a little bit from the upstream commit to check if the list is valid or not. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 1ef37c727572..49d244929c4d 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -375,7 +375,7 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, struct vmbus_channel_gpadl_header *gpadlmsg; struct vmbus_channel_gpadl_body *gpadl_body; struct vmbus_channel_msginfo *msginfo = NULL; - struct vmbus_channel_msginfo *submsginfo; + struct vmbus_channel_msginfo *submsginfo, *tmp; u32 msgcount; struct list_head *curr; u32 next_gpadl_handle; @@ -437,6 +437,13 @@ cleanup: list_del(&msginfo->msglistentry); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + if (msgcount > 1) { + list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, + msglistentry) { + kfree(submsginfo); + } + } + kfree(msginfo); return ret; } -- cgit v1.2.3 From 567dd48c4e71a8d6d3014adb153993ef8608722c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 9 Jun 2016 17:08:56 -0700 Subject: Drivers: hv: get rid of timeout in vmbus_open() commit 396e287fa2ff46e83ae016cdcb300c3faa3b02f6 upstream. vmbus_teardown_gpadl() can result in infinite wait when it is called on 5 second timeout in vmbus_open(). The issue is caused by the fact that gpadl teardown operation won't ever succeed for an opened channel and the timeout isn't always enough. As a guest, we can always trust the host to respond to our request (and there is nothing we can do if it doesn't). Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 49d244929c4d..d037454fe7b8 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -73,7 +73,6 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, void *in, *out; unsigned long flags; int ret, err = 0; - unsigned long t; struct page *page; spin_lock_irqsave(&newchannel->lock, flags); @@ -183,11 +182,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, goto error1; } - t = wait_for_completion_timeout(&open_info->waitevent, 5*HZ); - if (t == 0) { - err = -ETIMEDOUT; - goto error1; - } + wait_for_completion(&open_info->waitevent); spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); list_del(&open_info->msglistentry); -- cgit v1.2.3 From f803416632b5c31e647cf18861e4c379173a02e2 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Fri, 1 Jul 2016 16:26:36 -0700 Subject: Drivers: hv: vmbus: Reduce the delay between retries in vmbus_post_msg() commit 8de0d7e951826d7592e0ba1da655b175c4aa0923 upstream. The current delay between retries is unnecessarily high and is negatively affecting the time it takes to boot the system. Signed-off-by: K. Y. Srinivasan Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- drivers/hv/connection.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 4fc2e8836e60..2bbc53025549 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -429,7 +429,7 @@ int vmbus_post_msg(void *buffer, size_t buflen) union hv_connection_id conn_id; int ret = 0; int retries = 0; - u32 msec = 1; + u32 usec = 1; conn_id.asu32 = 0; conn_id.u.id = VMBUS_MESSAGE_CONNECTION_ID; @@ -462,9 +462,9 @@ int vmbus_post_msg(void *buffer, size_t buflen) } retries++; - msleep(msec); - if (msec < 2048) - msec *= 2; + udelay(usec); + if (usec < 2048) + usec *= 2; } return ret; } -- cgit v1.2.3 From 8d5ed79fb2d766e57b5220a3e6054d99f1c985d0 Mon Sep 17 00:00:00 2001 From: Jorgen Hansen Date: Tue, 5 Apr 2016 01:59:32 -0700 Subject: VSOCK: Detach QP check should filter out non matching QPs. commit 8ab18d71de8b07d2c4d6f984b718418c09ea45c5 upstream. The check in vmci_transport_peer_detach_cb should only allow a detach when the qp handle of the transport matches the one in the detach message. Testing: Before this change, a detach from a peer on a different socket would cause an active stream socket to register a detach. Reviewed-by: George Zhang Signed-off-by: Jorgen Hansen Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/vmw_vsock/vmci_transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 0a369bb440e7..662bdd20a748 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -842,7 +842,7 @@ static void vmci_transport_peer_detach_cb(u32 sub_id, * qp_handle. */ if (vmci_handle_is_invalid(e_payload->handle) || - vmci_handle_is_equal(trans->qp_handle, e_payload->handle)) + !vmci_handle_is_equal(trans->qp_handle, e_payload->handle)) return; /* We don't ask for delayed CBs when we subscribe to this event (we @@ -2154,7 +2154,7 @@ module_exit(vmci_transport_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); -MODULE_VERSION("1.0.2.0-k"); +MODULE_VERSION("1.0.3.0-k"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("vmware_vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK); -- cgit v1.2.3 From cdede60d6a308a311b0999b826fa4cc5261632c8 Mon Sep 17 00:00:00 2001 From: Thorsten Leemhuis Date: Tue, 18 Apr 2017 11:14:28 -0700 Subject: Input: elantech - add Fujitsu Lifebook E547 to force crc_enabled commit 704de489e0e3640a2ee2d0daf173e9f7375582ba upstream. Temporary got a Lifebook E547 into my hands and noticed the touchpad only works after running: echo "1" > /sys/devices/platform/i8042/serio2/crc_enabled Add it to the list of machines that need this workaround. Signed-off-by: Thorsten Leemhuis Reviewed-by: Ulrik De Bie Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elantech.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 43482ae1e049..1a2b2620421e 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1122,6 +1122,7 @@ static int elantech_get_resolution_v4(struct psmouse *psmouse, * Asus UX32VD 0x361f02 00, 15, 0e clickpad * Avatar AVIU-145A2 0x361f00 ? clickpad * Fujitsu LIFEBOOK E544 0x470f00 d0, 12, 09 2 hw buttons + * Fujitsu LIFEBOOK E547 0x470f00 50, 12, 09 2 hw buttons * Fujitsu LIFEBOOK E554 0x570f01 40, 14, 0c 2 hw buttons * Fujitsu T725 0x470f01 05, 12, 09 2 hw buttons * Fujitsu H730 0x570f00 c0, 14, 0c 3 hw buttons (**) @@ -1527,6 +1528,13 @@ static const struct dmi_system_id elantech_dmi_force_crc_enabled[] = { DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E544"), }, }, + { + /* Fujitsu LIFEBOOK E547 does not work with crc_enabled == 0 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E547"), + }, + }, { /* Fujitsu LIFEBOOK E554 does not work with crc_enabled == 0 */ .matches = { -- cgit v1.2.3 From 6986d0d29f3cda9f558461202d86464403454574 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Apr 2017 19:47:04 +0200 Subject: ACPI / power: Avoid maybe-uninitialized warning commit fe8c470ab87d90e4b5115902dd94eced7e3305c3 upstream. gcc -O2 cannot always prove that the loop in acpi_power_get_inferred_state() is enterered at least once, so it assumes that cur_state might not get initialized: drivers/acpi/power.c: In function 'acpi_power_get_inferred_state': drivers/acpi/power.c:222:9: error: 'cur_state' may be used uninitialized in this function [-Werror=maybe-uninitialized] This sets the variable to zero at the start of the loop, to ensure that there is well-defined behavior even for an empty list. This gets rid of the warning. The warning first showed up when the -Os flag got removed in a bug fix patch in linux-4.11-rc5. I would suggest merging this addon patch on top of that bug fix to avoid introducing a new warning in the stable kernels. Fixes: 61b79e16c68d (ACPI: Fix incompatibility with mcount-based function graph tracing) Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index fcd4ce6f78d5..1c2b846c5776 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -200,6 +200,7 @@ static int acpi_power_get_list_state(struct list_head *list, int *state) return -EINVAL; /* The state of the list is 'on' IFF all resources are 'on'. */ + cur_state = 0; list_for_each_entry(entry, list, node) { struct acpi_power_resource *resource = entry->resource; acpi_handle handle = resource->device.handle; -- cgit v1.2.3 From b74ba9dd91e53a3f182e7a2a9cefe709743c7a5f Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Wed, 19 Apr 2017 10:53:51 +0800 Subject: mmc: sdhci-esdhc-imx: increase the pad I/O drive strength for DDR50 card commit 9f327845358d3dd0d8a5a7a5436b0aa5c432e757 upstream. Currently for DDR50 card, it need tuning in default. We meet tuning fail issue for DDR50 card and some data CRC error when DDR50 sd card works. This is because the default pad I/O drive strength can't make sure DDR50 card work stable. So increase the pad I/O drive strength for DDR50 card, and use pins_100mhz. This fixes DDR50 card support for IMX since DDR50 tuning was enabled from commit 9faac7b95ea4 ("mmc: sdhci: enable tuning for DDR50") Tested-and-reported-by: Tim Harvey Signed-off-by: Haibo Chen Acked-by: Dong Aisheng Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-esdhc-imx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index 1f1582f6cccb..8d838779fd1b 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -804,6 +804,7 @@ static int esdhc_change_pinstate(struct sdhci_host *host, switch (uhs) { case MMC_TIMING_UHS_SDR50: + case MMC_TIMING_UHS_DDR50: pinctrl = imx_data->pins_100mhz; break; case MMC_TIMING_UHS_SDR104: -- cgit v1.2.3 From b812c69019e421cf9b4fd7e57747e73e2b83c741 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 20 Apr 2017 21:32:16 +0200 Subject: mac80211: reject ToDS broadcast data frames commit 3018e947d7fd536d57e2b550c33e456d921fff8c upstream. AP/AP_VLAN modes don't accept any real 802.11 multicast data frames, but since they do need to accept broadcast management frames the same is currently permitted for data frames. This opens a security problem because such frames would be decrypted with the GTK, and could even contain unicast L3 frames. Since the spec says that ToDS frames must always have the BSSID as the RA (addr1), reject any other data frames. The problem was originally reported in "Predicting, Decrypting, and Abusing WPA2/802.11 Group Keys" at usenix https://www.usenix.org/conference/usenixsecurity16/technical-sessions/presentation/vanhoef and brought to my attention by Jouni. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman -- --- net/mac80211/rx.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2b528389409f..9f0915f72702 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3396,6 +3396,27 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) !ether_addr_equal(bssid, hdr->addr1)) return false; } + + /* + * 802.11-2016 Table 9-26 says that for data frames, A1 must be + * the BSSID - we've checked that already but may have accepted + * the wildcard (ff:ff:ff:ff:ff:ff). + * + * It also says: + * The BSSID of the Data frame is determined as follows: + * a) If the STA is contained within an AP or is associated + * with an AP, the BSSID is the address currently in use + * by the STA contained in the AP. + * + * So we should not accept data frames with an address that's + * multicast. + * + * Accepting it also opens a security problem because stations + * could encrypt it with the GTK and inject traffic that way. + */ + if (ieee80211_is_data(hdr->frame_control) && multicast) + return false; + return true; case NL80211_IFTYPE_WDS: if (bssid || !ieee80211_is_data(hdr->frame_control)) -- cgit v1.2.3 From 38be91ce7ea86386242a295230a517772f359df1 Mon Sep 17 00:00:00 2001 From: Sebastian Siewior Date: Wed, 22 Feb 2017 17:15:21 +0100 Subject: ubi/upd: Always flush after prepared for an update commit 9cd9a21ce070be8a918ffd3381468315a7a76ba6 upstream. In commit 6afaf8a484cb ("UBI: flush wl before clearing update marker") I managed to trigger and fix a similar bug. Now here is another version of which I assumed it wouldn't matter back then but it turns out UBI has a check for it and will error out like this: |ubi0 warning: validate_vid_hdr: inconsistent used_ebs |ubi0 error: validate_vid_hdr: inconsistent VID header at PEB 592 All you need to trigger this is? "ubiupdatevol /dev/ubi0_0 file" + a powercut in the middle of the operation. ubi_start_update() sets the update-marker and puts all EBs on the erase list. After that userland can proceed to write new data while the old EB aren't erased completely. A powercut at this point is usually not that much of a tragedy. UBI won't give read access to the static volume because it has the update marker. It will most likely set the corrupted flag because it misses some EBs. So we are all good. Unless the size of the image that has been written differs from the old image in the magnitude of at least one EB. In that case UBI will find two different values for `used_ebs' and refuse to attach the image with the error message mentioned above. So in order not to get in the situation, the patch will ensure that we wait until everything is removed before it tries to write any data. The alternative would be to detect such a case and remove all EBs at the attached time after we processed the volume-table and see the update-marker set. The patch looks bigger and I doubt it is worth it since usually the write() will wait from time to time for a new EB since usually there not that many spare EB that can be used. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/upd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/ubi/upd.c b/drivers/mtd/ubi/upd.c index 0134ba32a057..39712560b4c1 100644 --- a/drivers/mtd/ubi/upd.c +++ b/drivers/mtd/ubi/upd.c @@ -148,11 +148,11 @@ int ubi_start_update(struct ubi_device *ubi, struct ubi_volume *vol, return err; } - if (bytes == 0) { - err = ubi_wl_flush(ubi, UBI_ALL, UBI_ALL); - if (err) - return err; + err = ubi_wl_flush(ubi, UBI_ALL, UBI_ALL); + if (err) + return err; + if (bytes == 0) { err = clear_update_marker(ubi, vol, 0); if (err) return err; -- cgit v1.2.3 From 6c107bba66dcc696ec56482e7c07fccfb00d75c4 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 11 Apr 2017 10:38:13 +0530 Subject: powerpc/kprobe: Fix oops when kprobed on 'stdu' instruction commit 9e1ba4f27f018742a1aa95d11e35106feba08ec1 upstream. If we set a kprobe on a 'stdu' instruction on powerpc64, we see a kernel OOPS: Bad kernel stack pointer cd93c840 at c000000000009868 Oops: Bad kernel stack pointer, sig: 6 [#1] ... GPR00: c000001fcd93cb30 00000000cd93c840 c0000000015c5e00 00000000cd93c840 ... NIP [c000000000009868] resume_kernel+0x2c/0x58 LR [c000000000006208] program_check_common+0x108/0x180 On a 64-bit system when the user probes on a 'stdu' instruction, the kernel does not emulate actual store in emulate_step() because it may corrupt the exception frame. So the kernel does the actual store operation in exception return code i.e. resume_kernel(). resume_kernel() loads the saved stack pointer from memory using lwz, which only loads the low 32-bits of the address, causing the kernel crash. Fix this by loading the 64-bit value instead. Fixes: be96f63375a1 ("powerpc: Split out instruction analysis part of emulate_step()") Signed-off-by: Ravi Bangoria Reviewed-by: Naveen N. Rao Reviewed-by: Ananth N Mavinakayanahalli [mpe: Change log massage, add stable tag] Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index edba294620db..f6fd0332c3a2 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -716,7 +716,7 @@ resume_kernel: addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ - lwz r3,GPR1(r1) + ld r3,GPR1(r1) subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ mr r4,r1 /* src: current exception frame */ mr r1,r3 /* Reroute the trampoline frame to r1 */ @@ -730,8 +730,8 @@ resume_kernel: addi r6,r6,8 bdnz 2b - /* Do real store operation to complete stwu */ - lwz r5,GPR1(r1) + /* Do real store operation to complete stdu */ + ld r5,GPR1(r1) std r8,0(r5) /* Clear _TIF_EMULATE_STACK_STORE flag */ -- cgit v1.2.3 From e2587fba99118f2f4506b37b4766d7a4cca1465e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 30 Mar 2017 13:17:14 +0200 Subject: x86/mce/AMD: Give a name to MCA bank 3 when accessed with legacy MSRs commit 29f72ce3e4d18066ec75c79c857bee0618a3504b upstream. MCA bank 3 is reserved on systems pre-Fam17h, so it didn't have a name. However, MCA bank 3 is defined on Fam17h systems and can be accessed using legacy MSRs. Without a name we get a stack trace on Fam17h systems when trying to register sysfs files for bank 3 on kernels that don't recognize Scalable MCA. Call MCA bank 3 "decode_unit" since this is what it represents on Fam17h. This will allow kernels without SMCA support to see this bank on Fam17h+ and prevent the stack trace. This will not affect older systems since this bank is reserved on them, i.e. it'll be ignored. Tested on AMD Fam15h and Fam17h systems. WARNING: CPU: 26 PID: 1 at lib/kobject.c:210 kobject_add_internal kobject: (ffff88085bb256c0): attempted to be registered with empty name! ... Call Trace: kobject_add_internal kobject_add kobject_create_and_add threshold_create_device threshold_init_device Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1490102285-3659-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e99b15077e94..62aca448726a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -53,7 +53,7 @@ static const char * const th_names[] = { "load_store", "insn_fetch", "combined_unit", - "", + "decode_unit", "northbridge", "execution_unit", }; -- cgit v1.2.3 From 2a60bb635236ead6437b37a5b4085da1e33962b1 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 3 Apr 2017 15:12:43 +0100 Subject: kvm: arm/arm64: Fix locking for kvm_free_stage2_pgd commit 8b3405e345b5a098101b0c31b264c812bba045d9 upstream. In kvm_free_stage2_pgd() we don't hold the kvm->mmu_lock while calling unmap_stage2_range() on the entire memory range for the guest. This could cause problems with other callers (e.g, munmap on a memslot) trying to unmap a range. And since we have to unmap the entire Guest memory range holding a spinlock, make sure we yield the lock if necessary, after we unmap each PUD range. Fixes: commit d5d8184d35c9 ("KVM: ARM: Memory virtualization setup") Cc: Paolo Bonzini Cc: Marc Zyngier Cc: Christoffer Dall Cc: Mark Rutland Signed-off-by: Suzuki K Poulose [ Avoid vCPU starvation and lockup detector warnings ] Signed-off-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/mmu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index f91ee2f27b41..01cf10556081 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -300,6 +300,14 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, next = kvm_pgd_addr_end(addr, end); if (!pgd_none(*pgd)) unmap_puds(kvm, pgd, addr, next); + /* + * If we are dealing with a large range in + * stage2 table, release the kvm->mmu_lock + * to prevent starvation and lockup detector + * warnings. + */ + if (kvm && (next != end)) + cond_resched_lock(&kvm->mmu_lock); } while (pgd++, addr = next, addr != end); } @@ -738,6 +746,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) */ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) { + assert_spin_locked(&kvm->mmu_lock); unmap_range(kvm, kvm->arch.pgd, start, size); } @@ -824,7 +833,10 @@ void kvm_free_stage2_pgd(struct kvm *kvm) if (kvm->arch.pgd == NULL) return; + spin_lock(&kvm->mmu_lock); unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + spin_unlock(&kvm->mmu_lock); + kvm_free_hwpgd(kvm_get_hwpgd(kvm)); if (KVM_PREALLOC_LEVEL > 0) kfree(kvm->arch.pgd); -- cgit v1.2.3 From 397488e09bf2670c841b9f9d8652ce5dd1c952f4 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 6 Jul 2016 18:24:10 -0700 Subject: Tools: hv: kvp: ensure kvp device fd is closed on exec commit 26840437cbd6d3625ea6ab34e17cd34bb810c861 upstream. KVP daemon does fork()/exec() (with popen()) so we need to close our fds to avoid sharing them with child processes. The immediate implication of not doing so I see is SELinux complaining about 'ip' trying to access '/dev/vmbus/hv_kvp'. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- tools/hv/hv_kvp_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 0d9f48ec42bb..bc7adb84e679 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -1433,7 +1433,7 @@ int main(int argc, char *argv[]) openlog("KVP", 0, LOG_USER); syslog(LOG_INFO, "KVP starting; pid is:%d", getpid()); - kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR); + kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC); if (kvp_fd < 0) { syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s", -- cgit v1.2.3 From 8e7a6dbc3b71f37fc0167dde0e7676b4cdde1963 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Aug 2016 16:23:09 -0700 Subject: Drivers: hv: balloon: keep track of where ha_region starts commit 7cf3b79ec85ee1a5bbaaf936bb1d050dc652983b upstream. Windows 2012 (non-R2) does not specify hot add region in hot add requests and the logic in hot_add_req() is trying to find a 128Mb-aligned region covering the request. It may also happen that host's requests are not 128Mb aligned and the created ha_region will start before the first specified PFN. We can't online these non-present pages but we don't remember the real start of the region. This is a regression introduced by the commit 5abbbb75d733 ("Drivers: hv: hv_balloon: don't lose memory when onlining order is not natural"). While the idea of keeping the 'moving window' was wrong (as there is no guarantee that hot add requests come ordered) we should still keep track of covered_start_pfn. This is not a revert, the logic is different. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_balloon.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 43af91362be5..1542d894fd8f 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -430,13 +430,14 @@ struct dm_info_msg { * currently hot added. We hot add in multiples of 128M * chunks; it is possible that we may not be able to bring * online all the pages in the region. The range - * covered_end_pfn defines the pages that can + * covered_start_pfn:covered_end_pfn defines the pages that can * be brough online. */ struct hv_hotadd_state { struct list_head list; unsigned long start_pfn; + unsigned long covered_start_pfn; unsigned long covered_end_pfn; unsigned long ha_end_pfn; unsigned long end_pfn; @@ -682,7 +683,8 @@ static void hv_online_page(struct page *pg) list_for_each(cur, &dm_device.ha_region_list) { has = list_entry(cur, struct hv_hotadd_state, list); - cur_start_pgp = (unsigned long)pfn_to_page(has->start_pfn); + cur_start_pgp = (unsigned long) + pfn_to_page(has->covered_start_pfn); cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); if (((unsigned long)pg >= cur_start_pgp) && @@ -854,6 +856,7 @@ static unsigned long process_hot_add(unsigned long pg_start, list_add_tail(&ha_region->list, &dm_device.ha_region_list); ha_region->start_pfn = rg_start; ha_region->ha_end_pfn = rg_start; + ha_region->covered_start_pfn = pg_start; ha_region->covered_end_pfn = pg_start; ha_region->end_pfn = rg_start + rg_size; } -- cgit v1.2.3 From 03e2fb9b5ce80aa9ff6384384f5fbde156550971 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Aug 2016 16:23:10 -0700 Subject: Drivers: hv: balloon: account for gaps in hot add regions commit cb7a5724c7e1bfb5766ad1c3beba14cc715991cf upstream. I'm observing the following hot add requests from the WS2012 host: hot_add_req: start_pfn = 0x108200 count = 330752 hot_add_req: start_pfn = 0x158e00 count = 193536 hot_add_req: start_pfn = 0x188400 count = 239616 As the host doesn't specify hot add regions we're trying to create 128Mb-aligned region covering the first request, we create the 0x108000 - 0x160000 region and we add 0x108000 - 0x158e00 memory. The second request passes the pfn_covered() check, we enlarge the region to 0x108000 - 0x190000 and add 0x158e00 - 0x188200 memory. The problem emerges with the third request as it starts at 0x188400 so there is a 0x200 gap which is not covered. As the end of our region is 0x190000 now it again passes the pfn_covered() check were we just adjust the covered_end_pfn and make it 0x188400 instead of 0x188200 which means that we'll try to online 0x188200-0x188400 pages but these pages were never assigned to us and we crash. We can't react to such requests by creating new hot add regions as it may happen that the whole suggested range falls into the previously identified 128Mb-aligned area so we'll end up adding nothing or create intersecting regions and our current logic doesn't allow that. Instead, create a list of such 'gaps' and check for them in the page online callback. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_balloon.c | 131 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 94 insertions(+), 37 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 1542d894fd8f..354da7f207b7 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -441,6 +441,16 @@ struct hv_hotadd_state { unsigned long covered_end_pfn; unsigned long ha_end_pfn; unsigned long end_pfn; + /* + * A list of gaps. + */ + struct list_head gap_list; +}; + +struct hv_hotadd_gap { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; }; struct balloon_state { @@ -596,18 +606,46 @@ static struct notifier_block hv_memory_nb = { .priority = 0 }; +/* Check if the particular page is backed and can be onlined and online it. */ +static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) +{ + unsigned long cur_start_pgp; + unsigned long cur_end_pgp; + struct hv_hotadd_gap *gap; + + cur_start_pgp = (unsigned long)pfn_to_page(has->covered_start_pfn); + cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); -static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size) + /* The page is not backed. */ + if (((unsigned long)pg < cur_start_pgp) || + ((unsigned long)pg >= cur_end_pgp)) + return; + + /* Check for gaps. */ + list_for_each_entry(gap, &has->gap_list, list) { + cur_start_pgp = (unsigned long) + pfn_to_page(gap->start_pfn); + cur_end_pgp = (unsigned long) + pfn_to_page(gap->end_pfn); + if (((unsigned long)pg >= cur_start_pgp) && + ((unsigned long)pg < cur_end_pgp)) { + return; + } + } + + /* This frame is currently backed; online the page. */ + __online_page_set_limits(pg); + __online_page_increment_counters(pg); + __online_page_free(pg); +} + +static void hv_bring_pgs_online(struct hv_hotadd_state *has, + unsigned long start_pfn, unsigned long size) { int i; - for (i = 0; i < size; i++) { - struct page *pg; - pg = pfn_to_page(start_pfn + i); - __online_page_set_limits(pg); - __online_page_increment_counters(pg); - __online_page_free(pg); - } + for (i = 0; i < size; i++) + hv_page_online_one(has, pfn_to_page(start_pfn + i)); } static void hv_mem_hot_add(unsigned long start, unsigned long size, @@ -684,26 +722,24 @@ static void hv_online_page(struct page *pg) list_for_each(cur, &dm_device.ha_region_list) { has = list_entry(cur, struct hv_hotadd_state, list); cur_start_pgp = (unsigned long) - pfn_to_page(has->covered_start_pfn); - cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); + pfn_to_page(has->start_pfn); + cur_end_pgp = (unsigned long)pfn_to_page(has->end_pfn); - if (((unsigned long)pg >= cur_start_pgp) && - ((unsigned long)pg < cur_end_pgp)) { - /* - * This frame is currently backed; online the - * page. - */ - __online_page_set_limits(pg); - __online_page_increment_counters(pg); - __online_page_free(pg); - } + /* The page belongs to a different HAS. */ + if (((unsigned long)pg < cur_start_pgp) || + ((unsigned long)pg >= cur_end_pgp)) + continue; + + hv_page_online_one(has, pg); + break; } } -static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) +static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) { struct list_head *cur; struct hv_hotadd_state *has; + struct hv_hotadd_gap *gap; unsigned long residual, new_inc; if (list_empty(&dm_device.ha_region_list)) @@ -718,6 +754,24 @@ static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) */ if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) continue; + + /* + * If the current start pfn is not where the covered_end + * is, create a gap and update covered_end_pfn. + */ + if (has->covered_end_pfn != start_pfn) { + gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); + if (!gap) + return -ENOMEM; + + INIT_LIST_HEAD(&gap->list); + gap->start_pfn = has->covered_end_pfn; + gap->end_pfn = start_pfn; + list_add_tail(&gap->list, &has->gap_list); + + has->covered_end_pfn = start_pfn; + } + /* * If the current hot add-request extends beyond * our current limit; extend it. @@ -734,19 +788,10 @@ static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) has->end_pfn += new_inc; } - /* - * If the current start pfn is not where the covered_end - * is, update it. - */ - - if (has->covered_end_pfn != start_pfn) - has->covered_end_pfn = start_pfn; - - return true; - + return 1; } - return false; + return 0; } static unsigned long handle_pg_range(unsigned long pg_start, @@ -785,6 +830,8 @@ static unsigned long handle_pg_range(unsigned long pg_start, if (pgs_ol > pfn_cnt) pgs_ol = pfn_cnt; + has->covered_end_pfn += pgs_ol; + pfn_cnt -= pgs_ol; /* * Check if the corresponding memory block is already * online by checking its last previously backed page. @@ -793,10 +840,8 @@ static unsigned long handle_pg_range(unsigned long pg_start, */ if (start_pfn > has->start_pfn && !PageReserved(pfn_to_page(start_pfn - 1))) - hv_bring_pgs_online(start_pfn, pgs_ol); + hv_bring_pgs_online(has, start_pfn, pgs_ol); - has->covered_end_pfn += pgs_ol; - pfn_cnt -= pgs_ol; } if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { @@ -834,13 +879,19 @@ static unsigned long process_hot_add(unsigned long pg_start, unsigned long rg_size) { struct hv_hotadd_state *ha_region = NULL; + int covered; if (pfn_cnt == 0) return 0; - if (!dm_device.host_specified_ha_region) - if (pfn_covered(pg_start, pfn_cnt)) + if (!dm_device.host_specified_ha_region) { + covered = pfn_covered(pg_start, pfn_cnt); + if (covered < 0) + return 0; + + if (covered) goto do_pg_range; + } /* * If the host has specified a hot-add range; deal with it first. @@ -852,6 +903,7 @@ static unsigned long process_hot_add(unsigned long pg_start, return 0; INIT_LIST_HEAD(&ha_region->list); + INIT_LIST_HEAD(&ha_region->gap_list); list_add_tail(&ha_region->list, &dm_device.ha_region_list); ha_region->start_pfn = rg_start; @@ -1584,6 +1636,7 @@ static int balloon_remove(struct hv_device *dev) struct hv_dynmem_device *dm = hv_get_drvdata(dev); struct list_head *cur, *tmp; struct hv_hotadd_state *has; + struct hv_hotadd_gap *gap, *tmp_gap; if (dm->num_pages_ballooned != 0) pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); @@ -1600,6 +1653,10 @@ static int balloon_remove(struct hv_device *dev) #endif list_for_each_safe(cur, tmp, &dm->ha_region_list) { has = list_entry(cur, struct hv_hotadd_state, list); + list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { + list_del(&gap->list); + kfree(gap); + } list_del(&has->list); kfree(has); } -- cgit v1.2.3 From 5693f3fb5a662ab0ab1f8ad3a0e13c820c4c47dc Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 7 Dec 2016 01:16:27 -0800 Subject: hv: don't reset hv_context.tsc_page on crash commit 56ef6718a1d8d77745033c5291e025ce18504159 upstream. It may happen that secondary CPUs are still alive and resetting hv_context.tsc_page will cause a consequent crash in read_hv_clock_tsc() as we don't check for it being not NULL there. It is safe as we're not freeing this page anyways. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index ddbf7e7e0d98..8ce1f2e22912 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -305,9 +305,10 @@ void hv_cleanup(bool crash) hypercall_msr.as_uint64 = 0; wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); - if (!crash) + if (!crash) { vfree(hv_context.tsc_page); - hv_context.tsc_page = NULL; + hv_context.tsc_page = NULL; + } } #endif } -- cgit v1.2.3 From d1cc3cdd39e90e70ee5fc9a766ad3a6c61426fa8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 6 Apr 2017 09:04:31 -0700 Subject: x86, pmem: fix broken __copy_user_nocache cache-bypass assumptions commit 11e63f6d920d6f2dfd3cd421e939a4aec9a58dcd upstream. Before we rework the "pmem api" to stop abusing __copy_user_nocache() for memcpy_to_pmem() we need to fix cases where we may strand dirty data in the cpu cache. The problem occurs when copy_from_iter_pmem() is used for arbitrary data transfers from userspace. There is no guarantee that these transfers, performed by dax_iomap_actor(), will have aligned destinations or aligned transfer lengths. Backstop the usage __copy_user_nocache() with explicit cache management in these unaligned cases. Yes, copy_from_iter_pmem() is now too big for an inline, but addressing that is saved for a later patch that moves the entirety of the "pmem api" into the pmem driver directly. Fixes: 5de490daec8b ("pmem: add copy_from_iter_pmem() and clear_pmem()") Cc: Cc: Jan Kara Cc: Jeff Moyer Cc: Ingo Molnar Cc: Christoph Hellwig Cc: "H. Peter Anvin" Cc: Al Viro Cc: Thomas Gleixner Cc: Matthew Wilcox Reviewed-by: Ross Zwisler Signed-off-by: Toshi Kani Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pmem.h | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index d8ce3ec816ab..bd8ce6bcdfc9 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -72,8 +72,8 @@ static inline void arch_wmb_pmem(void) * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) - * instruction. This function requires explicit ordering with an - * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. */ static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) { @@ -87,15 +87,6 @@ static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) clwb(p); } -/* - * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec - * iterators, so for other types (bvec & kvec) we must do a cache write-back. - */ -static inline bool __iter_needs_pmem_wb(struct iov_iter *i) -{ - return iter_is_iovec(i) == false; -} - /** * arch_copy_from_iter_pmem - copy data from an iterator to PMEM * @addr: PMEM destination address @@ -114,8 +105,36 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, /* TODO: skip the write-back by always using non-temporal stores */ len = copy_from_iter_nocache(vaddr, bytes, i); - if (__iter_needs_pmem_wb(i)) - __arch_wb_cache_pmem(vaddr, bytes); + /* + * In the iovec case on x86_64 copy_from_iter_nocache() uses + * non-temporal stores for the bulk of the transfer, but we need + * to manually flush if the transfer is unaligned. A cached + * memory copy is used when destination or size is not naturally + * aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + * + * In the non-iovec case the entire destination needs to be + * flushed. + */ + if (iter_is_iovec(i)) { + unsigned long flushed, dest = (unsigned long) addr; + + if (bytes < 8) { + if (!IS_ALIGNED(dest, 4) || (bytes != 4)) + __arch_wb_cache_pmem(addr, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + __arch_wb_cache_pmem(addr, 1); + } + + flushed = dest - (unsigned long) addr; + if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) + __arch_wb_cache_pmem(addr + bytes - 1, 1); + } + } else + __arch_wb_cache_pmem(addr, bytes); return len; } -- cgit v1.2.3 From 6ddbac9aa800b99761a05ba520d62a0d8063a5b8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 29 Dec 2015 14:02:29 -0800 Subject: block: fix del_gendisk() vs blkdev_ioctl crash commit ac34f15e0c6d2fd58480052b6985f6991fb53bcc upstream. When tearing down a block device early in its lifetime, userspace may still be performing discovery actions like blkdev_ioctl() to re-read partitions. The nvdimm_revalidate_disk() implementation depends on disk->driverfs_dev to be valid at entry. However, it is set to NULL in del_gendisk() and fatally this is happening *before* the disk device is deleted from userspace view. There's no reason for del_gendisk() to clear ->driverfs_dev. That device is the parent of the disk. It is guaranteed to not be freed until the disk, as a child, drops its ->parent reference. We could also fix this issue locally in nvdimm_revalidate_disk() by using disk_to_dev(disk)->parent, but lets fix it globally since ->driverfs_dev follows the lifetime of the parent. Longer term we should probably just add a @parent parameter to add_disk(), and stop carrying this pointer in the gendisk. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] nvdimm_revalidate_disk+0x18/0x90 [libnvdimm] CPU: 2 PID: 538 Comm: systemd-udevd Tainted: G O 4.4.0-rc5 #2257 [..] Call Trace: [] rescan_partitions+0x87/0x2c0 [] ? __lock_is_held+0x49/0x70 [] __blkdev_reread_part+0x72/0xb0 [] blkdev_reread_part+0x25/0x40 [] blkdev_ioctl+0x4fd/0x9c0 [] ? current_kernel_time64+0x69/0xd0 [] block_ioctl+0x3d/0x50 [] do_vfs_ioctl+0x308/0x560 [] ? __audit_syscall_entry+0xb1/0x100 [] ? do_audit_syscall_entry+0x66/0x70 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x12/0x76 Cc: Jan Kara Cc: Jens Axboe Reported-by: Robert Hu Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index a5bed6bc869d..3032453a89e6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -664,7 +664,6 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); -- cgit v1.2.3 From 6862fa9077dea0ec2ba5e6ea7c7f90b786288596 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 24 Feb 2016 11:10:48 -0500 Subject: tipc: fix crash during node removal commit d25a01257e422a4bdeb426f69529d57c73b235fe upstream. When the TIPC module is unloaded, we have identified a race condition that allows a node reference counter to go to zero and the node instance being freed before the node timer is finished with accessing it. This leads to occasional crashes, especially in multi-namespace environments. The scenario goes as follows: CPU0:(node_stop) CPU1:(node_timeout) // ref == 2 1: if(!mod_timer()) 2: if (del_timer()) 3: tipc_node_put() // ref -> 1 4: tipc_node_put() // ref -> 0 5: kfree_rcu(node); 6: tipc_node_get(node) 7: // BOOM! We now clean up this functionality as follows: 1) We remove the node pointer from the node lookup table before we attempt deactivating the timer. This way, we reduce the risk that tipc_node_find() may obtain a valid pointer to an instance marked for deletion; a harmless but undesirable situation. 2) We use del_timer_sync() instead of del_timer() to safely deactivate the node timer without any risk that it might be reactivated by the timeout handler. There is no risk of deadlock here, since the two functions never touch the same spinlocks. 3: We remove a pointless tipc_node_get() + tipc_node_put() from the timeout handler. Reported-by: Zhijiang Hu Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/node.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 3926b561f873..d468aad6163e 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -102,9 +102,10 @@ static unsigned int tipc_hashfn(u32 addr) static void tipc_node_kref_release(struct kref *kref) { - struct tipc_node *node = container_of(kref, struct tipc_node, kref); + struct tipc_node *n = container_of(kref, struct tipc_node, kref); - tipc_node_delete(node); + kfree(n->bc_entry.link); + kfree_rcu(n, rcu); } void tipc_node_put(struct tipc_node *node) @@ -216,21 +217,20 @@ static void tipc_node_delete(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); - kfree(node->bc_entry.link); - kfree_rcu(node, rcu); + tipc_node_put(node); + + del_timer_sync(&node->timer); + tipc_node_put(node); } void tipc_node_stop(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); - list_for_each_entry_safe(node, t_node, &tn->node_list, list) { - if (del_timer(&node->timer)) - tipc_node_put(node); - tipc_node_put(node); - } + list_for_each_entry_safe(node, t_node, &tn->node_list, list) + tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } @@ -313,9 +313,7 @@ static void tipc_node_timeout(unsigned long data) if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } - if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) - tipc_node_get(n); - tipc_node_put(n); + mod_timer(&n->timer, jiffies + n->keepalive_intv); } /** -- cgit v1.2.3 From 12f4e1f54a1334bcd5f9586fc9eb7baefb14b826 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 27 Apr 2017 09:09:53 +0200 Subject: Linux 4.4.64 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ec52973043f6..17708f5dc169 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 63 +SUBLEVEL = 64 EXTRAVERSION = NAME = Blurry Fish Butt -- cgit v1.2.3 From 3f0531e5775303091a1ff975cdd572cc6a935321 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Mon, 24 Apr 2017 18:20:52 -0700 Subject: BACKPORT: f2fs: sanity check log_blocks_per_seg f2fs currently only supports 4KB block size and 2MB segment size. Sanity check log_blocks_per_seg == 9, i.e. 2MB/4KB = (1 << 9) Partially (cherry-picked from commit 9a59b62fd88196844cee5fff851bee2cfd7afb6e) f2fs: do more integrity verification for superblock Do more sanity check for superblock during ->mount. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Bug: 36817013 Change-Id: I0be52e54fba82083068337ceb9f7ad985a87319f Signed-off-by: Jin Qian --- fs/f2fs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132352..98a77b0a365d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -947,6 +947,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || -- cgit v1.2.3 From 90d78776c4a0e13fb7ee5bd0787f04a1730631a6 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Tue, 25 Apr 2017 18:07:43 +0800 Subject: ANDROID: uid_sys_stats: fix access of task_uid(task) struct task_struct *task should be proteced by tasklist_lock. Change-Id: Iefcd13442a9b9d855a2bbcde9fd838a4132fee58 Signed-off-by: Ganesh Mahendran --- drivers/misc/uid_sys_stats.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 618617f3e867..ad21276c8d9e 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -95,9 +95,11 @@ static int uid_cputime_show(struct seq_file *m, void *v) { struct uid_entry *uid_entry; struct task_struct *task, *temp; + struct user_namespace *user_ns = current_user_ns(); cputime_t utime; cputime_t stime; unsigned long bkt; + uid_t uid; rt_mutex_lock(&uid_lock); @@ -108,14 +110,13 @@ static int uid_cputime_show(struct seq_file *m, void *v) read_lock(&tasklist_lock); do_each_thread(temp, task) { - uid_entry = find_or_register_uid(from_kuid_munged( - current_user_ns(), task_uid(task))); + uid = from_kuid_munged(user_ns, task_uid(task)); + uid_entry = find_or_register_uid(uid); if (!uid_entry) { read_unlock(&tasklist_lock); rt_mutex_unlock(&uid_lock); pr_err("%s: failed to find the uid_entry for uid %d\n", - __func__, from_kuid_munged(current_user_ns(), - task_uid(task))); + __func__, uid); return -ENOMEM; } task_cputime_adjusted(task, &utime, &stime); -- cgit v1.2.3 From 44b3b7e068874040ca511fcd2a812b5fbcf44616 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Mon, 14 Mar 2016 09:43:52 +0100 Subject: tipc: make sure IPv6 header fits in skb headroom commit 9bd160bfa27fa41927dbbce7ee0ea779700e09ef upstream. Expand headroom further in order to be able to fit the larger IPv6 header. Prior to this patch this caused a skb under panic for certain tipc packets when using IPv6 UDP bearer(s). Signed-off-by: Richard Alpe Acked-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Jon Maloy Signed-off-by: Greg Kroah-Hartman --- net/tipc/udp_media.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 6af78c6276b4..4056798c54a5 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -52,7 +52,7 @@ /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 -#define UDP_MIN_HEADROOM 28 +#define UDP_MIN_HEADROOM 48 static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, -- cgit v1.2.3 From 3f31559043087b9cd45582c2eb12d7900cedc4ed Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 7 Apr 2016 10:40:43 -0400 Subject: tipc: make dist queue pernet commit 541726abe7daca64390c2ec34e6a203145f1686d upstream. Nametable updates received from the network that cannot be applied immediately are placed on a defer queue. This queue is global to the TIPC module, which might cause problems when using TIPC in containers. To prevent nametable updates from escaping into the wrong namespace, we make the queue pernet instead. Signed-off-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/core.c | 1 + net/tipc/core.h | 3 +++ net/tipc/name_distr.c | 16 +++++++--------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/net/tipc/core.c b/net/tipc/core.c index 03a842870c52..e2bdb07a49a2 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -69,6 +69,7 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_nametbl; + INIT_LIST_HEAD(&tn->dist_queue); err = tipc_topsrv_start(net); if (err) goto out_subscr; diff --git a/net/tipc/core.h b/net/tipc/core.h index 18e95a8020cd..fe3b89e9cde4 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -103,6 +103,9 @@ struct tipc_net { spinlock_t nametbl_lock; struct name_table *nametbl; + /* Name dist queue */ + struct list_head dist_queue; + /* Topology subscription server */ struct tipc_server *topsrv; atomic_t subscription_count; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index f51c8bdbea1c..18f8152888f4 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -40,11 +40,6 @@ int sysctl_tipc_named_timeout __read_mostly = 2000; -/** - * struct tipc_dist_queue - queue holding deferred name table updates - */ -static struct list_head tipc_dist_queue = LIST_HEAD_INIT(tipc_dist_queue); - struct distr_queue_item { struct distr_item i; u32 dtype; @@ -340,9 +335,11 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, * tipc_named_add_backlog - add a failed name table update to the backlog * */ -static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) +static void tipc_named_add_backlog(struct net *net, struct distr_item *i, + u32 type, u32 node) { struct distr_queue_item *e; + struct tipc_net *tn = net_generic(net, tipc_net_id); unsigned long now = get_jiffies_64(); e = kzalloc(sizeof(*e), GFP_ATOMIC); @@ -352,7 +349,7 @@ static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) e->node = node; e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); memcpy(e, i, sizeof(*i)); - list_add_tail(&e->next, &tipc_dist_queue); + list_add_tail(&e->next, &tn->dist_queue); } /** @@ -362,10 +359,11 @@ static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) void tipc_named_process_backlog(struct net *net) { struct distr_queue_item *e, *tmp; + struct tipc_net *tn = net_generic(net, tipc_net_id); char addr[16]; unsigned long now = get_jiffies_64(); - list_for_each_entry_safe(e, tmp, &tipc_dist_queue, next) { + list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { if (time_after(e->expires, now)) { if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype)) continue; @@ -405,7 +403,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) node = msg_orignode(msg); while (count--) { if (!tipc_update_nametbl(net, item, node, mtype)) - tipc_named_add_backlog(item, mtype, node); + tipc_named_add_backlog(net, item, mtype, node); item++; } kfree_skb(skb); -- cgit v1.2.3 From 76ca3053f32c997472c325176c235a25170fc98b Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 2 May 2016 11:58:45 -0400 Subject: tipc: re-enable compensation for socket receive buffer double counting commit 7c8bcfb1255fe9d929c227d67bdcd84430fd200b upstream. In the refactoring commit d570d86497ee ("tipc: enqueue arrived buffers in socket in separate function") we did by accident replace the test if (sk->sk_backlog.len == 0) atomic_set(&tsk->dupl_rcvcnt, 0); with if (sk->sk_backlog.len) atomic_set(&tsk->dupl_rcvcnt, 0); This effectively disables the compensation we have for the double receive buffer accounting that occurs temporarily when buffers are moved from the backlog to the socket receive queue. Until now, this has gone unnoticed because of the large receive buffer limits we are applying, but becomes indispensable when we reduce this buffer limit later in this series. We now fix this by inverting the mentioned condition. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b26b7a127773..d119291db852 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1755,7 +1755,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Try backlog, compensating for double-counted bytes */ dcnt = &tipc_sk(sk)->dupl_rcvcnt; - if (sk->sk_backlog.len) + if (!sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); if (likely(!sk_add_backlog(sk, skb, lim))) -- cgit v1.2.3 From 2847736f563d0ac1f84ddad1e4877c0856bc1adb Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 8 Jun 2016 12:00:04 -0400 Subject: tipc: correct error in node fsm commit c4282ca76c5b81ed73ef4c5eb5c07ee397e51642 upstream. commit 88e8ac7000dc ("tipc: reduce transmission rate of reset messages when link is down") revealed a flaw in the node FSM, as defined in the log of commit 66996b6c47ed ("tipc: extend node FSM"). We see the following scenario: 1: Node B receives a RESET message from node A before its link endpoint is fully up, i.e., the node FSM is in state SELF_UP_PEER_COMING. This event will not change the node FSM state, but the (distinct) link FSM will move to state RESETTING. 2: As an effect of the previous event, the local endpoint on B will declare node A lost, and post the event SELF_DOWN to the its node FSM. This moves the FSM state to SELF_DOWN_PEER_LEAVING, meaning that no messages will be accepted from A until it receives another RESET message that confirms that A's endpoint has been reset. This is wasteful, since we know this as a fact already from the first received RESET, but worse is that the link instance's FSM has not wasted this information, but instead moved on to state ESTABLISHING, meaning that it repeatedly sends out ACTIVATE messages to the reset peer A. 3: Node A will receive one of the ACTIVATE messages, move its link FSM to state ESTABLISHED, and start repeatedly sending out STATE messages to node B. 4: Node B will consistently drop these messages, since it can only accept accept a RESET according to its node FSM. 5: After four lost STATE messages node A will reset its link and start repeatedly sending out RESET messages to B. 6: Because of the reduced send rate for RESET messages, it is very likely that A will receive an ACTIVATE (which is sent out at a much higher frequency) before it gets the chance to send a RESET, and A may hence quickly move back to state ESTABLISHED and continue sending out STATE messages, which will again be dropped by B. 7: GOTO 5. 8: After having repeated the cycle 5-7 a number of times, node A will by chance get in between with sending a RESET, and the situation is resolved. Unfortunately, we have seen that it may take a substantial amount of time before this vicious loop is broken, sometimes in the order of minutes. We correct this by making a small correction to the node FSM: When a node in state SELF_UP_PEER_COMING receives a SELF_DOWN event, it now moves directly back to state SELF_DOWN_PEER_DOWN, instead of as now SELF_DOWN_PEER_LEAVING. This is logically consistent, since we don't need to wait for RESET confirmation from of an endpoint that we alread know has been reset. It also means that node B in the scenario above will not be dropping incoming STATE messages, and the link can come up immediately. Finally, a symmetry comparison reveals that the FSM has a similar error when receiving the event PEER_DOWN in state PEER_UP_SELF_COMING. Instead of moving to PERR_DOWN_SELF_LEAVING, it should move directly to SELF_DOWN_PEER_DOWN. Although we have never seen any negative effect of this logical error, we choose fix this one, too. The node FSM looks as follows after those changes: +----------------------------------------+ | PEER_DOWN_EVT| | | +------------------------+----------------+ | |SELF_DOWN_EVT | | | | | | | | +-----------+ +-----------+ | | |NODE_ | |NODE_ | | | +----------|FAILINGOVER|<---------|SYNCHING |-----------+ | | |SELF_ +-----------+ FAILOVER_+-----------+ PEER_ | | | |DOWN_EVT | A BEGIN_EVT A | DOWN_EVT| | | | | | | | | | | | | | | | | | | | |FAILOVER_ |FAILOVER_ |SYNCH_ |SYNCH_ | | | | |END_EVT |BEGIN_EVT |BEGIN_EVT|END_EVT | | | | | | | | | | | | | | | | | | | | | +--------------+ | | | | | +-------->| SELF_UP_ |<-------+ | | | | +-----------------| PEER_UP |----------------+ | | | | |SELF_DOWN_EVT +--------------+ PEER_DOWN_EVT| | | | | | A A | | | | | | | | | | | | | | PEER_UP_EVT| |SELF_UP_EVT | | | | | | | | | | | V V V | | V V V +------------+ +-----------+ +-----------+ +------------+ |SELF_DOWN_ | |SELF_UP_ | |PEER_UP_ | |PEER_DOWN | |PEER_LEAVING| |PEER_COMING| |SELF_COMING| |SELF_LEAVING| +------------+ +-----------+ +-----------+ +------------+ | | A A | | | | | | | | | SELF_ | |SELF_ |PEER_ |PEER_ | | DOWN_EVT| |UP_EVT |UP_EVT |DOWN_EVT | | | | | | | | | | | | | | | +--------------+ | | |PEER_DOWN_EVT +--->| SELF_DOWN_ |<---+ SELF_DOWN_EVT| +------------------->| PEER_DOWN |<--------------------+ +--------------+ Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index d468aad6163e..2df0b98d4a32 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -728,7 +728,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: - state = SELF_DOWN_PEER_LEAVING; + state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: @@ -747,7 +747,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: - state = SELF_LEAVING_PEER_DOWN; + state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: -- cgit v1.2.3 From 58f80ccf09c4fb8ae2819cd2c0583b885b6b5454 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2016 22:54:56 +0100 Subject: tty: nozomi: avoid a harmless gcc warning commit a4f642a8a3c2838ad09fe8313d45db46600e1478 upstream. The nozomi wireless data driver has its own helper function to transfer data from a FIFO, doing an extra byte swap on big-endian architectures, presumably to bring the data back into byte-serial order after readw() or readl() perform their implicit byteswap. This helper function is used in the receive_data() function to first read the length into a 32-bit variable, which causes a compile-time warning: drivers/tty/nozomi.c: In function 'receive_data': drivers/tty/nozomi.c:857:9: warning: 'size' may be used uninitialized in this function [-Wmaybe-uninitialized] The problem is that gcc is unsure whether the data was actually read or not. We know that it is at this point, so we can replace it with a single readl() to shut up that warning. I am leaving the byteswap in there, to preserve the existing behavior, even though this seems fishy: Reading the length of the data into a cpu-endian variable should normally not use a second byteswap on big-endian systems, unless the hardware is aware of the CPU endianess. There appears to be a lot more confusion about endianess in this driver, so it probably has not worked on big-endian systems in a long time, if ever, and I have no way to test it. It's well possible that this driver has not been used by anyone in a while, the last patch that looks like it was tested on the hardware is from 2008. Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/tty/nozomi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c index 80f9de907563..5cc80b80c82b 100644 --- a/drivers/tty/nozomi.c +++ b/drivers/tty/nozomi.c @@ -823,7 +823,7 @@ static int receive_data(enum port_type index, struct nozomi *dc) struct tty_struct *tty = tty_port_tty_get(&port->port); int i, ret; - read_mem32((u32 *) &size, addr, 4); + size = __le32_to_cpu(readl(addr)); /* DBG1( "%d bytes port: %d", size, index); */ if (tty && test_bit(TTY_THROTTLED, &tty->flags)) { -- cgit v1.2.3 From 9a35bc2ae545b352966a107bf81d8fdcafe4d7bf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Jan 2016 22:58:28 +0100 Subject: hostap: avoid uninitialized variable use in hfa384x_get_rid commit 48dc5fb3ba53b20418de8514700f63d88c5de3a3 upstream. The driver reads a value from hfa384x_from_bap(), which may fail, and then assigns the value to a local variable. gcc detects that in in the failure case, the 'rlen' variable now contains uninitialized data: In file included from ../drivers/net/wireless/intersil/hostap/hostap_pci.c:220:0: drivers/net/wireless/intersil/hostap/hostap_hw.c: In function 'hfa384x_get_rid': drivers/net/wireless/intersil/hostap/hostap_hw.c:842:5: warning: 'rec' may be used uninitialized in this function [-Wmaybe-uninitialized] if (le16_to_cpu(rec.len) == 0) { This restructures the function as suggested by Russell King, to make it more readable and get more reliable error handling, by handling each failure mode using a goto. Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/hostap/hostap_hw.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/hostap/hostap_hw.c b/drivers/net/wireless/hostap/hostap_hw.c index 6df3ee561d52..515aa3f993f3 100644 --- a/drivers/net/wireless/hostap/hostap_hw.c +++ b/drivers/net/wireless/hostap/hostap_hw.c @@ -836,25 +836,30 @@ static int hfa384x_get_rid(struct net_device *dev, u16 rid, void *buf, int len, spin_lock_bh(&local->baplock); res = hfa384x_setup_bap(dev, BAP0, rid, 0); - if (!res) - res = hfa384x_from_bap(dev, BAP0, &rec, sizeof(rec)); + if (res) + goto unlock; + + res = hfa384x_from_bap(dev, BAP0, &rec, sizeof(rec)); + if (res) + goto unlock; if (le16_to_cpu(rec.len) == 0) { /* RID not available */ res = -ENODATA; + goto unlock; } rlen = (le16_to_cpu(rec.len) - 1) * 2; - if (!res && exact_len && rlen != len) { + if (exact_len && rlen != len) { printk(KERN_DEBUG "%s: hfa384x_get_rid - RID len mismatch: " "rid=0x%04x, len=%d (expected %d)\n", dev->name, rid, rlen, len); res = -ENODATA; } - if (!res) - res = hfa384x_from_bap(dev, BAP0, buf, len); + res = hfa384x_from_bap(dev, BAP0, buf, len); +unlock: spin_unlock_bh(&local->baplock); mutex_unlock(&local->rid_bap_mtx); -- cgit v1.2.3 From d39cb4a597295c6fd5e01795a134f1e3c0914049 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Jan 2016 13:08:10 -0500 Subject: gfs2: avoid uninitialized variable warning commit 67893f12e5374bbcaaffbc6e570acbc2714ea884 upstream. We get a bogus warning about a potential uninitialized variable use in gfs2, because the compiler does not figure out that we never use the leaf number if get_leaf_nr() returns an error: fs/gfs2/dir.c: In function 'get_first_leaf': fs/gfs2/dir.c:802:9: warning: 'leaf_no' may be used uninitialized in this function [-Wmaybe-uninitialized] fs/gfs2/dir.c: In function 'dir_split_leaf': fs/gfs2/dir.c:1021:8: warning: 'leaf_no' may be used uninitialized in this function [-Wmaybe-uninitialized] Changing the 'if (!error)' to 'if (!IS_ERR_VALUE(error))' is sufficient to let gcc understand that this is exactly the same condition as in IS_ERR() so it can optimize the code path enough to understand it. Signed-off-by: Arnd Bergmann Signed-off-by: Bob Peterson Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index ad8a5b757cc7..a443c6e54412 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -760,7 +760,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, int error; error = get_leaf_nr(dip, index, &leaf_no); - if (!error) + if (!IS_ERR_VALUE(error)) error = get_leaf(dip, leaf_no, bh_out); return error; @@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); - if (error) + if (IS_ERR_VALUE(error)) return error; /* Get the old leaf block */ -- cgit v1.2.3 From abc025d1e88a47c24a0f4411d851c1e9c3e0e87d Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Thu, 1 Sep 2016 16:22:16 +0200 Subject: tipc: fix random link resets while adding a second bearer commit d2f394dc4816b7bd1b44981d83509f18f19c53f0 upstream. In a dual bearer configuration, if the second tipc link becomes active while the first link still has pending nametable "bulk" updates, it randomly leads to reset of the second link. When a link is established, the function named_distribute(), fills the skb based on node mtu (allows room for TUNNEL_PROTOCOL) with NAME_DISTRIBUTOR message for each PUBLICATION. However, the function named_distribute() allocates the buffer by increasing the node mtu by INT_H_SIZE (to insert NAME_DISTRIBUTOR). This consumes the space allocated for TUNNEL_PROTOCOL. When establishing the second link, the link shall tunnel all the messages in the first link queue including the "bulk" update. As size of the NAME_DISTRIBUTOR messages while tunnelling, exceeds the link mtu the transmission fails (-EMSGSIZE). Thus, the synch point based on the message count of the tunnel packets is never reached leading to link timeout. In this commit, we adjust the size of name distributor message so that they can be tunnelled. Reviewed-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/name_distr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 18f8152888f4..c4c151bc000c 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -62,6 +62,8 @@ static void publ_to_item(struct distr_item *i, struct publication *p) /** * named_prepare_buf - allocate & initialize a publication message + * + * The buffer returned is of size INT_H_SIZE + payload size */ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) @@ -166,9 +168,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - uint msg_dsz = (tipc_node_get_mtu(net, dnode, 0) / ITEM_SIZE) * - ITEM_SIZE; - uint msg_rem = msg_dsz; + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + ITEM_SIZE) * ITEM_SIZE; + u32 msg_rem = msg_dsz; list_for_each_entry(publ, pls, local_list) { /* Prepare next buffer: */ -- cgit v1.2.3 From 59e0cd110fb9fb9aa97bb59c57789adb0e82da8d Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 17 Jun 2016 06:35:57 -0400 Subject: tipc: fix socket timer deadlock commit f1d048f24e66ba85d3dabf3d076cefa5f2b546b0 upstream. We sometimes observe a 'deadly embrace' type deadlock occurring between mutually connected sockets on the same node. This happens when the one-hour peer supervision timers happen to expire simultaneously in both sockets. The scenario is as follows: CPU 1: CPU 2: -------- -------- tipc_sk_timeout(sk1) tipc_sk_timeout(sk2) lock(sk1.slock) lock(sk2.slock) msg_create(probe) msg_create(probe) unlock(sk1.slock) unlock(sk2.slock) tipc_node_xmit_skb() tipc_node_xmit_skb() tipc_node_xmit() tipc_node_xmit() tipc_sk_rcv(sk2) tipc_sk_rcv(sk1) lock(sk2.slock) lock((sk1.slock) filter_rcv() filter_rcv() tipc_sk_proto_rcv() tipc_sk_proto_rcv() msg_create(probe_rsp) msg_create(probe_rsp) tipc_sk_respond() tipc_sk_respond() tipc_node_xmit_skb() tipc_node_xmit_skb() tipc_node_xmit() tipc_node_xmit() tipc_sk_rcv(sk1) tipc_sk_rcv(sk2) lock((sk1.slock) lock((sk2.slock) ===> DEADLOCK ===> DEADLOCK Further analysis reveals that there are three different locations in the socket code where tipc_sk_respond() is called within the context of the socket lock, with ensuing risk of similar deadlocks. We now solve this by passing a buffer queue along with all upcalls where sk_lock.slock may potentially be held. Response or rejected message buffers are accumulated into this queue instead of being sent out directly, and only sent once we know we are safely outside the slock context. Reported-by: GUNA Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/socket.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d119291db852..65171f8e8c45 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -777,9 +777,11 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, * @tsk: receiving socket * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) +static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { struct sock *sk = &tsk->sk; + u32 onode = tsk_own_node(tsk); struct tipc_msg *hdr = buf_msg(skb); int mtyp = msg_type(hdr); int conn_cong; @@ -792,7 +794,8 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) if (mtyp == CONN_PROBE) { msg_set_type(hdr, CONN_PROBE_REPLY); - tipc_sk_respond(sk, skb, TIPC_OK); + if (tipc_msg_reverse(onode, &skb, TIPC_OK)) + __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { conn_cong = tsk_conn_cong(tsk); @@ -1647,7 +1650,8 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) * * Returns true if message was added to socket receive queue, otherwise false */ -static bool filter_rcv(struct sock *sk, struct sk_buff *skb) +static bool filter_rcv(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { struct socket *sock = sk->sk_socket; struct tipc_sock *tsk = tipc_sk(sk); @@ -1657,7 +1661,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb) int usr = msg_user(hdr); if (unlikely(msg_user(hdr) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, skb); + tipc_sk_proto_rcv(tsk, skb, xmitq); return false; } @@ -1700,7 +1704,8 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb) return true; reject: - tipc_sk_respond(sk, skb, err); + if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err)) + __skb_queue_tail(xmitq, skb); return false; } @@ -1716,9 +1721,24 @@ reject: static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) { unsigned int truesize = skb->truesize; + struct sk_buff_head xmitq; + u32 dnode, selector; - if (likely(filter_rcv(sk, skb))) + __skb_queue_head_init(&xmitq); + + if (likely(filter_rcv(sk, skb, &xmitq))) { atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); + return 0; + } + + if (skb_queue_empty(&xmitq)) + return 0; + + /* Send response/rejected message */ + skb = __skb_dequeue(&xmitq); + dnode = msg_destnode(buf_msg(skb)); + selector = msg_origport(buf_msg(skb)); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); return 0; } @@ -1732,12 +1752,13 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) * Caller must hold socket lock */ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, - u32 dport) + u32 dport, struct sk_buff_head *xmitq) { + unsigned long time_limit = jiffies + 2; + struct sk_buff *skb; unsigned int lim; atomic_t *dcnt; - struct sk_buff *skb; - unsigned long time_limit = jiffies + 2; + u32 onode; while (skb_queue_len(inputq)) { if (unlikely(time_after_eq(jiffies, time_limit))) @@ -1749,7 +1770,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - filter_rcv(sk, skb); + filter_rcv(sk, skb, xmitq); continue; } @@ -1762,7 +1783,9 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, continue; /* Overload => reject message back to sender */ - tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD); + onode = tipc_own_addr(sock_net(sk)); + if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) + __skb_queue_tail(xmitq, skb); break; } } @@ -1775,12 +1798,14 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, */ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { + struct sk_buff_head xmitq; u32 dnode, dport = 0; int err; struct tipc_sock *tsk; struct sock *sk; struct sk_buff *skb; + __skb_queue_head_init(&xmitq); while (skb_queue_len(inputq)) { dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); @@ -1788,9 +1813,14 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) if (likely(tsk)) { sk = &tsk->sk; if (likely(spin_trylock_bh(&sk->sk_lock.slock))) { - tipc_sk_enqueue(inputq, sk, dport); + tipc_sk_enqueue(inputq, sk, dport, &xmitq); spin_unlock_bh(&sk->sk_lock.slock); } + /* Send pending response/rejected messages, if any */ + while ((skb = __skb_dequeue(&xmitq))) { + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(net, skb, dnode, dport); + } sock_put(sk); continue; } -- cgit v1.2.3 From c50fd34e10897114a7be2120133bd7e0b4184024 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 28 Sep 2016 00:27:17 -0500 Subject: mnt: Add a per mount namespace limit on the number of mounts commit d29216842a85c7970c536108e093963f02714498 upstream. CAI Qian pointed out that the semantics of shared subtrees make it possible to create an exponentially increasing number of mounts in a mount namespace. mkdir /tmp/1 /tmp/2 mount --make-rshared / for i in $(seq 1 20) ; do mount --bind /tmp/1 /tmp/2 ; done Will create create 2^20 or 1048576 mounts, which is a practical problem as some people have managed to hit this by accident. As such CVE-2016-6213 was assigned. Ian Kent described the situation for autofs users as follows: > The number of mounts for direct mount maps is usually not very large because of > the way they are implemented, large direct mount maps can have performance > problems. There can be anywhere from a few (likely case a few hundred) to less > than 10000, plus mounts that have been triggered and not yet expired. > > Indirect mounts have one autofs mount at the root plus the number of mounts that > have been triggered and not yet expired. > > The number of autofs indirect map entries can range from a few to the common > case of several thousand and in rare cases up to between 30000 and 50000. I've > not heard of people with maps larger than 50000 entries. > > The larger the number of map entries the greater the possibility for a large > number of active mounts so it's not hard to expect cases of a 1000 or somewhat > more active mounts. So I am setting the default number of mounts allowed per mount namespace at 100,000. This is more than enough for any use case I know of, but small enough to quickly stop an exponential increase in mounts. Which should be perfect to catch misconfigurations and malfunctioning programs. For anyone who needs a higher limit this can be changed by writing to the new /proc/sys/fs/mount-max sysctl. Tested-by: CAI Qian Signed-off-by: "Eric W. Biederman" [bwh: Backported to 4.4: adjust context] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- Documentation/sysctl/fs.txt | 7 +++++++ fs/mount.h | 2 ++ fs/namespace.c | 50 ++++++++++++++++++++++++++++++++++++++++++++- fs/pnode.c | 2 +- fs/pnode.h | 1 + include/linux/mount.h | 2 ++ kernel/sysctl.c | 9 ++++++++ 7 files changed, 71 insertions(+), 2 deletions(-) diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 302b5ed616a6..35e17f748ca7 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -265,6 +265,13 @@ aio-nr can grow to. ============================================================== +mount-max: + +This denotes the maximum number of mounts that may exist +in a mount namespace. + +============================================================== + 2. /proc/sys/fs/binfmt_misc ---------------------------------------------------------- diff --git a/fs/mount.h b/fs/mount.h index 3dc7dea5a357..13a4ebbbaa74 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -13,6 +13,8 @@ struct mnt_namespace { u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; + unsigned int mounts; /* # of mounts in the namespace */ + unsigned int pending_mounts; }; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index 7df3d406d3e0..f26d18d69712 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,9 @@ #include "pnode.h" #include "internal.h" +/* Maximum number of mounts in a mount namespace */ +unsigned int sysctl_mount_max __read_mostly = 100000; + static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; @@ -925,6 +928,9 @@ static void commit_tree(struct mount *mnt) list_splice(&head, n->list.prev); + n->mounts += n->pending_mounts; + n->pending_mounts = 0; + __attach_mnt(mnt, parent); touch_mnt_namespace(n); } @@ -1445,11 +1451,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { + struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_mnt_namespace(p->mnt_ns); + ns = p->mnt_ns; + if (ns) { + ns->mounts--; + __touch_mnt_namespace(ns); + } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; @@ -1850,6 +1861,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse) return 0; } +int count_mounts(struct mnt_namespace *ns, struct mount *mnt) +{ + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0, old, pending, sum; + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + mounts++; + + old = ns->mounts; + pending = ns->pending_mounts; + sum = old + pending; + if ((old > sum) || + (pending > sum) || + (max < sum) || + (mounts > (max - sum))) + return -ENOSPC; + + ns->pending_mounts = pending + mounts; + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -1919,6 +1952,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct path *parent_path) { HLIST_HEAD(tree_list); + struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; @@ -1931,6 +1965,13 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (IS_ERR(smp)) return PTR_ERR(smp); + /* Is there space to add these mounts to the mount namespace? */ + if (!parent_path) { + err = count_mounts(ns, source_mnt); + if (err) + goto out; + } + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -1970,11 +2011,14 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); + child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: + ns->pending_mounts = 0; + read_seqlock_excl(&mount_lock); put_mountpoint(smp); read_sequnlock_excl(&mount_lock); @@ -2804,6 +2848,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) init_waitqueue_head(&new_ns->poll); new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); + new_ns->mounts = 0; + new_ns->pending_mounts = 0; return new_ns; } @@ -2853,6 +2899,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = new; while (p) { q->mnt_ns = new_ns; + new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -2891,6 +2938,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; + new_ns->mounts++; list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); diff --git a/fs/pnode.c b/fs/pnode.c index b9f2af59b9a6..b394ca5307ec 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -259,7 +259,7 @@ static int propagate_one(struct mount *m) read_sequnlock_excl(&mount_lock); } hlist_add_head(&child->mnt_hash, list); - return 0; + return count_mounts(m->mnt_ns, child); } /* diff --git a/fs/pnode.h b/fs/pnode.h index 623f01772bec..dc87e65becd2 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -54,4 +54,5 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); +int count_mounts(struct mnt_namespace *ns, struct mount *mnt); #endif /* _LINUX_PNODE_H */ diff --git a/include/linux/mount.h b/include/linux/mount.h index f822c3c11377..dc6cd800cd5d 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -95,4 +95,6 @@ extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); +extern unsigned int sysctl_mount_max; + #endif /* _LINUX_MOUNT_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2f0d157258a2..300d64162aff 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -1749,6 +1750,14 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; -- cgit v1.2.3 From 0d9dac5d7cc31df50757f26bcbdfbcf47277a1b2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 28 Jan 2016 09:22:44 -0200 Subject: xc2028: avoid use after free commit 8dfbcc4351a0b6d2f2d77f367552f48ffefafe18 upstream. If struct xc2028_config is passed without a firmware name, the following trouble may happen: [11009.907205] xc2028 5-0061: type set to XCeive xc2028/xc3028 tuner [11009.907491] ================================================================== [11009.907750] BUG: KASAN: use-after-free in strcmp+0x96/0xb0 at addr ffff8803bd78ab40 [11009.907992] Read of size 1 by task modprobe/28992 [11009.907994] ============================================================================= [11009.907997] BUG kmalloc-16 (Tainted: G W ): kasan: bad access detected [11009.907999] ----------------------------------------------------------------------------- [11009.908008] INFO: Allocated in xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] age=0 cpu=3 pid=28992 [11009.908012] ___slab_alloc+0x581/0x5b0 [11009.908014] __slab_alloc+0x51/0x90 [11009.908017] __kmalloc+0x27b/0x350 [11009.908022] xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] [11009.908026] usb_hcd_submit_urb+0x1e8/0x1c60 [11009.908029] usb_submit_urb+0xb0e/0x1200 [11009.908032] usb_serial_generic_write_start+0xb6/0x4c0 [11009.908035] usb_serial_generic_write+0x92/0xc0 [11009.908039] usb_console_write+0x38a/0x560 [11009.908045] call_console_drivers.constprop.14+0x1ee/0x2c0 [11009.908051] console_unlock+0x40d/0x900 [11009.908056] vprintk_emit+0x4b4/0x830 [11009.908061] vprintk_default+0x1f/0x30 [11009.908064] printk+0x99/0xb5 [11009.908067] kasan_report_error+0x10a/0x550 [11009.908070] __asan_report_load1_noabort+0x43/0x50 [11009.908074] INFO: Freed in xc2028_set_config+0x90/0x630 [tuner_xc2028] age=1 cpu=3 pid=28992 [11009.908077] __slab_free+0x2ec/0x460 [11009.908080] kfree+0x266/0x280 [11009.908083] xc2028_set_config+0x90/0x630 [tuner_xc2028] [11009.908086] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908090] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908094] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908098] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908101] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908105] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908108] do_one_initcall+0x141/0x300 [11009.908111] do_init_module+0x1d0/0x5ad [11009.908114] load_module+0x6666/0x9ba0 [11009.908117] SyS_finit_module+0x108/0x130 [11009.908120] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908123] INFO: Slab 0xffffea000ef5e280 objects=25 used=25 fp=0x (null) flags=0x2ffff8000004080 [11009.908126] INFO: Object 0xffff8803bd78ab40 @offset=2880 fp=0x0000000000000001 [11009.908130] Bytes b4 ffff8803bd78ab30: 01 00 00 00 2a 07 00 00 9d 28 00 00 01 00 00 00 ....*....(...... [11009.908133] Object ffff8803bd78ab40: 01 00 00 00 00 00 00 00 b0 1d c3 6a 00 88 ff ff ...........j.... [11009.908137] CPU: 3 PID: 28992 Comm: modprobe Tainted: G B W 4.5.0-rc1+ #43 [11009.908140] Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0350.2015.0812.1722 08/12/2015 [11009.908142] ffff8803bd78a000 ffff8802c273f1b8 ffffffff81932007 ffff8803c6407a80 [11009.908148] ffff8802c273f1e8 ffffffff81556759 ffff8803c6407a80 ffffea000ef5e280 [11009.908153] ffff8803bd78ab40 dffffc0000000000 ffff8802c273f210 ffffffff8155ccb4 [11009.908158] Call Trace: [11009.908162] [] dump_stack+0x4b/0x64 [11009.908165] [] print_trailer+0xf9/0x150 [11009.908168] [] object_err+0x34/0x40 [11009.908171] [] kasan_report_error+0x230/0x550 [11009.908175] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908179] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908182] [] __asan_report_load1_noabort+0x43/0x50 [11009.908185] [] ? __asan_register_globals+0x50/0xa0 [11009.908189] [] ? strcmp+0x96/0xb0 [11009.908192] [] strcmp+0x96/0xb0 [11009.908196] [] xc2028_set_config+0x15c/0x630 [tuner_xc2028] [11009.908200] [] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908203] [] ? memset+0x28/0x30 [11009.908206] [] ? xc2028_set_config+0x630/0x630 [tuner_xc2028] [11009.908211] [] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908215] [] ? em28xx_dvb_init.part.3+0x37c/0x5cf4 [em28xx_dvb] [11009.908219] [] ? hauppauge_hvr930c_init+0x487/0x487 [em28xx_dvb] [11009.908222] [] ? lgdt330x_attach+0x1cc/0x370 [lgdt330x] [11009.908226] [] ? i2c_read_demod_bytes.isra.2+0x210/0x210 [lgdt330x] [11009.908230] [] ? ref_module.part.15+0x10/0x10 [11009.908233] [] ? module_assert_mutex_or_preempt+0x80/0x80 [11009.908238] [] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908242] [] ? em28xx_attach_xc3028.constprop.7+0x30d/0x30d [em28xx_dvb] [11009.908245] [] ? string+0x14d/0x1f0 [11009.908249] [] ? symbol_string+0xff/0x1a0 [11009.908253] [] ? uuid_string+0x6f0/0x6f0 [11009.908257] [] ? __kernel_text_address+0x7e/0xa0 [11009.908260] [] ? print_context_stack+0x7f/0xf0 [11009.908264] [] ? __module_address+0xb6/0x360 [11009.908268] [] ? is_ftrace_trampoline+0x99/0xe0 [11009.908271] [] ? __kernel_text_address+0x7e/0xa0 [11009.908275] [] ? debug_check_no_locks_freed+0x290/0x290 [11009.908278] [] ? dump_trace+0x11b/0x300 [11009.908282] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908285] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908289] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908292] [] ? trace_hardirqs_on+0xd/0x10 [11009.908296] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908299] [] ? mutex_trylock+0x400/0x400 [11009.908302] [] ? do_one_initcall+0x131/0x300 [11009.908306] [] ? call_rcu_sched+0x17/0x20 [11009.908309] [] ? put_object+0x48/0x70 [11009.908314] [] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908317] [] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908320] [] ? 0xffffffffa0150000 [11009.908324] [] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908327] [] do_one_initcall+0x141/0x300 [11009.908330] [] ? try_to_run_init_process+0x40/0x40 [11009.908333] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908337] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908340] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908343] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908346] [] ? __asan_register_globals+0x87/0xa0 [11009.908350] [] do_init_module+0x1d0/0x5ad [11009.908353] [] load_module+0x6666/0x9ba0 [11009.908356] [] ? symbol_put_addr+0x50/0x50 [11009.908361] [] ? em28xx_dvb_init.part.3+0x5989/0x5cf4 [em28xx_dvb] [11009.908366] [] ? module_frob_arch_sections+0x20/0x20 [11009.908369] [] ? open_exec+0x50/0x50 [11009.908374] [] ? ns_capable+0x5b/0xd0 [11009.908377] [] SyS_finit_module+0x108/0x130 [11009.908379] [] ? SyS_init_module+0x1f0/0x1f0 [11009.908383] [] ? lockdep_sys_exit_thunk+0x12/0x14 [11009.908394] [] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908396] Memory state around the buggy address: [11009.908398] ffff8803bd78aa00: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908401] ffff8803bd78aa80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908403] >ffff8803bd78ab00: fc fc fc fc fc fc fc fc 00 00 fc fc fc fc fc fc [11009.908405] ^ [11009.908407] ffff8803bd78ab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908409] ffff8803bd78ac00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908411] ================================================================== In order to avoid it, let's set the cached value of the firmware name to NULL after freeing it. While here, return an error if the memory allocation fails. Signed-off-by: Mauro Carvalho Chehab Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/media/tuners/tuner-xc2028.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 4e941f00b600..082ff5608455 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1403,11 +1403,12 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) * in order to avoid troubles during device release. */ kfree(priv->ctrl.fname); + priv->ctrl.fname = NULL; memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); if (priv->ctrl.fname == NULL) - rc = -ENOMEM; + return -ENOMEM; } /* -- cgit v1.2.3 From 9540baadb61ba5ed08832bb2a4cbfd876db37ff4 Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Tue, 2 Feb 2016 13:36:45 -0500 Subject: netfilter: nfnetlink: correctly validate length of batch messages commit c58d6c93680f28ac58984af61d0a7ebf4319c241 upstream. If nlh->nlmsg_len is zero then an infinite loop is triggered because 'skb_pull(skb, msglen);' pulls zero bytes. The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len < NLMSG_HDRLEN' which bypasses the length validation and will later trigger an out-of-bound read. If the length validation does fail then the malformed batch message is copied back to userspace. However, we cannot do this because the nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in netlink_ack: [ 41.455421] ================================================================== [ 41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340 [ 41.456431] Read of size 4294967280 by task a.out/987 [ 41.456431] ============================================================================= [ 41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected [ 41.456431] ----------------------------------------------------------------------------- ... [ 41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00 ................ [ 41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00 ............... [ 41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05 .......@EV."3... [ 41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb ................ ^^ start of batch nlmsg with nlmsg_len=4294967280 ... [ 41.456431] Memory state around the buggy address: [ 41.456431] ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ^ [ 41.456431] ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb [ 41.456431] ================================================================== Fix this with better validation of nlh->nlmsg_len and by setting NFNL_BATCH_FAILURE if any batch message fails length validation. CAP_NET_ADMIN is required to trigger the bugs. Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch") Signed-off-by: Phil Turnbull Signed-off-by: Pablo Neira Ayuso Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 77afe913d03d..9adedba78eea 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -326,10 +326,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ -- cgit v1.2.3 From 65d30f7545ffdddcf10a59f3e54b032c5ade2e9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Fri, 2 Dec 2016 09:33:41 +0100 Subject: tipc: check minimum bearer MTU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3de81b758853f0b29c61e246679d20b513c4cfec upstream. Qian Zhang (张谦) reported a potential socket buffer overflow in tipc_msg_build() which is also known as CVE-2016-8632: due to insufficient checks, a buffer overflow can occur if MTU is too short for even tipc headers. As anyone can set device MTU in a user/net namespace, this issue can be abused by a regular user. As agreed in the discussion on Ben Hutchings' original patch, we should check the MTU at the moment a bearer is attached rather than for each processed packet. We also need to repeat the check when bearer MTU is adjusted to new device MTU. UDP case also needs a check to avoid overflow when calculating bearer MTU. Fixes: b97bf3fd8f6a ("[TIPC] Initial merge") Signed-off-by: Michal Kubecek Reported-by: Qian Zhang (张谦) Acked-by: Ying Xue Signed-off-by: David S. Miller [bwh: Backported to 4.4: - Adjust context - NETDEV_GOING_DOWN and NETDEV_CHANGEMTU cases in net notifier were combined] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- net/tipc/bearer.c | 13 +++++++++++-- net/tipc/bearer.h | 13 +++++++++++++ net/tipc/udp_media.c | 5 +++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 648f2a67f314..cb1381513c82 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -381,6 +381,10 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, dev = dev_get_by_name(net, driver_name); if (!dev) return -ENODEV; + if (tipc_mtu_bad(dev, 0)) { + dev_put(dev); + return -EINVAL; + } /* Associate TIPC bearer with L2 bearer */ rcu_assign_pointer(b->media_ptr, dev); @@ -570,14 +574,19 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, if (!b_ptr) return NOTIFY_DONE; - b_ptr->mtu = dev->mtu; - switch (evt) { case NETDEV_CHANGE: if (netif_carrier_ok(dev)) break; case NETDEV_GOING_DOWN: + tipc_reset_bearer(net, b_ptr); + break; case NETDEV_CHANGEMTU: + if (tipc_mtu_bad(dev, 0)) { + bearer_disable(net, b_ptr); + break; + } + b_ptr->mtu = dev->mtu; tipc_reset_bearer(net, b_ptr); break; case NETDEV_CHANGEADDR: diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 552185bc4773..5f11e18b1fa1 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -39,6 +39,7 @@ #include "netlink.h" #include "core.h" +#include "msg.h" #include #define MAX_MEDIA 3 @@ -61,6 +62,9 @@ #define TIPC_MEDIA_TYPE_IB 2 #define TIPC_MEDIA_TYPE_UDP 3 +/* minimum bearer MTU */ +#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE) + /** * struct tipc_node_map - set of node identifiers * @count: # of nodes in set @@ -226,4 +230,13 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); +/* check if device MTU is too low for tipc headers */ +static inline bool tipc_mtu_bad(struct net_device *dev, unsigned int reserve) +{ + if (dev->mtu >= TIPC_MIN_BEARER_MTU + reserve) + return false; + netdev_warn(dev, "MTU too low for tipc bearer\n"); + return true; +} + #endif /* _TIPC_BEARER_H */ diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 4056798c54a5..78d6b78de29d 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -376,6 +376,11 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, udp_conf.local_ip.s_addr = htonl(INADDR_ANY); udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; + if (tipc_mtu_bad(dev, sizeof(struct iphdr) + + sizeof(struct udphdr))) { + err = -EINVAL; + goto err; + } b->mtu = dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr); #if IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From d23ef85b123d3dbd3ba8a3c5f0ef5e556feb635e Mon Sep 17 00:00:00 2001 From: Vlad Tsyrklevich Date: Wed, 12 Oct 2016 18:51:24 +0200 Subject: vfio/pci: Fix integer overflows, bitmask check commit 05692d7005a364add85c6e25a6c4447ce08f913a upstream. The VFIO_DEVICE_SET_IRQS ioctl did not sufficiently sanitize user-supplied integers, potentially allowing memory corruption. This patch adds appropriate integer overflow checks, checks the range bounds for VFIO_IRQ_SET_DATA_NONE, and also verifies that only single element in the VFIO_IRQ_SET_DATA_TYPE_MASK bitmask is set. VFIO_IRQ_SET_ACTION_TYPE_MASK is already correctly checked later in vfio_pci_set_irqs_ioctl(). Furthermore, a kzalloc is changed to a kcalloc because the use of a kzalloc with an integer multiplication allowed an integer overflow condition to be reached without this patch. kcalloc checks for overflow and should prevent a similar occurrence. Signed-off-by: Vlad Tsyrklevich Signed-off-by: Alex Williamson Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/vfio/pci/vfio_pci.c | 33 +++++++++++++++++++++------------ drivers/vfio/pci/vfio_pci_intrs.c | 2 +- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 9982cb176ce8..830e2fd47642 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -562,8 +562,9 @@ static long vfio_pci_ioctl(void *device_data, } else if (cmd == VFIO_DEVICE_SET_IRQS) { struct vfio_irq_set hdr; + size_t size; u8 *data = NULL; - int ret = 0; + int max, ret = 0; minsz = offsetofend(struct vfio_irq_set, count); @@ -571,23 +572,31 @@ static long vfio_pci_ioctl(void *device_data, return -EFAULT; if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || + hdr.count >= (U32_MAX - hdr.start) || hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | VFIO_IRQ_SET_ACTION_TYPE_MASK)) return -EINVAL; - if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { - size_t size; - int max = vfio_pci_get_irq_count(vdev, hdr.index); + max = vfio_pci_get_irq_count(vdev, hdr.index); + if (hdr.start >= max || hdr.start + hdr.count > max) + return -EINVAL; - if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) - size = sizeof(uint8_t); - else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) - size = sizeof(int32_t); - else - return -EINVAL; + switch (hdr.flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { + case VFIO_IRQ_SET_DATA_NONE: + size = 0; + break; + case VFIO_IRQ_SET_DATA_BOOL: + size = sizeof(uint8_t); + break; + case VFIO_IRQ_SET_DATA_EVENTFD: + size = sizeof(int32_t); + break; + default: + return -EINVAL; + } - if (hdr.argsz - minsz < hdr.count * size || - hdr.start >= max || hdr.start + hdr.count > max) + if (size) { + if (hdr.argsz - minsz < hdr.count * size) return -EINVAL; data = memdup_user((void __user *)(arg + minsz), diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 20e9a86d2dcf..5c8f767b6368 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -255,7 +255,7 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) if (!is_irq_none(vdev)) return -EINVAL; - vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); + vdev->ctx = kcalloc(nvec, sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); if (!vdev->ctx) return -ENOMEM; -- cgit v1.2.3 From a7544fdd1626b65db635022c9d36007bb32dd6d8 Mon Sep 17 00:00:00 2001 From: EunTaik Lee Date: Wed, 24 Feb 2016 04:38:06 +0000 Subject: staging/android/ion : fix a race condition in the ion driver commit 9590232bb4f4cc824f3425a6e1349afbe6d6d2b7 upstream. There is a use-after-free problem in the ion driver. This is caused by a race condition in the ion_ioctl() function. A handle has ref count of 1 and two tasks on different cpus calls ION_IOC_FREE simultaneously. cpu 0 cpu 1 ------------------------------------------------------- ion_handle_get_by_id() (ref == 2) ion_handle_get_by_id() (ref == 3) ion_free() (ref == 2) ion_handle_put() (ref == 1) ion_free() (ref == 0 so ion_handle_destroy() is called and the handle is freed.) ion_handle_put() is called and it decreases the slub's next free pointer The problem is detected as an unaligned access in the spin lock functions since it uses load exclusive instruction. In some cases it corrupts the slub's free pointer which causes a mis-aligned access to the next free pointer.(kmalloc returns a pointer like ffffc0745b4580aa). And it causes lots of other hard-to-debug problems. This symptom is caused since the first member in the ion_handle structure is the reference count and the ion driver decrements the reference after it has been freed. To fix this problem client->lock mutex is extended to protect all the codes that uses the handle. Signed-off-by: Eun Taik Lee Reviewed-by: Laura Abbott Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman index 7ff2a7ec871f..33b390e7ea31 --- drivers/staging/android/ion/ion.c | 55 ++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index df560216d702..374f840f31a4 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -387,13 +387,22 @@ static void ion_handle_get(struct ion_handle *handle) kref_get(&handle->ref); } -static int ion_handle_put(struct ion_handle *handle) +static int ion_handle_put_nolock(struct ion_handle *handle) +{ + int ret; + + ret = kref_put(&handle->ref, ion_handle_destroy); + + return ret; +} + +int ion_handle_put(struct ion_handle *handle) { struct ion_client *client = handle->client; int ret; mutex_lock(&client->lock); - ret = kref_put(&handle->ref, ion_handle_destroy); + ret = ion_handle_put_nolock(handle); mutex_unlock(&client->lock); return ret; @@ -417,20 +426,30 @@ static struct ion_handle *ion_handle_lookup(struct ion_client *client, return ERR_PTR(-EINVAL); } -static struct ion_handle *ion_handle_get_by_id(struct ion_client *client, +static struct ion_handle *ion_handle_get_by_id_nolock(struct ion_client *client, int id) { struct ion_handle *handle; - mutex_lock(&client->lock); handle = idr_find(&client->idr, id); if (handle) ion_handle_get(handle); - mutex_unlock(&client->lock); return handle ? handle : ERR_PTR(-EINVAL); } +struct ion_handle *ion_handle_get_by_id(struct ion_client *client, + int id) +{ + struct ion_handle *handle; + + mutex_lock(&client->lock); + handle = ion_handle_get_by_id_nolock(client, id); + mutex_unlock(&client->lock); + + return handle; +} + static bool ion_handle_validate(struct ion_client *client, struct ion_handle *handle) { @@ -532,22 +551,28 @@ struct ion_handle *ion_alloc(struct ion_client *client, size_t len, } EXPORT_SYMBOL(ion_alloc); -void ion_free(struct ion_client *client, struct ion_handle *handle) +static void ion_free_nolock(struct ion_client *client, struct ion_handle *handle) { bool valid_handle; BUG_ON(client != handle->client); - mutex_lock(&client->lock); valid_handle = ion_handle_validate(client, handle); if (!valid_handle) { WARN(1, "%s: invalid handle passed to free.\n", __func__); - mutex_unlock(&client->lock); return; } + ion_handle_put_nolock(handle); +} + +void ion_free(struct ion_client *client, struct ion_handle *handle) +{ + BUG_ON(client != handle->client); + + mutex_lock(&client->lock); + ion_free_nolock(client, handle); mutex_unlock(&client->lock); - ion_handle_put(handle); } EXPORT_SYMBOL(ion_free); @@ -1283,11 +1308,15 @@ static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct ion_handle *handle; - handle = ion_handle_get_by_id(client, data.handle.handle); - if (IS_ERR(handle)) + mutex_lock(&client->lock); + handle = ion_handle_get_by_id_nolock(client, data.handle.handle); + if (IS_ERR(handle)) { + mutex_unlock(&client->lock); return PTR_ERR(handle); - ion_free(client, handle); - ion_handle_put(handle); + } + ion_free_nolock(client, handle); + ion_handle_put_nolock(handle); + mutex_unlock(&client->lock); break; } case ION_IOC_SHARE: -- cgit v1.2.3 From b7f47c794bc45eae975bf2a52a4463333111bb2a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Mar 2017 19:36:13 -0700 Subject: ping: implement proper locking commit 43a6684519ab0a6c52024b5e25322476cabad893 upstream. We got a report of yet another bug in ping http://www.openwall.com/lists/oss-security/2017/03/24/6 ->disconnect() is not called with socket lock held. Fix this by acquiring ping rwlock earlier. Thanks to Daniel, Alexander and Andrey for letting us know this problem. Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Signed-off-by: Eric Dumazet Reported-by: Daniel Jiang Reported-by: Solar Designer Reported-by: Andrey Konovalov Signed-off-by: David S. Miller Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 3a00512addbc..37a3b05d175c 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -154,17 +154,18 @@ void ping_hash(struct sock *sk) void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); + pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); + write_lock_bh(&ping_table.lock); if (sk_hashed(sk)) { - write_lock_bh(&ping_table.lock); hlist_nulls_del(&sk->sk_nulls_node); sk_nulls_node_init(&sk->sk_nulls_node); sock_put(sk); isk->inet_num = 0; isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - write_unlock_bh(&ping_table.lock); } + write_unlock_bh(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_unhash); -- cgit v1.2.3 From 416bd4a366f3b4cd3f6a3246f91bd9f425891547 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Jan 2017 21:09:50 +0100 Subject: perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race commit 321027c1fe77f892f4ea07846aeae08cefbbb290 upstream. Di Shen reported a race between two concurrent sys_perf_event_open() calls where both try and move the same pre-existing software group into a hardware context. The problem is exactly that described in commit: f63a8daa5812 ("perf: Fix event->ctx locking") ... where, while we wait for a ctx->mutex acquisition, the event->ctx relation can have changed under us. That very same commit failed to recognise sys_perf_event_context() as an external access vector to the events and thereby didn't apply the established locking rules correctly. So while one sys_perf_event_open() call is stuck waiting on mutex_lock_double(), the other (which owns said locks) moves the group about. So by the time the former sys_perf_event_open() acquires the locks, the context we've acquired is stale (and possibly dead). Apply the established locking rules as per perf_event_ctx_lock_nested() to the mutex_lock_double() for the 'move_group' case. This obviously means we need to validate state after we acquire the locks. Reported-by: Di Shen (Keen Lab) Tested-by: John Dias Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kees Cook Cc: Linus Torvalds Cc: Min Chong Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: f63a8daa5812 ("perf: Fix event->ctx locking") Link: http://lkml.kernel.org/r/20170106131444.GZ3174@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar [bwh: Backported to 4.4: - Test perf_event::group_flags instead of group_caps - Adjust context] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e4b5494f05f8..784ab8fe8714 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8250,6 +8250,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } +/* + * Variation on perf_event_ctx_lock_nested(), except we take two context + * mutexes. + */ +static struct perf_event_context * +__perf_event_ctx_lock_double(struct perf_event *group_leader, + struct perf_event_context *ctx) +{ + struct perf_event_context *gctx; + +again: + rcu_read_lock(); + gctx = READ_ONCE(group_leader->ctx); + if (!atomic_inc_not_zero(&gctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock_double(&gctx->mutex, &ctx->mutex); + + if (group_leader->ctx != gctx) { + mutex_unlock(&ctx->mutex); + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + goto again; + } + + return gctx; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -8486,8 +8517,26 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - gctx = group_leader->ctx; - mutex_lock_double(&gctx->mutex, &ctx->mutex); + gctx = __perf_event_ctx_lock_double(group_leader, ctx); + + /* + * Check if we raced against another sys_perf_event_open() call + * moving the software group underneath us. + */ + if (!(group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * If someone moved the group out from under us, check + * if this new event wound up on the same ctx, if so + * its the regular !move_group case, otherwise fail. + */ + if (gctx != ctx) { + err = -EINVAL; + goto err_locked; + } else { + perf_event_ctx_unlock(group_leader, gctx); + move_group = 0; + } + } } else { mutex_lock(&ctx->mutex); } @@ -8582,7 +8631,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -8610,7 +8659,7 @@ SYSCALL_DEFINE5(perf_event_open, err_locked: if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); /* err_file: */ fput(event_file); -- cgit v1.2.3 From 418b99042b87b2b08a5d4f7f19e775f10211d431 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 30 Apr 2017 05:50:11 +0200 Subject: Linux 4.4.65 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 17708f5dc169..ddaef04f528a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 64 +SUBLEVEL = 65 EXTRAVERSION = NAME = Blurry Fish Butt -- cgit v1.2.3