From 182d919b84902eece162c63ed3d476c8016b4197 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Feb 2015 23:47:31 +0000 Subject: FS-Cache: Count culled objects and objects rejected due to lack of space Count the number of objects that get culled by the cache backend and the number of objects that the cache backend declines to instantiate due to lack of space in the cache. These numbers are made available through /proc/fs/fscache/stats Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- include/linux/fscache-cache.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 771484993ca7..c9dafdaf3347 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -371,6 +371,7 @@ struct fscache_object { #define FSCACHE_OBJECT_IS_LOOKED_UP 4 /* T if object has been looked up */ #define FSCACHE_OBJECT_IS_AVAILABLE 5 /* T if object has become active */ #define FSCACHE_OBJECT_RETIRED 6 /* T if object was retired on relinquishment */ +#define FSCACHE_OBJECT_KILLED_BY_CACHE 7 /* T if object was killed by the cache */ struct list_head cache_link; /* link in cache->object_list */ struct hlist_node cookie_link; /* link in cookie->backing_objects */ @@ -551,4 +552,15 @@ extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object, const void *data, uint16_t datalen); +extern void fscache_object_retrying_stale(struct fscache_object *object); + +enum fscache_why_object_killed { + FSCACHE_OBJECT_IS_STALE, + FSCACHE_OBJECT_NO_SPACE, + FSCACHE_OBJECT_WAS_RETIRED, + FSCACHE_OBJECT_WAS_CULLED, +}; +extern void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why); + #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit v1.2.3 From 30ceec6284129662efc3a1e7675b2bd857a046fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:27 +0000 Subject: FS-Cache: When submitting an op, cancel it if the target object is dying When submitting an operation, prefer to cancel the operation immediately rather than queuing it for later processing if the object is marked as dying (ie. the object state machine has reached the KILL_OBJECT state). Whilst we're at it, change the series of related test_bit() calls into a READ_ONCE() and bitwise-AND operators to reduce the number of load instructions (test_bit() has a volatile address). Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- include/linux/fscache-cache.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index c9dafdaf3347..2e83a141e465 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -411,17 +411,22 @@ static inline bool fscache_object_is_available(struct fscache_object *object) return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); } +static inline bool fscache_cache_is_broken(struct fscache_object *object) +{ + return test_bit(FSCACHE_IOERROR, &object->cache->flags); +} + static inline bool fscache_object_is_active(struct fscache_object *object) { return fscache_object_is_available(object) && fscache_object_is_live(object) && - !test_bit(FSCACHE_IOERROR, &object->cache->flags); + !fscache_cache_is_broken(object); } static inline bool fscache_object_is_dead(struct fscache_object *object) { return fscache_object_is_dying(object) && - test_bit(FSCACHE_IOERROR, &object->cache->flags); + fscache_cache_is_broken(object); } /** -- cgit v1.2.3 From 87021526300f1a292dd966e141e183630ac95317 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:52:51 +0000 Subject: FS-Cache: fscache_object_is_dead() has wrong logic, kill it fscache_object_is_dead() returns true only if the object is marked dead and the cache got an I/O error. This should be a logical OR instead. Since two of the callers got split up into handling for separate subcases, expand the other callers and kill the function. This is probably the right thing to do anyway since one of the subcases isn't about the object at all, but rather about the cache. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- include/linux/fscache-cache.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 2e83a141e465..ca3d550da11e 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -423,12 +423,6 @@ static inline bool fscache_object_is_active(struct fscache_object *object) !fscache_cache_is_broken(object); } -static inline bool fscache_object_is_dead(struct fscache_object *object) -{ - return fscache_object_is_dying(object) && - fscache_cache_is_broken(object); -} - /** * fscache_object_destroyed - Note destruction of an object in a cache * @cache: The cache from which the object came -- cgit v1.2.3 From 1339ec98e32b4bc8efb6fbb71c006a465130aaba Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Feb 2015 13:26:39 +0000 Subject: FS-Cache: Out of line fscache_operation_init() Out of line fscache_operation_init() so that it can access internal FS-Cache features, such as stats, in a later commit. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- include/linux/fscache-cache.h | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index ca3d550da11e..0e26d49972e3 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -119,27 +119,9 @@ extern void fscache_op_work_func(struct work_struct *work); extern void fscache_enqueue_operation(struct fscache_operation *); extern void fscache_op_complete(struct fscache_operation *, bool); extern void fscache_put_operation(struct fscache_operation *); - -/** - * fscache_operation_init - Do basic initialisation of an operation - * @op: The operation to initialise - * @release: The release function to assign - * - * Do basic initialisation of an operation. The caller must still set flags, - * object and processor if needed. - */ -static inline void fscache_operation_init(struct fscache_operation *op, - fscache_operation_processor_t processor, - fscache_operation_release_t release) -{ - INIT_WORK(&op->work, fscache_op_work_func); - atomic_set(&op->usage, 1); - op->state = FSCACHE_OP_ST_INITIALISED; - op->debug_id = atomic_inc_return(&fscache_op_debug_id); - op->processor = processor; - op->release = release; - INIT_LIST_HEAD(&op->pend_link); -} +extern void fscache_operation_init(struct fscache_operation *, + fscache_operation_processor_t, + fscache_operation_release_t); /* * data read operation -- cgit v1.2.3 From d3b97ca4a99e4e6c78f5a21c968eadf5c8ba9971 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:29 +0000 Subject: FS-Cache: The operation cancellation method needs calling in more places Any time an incomplete operation is cancelled, the operation cancellation function needs to be called to clean up. This is currently being passed directly to some of the functions that might want to call it, but not all. Instead, pass the cancellation method pointer to the fscache_operation_init() and have that cache it in the operation struct. Further, plug in a dummy cancellation handler if the caller declines to set one as this allows us to call the function unconditionally (the extra overhead isn't worth bothering about as we don't expect to be calling this typically). The cancellation method must thence be called everywhere the CANCELLED state is set. Note that we call it *before* setting the CANCELLED state such that the method can use the old state value to guide its operation. fscache_do_cancel_retrieval() needs moving higher up in the sources so that the init function can use it now. Without this, the following oops may be seen: FS-Cache: Assertion failed FS-Cache: 3 == 0 is false ------------[ cut here ]------------ kernel BUG at ../fs/fscache/page.c:261! ... RIP: 0010:[] fscache_release_retrieval_op+0x77/0x100 [] fscache_put_operation+0x114/0x2da [] __fscache_read_or_alloc_pages+0x358/0x3b3 [] __nfs_readpages_from_fscache+0x59/0xbf [nfs] [] nfs_readpages+0x10c/0x185 [nfs] [] ? alloc_pages_current+0x119/0x13e [] ? __page_cache_alloc+0xfb/0x10a [] __do_page_cache_readahead+0x188/0x22c [] ondemand_readahead+0x29e/0x2af [] page_cache_sync_readahead+0x38/0x3a [] generic_file_read_iter+0x1a2/0x55a [] ? nfs_revalidate_mapping+0xd6/0x288 [nfs] [] nfs_file_read+0x49/0x70 [nfs] [] new_sync_read+0x78/0x9c [] __vfs_read+0x13/0x38 [] vfs_read+0x95/0x121 [] SyS_read+0x4c/0x8a [] system_call_fastpath+0x12/0x17 The assertion is showing that the remaining number of pages (n_pages) is not 0 when the operation is being released. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- include/linux/fscache-cache.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 0e26d49972e3..69c6eb10d858 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -74,6 +74,7 @@ extern wait_queue_head_t fscache_cache_cleared_wq; */ typedef void (*fscache_operation_release_t)(struct fscache_operation *op); typedef void (*fscache_operation_processor_t)(struct fscache_operation *op); +typedef void (*fscache_operation_cancel_t)(struct fscache_operation *op); enum fscache_operation_state { FSCACHE_OP_ST_BLANK, /* Op is not yet submitted */ @@ -109,6 +110,9 @@ struct fscache_operation { * the op in a non-pool thread */ fscache_operation_processor_t processor; + /* Operation cancellation cleanup (optional) */ + fscache_operation_cancel_t cancel; + /* operation releaser */ fscache_operation_release_t release; }; @@ -121,6 +125,7 @@ extern void fscache_op_complete(struct fscache_operation *, bool); extern void fscache_put_operation(struct fscache_operation *); extern void fscache_operation_init(struct fscache_operation *, fscache_operation_processor_t, + fscache_operation_cancel_t, fscache_operation_release_t); /* -- cgit v1.2.3 From 4a47132ff472a0c2c5441baeb50cf97f2580bc43 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:29 +0000 Subject: FS-Cache: Retain the netfs context in the retrieval op earlier Now that the retrieval operation may be disposed of by fscache_put_operation() before we actually set the context, the retrieval-specific cleanup operation can produce a NULL-pointer dereference when it tries to unconditionally clean up the netfs context. Given that it is expected that we'll get at least as far as the place where we currently set the context pointer and it is unlikely we'll go through the error handling paths prior to that point, retain the context right from the point that the retrieval op is allocated. Concomitant to this, we need to retain the cookie pointer in the retrieval op also so that we can call the netfs to release its context in the release method. In addition, we might now get into fscache_release_retrieval_op() with the op only initialised. To this end, set the operation to DEAD only after the release method has been called and skip the n_pages test upon cleanup if the op is still in the INITIALISED state. Without these changes, the following oops might be seen: BUG: unable to handle kernel NULL pointer dereference at 00000000000000b8 ... RIP: 0010:[] fscache_release_retrieval_op+0xae/0x100 ... Call Trace: [] fscache_put_operation+0x117/0x2e0 [] __fscache_read_or_alloc_pages+0x351/0x3ac [] __nfs_readpages_from_fscache+0x59/0xbf [nfs] [] nfs_readpages+0x10c/0x185 [nfs] [] ? alloc_pages_current+0x119/0x13e [] ? __page_cache_alloc+0xfb/0x10a [] __do_page_cache_readahead+0x188/0x22c [] ondemand_readahead+0x29e/0x2af [] page_cache_sync_readahead+0x38/0x3a [] generic_file_read_iter+0x1a2/0x55a [] ? nfs_revalidate_mapping+0xd6/0x288 [nfs] [] nfs_file_read+0x49/0x70 [nfs] [] new_sync_read+0x78/0x9c [] __vfs_read+0x13/0x38 [] vfs_read+0x95/0x121 [] SyS_read+0x4c/0x8a [] system_call_fastpath+0x12/0x17 Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- include/linux/fscache-cache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 69c6eb10d858..604e1526cd00 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -133,6 +133,7 @@ extern void fscache_operation_init(struct fscache_operation *, */ struct fscache_retrieval { struct fscache_operation op; + struct fscache_cookie *cookie; /* The netfs cookie */ struct address_space *mapping; /* netfs pages */ fscache_rw_complete_t end_io_func; /* function to call on I/O completion */ void *context; /* netfs read context (pinned) */ -- cgit v1.2.3 From c8a8585431efba0faaf41167f8f7c27c48307ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vianney=20le=20Cl=C3=A9ment=20de=20Saint-Marcq?= Date: Mon, 30 Mar 2015 10:34:58 +0200 Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBEMISSIVITY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contact-less IR temperature sensors measure the temperature of an object by using its thermal radiation. Surfaces with different emissivity ratios emit different amounts of energy at the same temperature. IIO_CHAN_INFO_CALIBEMISSIVITY allows the user to inform the sensor of the emissivity of the object in front of it, in order to effectively measure its temperature. A device providing such setting is Melexis's MLX90614: http://melexis.com/Assets/IR-sensor-thermometer-MLX90614-Datasheet-5152.aspx. Signed-off-by: Vianney le Clément de Saint-Marcq Cc: Arnout Vandecappelle (Essensium/Mind) Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index d86b753e9b30..b1e46ae89aa7 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -43,6 +43,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_CALIBWEIGHT, IIO_CHAN_INFO_DEBOUNCE_COUNT, IIO_CHAN_INFO_DEBOUNCE_TIME, + IIO_CHAN_INFO_CALIBEMISSIVITY, }; enum iio_shared_by { -- cgit v1.2.3 From 591fc116f3302da915bb57d4474a61a5e8884cec Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 9 Apr 2015 11:34:22 +0300 Subject: usb: phy: msm: Use extcon framework for VBUS and ID detection On recent Qualcomm platforms VBUS and ID lines are not routed to USB PHY LINK controller. Use extcon framework to receive connect and disconnect ID and VBUS notification. Signed-off-by: Ivan T. Ivanov Signed-off-by: Felipe Balbi --- include/linux/usb/msm_hsusb.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h index 7dbecf9a4656..c4d956e50d09 100644 --- a/include/linux/usb/msm_hsusb.h +++ b/include/linux/usb/msm_hsusb.h @@ -18,6 +18,7 @@ #ifndef __ASM_ARCH_MSM_HSUSB_H #define __ASM_ARCH_MSM_HSUSB_H +#include #include #include #include @@ -119,6 +120,17 @@ struct msm_otg_platform_data { void (*setup_gpio)(enum usb_otg_state state); }; +/** + * struct msm_usb_cable - structure for exteternal connector cable + * state tracking + * @nb: hold event notification callback + * @conn: used for notification registration + */ +struct msm_usb_cable { + struct notifier_block nb; + struct extcon_specific_cable_nb conn; +}; + /** * struct msm_otg: OTG driver data. Shared by HCD and DCD. * @otg: USB OTG Transceiver structure. @@ -138,6 +150,8 @@ struct msm_otg_platform_data { * @chg_type: The type of charger attached. * @dcd_retires: The retry count used to track Data contact * detection process. + * @vbus: VBUS signal state trakining, using extcon framework + * @id: ID signal state trakining, using extcon framework */ struct msm_otg { struct usb_phy phy; @@ -166,6 +180,9 @@ struct msm_otg { struct reset_control *phy_rst; struct reset_control *link_rst; int vdd_levels[3]; + + struct msm_usb_cable vbus; + struct msm_usb_cable id; }; #endif -- cgit v1.2.3 From 44e42ae3a398b559c768b9b3c324d72b0b0b4479 Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 9 Apr 2015 11:34:33 +0300 Subject: usb: phy: msm: Manual PHY and LINK controller VBUS change notification VBUS is not routed to USB PHY on recent Qualcomm platforms. USB controller must see VBUS in order to pull-up DP when setting RS bit. Henc configure USB PHY and LINK registers sense VBUS and enable manual pullup on D+ line. Cc: Vamsi Krishna Cc: Mayank Rana Signed-off-by: Ivan T. Ivanov Signed-off-by: Felipe Balbi --- include/linux/usb/msm_hsusb.h | 5 +++++ include/linux/usb/msm_hsusb_hw.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h index c4d956e50d09..e55a1504266e 100644 --- a/include/linux/usb/msm_hsusb.h +++ b/include/linux/usb/msm_hsusb.h @@ -150,6 +150,9 @@ struct msm_usb_cable { * @chg_type: The type of charger attached. * @dcd_retires: The retry count used to track Data contact * detection process. + * @manual_pullup: true if VBUS is not routed to USB controller/phy + * and controller driver therefore enables pull-up explicitly before + * starting controller using usbcmd run/stop bit. * @vbus: VBUS signal state trakining, using extcon framework * @id: ID signal state trakining, using extcon framework */ @@ -181,6 +184,8 @@ struct msm_otg { struct reset_control *link_rst; int vdd_levels[3]; + bool manual_pullup; + struct msm_usb_cable vbus; struct msm_usb_cable id; }; diff --git a/include/linux/usb/msm_hsusb_hw.h b/include/linux/usb/msm_hsusb_hw.h index a29f6030afb1..e159b39f67a2 100644 --- a/include/linux/usb/msm_hsusb_hw.h +++ b/include/linux/usb/msm_hsusb_hw.h @@ -21,6 +21,8 @@ #define USB_AHBBURST (MSM_USB_BASE + 0x0090) #define USB_AHBMODE (MSM_USB_BASE + 0x0098) +#define USB_GENCONFIG_2 (MSM_USB_BASE + 0x00a0) + #define USB_CAPLENGTH (MSM_USB_BASE + 0x0100) /* 8 bit */ #define USB_USBCMD (MSM_USB_BASE + 0x0140) @@ -30,6 +32,9 @@ #define USB_PHY_CTRL (MSM_USB_BASE + 0x0240) #define USB_PHY_CTRL2 (MSM_USB_BASE + 0x0278) +#define GENCONFIG_2_SESS_VLD_CTRL_EN BIT(7) +#define USBCMD_SESS_VLD_CTRL BIT(25) + #define USBCMD_RESET 2 #define USB_USBINTR (MSM_USB_BASE + 0x0148) @@ -50,6 +55,10 @@ #define ULPI_PWR_CLK_MNG_REG 0x88 #define OTG_COMP_DISABLE BIT(0) +#define ULPI_MISC_A 0x96 +#define ULPI_MISC_A_VBUSVLDEXTSEL BIT(1) +#define ULPI_MISC_A_VBUSVLDEXT BIT(0) + #define ASYNC_INTR_CTRL (1 << 29) /* Enable async interrupt */ #define ULPI_STP_CTRL (1 << 30) /* Block communication with PHY */ #define PHY_RETEN (1 << 1) /* PHY retention enable/disable */ -- cgit v1.2.3 From b189a2117223edbe40e0a187ae5c606cbdd6447c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 14:04:07 +0200 Subject: usb: phy: Remove the phy-rcar-gen2-usb driver The phy-rcar-gen2-usb driver, which supports legacy platform data only, is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager: Remove legacy board support"). This driver was superseded by the DT-only phy-rcar-gen2 driver, which was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY driver"). Signed-off-by: Geert Uytterhoeven Signed-off-by: Felipe Balbi --- include/linux/platform_data/usb-rcar-gen2-phy.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h (limited to 'include/linux') diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h deleted file mode 100644 index dd3ba46c0d90..000000000000 --- a/include/linux/platform_data/usb-rcar-gen2-phy.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __USB_RCAR_GEN2_PHY_H -#define __USB_RCAR_GEN2_PHY_H - -#include - -struct rcar_gen2_phy_platform_data { - /* USB channel 0 configuration */ - bool chan0_pci:1; /* true: PCI USB host 0, false: USBHS */ - /* USB channel 2 configuration */ - bool chan2_pci:1; /* true: PCI USB host 2, false: USBSS */ -}; - -#endif -- cgit v1.2.3 From f1ec7187167ce225d2744b20a90afef5f10fd6cd Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 29 Apr 2015 16:02:30 -0500 Subject: libfdt: add fdt type definitions In preparation for libfdt/dtc update, add the new fdt specific types. Signed-off-by: Rob Herring Cc: Russell King Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org --- include/linux/libfdt_env.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libfdt_env.h b/include/linux/libfdt_env.h index 01508c7b8c81..2a663c6bb428 100644 --- a/include/linux/libfdt_env.h +++ b/include/linux/libfdt_env.h @@ -5,6 +5,10 @@ #include +typedef __be16 fdt16_t; +typedef __be32 fdt32_t; +typedef __be64 fdt64_t; + #define fdt32_to_cpu(x) be32_to_cpu(x) #define cpu_to_fdt32(x) cpu_to_be32(x) #define fdt64_to_cpu(x) be64_to_cpu(x) -- cgit v1.2.3 From 042f7df15a4fff8eec42873f755aea848dcdedd1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 30 Apr 2015 17:16:12 +0800 Subject: workqueue: Allow modifying low level unbound workqueue cpumask Allow to modify the low-level unbound workqueues cpumask through sysfs. This is performed by traversing the entire workqueue list and calling apply_wqattrs_prepare() on the unbound workqueues with the new low level mask. Only after all the preparation are done, we commit them all together. Ordered workqueues are ignored from the low level unbound workqueue cpumask, it will be handled in near future. All the (default & per-node) pwqs are mandatorily controlled by the low level cpumask. If the user configured cpumask doesn't overlap with the low level cpumask, the low level cpumask will be used for the wq instead. The comment of wq_calc_node_cpumask() is updated and explicitly requires that its first argument should be the attrs of the default pwq. The default wq_unbound_cpumask is cpu_possible_mask. The workqueue subsystem doesn't know its best default value, let the system manager or the other subsystem set it when needed. Changed from V8: merge the calculating code for the attrs of the default pwq together. minor change the code&comments for saving the user configured attrs. remove unnecessary list_del(). minor update the comment of wq_calc_node_cpumask(). update the comment of workqueue_set_unbound_cpumask(); Cc: Christoph Lameter Cc: Kevin Hilman Cc: Lai Jiangshan Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Tejun Heo Cc: Viresh Kumar Cc: Frederic Weisbecker Original-patch-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index deee212af8e0..4618dd672d1b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -- cgit v1.2.3 From fb7737e949e31d8a71acee6bbb670f32dbd2a2c0 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 4 Mar 2015 20:01:14 -0600 Subject: hwspinlock/core: add device tree support This patch adds a new OF-friendly API of_hwspin_lock_get_id() for hwspinlock clients to use/request locks from a hwspinlock device instantiated through a device-tree blob. This new API can be used by hwspinlock clients to get the id for a specific lock using the phandle + args specifier, so that it can be requested using the available hwspin_lock_request_specific() API. Signed-off-by: Suman Anna Reviewed-by: Bjorn Andersson [small comment clarification] Signed-off-by: Ohad Ben-Cohen --- include/linux/hwspinlock.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h index 3343298e40e8..859d673d98c8 100644 --- a/include/linux/hwspinlock.h +++ b/include/linux/hwspinlock.h @@ -26,6 +26,7 @@ #define HWLOCK_IRQ 0x02 /* Disable interrupts, don't save state */ struct device; +struct device_node; struct hwspinlock; struct hwspinlock_device; struct hwspinlock_ops; @@ -66,6 +67,7 @@ int hwspin_lock_unregister(struct hwspinlock_device *bank); struct hwspinlock *hwspin_lock_request(void); struct hwspinlock *hwspin_lock_request_specific(unsigned int id); int hwspin_lock_free(struct hwspinlock *hwlock); +int of_hwspin_lock_get_id(struct device_node *np, int index); int hwspin_lock_get_id(struct hwspinlock *hwlock); int __hwspin_lock_timeout(struct hwspinlock *, unsigned int, int, unsigned long *); @@ -120,6 +122,11 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags) { } +static inline int of_hwspin_lock_get_id(struct device_node *np, int index) +{ + return 0; +} + static inline int hwspin_lock_get_id(struct hwspinlock *hwlock) { return 0; -- cgit v1.2.3 From 20f56758b04364c3c139edbffde8cfaf0307edee Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Tue, 28 Apr 2015 00:18:41 -0700 Subject: leds: unify the location of led-trigger API Part of led-trigger API was in the private drivers/leds/leds.h header. Move it to the include/linux/leds.h header to unify the API location and announce it as public. It has been already exported from led-triggers.c with EXPORT_SYMBOL_GPL macro. The no-op definitions are changed from macros to inline to match the style of the surrounding code. Signed-off-by: Jacek Anaszewski Cc: Richard Purdie Acked-by: Sakari Ailus Signed-off-by: Bryan Wu --- include/linux/leds.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 9a2b000094cf..b122eeafb5dc 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -12,6 +12,7 @@ #ifndef __LINUX_LEDS_H_INCLUDED #define __LINUX_LEDS_H_INCLUDED +#include #include #include #include @@ -222,6 +223,11 @@ struct led_trigger { struct list_head next_trig; }; +ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); +ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr, + char *buf); + /* Registration functions for complex triggers */ extern int led_trigger_register(struct led_trigger *trigger); extern void led_trigger_unregister(struct led_trigger *trigger); @@ -238,6 +244,16 @@ extern void led_trigger_blink_oneshot(struct led_trigger *trigger, unsigned long *delay_on, unsigned long *delay_off, int invert); +extern void led_trigger_set_default(struct led_classdev *led_cdev); +extern void led_trigger_set(struct led_classdev *led_cdev, + struct led_trigger *trigger); +extern void led_trigger_remove(struct led_classdev *led_cdev); + +static inline void *led_get_trigger_data(struct led_classdev *led_cdev) +{ + return led_cdev->trigger_data; +} + /** * led_trigger_rename_static - rename a trigger * @name: the new trigger name @@ -267,6 +283,15 @@ static inline void led_trigger_register_simple(const char *name, static inline void led_trigger_unregister_simple(struct led_trigger *trigger) {} static inline void led_trigger_event(struct led_trigger *trigger, enum led_brightness event) {} +static inline void led_trigger_set_default(struct led_classdev *led_cdev) {} +static inline void led_trigger_set(struct led_classdev *led_cdev, + struct led_trigger *trigger) {} +static inline void led_trigger_remove(struct led_classdev *led_cdev) {} +static inline void *led_get_trigger_data(struct led_classdev *led_cdev) +{ + return NULL; +} + #endif /* CONFIG_LEDS_TRIGGERS */ /* Trigger specific functions */ -- cgit v1.2.3 From 6cd9e9f629f11b9412d4e9aa294c029dbb36b3cf Mon Sep 17 00:00:00 2001 From: Kapileshwar Singh Date: Wed, 18 Feb 2015 16:04:21 +0000 Subject: thermal: of: fix cooling device weights in device tree Currently you can specify the weight of the cooling device in the device tree but that information is not populated to the thermal_bind_params where the fair share governor expects it to be. The of thermal zone device doesn't have a thermal_bind_params structure and arguably it's better to pass the weight inside the thermal_instance as it is specific to the bind of a cooling device to a thermal zone parameter. Core thermal code is fixed to populate the weight in the instance from the thermal_bind_params, so platform code that was passing the weight inside the thermal_bind_params continue to work seamlessly. While we are at it, create a default value for the weight parameter for those thermal zones that currently don't define it and remove the hardcoded default in of-thermal. Cc: Zhang Rui Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Peter Feuerer Cc: Darren Hart Cc: Eduardo Valentin Cc: Kukjin Kim Cc: Durgadoss R Signed-off-by: Kapileshwar Singh Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 5eac316490ea..00dacd4dfdce 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -40,6 +40,9 @@ /* No upper/lower limit requirement */ #define THERMAL_NO_LIMIT ((u32)~0) +/* Default weight of a bound cooling device */ +#define THERMAL_WEIGHT_DEFAULT 0 + /* Unit conversion macros */ #define KELVIN_TO_CELSIUS(t) (long)(((long)t-2732 >= 0) ? \ ((long)t-2732+5)/10 : ((long)t-2732-5)/10) @@ -323,7 +326,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *, - unsigned long, unsigned long); + unsigned long, unsigned long, + unsigned int); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *); -- cgit v1.2.3 From bcdcbbc71125c37195f97314f453ca9a3a4eb758 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Wed, 18 Feb 2015 16:04:25 +0000 Subject: thermal: fair_share: generalize the weight concept The fair share governor has the concept of weights, which is the influence of each cooling device in a thermal zone. The current implementation forces the weights of all cooling devices in a thermal zone to add up to a 100. This complicates setups, as you need to know in advance how many cooling devices you are going to have. If you bind a new cooling device, you have to modify all the other cooling devices weights, which is error prone. Furthermore, you can't specify a "default" weight for platforms since that default value depends on the number of cooling devices in the platform. This patch generalizes the concept of weight by allowing any number to be a "weight". Weights are now relative to each other. Platforms that don't specify weights get the same default value for all their cooling devices, so all their cdevs are considered to be equally influential. It's important to note that previous users of the weights don't need to alter the code: percentages continue to work as they used to. This patch just removes the constraint of all the weights in a thermal zone having to add up to a 100. If they do, you get the same behavior as before. If they don't, fair share now works for that platform. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Durgadoss R Acked-by: Durgadoss R Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 00dacd4dfdce..bac0f52c7a1e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -217,9 +217,12 @@ struct thermal_bind_params { /* * This is a measure of 'how effectively these devices can - * cool 'this' thermal zone. The shall be determined by platform - * characterization. This is on a 'percentage' scale. - * See Documentation/thermal/sysfs-api.txt for more information. + * cool 'this' thermal zone. It shall be determined by + * platform characterization. This value is relative to the + * rest of the weights so a cooling device whose weight is + * double that of another cooling device is twice as + * effective. See Documentation/thermal/sysfs-api.txt for more + * information. */ int weight; -- cgit v1.2.3 From e33df1d2f3a0141cd79e770f31999ba0dd7ebfa8 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:27 +0000 Subject: thermal: let governors have private data for each thermal zone A governor may need to store its current state between calls to throttle(). That state depends on the thermal zone, so store it as private data in struct thermal_zone_device. The governors may have two new ops: bind_to_tz() and unbind_from_tz(). When provided, these functions let governors do some initialization and teardown when they are bound/unbound to a tz and possibly store that information in the governor_data field of the struct thermal_zone_device. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bac0f52c7a1e..edf9d53c67e6 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -165,6 +165,7 @@ struct thermal_attr { * @ops: operations this &thermal_zone_device supports * @tzp: thermal zone parameters * @governor: pointer to the governor for this thermal zone + * @governor_data: private pointer for governor data * @thermal_instances: list of &struct thermal_instance of this thermal zone * @idr: &struct idr to generate unique id for this zone's cooling * devices @@ -191,6 +192,7 @@ struct thermal_zone_device { struct thermal_zone_device_ops *ops; const struct thermal_zone_params *tzp; struct thermal_governor *governor; + void *governor_data; struct list_head thermal_instances; struct idr idr; struct mutex lock; @@ -201,12 +203,19 @@ struct thermal_zone_device { /** * struct thermal_governor - structure that holds thermal governor information * @name: name of the governor + * @bind_to_tz: callback called when binding to a thermal zone. If it + * returns 0, the governor is bound to the thermal zone, + * otherwise it fails. + * @unbind_from_tz: callback called when a governor is unbound from a + * thermal zone. * @throttle: callback called for every trip point even if temperature is * below the trip point temperature * @governor_list: node in thermal_governor_list (in thermal_core.c) */ struct thermal_governor { char name[THERMAL_NAME_LENGTH]; + int (*bind_to_tz)(struct thermal_zone_device *tz); + void (*unbind_from_tz)(struct thermal_zone_device *tz); int (*throttle)(struct thermal_zone_device *tz, int trip); struct list_head governor_list; }; -- cgit v1.2.3 From 35b11d2e3a66279a477e36cefb2603806295b8ce Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:28 +0000 Subject: thermal: extend the cooling device API to include power information Add three optional callbacks to the cooling device interface to allow them to express power. In addition to the callbacks, add helpers to identify cooling devices that implement the power cooling device API. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index edf9d53c67e6..bf3c55f405c2 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -63,6 +63,7 @@ struct thermal_zone_device; struct thermal_cooling_device; +struct thermal_instance; enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, @@ -116,6 +117,12 @@ struct thermal_cooling_device_ops { int (*get_max_state) (struct thermal_cooling_device *, unsigned long *); int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *); int (*set_cur_state) (struct thermal_cooling_device *, unsigned long); + int (*get_requested_power)(struct thermal_cooling_device *, + struct thermal_zone_device *, u32 *); + int (*state2power)(struct thermal_cooling_device *, + struct thermal_zone_device *, unsigned long, u32 *); + int (*power2state)(struct thermal_cooling_device *, + struct thermal_zone_device *, u32, unsigned long *); }; struct thermal_cooling_device { @@ -331,6 +338,16 @@ void thermal_zone_of_sensor_unregister(struct device *dev, #endif #if IS_ENABLED(CONFIG_THERMAL) +static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) +{ + return cdev->ops->get_requested_power && cdev->ops->state2power && + cdev->ops->power2state; +} + +int power_actor_get_max_power(struct thermal_cooling_device *, + struct thermal_zone_device *tz, u32 *max_power); +int power_actor_set_power(struct thermal_cooling_device *, + struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, const struct thermal_zone_params *, int, int); @@ -359,6 +376,14 @@ struct thermal_instance *get_thermal_instance(struct thermal_zone_device *, void thermal_cdev_update(struct thermal_cooling_device *); void thermal_notify_framework(struct thermal_zone_device *, int); #else +static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) +{ return false; } +static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *max_power) +{ return 0; } +static inline int power_actor_set_power(struct thermal_cooling_device *cdev, + struct thermal_instance *tz, u32 power) +{ return 0; } static inline struct thermal_zone_device *thermal_zone_device_register( const char *type, int trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, -- cgit v1.2.3 From c36cf07176316fbe6a4bdbc23afcb0cbf7822bf2 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:29 +0000 Subject: thermal: cpu_cooling: implement the power cooling device API Add a basic power model to the cpu cooling device to implement the power cooling device API. The power model uses the current frequency, current load and OPPs for the power calculations. The cpus must have registered their OPPs using the OPP library. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Kapileshwar Singh Signed-off-by: Punit Agrawal Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/cpu_cooling.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h index bd955270d5aa..c156f5082758 100644 --- a/include/linux/cpu_cooling.h +++ b/include/linux/cpu_cooling.h @@ -28,6 +28,9 @@ #include #include +typedef int (*get_static_t)(cpumask_t *cpumask, int interval, + unsigned long voltage, u32 *power); + #ifdef CONFIG_CPU_THERMAL /** * cpufreq_cooling_register - function to create cpufreq cooling device. @@ -36,6 +39,10 @@ struct thermal_cooling_device * cpufreq_cooling_register(const struct cpumask *clip_cpus); +struct thermal_cooling_device * +cpufreq_power_cooling_register(const struct cpumask *clip_cpus, + u32 capacitance, get_static_t plat_static_func); + /** * of_cpufreq_cooling_register - create cpufreq cooling device based on DT. * @np: a valid struct device_node to the cooling device device tree node. @@ -45,6 +52,12 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus); struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, const struct cpumask *clip_cpus); + +struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func); #else static inline struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, @@ -52,6 +65,15 @@ of_cpufreq_cooling_register(struct device_node *np, { return ERR_PTR(-ENOSYS); } + +static inline struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func) +{ + return NULL; +} #endif /** @@ -67,12 +89,29 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus) { return ERR_PTR(-ENOSYS); } +static inline struct thermal_cooling_device * +cpufreq_power_cooling_register(const struct cpumask *clip_cpus, + u32 capacitance, get_static_t plat_static_func) +{ + return NULL; +} + static inline struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, const struct cpumask *clip_cpus) { return ERR_PTR(-ENOSYS); } + +static inline struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func) +{ + return NULL; +} + static inline void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) { -- cgit v1.2.3 From 6b775e870c56c59c3e16531ea2307b797395f9f7 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Mon, 2 Mar 2015 17:17:19 +0000 Subject: thermal: introduce the Power Allocator governor The power allocator governor is a thermal governor that controls system and device power allocation to control temperature. Conceptually, the implementation divides the sustainable power of a thermal zone among all the heat sources in that zone. This governor relies on "power actors", entities that represent heat sources. They can report current and maximum power consumption and can set a given maximum power consumption, usually via a cooling device. The governor uses a Proportional Integral Derivative (PID) controller driven by the temperature of the thermal zone. The output of the controller is a power budget that is then allocated to each power actor that can have bearing on the temperature we are trying to control. It decides how much power to give each cooling device based on the performance they are requesting. The PID controller ensures that the total power budget does not exceed the control temperature. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Punit Agrawal Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bf3c55f405c2..6bbe11c97cea 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -59,6 +59,8 @@ #define DEFAULT_THERMAL_GOVERNOR "fair_share" #elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE) #define DEFAULT_THERMAL_GOVERNOR "user_space" +#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR) +#define DEFAULT_THERMAL_GOVERNOR "power_allocator" #endif struct thermal_zone_device; @@ -154,8 +156,7 @@ struct thermal_attr { * @devdata: private pointer for device private data * @trips: number of trip points the thermal zone supports * @passive_delay: number of milliseconds to wait between polls when - * performing passive cooling. Currenty only used by the - * step-wise governor + * performing passive cooling. * @polling_delay: number of milliseconds to wait between polls when * checking whether trip points have been crossed (0 for * interrupt driven systems) @@ -165,7 +166,6 @@ struct thermal_attr { * @last_temperature: previous temperature read * @emul_temperature: emulated temperature when using CONFIG_THERMAL_EMULATION * @passive: 1 if you've crossed a passive trip point, 0 otherwise. - * Currenty only used by the step-wise governor. * @forced_passive: If > 0, temperature at which to switch on all ACPI * processor cooling devices. Currently only used by the * step-wise governor. @@ -197,7 +197,7 @@ struct thermal_zone_device { int passive; unsigned int forced_passive; struct thermal_zone_device_ops *ops; - const struct thermal_zone_params *tzp; + struct thermal_zone_params *tzp; struct thermal_governor *governor; void *governor_data; struct list_head thermal_instances; @@ -275,6 +275,33 @@ struct thermal_zone_params { int num_tbps; /* Number of tbp entries */ struct thermal_bind_params *tbp; + + /* + * Sustainable power (heat) that this thermal zone can dissipate in + * mW + */ + u32 sustainable_power; + + /* + * Proportional parameter of the PID controller when + * overshooting (i.e., when temperature is below the target) + */ + s32 k_po; + + /* + * Proportional parameter of the PID controller when + * undershooting + */ + s32 k_pu; + + /* Integral parameter of the PID controller */ + s32 k_i; + + /* Derivative parameter of the PID controller */ + s32 k_d; + + /* threshold below which the error is no longer accumulated */ + s32 integral_cutoff; }; struct thermal_genl_event { @@ -350,7 +377,7 @@ int power_actor_set_power(struct thermal_cooling_device *, struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, - const struct thermal_zone_params *, int, int); + struct thermal_zone_params *, int, int); void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, -- cgit v1.2.3 From 406c057c4e00744453d5b0731eb23629ec14dcdf Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 4 May 2015 21:54:20 -0400 Subject: libata: READ LOG DMA EXT support can be in either page 119 or 120 Support for the READ/WRITE LOG DMA EXT commands can be signaled either in page 119 or page 120. We should check both pages. Signed-off-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- include/linux/ata.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index b666b773e111..fed36418dd1c 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -704,9 +704,19 @@ static inline bool ata_id_wcache_enabled(const u16 *id) static inline bool ata_id_has_read_log_dma_ext(const u16 *id) { + /* Word 86 must have bit 15 set */ if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15))) return false; - return id[ATA_ID_COMMAND_SET_3] & (1 << 3); + + /* READ LOG DMA EXT support can be signaled either from word 119 + * or from word 120. The format is the same for both words: Bit + * 15 must be cleared, bit 14 set and bit 3 set. + */ + if ((id[ATA_ID_COMMAND_SET_3] & 0xC008) == 0x4008 || + (id[ATA_ID_COMMAND_SET_4] & 0xC008) == 0x4008) + return true; + + return false; } static inline bool ata_id_has_sense_reporting(const u16 *id) -- cgit v1.2.3 From 5d3abf8ff67f49271a42c0f7fa4f20f9e046bf0e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 4 May 2015 21:54:21 -0400 Subject: libata: Fall back to unqueued READ LOG EXT if the DMA variant fails Some devices advertise support for the READ/WRITE LOG DMA EXT commands but fail when we try to issue them. This can lead to queued TRIM being unintentionally disabled since the relevant feature flag is located in a general purpose log page. Fall back to unqueued READ LOG EXT if the DMA variant fails while reading a log page. Signed-off-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 8dad4a307bb8..c3ef58014b33 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -424,6 +424,7 @@ enum { ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ + ATA_HORKAGE_NO_NCQ_LOG = (1 << 23), /* don't use NCQ for log read */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:15:18 -0600 Subject: bio: skip atomic inc/dec of ->bi_remaining for non-chains Struct bio has an atomic ref count for chained bio's, and we use this to know when to end IO on the bio. However, most bio's are not chained, so we don't need to always introduce this atomic operation as part of ending IO. Add a helper to elevate the bi_remaining count, and flag the bio as now actually needing the decrement at end_io time. Rename the field to __bi_remaining to catch any current users of this doing the incrementing manually. For high IOPS workloads, this reduces the overhead of bio_endio() substantially. Tested-by: Robert Elliott Acked-by: Kent Overstreet Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 +++++++++++ include/linux/blk_types.h | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index da3a127c9958..8bfe9eee6d1a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -644,6 +644,17 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1b25e35ea5f..8b07e0603887 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -65,7 +65,7 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - atomic_t bi_remaining; + atomic_t __bi_remaining; bio_end_io_t *bi_end_io; @@ -122,6 +122,7 @@ struct bio { #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ +#define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:23:59 -0600 Subject: bio: skip atomic inc/dec of ->bi_cnt for most use cases Struct bio has a reference count that controls when it can be freed. Most uses cases is allocating the bio, which then returns with a single reference to it, doing IO, and then dropping that single reference. We can remove this atomic_dec_and_test() in the completion path, if nobody else is holding a reference to the bio. If someone does call bio_get() on the bio, then we flag the bio as now having valid count and that we must properly honor the reference count when it's being put. Tested-by: Robert Elliott Signed-off-by: Jens Axboe --- include/linux/bio.h | 16 +++++++++++++++- include/linux/blk_types.h | 3 ++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 8bfe9eee6d1a..7486ea103f6e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -290,7 +290,21 @@ static inline unsigned bio_segments(struct bio *bio) * returns. and then bio would be freed memory when if (bio->bi_flags ...) * runs */ -#define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +static inline void bio_get(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_cnt); +} + +static inline void bio_cnt_set(struct bio *bio, unsigned int count) +{ + if (count != 1) { + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + } + atomic_set(&bio->__bi_cnt, count); +} enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8b07e0603887..93d2e7153816 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -92,7 +92,7 @@ struct bio { unsigned short bi_max_vecs; /* max bvl_vecs we can hold */ - atomic_t bi_cnt; /* pin count */ + atomic_t __bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */ @@ -123,6 +123,7 @@ struct bio { #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ #define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ +#define BIO_REFFED 12 /* bio has elevated ->bi_cnt */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From 84be456f883c4685680fba8e5154b5f72e92957e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 May 2015 12:46:15 +0200 Subject: remove We don't have any arch specific scatterlist now that parisc switched over to the generic one. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- include/linux/dmapool.h | 2 +- include/linux/scatterlist.h | 39 ++++++++++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..504af1e65ce1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,8 +22,7 @@ #include #include #include - -#include +#include struct module; struct scsi_ioctl_command; diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index 52456aa566a0..e1043f79122f 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -11,8 +11,8 @@ #ifndef LINUX_DMAPOOL_H #define LINUX_DMAPOOL_H +#include #include -#include struct device; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index ed8f9e70df9b..eca1ec93775c 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -2,13 +2,39 @@ #define _LINUX_SCATTERLIST_H #include +#include #include #include - -#include -#include #include +struct scatterlist { +#ifdef CONFIG_DEBUG_SG + unsigned long sg_magic; +#endif + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + unsigned int dma_length; +#endif +}; + +/* + * These macros should be used after a dma_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries dma_map_sg + * returns, or alternatively stop on the first sg_dma_len(sg) which + * is 0. + */ +#define sg_dma_address(sg) ((sg)->dma_address) + +#ifdef CONFIG_NEED_SG_DMA_LENGTH +#define sg_dma_len(sg) ((sg)->dma_length) +#else +#define sg_dma_len(sg) ((sg)->length) +#endif + struct sg_table { struct scatterlist *sgl; /* the list */ unsigned int nents; /* number of mapped entries */ @@ -18,10 +44,9 @@ struct sg_table { /* * Notes on SG table design. * - * Architectures must provide an unsigned long page_link field in the - * scatterlist struct. We use that to place the page pointer AND encode - * information about the sg table as well. The two lower bits are reserved - * for this information. + * We use the unsigned long page_link field in the scatterlist struct to place + * the page pointer AND encode information about the sg table as well. The two + * lower bits are reserved for this information. * * If bit 0 is set, then the page_link contains a pointer to the next sg * table list. Otherwise the next entry is at sg + 1. -- cgit v1.2.3 From 4f8c9510ba71bb54477841bebb90154ef140860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:16 +0200 Subject: block: rename REQ_TYPE_SPECIAL to REQ_TYPE_DRV_PRIV Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..98c90272443b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,10 +79,10 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_SPECIAL, /* driver defined type */ + REQ_TYPE_DRV_PRIV, /* driver defined type */ /* * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver + * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver * private REQ_LB opcodes to differentiate what type of request this is */ REQ_TYPE_ATA_TASKFILE, -- cgit v1.2.3 From b42171ef7d938a66fa52e66a3d911ed63770b5ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:17 +0200 Subject: block: move REQ_TYPE_ATA_TASKFILE and REQ_TYPE_ATA_PC to ide.h These values are only used by the IDE driver, so move them into it by allowing drivers to take cmd_type values after the first private one. Note that we have to turn cmd_type into a plain unsigned integer so that gcc doesn't complain about mismatching enum types. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ++--------- include/linux/ide.h | 7 +++++++ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 98c90272443b..9cb4d80a4987 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,14 +79,7 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_DRV_PRIV, /* driver defined type */ - /* - * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver - * private REQ_LB opcodes to differentiate what type of request this is - */ - REQ_TYPE_ATA_TASKFILE, - REQ_TYPE_ATA_PC, + REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; #define BLK_MAX_CDB 16 @@ -108,7 +101,7 @@ struct request { struct blk_mq_ctx *mq_ctx; u64 cmd_flags; - enum rq_cmd_type_bits cmd_type; + unsigned cmd_type; unsigned long atomic_flags; int cpu; diff --git a/include/linux/ide.h b/include/linux/ide.h index 93b5ca754b5b..62ac399144a6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -39,6 +39,12 @@ struct device; +/* IDE-specific values for req->cmd_type */ +enum ata_cmd_type_bits { + REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, + REQ_TYPE_ATA_PC, +}; + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1551,4 +1557,5 @@ static inline void ide_set_drivedata(ide_drive_t *drive, void *data) #define ide_host_for_each_port(i, port, host) \ for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++) + #endif /* _IDE_H */ -- cgit v1.2.3 From b0b93b48a30e809240ddd7449a6ad60a5ddf7b4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:18 +0200 Subject: block: move REQ_TYPE_SENSE to the ide driver Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - include/linux/ide.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cb4d80a4987..6076b9e18dcb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -75,7 +75,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_SENSE, /* sense request */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ diff --git a/include/linux/ide.h b/include/linux/ide.h index 62ac399144a6..9856b7d455d9 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -43,6 +43,7 @@ struct device; enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, + REQ_TYPE_ATA_SENSE, /* sense request */ }; /* Error codes returned in rq->errors to the higher part of the driver. */ -- cgit v1.2.3 From ac7cdff00a33d48d27217560fa3b16d802e5f535 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:19 +0200 Subject: block: remove REQ_TYPE_PM_SHUTDOWN Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6076b9e18dcb..c2829ba5e738 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -77,7 +77,6 @@ enum rq_cmd_type_bits { REQ_TYPE_BLOCK_PC, /* scsi command */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ - REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; -- cgit v1.2.3 From a7928c1578c550bd6f4dec62d65132e6db226c57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:20 +0200 Subject: block: move PM request support to IDE This removes the request types and hacks from the block code and into the old IDE driver. There is a small amunt of code duplication due to this, but it's not too bad. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +-------------------- include/linux/ide.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c2829ba5e738..2da818a48097 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -30,7 +30,6 @@ struct scsi_ioctl_command; struct request_queue; struct elevator_queue; -struct request_pm_state; struct blk_trace; struct request; struct sg_io_hdr; @@ -75,8 +74,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_PM_SUSPEND, /* suspend request */ - REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; @@ -207,19 +204,6 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } -/* - * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME - * requests. Some step values could eventually be made generic. - */ -struct request_pm_state -{ - /* PM state machine step value, currently driver specific */ - int pm_step; - /* requested PM state value (S1, S2, S3, S4, ...) */ - u32 pm_state; - void* data; /* for driver use */ -}; - #include struct blk_queue_ctx; @@ -601,10 +585,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) (((rq)->cmd_flags & REQ_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) -#define blk_pm_request(rq) \ - ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \ - (rq)->cmd_type == REQ_TYPE_PM_RESUME) - #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) /* rq->queuelist of dequeued request must be list_empty() */ @@ -838,6 +818,7 @@ extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); extern void __blk_stop_queue(struct request_queue *q); extern void __blk_run_queue(struct request_queue *q); +extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, diff --git a/include/linux/ide.h b/include/linux/ide.h index 9856b7d455d9..a633898f36ac 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -44,8 +44,14 @@ enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, REQ_TYPE_ATA_SENSE, /* sense request */ + REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */ + REQ_TYPE_ATA_PM_RESUME, /* resume request */ }; +#define ata_pm_request(rq) \ + ((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \ + (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME) + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1321,6 +1327,19 @@ struct ide_port_info { u8 udma_mask; }; +/* + * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME + * requests. + */ +struct ide_pm_state { + /* PM state machine step value, currently driver specific */ + int pm_step; + /* requested PM state value (S1, S2, S3, S4, ...) */ + u32 pm_state; + void* data; /* for driver use */ +}; + + int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *); int ide_pci_init_two(struct pci_dev *, struct pci_dev *, const struct ide_port_info *, void *); -- cgit v1.2.3 From 2893c379461a208b3059f55dfe4dafa06b4aa46a Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 31 Mar 2015 20:16:52 +0200 Subject: clk: make strings in parent name arrays const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clk functions and structs declare the parent_name arrays as 'const char **parent_names' which means the parent name strings are const, but the array itself is not. Use 'const char * const * parent_names' instead which also makes the array const. This allows us to put the parent_name arrays into the __initconst section. Signed-off-by: Sascha Hauer Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Acked-by: Uwe Kleine-König [sboyd@codeaurora.org: Squelch 80-character checkpatch warnings] Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index df695313f975..5378c2aba4d2 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -209,7 +209,7 @@ struct clk_ops { struct clk_init_data { const char *name; const struct clk_ops *ops; - const char **parent_names; + const char * const *parent_names; u8 num_parents; unsigned long flags; }; @@ -426,12 +426,14 @@ extern const struct clk_ops clk_mux_ops; extern const struct clk_ops clk_mux_ro_ops; struct clk *clk_register_mux(struct device *dev, const char *name, - const char **parent_names, u8 num_parents, unsigned long flags, + const char * const *parent_names, u8 num_parents, + unsigned long flags, void __iomem *reg, u8 shift, u8 width, u8 clk_mux_flags, spinlock_t *lock); struct clk *clk_register_mux_table(struct device *dev, const char *name, - const char **parent_names, u8 num_parents, unsigned long flags, + const char * const *parent_names, u8 num_parents, + unsigned long flags, void __iomem *reg, u8 shift, u32 mask, u8 clk_mux_flags, u32 *table, spinlock_t *lock); @@ -518,7 +520,7 @@ struct clk_composite { }; struct clk *clk_register_composite(struct device *dev, const char *name, - const char **parent_names, int num_parents, + const char * const *parent_names, int num_parents, struct clk_hw *mux_hw, const struct clk_ops *mux_ops, struct clk_hw *rate_hw, const struct clk_ops *rate_ops, struct clk_hw *gate_hw, const struct clk_ops *gate_ops, -- cgit v1.2.3 From fa76a3db7093a527333c380df82a0f158d9b8299 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Thu, 9 Apr 2015 11:13:07 +0800 Subject: pinctrl: allow exlusive GPIO/mux pin allocation Disallow simultaneous use of the the GPIO and peripheral mux functions by setting a flag "strict" in struct pinctrl_desc. The blackfin pinmux and gpio controller doesn't allow user to set up a pin for both GPIO and peripheral function. So, add flag strict in struct pinctrl_desc to check both gpio_owner and mux_owner before approving the pin request. v2-changes: - if strict flag is set, check gpio_owner and mux_onwer in if and else clause v3-changes: - add kerneldoc for this struct - augment Documentation/pinctrl.txt Signed-off-by: Sonic Zhang Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 66e4697516de..fc6b0348c375 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -114,6 +114,8 @@ struct pinctrl_ops { * of the pins field above * @pctlops: pin control operation vtable, to support global concepts like * grouping of pins, this is optional. + * @strict: check both gpio_owner and mux_owner strictly before approving + the pin request * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver * @confops: pin config operations vtable, if you support pin configuration in * your driver @@ -132,6 +134,7 @@ struct pinctrl_desc { const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; const struct pinconf_ops *confops; + bool strict; struct module *owner; #ifdef CONFIG_GENERIC_PINCONF unsigned int num_custom_params; -- cgit v1.2.3 From 8c4c2016345feefcd289ce2479eb70286d30825a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 6 May 2015 14:19:13 +0200 Subject: pinctrl: move strict option to pinmux_ops While the pinmux_ops are ideally just a vtable for pin mux calls, the "strict" setting belongs so intuitively with the pin multiplexing that we should move it here anyway. Putting it in the top pinctrl_desc makes no sense. Cc: Sonic Zhang Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl.h | 3 --- include/linux/pinctrl/pinmux.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index fc6b0348c375..66e4697516de 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -114,8 +114,6 @@ struct pinctrl_ops { * of the pins field above * @pctlops: pin control operation vtable, to support global concepts like * grouping of pins, this is optional. - * @strict: check both gpio_owner and mux_owner strictly before approving - the pin request * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver * @confops: pin config operations vtable, if you support pin configuration in * your driver @@ -134,7 +132,6 @@ struct pinctrl_desc { const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; const struct pinconf_ops *confops; - bool strict; struct module *owner; #ifdef CONFIG_GENERIC_PINCONF unsigned int num_custom_params; diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index 511bda9ed4bf..d3740fa7073f 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -56,6 +56,9 @@ struct pinctrl_dev; * depending on whether the GPIO is configured as input or output, * a direction selector function may be implemented as a backing * to the GPIO controllers that need pin muxing. + * @strict: do not allow simultaneous use of the same pin for GPIO and another + * function. Check both gpio_owner and mux_owner strictly before approving + * the pin request. */ struct pinmux_ops { int (*request) (struct pinctrl_dev *pctldev, unsigned offset); @@ -79,6 +82,7 @@ struct pinmux_ops { struct pinctrl_gpio_range *range, unsigned offset, bool input); + bool strict; }; #endif /* CONFIG_PINMUX */ -- cgit v1.2.3 From 66eb3bd857f5311f72c7c371f78ddc9c472befba Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 27 Apr 2015 18:04:05 +0200 Subject: pinctrl: use ERR_CAST instead of ERR_PTR/PTR_ERR Inspired by scripts/coccinelle/api/err_cast.cocci Signed-off-by: Fabian Frederick Signed-off-by: Linus Walleij --- include/linux/pinctrl/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 18eccefea06e..d7e5d608faa7 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -142,7 +142,7 @@ static inline struct pinctrl * __must_check pinctrl_get_select( s = pinctrl_lookup_state(p, name); if (IS_ERR(s)) { pinctrl_put(p); - return ERR_PTR(PTR_ERR(s)); + return ERR_CAST(s); } ret = pinctrl_select_state(p, s); -- cgit v1.2.3 From 1d6b98774cff82860a3f044610e956bcbff556c1 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 31 Mar 2015 15:55:57 +0200 Subject: tty: constify return type of tty_name All users of tty_name pass the result directly to a printf-like function. This means we can actually let tty_name return the literal "NULL tty" or tty->name directly, avoiding the strcpy and a lot of medium-sized stack buffers. In preparation for that, make the return type const char*. While at it, we can also constify the tty parameter. Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index fe5623c9af71..4cbecfc7b3c9 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine); -extern char *tty_name(struct tty_struct *tty, char *buf); +extern const char *tty_name(const struct tty_struct *tty, char *buf); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); -- cgit v1.2.3 From 429b474990cb4e5e8cfe2352daf649d0599cccb6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 31 Mar 2015 15:55:59 +0200 Subject: tty: remove buf parameter from tty_name() tty_name no longer uses the buf parameter, so remove it along with all the 64 byte stack buffers that used to be passed in. Mostly generated by the coccinelle script @depends on patch@ identifier buf; constant C; expression tty; @@ - char buf[C]; <+... - tty_name(tty, buf) + tty_name(tty) ...+> allmodconfig compiles, so I'm fairly confident the stack buffers weren't used for other purposes as well. Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Hurley Acked-by: Jesper Nilsson Acked-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 4cbecfc7b3c9..9a72c9144d8a 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine); -extern const char *tty_name(const struct tty_struct *tty, char *buf); +extern const char *tty_name(const struct tty_struct *tty); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); -- cgit v1.2.3 From 6b3cddccf4eec0883feb065aea28dd9770bb17d0 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 11 Apr 2015 11:02:36 -0400 Subject: serial: core: Fix unused variable warnings from uart_console() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SERIAL_CORE_CONSOLE=n, build warnings are generated by uart_console() macro expansion: drivers/tty/serial/of_serial.c: In function ‘of_serial_suspend_8250’: drivers/tty/serial/of_serial.c:262:20: warning: unused variable ‘port’ [-Wunused-variable] struct uart_port *port = &port8250->port; ^ drivers/tty/serial/of_serial.c: In function ‘of_serial_resume_8250’: drivers/tty/serial/of_serial.c:272:20: warning: unused variable ‘port’ [-Wunused-variable] struct uart_port *port = &port8250->port; Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 025dad9dcde4..297d4fa1cfe5 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -35,7 +35,7 @@ #define uart_console(port) \ ((port)->cons && (port)->cons->index == (port)->line) #else -#define uart_console(port) (0) +#define uart_console(port) ({ (void)port; 0; }) #endif struct uart_port; -- cgit v1.2.3 From 385f83f85cd9428db82cae5e6f6f786be113b24c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 12:21:40 +0200 Subject: dmaengine: Remove Renesas Audio DMAC peri peri platform data Commit 3cd44dcd35a6 ("dmaengine: remove Renesas Audio DMAC peri peri") forgot to remove the header file with the platform data definitions. Signed-off-by: Geert Uytterhoeven Acked-by: Simon Horman Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-rcar-audmapp.h | 34 -------------------------- 1 file changed, 34 deletions(-) delete mode 100644 include/linux/platform_data/dma-rcar-audmapp.h (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-rcar-audmapp.h b/include/linux/platform_data/dma-rcar-audmapp.h deleted file mode 100644 index 471fffebbeb4..000000000000 --- a/include/linux/platform_data/dma-rcar-audmapp.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * This is for Renesas R-Car Audio-DMAC-peri-peri. - * - * Copyright (C) 2014 Renesas Electronics Corporation - * Copyright (C) 2014 Kuninori Morimoto - * - * This file is based on the include/linux/sh_dma.h - * - * Header for the new SH dmaengine driver - * - * Copyright (C) 2010 Guennadi Liakhovetski - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef SH_AUDMAPP_H -#define SH_AUDMAPP_H - -#include - -struct audmapp_slave_config { - int slave_id; - dma_addr_t src; - dma_addr_t dst; - u32 chcr; -}; - -struct audmapp_pdata { - struct audmapp_slave_config *slave; - int slave_num; -}; - -#endif /* SH_AUDMAPP_H */ -- cgit v1.2.3 From 56f13c0d9524c5816f5dc9c91b9d766d6b1064ca Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 9 Apr 2015 12:35:47 +0300 Subject: dmaengine: of_dma: Support for DMA routers DMA routers are transparent devices used to mux DMA requests from peripherals to DMA controllers. They are used when the SoC integrates more devices with DMA requests then their controller can handle. DRA7x is one example of such SoC, where the sDMA can hanlde 128 DMA request lines, but in SoC level it has 205 DMA requests. The of_dma_router will be registered as of_dma_controller with special xlate function and additional parameters. The driver for the router is responsible to craft the dma_spec (in the of_dma_route_allocate callback) which can be used to requests a DMA channel from the real DMA controller. This way the router can be transparent for the system while remaining generic enough to be used in different environments. Signed-off-by: Peter Ujfalusi Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 17 +++++++++++++++++ include/linux/of_dma.h | 21 +++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..abf63ceabef9 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -221,6 +221,16 @@ struct dma_chan_percpu { unsigned long bytes_transferred; }; +/** + * struct dma_router - DMA router structure + * @dev: pointer to the DMA router device + * @route_free: function to be called when the route can be disconnected + */ +struct dma_router { + struct device *dev; + void (*route_free)(struct device *dev, void *route_data); +}; + /** * struct dma_chan - devices supply DMA channels, clients use them * @device: ptr to the dma device who supplies this channel, always !%NULL @@ -232,6 +242,8 @@ struct dma_chan_percpu { * @local: per-cpu pointer to a struct dma_chan_percpu * @client_count: how many clients are using this channel * @table_count: number of appearances in the mem-to-mem allocation table + * @router: pointer to the DMA router structure + * @route_data: channel specific data for the router * @private: private data for certain client-channel associations */ struct dma_chan { @@ -247,6 +259,11 @@ struct dma_chan { struct dma_chan_percpu __percpu *local; int client_count; int table_count; + + /* DMA router */ + struct dma_router *router; + void *route_data; + void *private; }; diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index 56bc026c143f..98ba7525929e 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -23,6 +23,9 @@ struct of_dma { struct device_node *of_node; struct dma_chan *(*of_dma_xlate) (struct of_phandle_args *, struct of_dma *); + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *); + struct dma_router *dma_router; void *of_dma_data; }; @@ -37,12 +40,20 @@ extern int of_dma_controller_register(struct device_node *np, (struct of_phandle_args *, struct of_dma *), void *data); extern void of_dma_controller_free(struct device_node *np); + +extern int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router); +#define of_dma_router_free of_dma_controller_free + extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name); extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma); extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec, struct of_dma *ofdma); + #else static inline int of_dma_controller_register(struct device_node *np, struct dma_chan *(*of_dma_xlate) @@ -56,6 +67,16 @@ static inline void of_dma_controller_free(struct device_node *np) { } +static inline int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router) +{ + return -ENODEV; +} + +#define of_dma_router_free of_dma_controller_free + static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name) { -- cgit v1.2.3 From 34ef33f7da6b00900d3a896d33522a035a930245 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 14:04:07 +0200 Subject: usb: phy: Remove the phy-rcar-gen2-usb driver The phy-rcar-gen2-usb driver, which supports legacy platform data only, is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager: Remove legacy board support"). This driver was superseded by the DT-only phy-rcar-gen2 driver, which was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY driver"). Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/usb-rcar-gen2-phy.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h (limited to 'include/linux') diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h deleted file mode 100644 index dd3ba46c0d90..000000000000 --- a/include/linux/platform_data/usb-rcar-gen2-phy.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __USB_RCAR_GEN2_PHY_H -#define __USB_RCAR_GEN2_PHY_H - -#include - -struct rcar_gen2_phy_platform_data { - /* USB channel 0 configuration */ - bool chan0_pci:1; /* true: PCI USB host 0, false: USBHS */ - /* USB channel 2 configuration */ - bool chan2_pci:1; /* true: PCI USB host 2, false: USBSS */ -}; - -#endif -- cgit v1.2.3 From 1c5841e832e2d7563c31de4946118e78baf573a3 Mon Sep 17 00:00:00 2001 From: Eddie Huang Date: Tue, 28 Apr 2015 21:40:32 +0800 Subject: tty: serial: 8250: export early_serial8250_setup function 8250-like uart driver may call early_serial8250_setup to reuse 8250_early.c character output function. Signed-off-by: Eddie Huang Tested-by: Sascha Hauer Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 78097e7a330a..f0c68d88b6f4 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -137,6 +137,8 @@ extern int early_serial_setup(struct uart_port *port); extern unsigned int serial8250_early_in(struct uart_port *port, int offset); extern void serial8250_early_out(struct uart_port *port, int offset, int value); +extern int early_serial8250_setup(struct earlycon_device *device, + const char *options); extern void serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, struct ktermios *old); extern int serial8250_do_startup(struct uart_port *port); -- cgit v1.2.3 From c27ffc1080179c3f3b85e1e194fa61f1c9923b62 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Apr 2015 18:21:25 +0200 Subject: serial: sh-sci: Move private definitions to private header file Move private register definitions and enums from the public header file to the driver private "sh-sci.h" header file. The common Serial Control Register definitions are left in the public header file, as they're needed to fill in plat_sci_port.scscr on legacy systems not using DT. Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_sci.h | 67 +--------------------------------------------- 1 file changed, 1 insertion(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 6c5e3bb282b0..395fceb8c060 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -10,13 +10,6 @@ #define SCIx_NOT_SUPPORTED (-1) -/* SCSMR (Serial Mode Register) */ -#define SCSMR_CHR (1 << 6) /* 7-bit Character Length */ -#define SCSMR_PE (1 << 5) /* Parity Enable */ -#define SCSMR_ODD (1 << 4) /* Odd Parity */ -#define SCSMR_STOP (1 << 3) /* Stop Bit Length */ -#define SCSMR_CKS 0x0003 /* Clock Select */ - /* Serial Control Register (@ = not supported by all parts) */ #define SCSCR_TIE (1 << 7) /* Transmit Interrupt Enable */ #define SCSCR_RIE (1 << 6) /* Receive Interrupt Enable */ @@ -26,43 +19,7 @@ #define SCSCR_TOIE (1 << 2) /* Timeout Interrupt Enable @ */ #define SCSCR_CKE1 (1 << 1) /* Clock Enable 1 */ #define SCSCR_CKE0 (1 << 0) /* Clock Enable 0 */ -/* SCIFA/SCIFB only */ -#define SCSCR_TDRQE (1 << 15) /* Tx Data Transfer Request Enable */ -#define SCSCR_RDRQE (1 << 14) /* Rx Data Transfer Request Enable */ - -/* SCxSR (Serial Status Register) on SCI */ -#define SCI_TDRE 0x80 /* Transmit Data Register Empty */ -#define SCI_RDRF 0x40 /* Receive Data Register Full */ -#define SCI_ORER 0x20 /* Overrun Error */ -#define SCI_FER 0x10 /* Framing Error */ -#define SCI_PER 0x08 /* Parity Error */ -#define SCI_TEND 0x04 /* Transmit End */ - -#define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER) - -/* SCxSR (Serial Status Register) on SCIF, HSCIF */ -#define SCIF_ER 0x0080 /* Receive Error */ -#define SCIF_TEND 0x0040 /* Transmission End */ -#define SCIF_TDFE 0x0020 /* Transmit FIFO Data Empty */ -#define SCIF_BRK 0x0010 /* Break Detect */ -#define SCIF_FER 0x0008 /* Framing Error */ -#define SCIF_PER 0x0004 /* Parity Error */ -#define SCIF_RDF 0x0002 /* Receive FIFO Data Full */ -#define SCIF_DR 0x0001 /* Receive Data Ready */ - -#define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK) - -/* SCFCR (FIFO Control Register) */ -#define SCFCR_LOOP (1 << 0) /* Loopback Test */ - -/* SCSPTR (Serial Port Register), optional */ -#define SCSPTR_RTSIO (1 << 7) /* Serial Port RTS Pin Input/Output */ -#define SCSPTR_CTSIO (1 << 5) /* Serial Port CTS Pin Input/Output */ -#define SCSPTR_SPB2IO (1 << 1) /* Serial Port Break Input/Output */ -#define SCSPTR_SPB2DT (1 << 0) /* Serial Port Break Data */ - -/* HSSRR HSCIF */ -#define HSCIF_SRE 0x8000 /* Sampling Rate Register Enable */ + enum { SCIx_PROBE_REGTYPE, @@ -82,28 +39,6 @@ enum { SCIx_NR_REGTYPES, }; -/* - * SCI register subset common for all port types. - * Not all registers will exist on all parts. - */ -enum { - SCSMR, /* Serial Mode Register */ - SCBRR, /* Bit Rate Register */ - SCSCR, /* Serial Control Register */ - SCxSR, /* Serial Status Register */ - SCFCR, /* FIFO Control Register */ - SCFDR, /* FIFO Data Count Register */ - SCxTDR, /* Transmit (FIFO) Data Register */ - SCxRDR, /* Receive (FIFO) Data Register */ - SCLSR, /* Line Status Register */ - SCTFDR, /* Transmit FIFO Data Count Register */ - SCRFDR, /* Receive FIFO Data Count Register */ - SCSPTR, /* Serial Port Register */ - HSSRR, /* Sampling Rate Register */ - - SCIx_NR_REGS, -}; - struct device; struct plat_sci_port_ops { -- cgit v1.2.3 From d94a0a3857987c76c37a8095977fe554799ab69d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Apr 2015 18:21:29 +0200 Subject: serial: sh-sci: Standardize on using the BIT() macro to define register bits Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_sci.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 395fceb8c060..7c536ac5be05 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -1,6 +1,7 @@ #ifndef __LINUX_SERIAL_SCI_H #define __LINUX_SERIAL_SCI_H +#include #include #include @@ -11,14 +12,14 @@ #define SCIx_NOT_SUPPORTED (-1) /* Serial Control Register (@ = not supported by all parts) */ -#define SCSCR_TIE (1 << 7) /* Transmit Interrupt Enable */ -#define SCSCR_RIE (1 << 6) /* Receive Interrupt Enable */ -#define SCSCR_TE (1 << 5) /* Transmit Enable */ -#define SCSCR_RE (1 << 4) /* Receive Enable */ -#define SCSCR_REIE (1 << 3) /* Receive Error Interrupt Enable @ */ -#define SCSCR_TOIE (1 << 2) /* Timeout Interrupt Enable @ */ -#define SCSCR_CKE1 (1 << 1) /* Clock Enable 1 */ -#define SCSCR_CKE0 (1 << 0) /* Clock Enable 0 */ +#define SCSCR_TIE BIT(7) /* Transmit Interrupt Enable */ +#define SCSCR_RIE BIT(6) /* Receive Interrupt Enable */ +#define SCSCR_TE BIT(5) /* Transmit Enable */ +#define SCSCR_RE BIT(4) /* Receive Enable */ +#define SCSCR_REIE BIT(3) /* Receive Error Interrupt Enable @ */ +#define SCSCR_TOIE BIT(2) /* Timeout Interrupt Enable @ */ +#define SCSCR_CKE1 BIT(1) /* Clock Enable 1 */ +#define SCSCR_CKE0 BIT(0) /* Clock Enable 0 */ enum { @@ -48,7 +49,7 @@ struct plat_sci_port_ops { /* * Port-specific capabilities */ -#define SCIx_HAVE_RTSCTS (1 << 0) +#define SCIx_HAVE_RTSCTS BIT(0) /* * Platform device specific platform_data struct -- cgit v1.2.3 From bd63364caa8df38bad2b25b11b2a1b849475cce5 Mon Sep 17 00:00:00 2001 From: Scot Doyle Date: Thu, 26 Mar 2015 13:54:39 +0000 Subject: vt: add cursor blink interval escape sequence Add an escape sequence to specify the current console's cursor blink interval. The interval is specified as a number of milliseconds until the next cursor display state toggle, from 50 to 65535. /proc/loadavg did not show a difference with a one msec interval, but the lower bound is set to 50 msecs since slower hardware wasn't tested. Store the interval in the vc_data structure for later access by fbcon, initializing the value to fbcon's current hardcoded value of 200 msecs. Signed-off-by: Scot Doyle Acked-by: Pavel Machek Signed-off-by: Greg Kroah-Hartman --- include/linux/console_struct.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index e859c98d1767..e329ee2667e1 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -104,6 +104,7 @@ struct vc_data { unsigned int vc_resize_user; /* resize request from user */ unsigned int vc_bell_pitch; /* Console bell pitch */ unsigned int vc_bell_duration; /* Console bell duration */ + unsigned short vc_cur_blink_ms; /* Cursor blink duration */ struct vc_data **vc_display_fg; /* [!] Ptr to var holding fg console for this display */ struct uni_pagedir *vc_uni_pagedir; struct uni_pagedir **vc_uni_pagedir_loc; /* [!] Location of uni_pagedir variable for this console */ -- cgit v1.2.3 From faaa44955dedc661f083636d816af90975a359ee Mon Sep 17 00:00:00 2001 From: Irina Tirdea Date: Wed, 29 Apr 2015 21:16:39 +0300 Subject: iio: core: Introduce IIO_CHAN_INFO_OVERSAMPLING_RATIO Some magnetometers can perform a number of repetitions in HW for each measurement to increase accuracy. One example is Bosch BMC150: http://ae-bst.resource.bosch.com/media/products/dokumente/bmc150/BST-BMC150-DS000-04.pdf. Introduce an interface to set the oversampling ratio for these devices. Signed-off-by: Irina Tirdea Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index b1e46ae89aa7..058441da4984 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -44,6 +44,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_DEBOUNCE_COUNT, IIO_CHAN_INFO_DEBOUNCE_TIME, IIO_CHAN_INFO_CALIBEMISSIVITY, + IIO_CHAN_INFO_OVERSAMPLING_RATIO, }; enum iio_shared_by { -- cgit v1.2.3 From 6be109b31ccdb9c98e7be12687171f6602527a5d Mon Sep 17 00:00:00 2001 From: Arun Ramamurthy Date: Wed, 22 Apr 2015 16:04:11 -0700 Subject: phy: core: Add devm_of_phy_get_by_index to phy-core Some generic drivers, such as ehci, may use multiple phys and for such drivers referencing phy(s) by name(s) does not make sense. Instead of inventing new naming schemes and using custom code to iterate through them, such drivers are better of using nameless phy bindings and using this newly introduced API to iterate through them. Signed-off-by: Arun Ramamurthy Reviewed-by: Ray Jui Reviewed-by: Scott Branden [kishon@ti.com: fix compilation errors] Signed-off-by: Kishon Vijay Abraham I --- include/linux/phy/phy.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index a0197fa1b116..8cf05e341cff 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -133,6 +133,8 @@ struct phy *devm_phy_get(struct device *dev, const char *string); struct phy *devm_phy_optional_get(struct device *dev, const char *string); struct phy *devm_of_phy_get(struct device *dev, struct device_node *np, const char *con_id); +struct phy *devm_of_phy_get_by_index(struct device *dev, struct device_node *np, + int index); void phy_put(struct phy *phy); void devm_phy_put(struct device *dev, struct phy *phy); struct phy *of_phy_get(struct device_node *np, const char *con_id); @@ -261,6 +263,13 @@ static inline struct phy *devm_of_phy_get(struct device *dev, return ERR_PTR(-ENOSYS); } +static inline struct phy *devm_of_phy_get_by_index(struct device *dev, + struct device_node *np, + int index) +{ + return ERR_PTR(-ENOSYS); +} + static inline void phy_put(struct phy *phy) { } -- cgit v1.2.3 From 9d0be7f4810257a9b0fc78fff641f14409f14ab3 Mon Sep 17 00:00:00 2001 From: Eduardo Valentin Date: Mon, 11 May 2015 19:34:23 -0700 Subject: thermal: support slope and offset coefficients It is common to have a linear extrapolation from the current sensor readings and the actual temperature value. This is specially the case when the sensor is in use to extrapolate hotspots. This patch adds slope and offset constants for single sensor linear extrapolation equation. Because the same sensor can be use in different locations, from board to board, these constants are added as part of thermal_zone_params. The constants are available through sysfs. It is up to the device driver to determine the usage of these values. Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 6bbe11c97cea..037e9df2f610 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -302,6 +302,17 @@ struct thermal_zone_params { /* threshold below which the error is no longer accumulated */ s32 integral_cutoff; + + /* + * @slope: slope of a linear temperature adjustment curve. + * Used by thermal zone drivers. + */ + int slope; + /* + * @offset: offset of a linear temperature adjustment curve. + * Used by thermal zone drivers (default 0). + */ + int offset; }; struct thermal_genl_event { -- cgit v1.2.3 From 05ae797566a66d159cf1e2ee11bf3f6fae40c8eb Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Mon, 4 May 2015 10:36:35 -0700 Subject: mailbox: Make mbox_chan_ops const The mailbox controller's channel ops ought to be read-only. Update all the mailbox drivers to make their mbox_chan_ops const as well. Signed-off-by: Andrew Bresticker Cc: Ashwin Chaugule Cc: Ley Foon Tan Acked-by: Suman Anna Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index d4cf96f07cfc..68c42454439b 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -72,7 +72,7 @@ struct mbox_chan_ops { */ struct mbox_controller { struct device *dev; - struct mbox_chan_ops *ops; + const struct mbox_chan_ops *ops; struct mbox_chan *chans; int num_chans; bool txdone_irq; -- cgit v1.2.3 From 3c4ed7bdf5997d8020cbb8d4abbef2fcfb9f1284 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:10:46 -0700 Subject: LSM: Split security.h The security.h header file serves two purposes, interfaces for users of the security modules and interfaces for security modules. Users of the security modules don't need to know about what's in the security_operations structure, so pull it out into it's own header, lsm_hooks.h Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 358 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/security.h | 305 --------------------------------------- 2 files changed, 358 insertions(+), 305 deletions(-) create mode 100644 include/linux/lsm_hooks.h (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h new file mode 100644 index 000000000000..c60f81b2d18c --- /dev/null +++ b/include/linux/lsm_hooks.h @@ -0,0 +1,358 @@ +/* + * Linux Security Module interfaces + * + * Copyright (C) 2001 WireX Communications, Inc + * Copyright (C) 2001 Greg Kroah-Hartman + * Copyright (C) 2001 Networks Associates Technology, Inc + * Copyright (C) 2001 James Morris + * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group) + * Copyright (C) 2015 Intel Corporation. + * Copyright (C) 2015 Casey Schaufler + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Due to this file being licensed under the GPL there is controversy over + * whether this permits you to write a module that #includes this file + * without placing your module under the GPL. Please consult a lawyer for + * advice before doing this. + * + */ + +#ifndef __LINUX_LSM_HOOKS_H +#define __LINUX_LSM_HOOKS_H + +#include + +/* Maximum number of letters for an LSM name string */ +#define SECURITY_NAME_MAX 10 + +#ifdef CONFIG_SECURITY + +struct security_operations { + char name[SECURITY_NAME_MAX + 1]; + + int (*binder_set_context_mgr)(struct task_struct *mgr); + int (*binder_transaction)(struct task_struct *from, + struct task_struct *to); + int (*binder_transfer_binder)(struct task_struct *from, + struct task_struct *to); + int (*binder_transfer_file)(struct task_struct *from, + struct task_struct *to, + struct file *file); + + int (*ptrace_access_check)(struct task_struct *child, + unsigned int mode); + int (*ptrace_traceme)(struct task_struct *parent); + int (*capget)(struct task_struct *target, kernel_cap_t *effective, + kernel_cap_t *inheritable, kernel_cap_t *permitted); + int (*capset)(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); + int (*capable)(const struct cred *cred, struct user_namespace *ns, + int cap, int audit); + int (*quotactl)(int cmds, int type, int id, struct super_block *sb); + int (*quota_on)(struct dentry *dentry); + int (*syslog)(int type); + int (*settime)(const struct timespec *ts, const struct timezone *tz); + int (*vm_enough_memory)(struct mm_struct *mm, long pages); + + int (*bprm_set_creds)(struct linux_binprm *bprm); + int (*bprm_check_security)(struct linux_binprm *bprm); + int (*bprm_secureexec)(struct linux_binprm *bprm); + void (*bprm_committing_creds)(struct linux_binprm *bprm); + void (*bprm_committed_creds)(struct linux_binprm *bprm); + + int (*sb_alloc_security)(struct super_block *sb); + void (*sb_free_security)(struct super_block *sb); + int (*sb_copy_data)(char *orig, char *copy); + int (*sb_remount)(struct super_block *sb, void *data); + int (*sb_kern_mount)(struct super_block *sb, int flags, void *data); + int (*sb_show_options)(struct seq_file *m, struct super_block *sb); + int (*sb_statfs)(struct dentry *dentry); + int (*sb_mount)(const char *dev_name, struct path *path, + const char *type, unsigned long flags, void *data); + int (*sb_umount)(struct vfsmount *mnt, int flags); + int (*sb_pivotroot)(struct path *old_path, struct path *new_path); + int (*sb_set_mnt_opts)(struct super_block *sb, + struct security_mnt_opts *opts, + unsigned long kern_flags, + unsigned long *set_kern_flags); + int (*sb_clone_mnt_opts)(const struct super_block *oldsb, + struct super_block *newsb); + int (*sb_parse_opts_str)(char *options, struct security_mnt_opts *opts); + int (*dentry_init_security)(struct dentry *dentry, int mode, + struct qstr *name, void **ctx, + u32 *ctxlen); + + +#ifdef CONFIG_SECURITY_PATH + int (*path_unlink)(struct path *dir, struct dentry *dentry); + int (*path_mkdir)(struct path *dir, struct dentry *dentry, + umode_t mode); + int (*path_rmdir)(struct path *dir, struct dentry *dentry); + int (*path_mknod)(struct path *dir, struct dentry *dentry, + umode_t mode, unsigned int dev); + int (*path_truncate)(struct path *path); + int (*path_symlink)(struct path *dir, struct dentry *dentry, + const char *old_name); + int (*path_link)(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry); + int (*path_rename)(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, + struct dentry *new_dentry); + int (*path_chmod)(struct path *path, umode_t mode); + int (*path_chown)(struct path *path, kuid_t uid, kgid_t gid); + int (*path_chroot)(struct path *path); +#endif + + int (*inode_alloc_security)(struct inode *inode); + void (*inode_free_security)(struct inode *inode); + int (*inode_init_security)(struct inode *inode, struct inode *dir, + const struct qstr *qstr, + const char **name, void **value, + size_t *len); + int (*inode_create)(struct inode *dir, struct dentry *dentry, + umode_t mode); + int (*inode_link)(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry); + int (*inode_unlink)(struct inode *dir, struct dentry *dentry); + int (*inode_symlink)(struct inode *dir, struct dentry *dentry, + const char *old_name); + int (*inode_mkdir)(struct inode *dir, struct dentry *dentry, + umode_t mode); + int (*inode_rmdir)(struct inode *dir, struct dentry *dentry); + int (*inode_mknod)(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev); + int (*inode_rename)(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry); + int (*inode_readlink)(struct dentry *dentry); + int (*inode_follow_link)(struct dentry *dentry, struct nameidata *nd); + int (*inode_permission)(struct inode *inode, int mask); + int (*inode_setattr)(struct dentry *dentry, struct iattr *attr); + int (*inode_getattr)(const struct path *path); + int (*inode_setxattr)(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); + void (*inode_post_setxattr)(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags); + int (*inode_getxattr)(struct dentry *dentry, const char *name); + int (*inode_listxattr)(struct dentry *dentry); + int (*inode_removexattr)(struct dentry *dentry, const char *name); + int (*inode_need_killpriv)(struct dentry *dentry); + int (*inode_killpriv)(struct dentry *dentry); + int (*inode_getsecurity)(const struct inode *inode, const char *name, + void **buffer, bool alloc); + int (*inode_setsecurity)(struct inode *inode, const char *name, + const void *value, size_t size, + int flags); + int (*inode_listsecurity)(struct inode *inode, char *buffer, + size_t buffer_size); + void (*inode_getsecid)(const struct inode *inode, u32 *secid); + + int (*file_permission)(struct file *file, int mask); + int (*file_alloc_security)(struct file *file); + void (*file_free_security)(struct file *file); + int (*file_ioctl)(struct file *file, unsigned int cmd, + unsigned long arg); + int (*mmap_addr)(unsigned long addr); + int (*mmap_file)(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags); + int (*file_mprotect)(struct vm_area_struct *vma, unsigned long reqprot, + unsigned long prot); + int (*file_lock)(struct file *file, unsigned int cmd); + int (*file_fcntl)(struct file *file, unsigned int cmd, + unsigned long arg); + void (*file_set_fowner)(struct file *file); + int (*file_send_sigiotask)(struct task_struct *tsk, + struct fown_struct *fown, int sig); + int (*file_receive)(struct file *file); + int (*file_open)(struct file *file, const struct cred *cred); + + int (*task_create)(unsigned long clone_flags); + void (*task_free)(struct task_struct *task); + int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp); + void (*cred_free)(struct cred *cred); + int (*cred_prepare)(struct cred *new, const struct cred *old, + gfp_t gfp); + void (*cred_transfer)(struct cred *new, const struct cred *old); + int (*kernel_act_as)(struct cred *new, u32 secid); + int (*kernel_create_files_as)(struct cred *new, struct inode *inode); + int (*kernel_fw_from_file)(struct file *file, char *buf, size_t size); + int (*kernel_module_request)(char *kmod_name); + int (*kernel_module_from_file)(struct file *file); + int (*task_fix_setuid)(struct cred *new, const struct cred *old, + int flags); + int (*task_setpgid)(struct task_struct *p, pid_t pgid); + int (*task_getpgid)(struct task_struct *p); + int (*task_getsid)(struct task_struct *p); + void (*task_getsecid)(struct task_struct *p, u32 *secid); + int (*task_setnice)(struct task_struct *p, int nice); + int (*task_setioprio)(struct task_struct *p, int ioprio); + int (*task_getioprio)(struct task_struct *p); + int (*task_setrlimit)(struct task_struct *p, unsigned int resource, + struct rlimit *new_rlim); + int (*task_setscheduler)(struct task_struct *p); + int (*task_getscheduler)(struct task_struct *p); + int (*task_movememory)(struct task_struct *p); + int (*task_kill)(struct task_struct *p, struct siginfo *info, + int sig, u32 secid); + int (*task_wait)(struct task_struct *p); + int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5); + void (*task_to_inode)(struct task_struct *p, struct inode *inode); + + int (*ipc_permission)(struct kern_ipc_perm *ipcp, short flag); + void (*ipc_getsecid)(struct kern_ipc_perm *ipcp, u32 *secid); + + int (*msg_msg_alloc_security)(struct msg_msg *msg); + void (*msg_msg_free_security)(struct msg_msg *msg); + + int (*msg_queue_alloc_security)(struct msg_queue *msq); + void (*msg_queue_free_security)(struct msg_queue *msq); + int (*msg_queue_associate)(struct msg_queue *msq, int msqflg); + int (*msg_queue_msgctl)(struct msg_queue *msq, int cmd); + int (*msg_queue_msgsnd)(struct msg_queue *msq, struct msg_msg *msg, + int msqflg); + int (*msg_queue_msgrcv)(struct msg_queue *msq, struct msg_msg *msg, + struct task_struct *target, long type, + int mode); + + int (*shm_alloc_security)(struct shmid_kernel *shp); + void (*shm_free_security)(struct shmid_kernel *shp); + int (*shm_associate)(struct shmid_kernel *shp, int shmflg); + int (*shm_shmctl)(struct shmid_kernel *shp, int cmd); + int (*shm_shmat)(struct shmid_kernel *shp, char __user *shmaddr, + int shmflg); + + int (*sem_alloc_security)(struct sem_array *sma); + void (*sem_free_security)(struct sem_array *sma); + int (*sem_associate)(struct sem_array *sma, int semflg); + int (*sem_semctl)(struct sem_array *sma, int cmd); + int (*sem_semop)(struct sem_array *sma, struct sembuf *sops, + unsigned nsops, int alter); + + int (*netlink_send)(struct sock *sk, struct sk_buff *skb); + + void (*d_instantiate)(struct dentry *dentry, struct inode *inode); + + int (*getprocattr)(struct task_struct *p, char *name, char **value); + int (*setprocattr)(struct task_struct *p, char *name, void *value, + size_t size); + int (*ismaclabel)(const char *name); + int (*secid_to_secctx)(u32 secid, char **secdata, u32 *seclen); + int (*secctx_to_secid)(const char *secdata, u32 seclen, u32 *secid); + void (*release_secctx)(char *secdata, u32 seclen); + + int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen); + int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen); + int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen); + +#ifdef CONFIG_SECURITY_NETWORK + int (*unix_stream_connect)(struct sock *sock, struct sock *other, + struct sock *newsk); + int (*unix_may_send)(struct socket *sock, struct socket *other); + + int (*socket_create)(int family, int type, int protocol, int kern); + int (*socket_post_create)(struct socket *sock, int family, int type, + int protocol, int kern); + int (*socket_bind)(struct socket *sock, struct sockaddr *address, + int addrlen); + int (*socket_connect)(struct socket *sock, struct sockaddr *address, + int addrlen); + int (*socket_listen)(struct socket *sock, int backlog); + int (*socket_accept)(struct socket *sock, struct socket *newsock); + int (*socket_sendmsg)(struct socket *sock, struct msghdr *msg, + int size); + int (*socket_recvmsg)(struct socket *sock, struct msghdr *msg, + int size, int flags); + int (*socket_getsockname)(struct socket *sock); + int (*socket_getpeername)(struct socket *sock); + int (*socket_getsockopt)(struct socket *sock, int level, int optname); + int (*socket_setsockopt)(struct socket *sock, int level, int optname); + int (*socket_shutdown)(struct socket *sock, int how); + int (*socket_sock_rcv_skb)(struct sock *sk, struct sk_buff *skb); + int (*socket_getpeersec_stream)(struct socket *sock, + char __user *optval, + int __user *optlen, unsigned len); + int (*socket_getpeersec_dgram)(struct socket *sock, + struct sk_buff *skb, u32 *secid); + int (*sk_alloc_security)(struct sock *sk, int family, gfp_t priority); + void (*sk_free_security)(struct sock *sk); + void (*sk_clone_security)(const struct sock *sk, struct sock *newsk); + void (*sk_getsecid)(struct sock *sk, u32 *secid); + void (*sock_graft)(struct sock *sk, struct socket *parent); + int (*inet_conn_request)(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); + void (*inet_csk_clone)(struct sock *newsk, + const struct request_sock *req); + void (*inet_conn_established)(struct sock *sk, struct sk_buff *skb); + int (*secmark_relabel_packet)(u32 secid); + void (*secmark_refcount_inc)(void); + void (*secmark_refcount_dec)(void); + void (*req_classify_flow)(const struct request_sock *req, + struct flowi *fl); + int (*tun_dev_alloc_security)(void **security); + void (*tun_dev_free_security)(void *security); + int (*tun_dev_create)(void); + int (*tun_dev_attach_queue)(void *security); + int (*tun_dev_attach)(struct sock *sk, void *security); + int (*tun_dev_open)(void *security); +#endif /* CONFIG_SECURITY_NETWORK */ + +#ifdef CONFIG_SECURITY_NETWORK_XFRM + int (*xfrm_policy_alloc_security)(struct xfrm_sec_ctx **ctxp, + struct xfrm_user_sec_ctx *sec_ctx, + gfp_t gfp); + int (*xfrm_policy_clone_security)(struct xfrm_sec_ctx *old_ctx, + struct xfrm_sec_ctx **new_ctx); + void (*xfrm_policy_free_security)(struct xfrm_sec_ctx *ctx); + int (*xfrm_policy_delete_security)(struct xfrm_sec_ctx *ctx); + int (*xfrm_state_alloc)(struct xfrm_state *x, + struct xfrm_user_sec_ctx *sec_ctx); + int (*xfrm_state_alloc_acquire)(struct xfrm_state *x, + struct xfrm_sec_ctx *polsec, + u32 secid); + void (*xfrm_state_free_security)(struct xfrm_state *x); + int (*xfrm_state_delete_security)(struct xfrm_state *x); + int (*xfrm_policy_lookup)(struct xfrm_sec_ctx *ctx, u32 fl_secid, + u8 dir); + int (*xfrm_state_pol_flow_match)(struct xfrm_state *x, + struct xfrm_policy *xp, + const struct flowi *fl); + int (*xfrm_decode_session)(struct sk_buff *skb, u32 *secid, int ckall); +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ + + /* key management security hooks */ +#ifdef CONFIG_KEYS + int (*key_alloc)(struct key *key, const struct cred *cred, + unsigned long flags); + void (*key_free)(struct key *key); + int (*key_permission)(key_ref_t key_ref, const struct cred *cred, + unsigned perm); + int (*key_getsecurity)(struct key *key, char **_buffer); +#endif /* CONFIG_KEYS */ + +#ifdef CONFIG_AUDIT + int (*audit_rule_init)(u32 field, u32 op, char *rulestr, + void **lsmrule); + int (*audit_rule_known)(struct audit_krule *krule); + int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule, + struct audit_context *actx); + void (*audit_rule_free)(void *lsmrule); +#endif /* CONFIG_AUDIT */ +}; + +/* prototypes */ +extern int security_module_enable(struct security_operations *ops); +extern int register_security(struct security_operations *ops); +extern void __init security_fixup_ops(struct security_operations *ops); +extern void reset_security_ops(void); + +#endif /* CONFIG_SECURITY */ + +#endif /* ! __LINUX_LSM_HOOKS_H */ diff --git a/include/linux/security.h b/include/linux/security.h index 18264ea9e314..f3d42c636f27 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -116,8 +116,6 @@ struct seq_file; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); -void reset_security_ops(void); - #ifdef CONFIG_MMU extern unsigned long mmap_min_addr; extern unsigned long dac_mmap_min_addr; @@ -1457,312 +1455,9 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @ctxlen points to the place to put the length of @ctx. * This is the main security structure. */ -struct security_operations { - char name[SECURITY_NAME_MAX + 1]; - - int (*binder_set_context_mgr) (struct task_struct *mgr); - int (*binder_transaction) (struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_binder) (struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_file) (struct task_struct *from, - struct task_struct *to, struct file *file); - - int (*ptrace_access_check) (struct task_struct *child, unsigned int mode); - int (*ptrace_traceme) (struct task_struct *parent); - int (*capget) (struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capset) (struct cred *new, - const struct cred *old, - const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); - int (*capable) (const struct cred *cred, struct user_namespace *ns, - int cap, int audit); - int (*quotactl) (int cmds, int type, int id, struct super_block *sb); - int (*quota_on) (struct dentry *dentry); - int (*syslog) (int type); - int (*settime) (const struct timespec *ts, const struct timezone *tz); - int (*vm_enough_memory) (struct mm_struct *mm, long pages); - - int (*bprm_set_creds) (struct linux_binprm *bprm); - int (*bprm_check_security) (struct linux_binprm *bprm); - int (*bprm_secureexec) (struct linux_binprm *bprm); - void (*bprm_committing_creds) (struct linux_binprm *bprm); - void (*bprm_committed_creds) (struct linux_binprm *bprm); - - int (*sb_alloc_security) (struct super_block *sb); - void (*sb_free_security) (struct super_block *sb); - int (*sb_copy_data) (char *orig, char *copy); - int (*sb_remount) (struct super_block *sb, void *data); - int (*sb_kern_mount) (struct super_block *sb, int flags, void *data); - int (*sb_show_options) (struct seq_file *m, struct super_block *sb); - int (*sb_statfs) (struct dentry *dentry); - int (*sb_mount) (const char *dev_name, struct path *path, - const char *type, unsigned long flags, void *data); - int (*sb_umount) (struct vfsmount *mnt, int flags); - int (*sb_pivotroot) (struct path *old_path, - struct path *new_path); - int (*sb_set_mnt_opts) (struct super_block *sb, - struct security_mnt_opts *opts, - unsigned long kern_flags, - unsigned long *set_kern_flags); - int (*sb_clone_mnt_opts) (const struct super_block *oldsb, - struct super_block *newsb); - int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts); - int (*dentry_init_security) (struct dentry *dentry, int mode, - struct qstr *name, void **ctx, - u32 *ctxlen); - - -#ifdef CONFIG_SECURITY_PATH - int (*path_unlink) (struct path *dir, struct dentry *dentry); - int (*path_mkdir) (struct path *dir, struct dentry *dentry, umode_t mode); - int (*path_rmdir) (struct path *dir, struct dentry *dentry); - int (*path_mknod) (struct path *dir, struct dentry *dentry, umode_t mode, - unsigned int dev); - int (*path_truncate) (struct path *path); - int (*path_symlink) (struct path *dir, struct dentry *dentry, - const char *old_name); - int (*path_link) (struct dentry *old_dentry, struct path *new_dir, - struct dentry *new_dentry); - int (*path_rename) (struct path *old_dir, struct dentry *old_dentry, - struct path *new_dir, struct dentry *new_dentry); - int (*path_chmod) (struct path *path, umode_t mode); - int (*path_chown) (struct path *path, kuid_t uid, kgid_t gid); - int (*path_chroot) (struct path *path); -#endif - - int (*inode_alloc_security) (struct inode *inode); - void (*inode_free_security) (struct inode *inode); - int (*inode_init_security) (struct inode *inode, struct inode *dir, - const struct qstr *qstr, const char **name, - void **value, size_t *len); - int (*inode_create) (struct inode *dir, - struct dentry *dentry, umode_t mode); - int (*inode_link) (struct dentry *old_dentry, - struct inode *dir, struct dentry *new_dentry); - int (*inode_unlink) (struct inode *dir, struct dentry *dentry); - int (*inode_symlink) (struct inode *dir, - struct dentry *dentry, const char *old_name); - int (*inode_mkdir) (struct inode *dir, struct dentry *dentry, umode_t mode); - int (*inode_rmdir) (struct inode *dir, struct dentry *dentry); - int (*inode_mknod) (struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t dev); - int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); - int (*inode_readlink) (struct dentry *dentry); - int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd); - int (*inode_permission) (struct inode *inode, int mask); - int (*inode_setattr) (struct dentry *dentry, struct iattr *attr); - int (*inode_getattr) (const struct path *path); - int (*inode_setxattr) (struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); - void (*inode_post_setxattr) (struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); - int (*inode_getxattr) (struct dentry *dentry, const char *name); - int (*inode_listxattr) (struct dentry *dentry); - int (*inode_removexattr) (struct dentry *dentry, const char *name); - int (*inode_need_killpriv) (struct dentry *dentry); - int (*inode_killpriv) (struct dentry *dentry); - int (*inode_getsecurity) (const struct inode *inode, const char *name, void **buffer, bool alloc); - int (*inode_setsecurity) (struct inode *inode, const char *name, const void *value, size_t size, int flags); - int (*inode_listsecurity) (struct inode *inode, char *buffer, size_t buffer_size); - void (*inode_getsecid) (const struct inode *inode, u32 *secid); - - int (*file_permission) (struct file *file, int mask); - int (*file_alloc_security) (struct file *file); - void (*file_free_security) (struct file *file); - int (*file_ioctl) (struct file *file, unsigned int cmd, - unsigned long arg); - int (*mmap_addr) (unsigned long addr); - int (*mmap_file) (struct file *file, - unsigned long reqprot, unsigned long prot, - unsigned long flags); - int (*file_mprotect) (struct vm_area_struct *vma, - unsigned long reqprot, - unsigned long prot); - int (*file_lock) (struct file *file, unsigned int cmd); - int (*file_fcntl) (struct file *file, unsigned int cmd, - unsigned long arg); - void (*file_set_fowner) (struct file *file); - int (*file_send_sigiotask) (struct task_struct *tsk, - struct fown_struct *fown, int sig); - int (*file_receive) (struct file *file); - int (*file_open) (struct file *file, const struct cred *cred); - - int (*task_create) (unsigned long clone_flags); - void (*task_free) (struct task_struct *task); - int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp); - void (*cred_free) (struct cred *cred); - int (*cred_prepare)(struct cred *new, const struct cred *old, - gfp_t gfp); - void (*cred_transfer)(struct cred *new, const struct cred *old); - int (*kernel_act_as)(struct cred *new, u32 secid); - int (*kernel_create_files_as)(struct cred *new, struct inode *inode); - int (*kernel_fw_from_file)(struct file *file, char *buf, size_t size); - int (*kernel_module_request)(char *kmod_name); - int (*kernel_module_from_file)(struct file *file); - int (*task_fix_setuid) (struct cred *new, const struct cred *old, - int flags); - int (*task_setpgid) (struct task_struct *p, pid_t pgid); - int (*task_getpgid) (struct task_struct *p); - int (*task_getsid) (struct task_struct *p); - void (*task_getsecid) (struct task_struct *p, u32 *secid); - int (*task_setnice) (struct task_struct *p, int nice); - int (*task_setioprio) (struct task_struct *p, int ioprio); - int (*task_getioprio) (struct task_struct *p); - int (*task_setrlimit) (struct task_struct *p, unsigned int resource, - struct rlimit *new_rlim); - int (*task_setscheduler) (struct task_struct *p); - int (*task_getscheduler) (struct task_struct *p); - int (*task_movememory) (struct task_struct *p); - int (*task_kill) (struct task_struct *p, - struct siginfo *info, int sig, u32 secid); - int (*task_wait) (struct task_struct *p); - int (*task_prctl) (int option, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5); - void (*task_to_inode) (struct task_struct *p, struct inode *inode); - - int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag); - void (*ipc_getsecid) (struct kern_ipc_perm *ipcp, u32 *secid); - - int (*msg_msg_alloc_security) (struct msg_msg *msg); - void (*msg_msg_free_security) (struct msg_msg *msg); - - int (*msg_queue_alloc_security) (struct msg_queue *msq); - void (*msg_queue_free_security) (struct msg_queue *msq); - int (*msg_queue_associate) (struct msg_queue *msq, int msqflg); - int (*msg_queue_msgctl) (struct msg_queue *msq, int cmd); - int (*msg_queue_msgsnd) (struct msg_queue *msq, - struct msg_msg *msg, int msqflg); - int (*msg_queue_msgrcv) (struct msg_queue *msq, - struct msg_msg *msg, - struct task_struct *target, - long type, int mode); - - int (*shm_alloc_security) (struct shmid_kernel *shp); - void (*shm_free_security) (struct shmid_kernel *shp); - int (*shm_associate) (struct shmid_kernel *shp, int shmflg); - int (*shm_shmctl) (struct shmid_kernel *shp, int cmd); - int (*shm_shmat) (struct shmid_kernel *shp, - char __user *shmaddr, int shmflg); - - int (*sem_alloc_security) (struct sem_array *sma); - void (*sem_free_security) (struct sem_array *sma); - int (*sem_associate) (struct sem_array *sma, int semflg); - int (*sem_semctl) (struct sem_array *sma, int cmd); - int (*sem_semop) (struct sem_array *sma, - struct sembuf *sops, unsigned nsops, int alter); - - int (*netlink_send) (struct sock *sk, struct sk_buff *skb); - - void (*d_instantiate) (struct dentry *dentry, struct inode *inode); - - int (*getprocattr) (struct task_struct *p, char *name, char **value); - int (*setprocattr) (struct task_struct *p, char *name, void *value, size_t size); - int (*ismaclabel) (const char *name); - int (*secid_to_secctx) (u32 secid, char **secdata, u32 *seclen); - int (*secctx_to_secid) (const char *secdata, u32 seclen, u32 *secid); - void (*release_secctx) (char *secdata, u32 seclen); - - int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen); - int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen); - int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen); - -#ifdef CONFIG_SECURITY_NETWORK - int (*unix_stream_connect) (struct sock *sock, struct sock *other, struct sock *newsk); - int (*unix_may_send) (struct socket *sock, struct socket *other); - - int (*socket_create) (int family, int type, int protocol, int kern); - int (*socket_post_create) (struct socket *sock, int family, - int type, int protocol, int kern); - int (*socket_bind) (struct socket *sock, - struct sockaddr *address, int addrlen); - int (*socket_connect) (struct socket *sock, - struct sockaddr *address, int addrlen); - int (*socket_listen) (struct socket *sock, int backlog); - int (*socket_accept) (struct socket *sock, struct socket *newsock); - int (*socket_sendmsg) (struct socket *sock, - struct msghdr *msg, int size); - int (*socket_recvmsg) (struct socket *sock, - struct msghdr *msg, int size, int flags); - int (*socket_getsockname) (struct socket *sock); - int (*socket_getpeername) (struct socket *sock); - int (*socket_getsockopt) (struct socket *sock, int level, int optname); - int (*socket_setsockopt) (struct socket *sock, int level, int optname); - int (*socket_shutdown) (struct socket *sock, int how); - int (*socket_sock_rcv_skb) (struct sock *sk, struct sk_buff *skb); - int (*socket_getpeersec_stream) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len); - int (*socket_getpeersec_dgram) (struct socket *sock, struct sk_buff *skb, u32 *secid); - int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority); - void (*sk_free_security) (struct sock *sk); - void (*sk_clone_security) (const struct sock *sk, struct sock *newsk); - void (*sk_getsecid) (struct sock *sk, u32 *secid); - void (*sock_graft) (struct sock *sk, struct socket *parent); - int (*inet_conn_request) (struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); - void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); - int (*secmark_relabel_packet) (u32 secid); - void (*secmark_refcount_inc) (void); - void (*secmark_refcount_dec) (void); - void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); - int (*tun_dev_alloc_security) (void **security); - void (*tun_dev_free_security) (void *security); - int (*tun_dev_create) (void); - int (*tun_dev_attach_queue) (void *security); - int (*tun_dev_attach) (struct sock *sk, void *security); - int (*tun_dev_open) (void *security); -#endif /* CONFIG_SECURITY_NETWORK */ - -#ifdef CONFIG_SECURITY_NETWORK_XFRM - int (*xfrm_policy_alloc_security) (struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp); - int (*xfrm_policy_clone_security) (struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx); - void (*xfrm_policy_free_security) (struct xfrm_sec_ctx *ctx); - int (*xfrm_policy_delete_security) (struct xfrm_sec_ctx *ctx); - int (*xfrm_state_alloc) (struct xfrm_state *x, - struct xfrm_user_sec_ctx *sec_ctx); - int (*xfrm_state_alloc_acquire) (struct xfrm_state *x, - struct xfrm_sec_ctx *polsec, - u32 secid); - void (*xfrm_state_free_security) (struct xfrm_state *x); - int (*xfrm_state_delete_security) (struct xfrm_state *x); - int (*xfrm_policy_lookup) (struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); - int (*xfrm_state_pol_flow_match) (struct xfrm_state *x, - struct xfrm_policy *xp, - const struct flowi *fl); - int (*xfrm_decode_session) (struct sk_buff *skb, u32 *secid, int ckall); -#endif /* CONFIG_SECURITY_NETWORK_XFRM */ - - /* key management security hooks */ -#ifdef CONFIG_KEYS - int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags); - void (*key_free) (struct key *key); - int (*key_permission) (key_ref_t key_ref, - const struct cred *cred, - unsigned perm); - int (*key_getsecurity)(struct key *key, char **_buffer); -#endif /* CONFIG_KEYS */ - -#ifdef CONFIG_AUDIT - int (*audit_rule_init) (u32 field, u32 op, char *rulestr, void **lsmrule); - int (*audit_rule_known) (struct audit_krule *krule); - int (*audit_rule_match) (u32 secid, u32 field, u32 op, void *lsmrule, - struct audit_context *actx); - void (*audit_rule_free) (void *lsmrule); -#endif /* CONFIG_AUDIT */ -}; /* prototypes */ extern int security_init(void); -extern int security_module_enable(struct security_operations *ops); -extern int register_security(struct security_operations *ops); -extern void __init security_fixup_ops(struct security_operations *ops); - /* Security operations */ int security_binder_set_context_mgr(struct task_struct *mgr); -- cgit v1.2.3 From fe7bb272ee72b5cc377e02b556d0d718d12bbede Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:10:53 -0700 Subject: LSM: Add the comment to lsm_hooks.h Add the large comment describing the content of the security_operations structure to lsm_hooks.h. This wasn't done in the previous (1/7) patch because it would have exceeded the mail list size limits. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 1279 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1279 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index c60f81b2d18c..b4c91de510c2 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -31,6 +31,1285 @@ #ifdef CONFIG_SECURITY +/** + * struct security_operations - main security structure + * + * Security module identifier. + * + * @name: + * A string that acts as a unique identifier for the LSM with max number + * of characters = SECURITY_NAME_MAX. + * + * Security hooks for program execution operations. + * + * @bprm_set_creds: + * Save security information in the bprm->security field, typically based + * on information about the bprm->file, for later use by the apply_creds + * hook. This hook may also optionally check permissions (e.g. for + * transitions between security domains). + * This hook may be called multiple times during a single execve, e.g. for + * interpreters. The hook can tell whether it has already been called by + * checking to see if @bprm->security is non-NULL. If so, then the hook + * may decide either to retain the security information saved earlier or + * to replace it. + * @bprm contains the linux_binprm structure. + * Return 0 if the hook is successful and permission is granted. + * @bprm_check_security: + * This hook mediates the point when a search for a binary handler will + * begin. It allows a check the @bprm->security value which is set in the + * preceding set_creds call. The primary difference from set_creds is + * that the argv list and envp list are reliably available in @bprm. This + * hook may be called multiple times during a single execve; and in each + * pass set_creds is called first. + * @bprm contains the linux_binprm structure. + * Return 0 if the hook is successful and permission is granted. + * @bprm_committing_creds: + * Prepare to install the new security attributes of a process being + * transformed by an execve operation, based on the old credentials + * pointed to by @current->cred and the information set in @bprm->cred by + * the bprm_set_creds hook. @bprm points to the linux_binprm structure. + * This hook is a good place to perform state changes on the process such + * as closing open file descriptors to which access will no longer be + * granted when the attributes are changed. This is called immediately + * before commit_creds(). + * @bprm_committed_creds: + * Tidy up after the installation of the new security attributes of a + * process being transformed by an execve operation. The new credentials + * have, by this point, been set to @current->cred. @bprm points to the + * linux_binprm structure. This hook is a good place to perform state + * changes on the process such as clearing out non-inheritable signal + * state. This is called immediately after commit_creds(). + * @bprm_secureexec: + * Return a boolean value (0 or 1) indicating whether a "secure exec" + * is required. The flag is passed in the auxiliary table + * on the initial stack to the ELF interpreter to indicate whether libc + * should enable secure mode. + * @bprm contains the linux_binprm structure. + * + * Security hooks for filesystem operations. + * + * @sb_alloc_security: + * Allocate and attach a security structure to the sb->s_security field. + * The s_security field is initialized to NULL when the structure is + * allocated. + * @sb contains the super_block structure to be modified. + * Return 0 if operation was successful. + * @sb_free_security: + * Deallocate and clear the sb->s_security field. + * @sb contains the super_block structure to be modified. + * @sb_statfs: + * Check permission before obtaining filesystem statistics for the @mnt + * mountpoint. + * @dentry is a handle on the superblock for the filesystem. + * Return 0 if permission is granted. + * @sb_mount: + * Check permission before an object specified by @dev_name is mounted on + * the mount point named by @nd. For an ordinary mount, @dev_name + * identifies a device if the file system type requires a device. For a + * remount (@flags & MS_REMOUNT), @dev_name is irrelevant. For a + * loopback/bind mount (@flags & MS_BIND), @dev_name identifies the + * pathname of the object being mounted. + * @dev_name contains the name for object being mounted. + * @path contains the path for mount point object. + * @type contains the filesystem type. + * @flags contains the mount flags. + * @data contains the filesystem-specific data. + * Return 0 if permission is granted. + * @sb_copy_data: + * Allow mount option data to be copied prior to parsing by the filesystem, + * so that the security module can extract security-specific mount + * options cleanly (a filesystem may modify the data e.g. with strsep()). + * This also allows the original mount data to be stripped of security- + * specific options to avoid having to make filesystems aware of them. + * @type the type of filesystem being mounted. + * @orig the original mount data copied from userspace. + * @copy copied data which will be passed to the security module. + * Returns 0 if the copy was successful. + * @sb_remount: + * Extracts security system specific mount options and verifies no changes + * are being made to those options. + * @sb superblock being remounted + * @data contains the filesystem-specific data. + * Return 0 if permission is granted. + * @sb_umount: + * Check permission before the @mnt file system is unmounted. + * @mnt contains the mounted file system. + * @flags contains the unmount flags, e.g. MNT_FORCE. + * Return 0 if permission is granted. + * @sb_pivotroot: + * Check permission before pivoting the root filesystem. + * @old_path contains the path for the new location of the + * current root (put_old). + * @new_path contains the path for the new root (new_root). + * Return 0 if permission is granted. + * @sb_set_mnt_opts: + * Set the security relevant mount options used for a superblock + * @sb the superblock to set security mount options for + * @opts binary data structure containing all lsm mount data + * @sb_clone_mnt_opts: + * Copy all security options from a given superblock to another + * @oldsb old superblock which contain information to clone + * @newsb new superblock which needs filled in + * @sb_parse_opts_str: + * Parse a string of security data filling in the opts structure + * @options string containing all mount options known by the LSM + * @opts binary data structure usable by the LSM + * @dentry_init_security: + * Compute a context for a dentry as the inode is not yet available + * since NFSv4 has no label backed by an EA anyway. + * @dentry dentry to use in calculating the context. + * @mode mode used to determine resource type. + * @name name of the last path component used to create file + * @ctx pointer to place the pointer to the resulting context in. + * @ctxlen point to place the length of the resulting context. + * + * + * Security hooks for inode operations. + * + * @inode_alloc_security: + * Allocate and attach a security structure to @inode->i_security. The + * i_security field is initialized to NULL when the inode structure is + * allocated. + * @inode contains the inode structure. + * Return 0 if operation was successful. + * @inode_free_security: + * @inode contains the inode structure. + * Deallocate the inode security structure and set @inode->i_security to + * NULL. + * @inode_init_security: + * Obtain the security attribute name suffix and value to set on a newly + * created inode and set up the incore security field for the new inode. + * This hook is called by the fs code as part of the inode creation + * transaction and provides for atomic labeling of the inode, unlike + * the post_create/mkdir/... hooks called by the VFS. The hook function + * is expected to allocate the name and value via kmalloc, with the caller + * being responsible for calling kfree after using them. + * If the security module does not use security attributes or does + * not wish to put a security attribute on this particular inode, + * then it should return -EOPNOTSUPP to skip this processing. + * @inode contains the inode structure of the newly created inode. + * @dir contains the inode structure of the parent directory. + * @qstr contains the last path component of the new object + * @name will be set to the allocated name suffix (e.g. selinux). + * @value will be set to the allocated attribute value. + * @len will be set to the length of the value. + * Returns 0 if @name and @value have been successfully set, + * -EOPNOTSUPP if no security attribute is needed, or + * -ENOMEM on memory allocation failure. + * @inode_create: + * Check permission to create a regular file. + * @dir contains inode structure of the parent of the new file. + * @dentry contains the dentry structure for the file to be created. + * @mode contains the file mode of the file to be created. + * Return 0 if permission is granted. + * @inode_link: + * Check permission before creating a new hard link to a file. + * @old_dentry contains the dentry structure for an existing + * link to the file. + * @dir contains the inode structure of the parent directory + * of the new link. + * @new_dentry contains the dentry structure for the new link. + * Return 0 if permission is granted. + * @path_link: + * Check permission before creating a new hard link to a file. + * @old_dentry contains the dentry structure for an existing link + * to the file. + * @new_dir contains the path structure of the parent directory of + * the new link. + * @new_dentry contains the dentry structure for the new link. + * Return 0 if permission is granted. + * @inode_unlink: + * Check the permission to remove a hard link to a file. + * @dir contains the inode structure of parent directory of the file. + * @dentry contains the dentry structure for file to be unlinked. + * Return 0 if permission is granted. + * @path_unlink: + * Check the permission to remove a hard link to a file. + * @dir contains the path structure of parent directory of the file. + * @dentry contains the dentry structure for file to be unlinked. + * Return 0 if permission is granted. + * @inode_symlink: + * Check the permission to create a symbolic link to a file. + * @dir contains the inode structure of parent directory of + * the symbolic link. + * @dentry contains the dentry structure of the symbolic link. + * @old_name contains the pathname of file. + * Return 0 if permission is granted. + * @path_symlink: + * Check the permission to create a symbolic link to a file. + * @dir contains the path structure of parent directory of + * the symbolic link. + * @dentry contains the dentry structure of the symbolic link. + * @old_name contains the pathname of file. + * Return 0 if permission is granted. + * @inode_mkdir: + * Check permissions to create a new directory in the existing directory + * associated with inode structure @dir. + * @dir contains the inode structure of parent of the directory + * to be created. + * @dentry contains the dentry structure of new directory. + * @mode contains the mode of new directory. + * Return 0 if permission is granted. + * @path_mkdir: + * Check permissions to create a new directory in the existing directory + * associated with path structure @path. + * @dir contains the path structure of parent of the directory + * to be created. + * @dentry contains the dentry structure of new directory. + * @mode contains the mode of new directory. + * Return 0 if permission is granted. + * @inode_rmdir: + * Check the permission to remove a directory. + * @dir contains the inode structure of parent of the directory + * to be removed. + * @dentry contains the dentry structure of directory to be removed. + * Return 0 if permission is granted. + * @path_rmdir: + * Check the permission to remove a directory. + * @dir contains the path structure of parent of the directory to be + * removed. + * @dentry contains the dentry structure of directory to be removed. + * Return 0 if permission is granted. + * @inode_mknod: + * Check permissions when creating a special file (or a socket or a fifo + * file created via the mknod system call). Note that if mknod operation + * is being done for a regular file, then the create hook will be called + * and not this hook. + * @dir contains the inode structure of parent of the new file. + * @dentry contains the dentry structure of the new file. + * @mode contains the mode of the new file. + * @dev contains the device number. + * Return 0 if permission is granted. + * @path_mknod: + * Check permissions when creating a file. Note that this hook is called + * even if mknod operation is being done for a regular file. + * @dir contains the path structure of parent of the new file. + * @dentry contains the dentry structure of the new file. + * @mode contains the mode of the new file. + * @dev contains the undecoded device number. Use new_decode_dev() to get + * the decoded device number. + * Return 0 if permission is granted. + * @inode_rename: + * Check for permission to rename a file or directory. + * @old_dir contains the inode structure for parent of the old link. + * @old_dentry contains the dentry structure of the old link. + * @new_dir contains the inode structure for parent of the new link. + * @new_dentry contains the dentry structure of the new link. + * Return 0 if permission is granted. + * @path_rename: + * Check for permission to rename a file or directory. + * @old_dir contains the path structure for parent of the old link. + * @old_dentry contains the dentry structure of the old link. + * @new_dir contains the path structure for parent of the new link. + * @new_dentry contains the dentry structure of the new link. + * Return 0 if permission is granted. + * @path_chmod: + * Check for permission to change DAC's permission of a file or directory. + * @dentry contains the dentry structure. + * @mnt contains the vfsmnt structure. + * @mode contains DAC's mode. + * Return 0 if permission is granted. + * @path_chown: + * Check for permission to change owner/group of a file or directory. + * @path contains the path structure. + * @uid contains new owner's ID. + * @gid contains new group's ID. + * Return 0 if permission is granted. + * @path_chroot: + * Check for permission to change root directory. + * @path contains the path structure. + * Return 0 if permission is granted. + * @inode_readlink: + * Check the permission to read the symbolic link. + * @dentry contains the dentry structure for the file link. + * Return 0 if permission is granted. + * @inode_follow_link: + * Check permission to follow a symbolic link when looking up a pathname. + * @dentry contains the dentry structure for the link. + * @nd contains the nameidata structure for the parent directory. + * Return 0 if permission is granted. + * @inode_permission: + * Check permission before accessing an inode. This hook is called by the + * existing Linux permission function, so a security module can use it to + * provide additional checking for existing Linux permission checks. + * Notice that this hook is called when a file is opened (as well as many + * other operations), whereas the file_security_ops permission hook is + * called when the actual read/write operations are performed. + * @inode contains the inode structure to check. + * @mask contains the permission mask. + * Return 0 if permission is granted. + * @inode_setattr: + * Check permission before setting file attributes. Note that the kernel + * call to notify_change is performed from several locations, whenever + * file attributes change (such as when a file is truncated, chown/chmod + * operations, transferring disk quotas, etc). + * @dentry contains the dentry structure for the file. + * @attr is the iattr structure containing the new file attributes. + * Return 0 if permission is granted. + * @path_truncate: + * Check permission before truncating a file. + * @path contains the path structure for the file. + * Return 0 if permission is granted. + * @inode_getattr: + * Check permission before obtaining file attributes. + * @mnt is the vfsmount where the dentry was looked up + * @dentry contains the dentry structure for the file. + * Return 0 if permission is granted. + * @inode_setxattr: + * Check permission before setting the extended attributes + * @value identified by @name for @dentry. + * Return 0 if permission is granted. + * @inode_post_setxattr: + * Update inode security field after successful setxattr operation. + * @value identified by @name for @dentry. + * @inode_getxattr: + * Check permission before obtaining the extended attributes + * identified by @name for @dentry. + * Return 0 if permission is granted. + * @inode_listxattr: + * Check permission before obtaining the list of extended attribute + * names for @dentry. + * Return 0 if permission is granted. + * @inode_removexattr: + * Check permission before removing the extended attribute + * identified by @name for @dentry. + * Return 0 if permission is granted. + * @inode_getsecurity: + * Retrieve a copy of the extended attribute representation of the + * security label associated with @name for @inode via @buffer. Note that + * @name is the remainder of the attribute name after the security prefix + * has been removed. @alloc is used to specify of the call should return a + * value via the buffer or just the value length Return size of buffer on + * success. + * @inode_setsecurity: + * Set the security label associated with @name for @inode from the + * extended attribute value @value. @size indicates the size of the + * @value in bytes. @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. + * Note that @name is the remainder of the attribute name after the + * security. prefix has been removed. + * Return 0 on success. + * @inode_listsecurity: + * Copy the extended attribute names for the security labels + * associated with @inode into @buffer. The maximum size of @buffer + * is specified by @buffer_size. @buffer may be NULL to request + * the size of the buffer required. + * Returns number of bytes used/required on success. + * @inode_need_killpriv: + * Called when an inode has been changed. + * @dentry is the dentry being changed. + * Return <0 on error to abort the inode change operation. + * Return 0 if inode_killpriv does not need to be called. + * Return >0 if inode_killpriv does need to be called. + * @inode_killpriv: + * The setuid bit is being removed. Remove similar security labels. + * Called with the dentry->d_inode->i_mutex held. + * @dentry is the dentry being changed. + * Return 0 on success. If error is returned, then the operation + * causing setuid bit removal is failed. + * @inode_getsecid: + * Get the secid associated with the node. + * @inode contains a pointer to the inode. + * @secid contains a pointer to the location where result will be saved. + * In case of failure, @secid will be set to zero. + * + * Security hooks for file operations + * + * @file_permission: + * Check file permissions before accessing an open file. This hook is + * called by various operations that read or write files. A security + * module can use this hook to perform additional checking on these + * operations, e.g. to revalidate permissions on use to support privilege + * bracketing or policy changes. Notice that this hook is used when the + * actual read/write operations are performed, whereas the + * inode_security_ops hook is called when a file is opened (as well as + * many other operations). + * Caveat: Although this hook can be used to revalidate permissions for + * various system call operations that read or write files, it does not + * address the revalidation of permissions for memory-mapped files. + * Security modules must handle this separately if they need such + * revalidation. + * @file contains the file structure being accessed. + * @mask contains the requested permissions. + * Return 0 if permission is granted. + * @file_alloc_security: + * Allocate and attach a security structure to the file->f_security field. + * The security field is initialized to NULL when the structure is first + * created. + * @file contains the file structure to secure. + * Return 0 if the hook is successful and permission is granted. + * @file_free_security: + * Deallocate and free any security structures stored in file->f_security. + * @file contains the file structure being modified. + * @file_ioctl: + * @file contains the file structure. + * @cmd contains the operation to perform. + * @arg contains the operational arguments. + * Check permission for an ioctl operation on @file. Note that @arg + * sometimes represents a user space pointer; in other cases, it may be a + * simple integer value. When @arg represents a user space pointer, it + * should never be used by the security module. + * Return 0 if permission is granted. + * @mmap_addr : + * Check permissions for a mmap operation at @addr. + * @addr contains virtual address that will be used for the operation. + * Return 0 if permission is granted. + * @mmap_file : + * Check permissions for a mmap operation. The @file may be NULL, e.g. + * if mapping anonymous memory. + * @file contains the file structure for file to map (may be NULL). + * @reqprot contains the protection requested by the application. + * @prot contains the protection that will be applied by the kernel. + * @flags contains the operational flags. + * Return 0 if permission is granted. + * @file_mprotect: + * Check permissions before changing memory access permissions. + * @vma contains the memory region to modify. + * @reqprot contains the protection requested by the application. + * @prot contains the protection that will be applied by the kernel. + * Return 0 if permission is granted. + * @file_lock: + * Check permission before performing file locking operations. + * Note: this hook mediates both flock and fcntl style locks. + * @file contains the file structure. + * @cmd contains the posix-translated lock operation to perform + * (e.g. F_RDLCK, F_WRLCK). + * Return 0 if permission is granted. + * @file_fcntl: + * Check permission before allowing the file operation specified by @cmd + * from being performed on the file @file. Note that @arg sometimes + * represents a user space pointer; in other cases, it may be a simple + * integer value. When @arg represents a user space pointer, it should + * never be used by the security module. + * @file contains the file structure. + * @cmd contains the operation to be performed. + * @arg contains the operational arguments. + * Return 0 if permission is granted. + * @file_set_fowner: + * Save owner security information (typically from current->security) in + * file->f_security for later use by the send_sigiotask hook. + * @file contains the file structure to update. + * Return 0 on success. + * @file_send_sigiotask: + * Check permission for the file owner @fown to send SIGIO or SIGURG to the + * process @tsk. Note that this hook is sometimes called from interrupt. + * Note that the fown_struct, @fown, is never outside the context of a + * struct file, so the file structure (and associated security information) + * can always be obtained: + * container_of(fown, struct file, f_owner) + * @tsk contains the structure of task receiving signal. + * @fown contains the file owner information. + * @sig is the signal that will be sent. When 0, kernel sends SIGIO. + * Return 0 if permission is granted. + * @file_receive: + * This hook allows security modules to control the ability of a process + * to receive an open file descriptor via socket IPC. + * @file contains the file structure being received. + * Return 0 if permission is granted. + * @file_open + * Save open-time permission checking state for later use upon + * file_permission, and recheck access if anything has changed + * since inode_permission. + * + * Security hooks for task operations. + * + * @task_create: + * Check permission before creating a child process. See the clone(2) + * manual page for definitions of the @clone_flags. + * @clone_flags contains the flags indicating what should be shared. + * Return 0 if permission is granted. + * @task_free: + * @task task being freed + * Handle release of task-related resources. (Note that this can be called + * from interrupt context.) + * @cred_alloc_blank: + * @cred points to the credentials. + * @gfp indicates the atomicity of any memory allocations. + * Only allocate sufficient memory and attach to @cred such that + * cred_transfer() will not get ENOMEM. + * @cred_free: + * @cred points to the credentials. + * Deallocate and clear the cred->security field in a set of credentials. + * @cred_prepare: + * @new points to the new credentials. + * @old points to the original credentials. + * @gfp indicates the atomicity of any memory allocations. + * Prepare a new set of credentials by copying the data from the old set. + * @cred_transfer: + * @new points to the new credentials. + * @old points to the original credentials. + * Transfer data from original creds to new creds + * @kernel_act_as: + * Set the credentials for a kernel service to act as (subjective context). + * @new points to the credentials to be modified. + * @secid specifies the security ID to be set + * The current task must be the one that nominated @secid. + * Return 0 if successful. + * @kernel_create_files_as: + * Set the file creation context in a set of credentials to be the same as + * the objective context of the specified inode. + * @new points to the credentials to be modified. + * @inode points to the inode to use as a reference. + * The current task must be the one that nominated @inode. + * Return 0 if successful. + * @kernel_fw_from_file: + * Load firmware from userspace (not called for built-in firmware). + * @file contains the file structure pointing to the file containing + * the firmware to load. This argument will be NULL if the firmware + * was loaded via the uevent-triggered blob-based interface exposed + * by CONFIG_FW_LOADER_USER_HELPER. + * @buf pointer to buffer containing firmware contents. + * @size length of the firmware contents. + * Return 0 if permission is granted. + * @kernel_module_request: + * Ability to trigger the kernel to automatically upcall to userspace for + * userspace to load a kernel module with the given name. + * @kmod_name name of the module requested by the kernel + * Return 0 if successful. + * @kernel_module_from_file: + * Load a kernel module from userspace. + * @file contains the file structure pointing to the file containing + * the kernel module to load. If the module is being loaded from a blob, + * this argument will be NULL. + * Return 0 if permission is granted. + * @task_fix_setuid: + * Update the module's state after setting one or more of the user + * identity attributes of the current process. The @flags parameter + * indicates which of the set*uid system calls invoked this hook. If + * @new is the set of credentials that will be installed. Modifications + * should be made to this rather than to @current->cred. + * @old is the set of credentials that are being replaces + * @flags contains one of the LSM_SETID_* values. + * Return 0 on success. + * @task_setpgid: + * Check permission before setting the process group identifier of the + * process @p to @pgid. + * @p contains the task_struct for process being modified. + * @pgid contains the new pgid. + * Return 0 if permission is granted. + * @task_getpgid: + * Check permission before getting the process group identifier of the + * process @p. + * @p contains the task_struct for the process. + * Return 0 if permission is granted. + * @task_getsid: + * Check permission before getting the session identifier of the process + * @p. + * @p contains the task_struct for the process. + * Return 0 if permission is granted. + * @task_getsecid: + * Retrieve the security identifier of the process @p. + * @p contains the task_struct for the process and place is into @secid. + * In case of failure, @secid will be set to zero. + * + * @task_setnice: + * Check permission before setting the nice value of @p to @nice. + * @p contains the task_struct of process. + * @nice contains the new nice value. + * Return 0 if permission is granted. + * @task_setioprio + * Check permission before setting the ioprio value of @p to @ioprio. + * @p contains the task_struct of process. + * @ioprio contains the new ioprio value + * Return 0 if permission is granted. + * @task_getioprio + * Check permission before getting the ioprio value of @p. + * @p contains the task_struct of process. + * Return 0 if permission is granted. + * @task_setrlimit: + * Check permission before setting the resource limits of the current + * process for @resource to @new_rlim. The old resource limit values can + * be examined by dereferencing (current->signal->rlim + resource). + * @resource contains the resource whose limit is being set. + * @new_rlim contains the new limits for @resource. + * Return 0 if permission is granted. + * @task_setscheduler: + * Check permission before setting scheduling policy and/or parameters of + * process @p based on @policy and @lp. + * @p contains the task_struct for process. + * @policy contains the scheduling policy. + * @lp contains the scheduling parameters. + * Return 0 if permission is granted. + * @task_getscheduler: + * Check permission before obtaining scheduling information for process + * @p. + * @p contains the task_struct for process. + * Return 0 if permission is granted. + * @task_movememory + * Check permission before moving memory owned by process @p. + * @p contains the task_struct for process. + * Return 0 if permission is granted. + * @task_kill: + * Check permission before sending signal @sig to @p. @info can be NULL, + * the constant 1, or a pointer to a siginfo structure. If @info is 1 or + * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming + * from the kernel and should typically be permitted. + * SIGIO signals are handled separately by the send_sigiotask hook in + * file_security_ops. + * @p contains the task_struct for process. + * @info contains the signal information. + * @sig contains the signal value. + * @secid contains the sid of the process where the signal originated + * Return 0 if permission is granted. + * @task_wait: + * Check permission before allowing a process to reap a child process @p + * and collect its status information. + * @p contains the task_struct for process. + * Return 0 if permission is granted. + * @task_prctl: + * Check permission before performing a process control operation on the + * current process. + * @option contains the operation. + * @arg2 contains a argument. + * @arg3 contains a argument. + * @arg4 contains a argument. + * @arg5 contains a argument. + * Return -ENOSYS if no-one wanted to handle this op, any other value to + * cause prctl() to return immediately with that value. + * @task_to_inode: + * Set the security attributes for an inode based on an associated task's + * security attributes, e.g. for /proc/pid inodes. + * @p contains the task_struct for the task. + * @inode contains the inode structure for the inode. + * + * Security hooks for Netlink messaging. + * + * @netlink_send: + * Save security information for a netlink message so that permission + * checking can be performed when the message is processed. The security + * information can be saved using the eff_cap field of the + * netlink_skb_parms structure. Also may be used to provide fine + * grained control over message transmission. + * @sk associated sock of task sending the message. + * @skb contains the sk_buff structure for the netlink message. + * Return 0 if the information was successfully saved and message + * is allowed to be transmitted. + * + * Security hooks for Unix domain networking. + * + * @unix_stream_connect: + * Check permissions before establishing a Unix domain stream connection + * between @sock and @other. + * @sock contains the sock structure. + * @other contains the peer sock structure. + * @newsk contains the new sock structure. + * Return 0 if permission is granted. + * @unix_may_send: + * Check permissions before connecting or sending datagrams from @sock to + * @other. + * @sock contains the socket structure. + * @other contains the peer socket structure. + * Return 0 if permission is granted. + * + * The @unix_stream_connect and @unix_may_send hooks were necessary because + * Linux provides an alternative to the conventional file name space for Unix + * domain sockets. Whereas binding and connecting to sockets in the file name + * space is mediated by the typical file permissions (and caught by the mknod + * and permission hooks in inode_security_ops), binding and connecting to + * sockets in the abstract name space is completely unmediated. Sufficient + * control of Unix domain sockets in the abstract name space isn't possible + * using only the socket layer hooks, since we need to know the actual target + * socket, which is not looked up until we are inside the af_unix code. + * + * Security hooks for socket operations. + * + * @socket_create: + * Check permissions prior to creating a new socket. + * @family contains the requested protocol family. + * @type contains the requested communications type. + * @protocol contains the requested protocol. + * @kern set to 1 if a kernel socket. + * Return 0 if permission is granted. + * @socket_post_create: + * This hook allows a module to update or allocate a per-socket security + * structure. Note that the security field was not added directly to the + * socket structure, but rather, the socket security information is stored + * in the associated inode. Typically, the inode alloc_security hook will + * allocate and and attach security information to + * sock->inode->i_security. This hook may be used to update the + * sock->inode->i_security field with additional information that wasn't + * available when the inode was allocated. + * @sock contains the newly created socket structure. + * @family contains the requested protocol family. + * @type contains the requested communications type. + * @protocol contains the requested protocol. + * @kern set to 1 if a kernel socket. + * @socket_bind: + * Check permission before socket protocol layer bind operation is + * performed and the socket @sock is bound to the address specified in the + * @address parameter. + * @sock contains the socket structure. + * @address contains the address to bind to. + * @addrlen contains the length of address. + * Return 0 if permission is granted. + * @socket_connect: + * Check permission before socket protocol layer connect operation + * attempts to connect socket @sock to a remote address, @address. + * @sock contains the socket structure. + * @address contains the address of remote endpoint. + * @addrlen contains the length of address. + * Return 0 if permission is granted. + * @socket_listen: + * Check permission before socket protocol layer listen operation. + * @sock contains the socket structure. + * @backlog contains the maximum length for the pending connection queue. + * Return 0 if permission is granted. + * @socket_accept: + * Check permission before accepting a new connection. Note that the new + * socket, @newsock, has been created and some information copied to it, + * but the accept operation has not actually been performed. + * @sock contains the listening socket structure. + * @newsock contains the newly created server socket for connection. + * Return 0 if permission is granted. + * @socket_sendmsg: + * Check permission before transmitting a message to another socket. + * @sock contains the socket structure. + * @msg contains the message to be transmitted. + * @size contains the size of message. + * Return 0 if permission is granted. + * @socket_recvmsg: + * Check permission before receiving a message from a socket. + * @sock contains the socket structure. + * @msg contains the message structure. + * @size contains the size of message structure. + * @flags contains the operational flags. + * Return 0 if permission is granted. + * @socket_getsockname: + * Check permission before the local address (name) of the socket object + * @sock is retrieved. + * @sock contains the socket structure. + * Return 0 if permission is granted. + * @socket_getpeername: + * Check permission before the remote address (name) of a socket object + * @sock is retrieved. + * @sock contains the socket structure. + * Return 0 if permission is granted. + * @socket_getsockopt: + * Check permissions before retrieving the options associated with socket + * @sock. + * @sock contains the socket structure. + * @level contains the protocol level to retrieve option from. + * @optname contains the name of option to retrieve. + * Return 0 if permission is granted. + * @socket_setsockopt: + * Check permissions before setting the options associated with socket + * @sock. + * @sock contains the socket structure. + * @level contains the protocol level to set options for. + * @optname contains the name of the option to set. + * Return 0 if permission is granted. + * @socket_shutdown: + * Checks permission before all or part of a connection on the socket + * @sock is shut down. + * @sock contains the socket structure. + * @how contains the flag indicating how future sends and receives + * are handled. + * Return 0 if permission is granted. + * @socket_sock_rcv_skb: + * Check permissions on incoming network packets. This hook is distinct + * from Netfilter's IP input hooks since it is the first time that the + * incoming sk_buff @skb has been associated with a particular socket, @sk. + * Must not sleep inside this hook because some callers hold spinlocks. + * @sk contains the sock (not socket) associated with the incoming sk_buff. + * @skb contains the incoming network data. + * @socket_getpeersec_stream: + * This hook allows the security module to provide peer socket security + * state for unix or connected tcp sockets to userspace via getsockopt + * SO_GETPEERSEC. For tcp sockets this can be meaningful if the + * socket is associated with an ipsec SA. + * @sock is the local socket. + * @optval userspace memory where the security state is to be copied. + * @optlen userspace int where the module should copy the actual length + * of the security state. + * @len as input is the maximum length to copy to userspace provided + * by the caller. + * Return 0 if all is well, otherwise, typical getsockopt return + * values. + * @socket_getpeersec_dgram: + * This hook allows the security module to provide peer socket security + * state for udp sockets on a per-packet basis to userspace via + * getsockopt SO_GETPEERSEC. The application must first have indicated + * the IP_PASSSEC option via getsockopt. It can then retrieve the + * security state returned by this hook for a packet via the SCM_SECURITY + * ancillary message type. + * @skb is the skbuff for the packet being queried + * @secdata is a pointer to a buffer in which to copy the security data + * @seclen is the maximum length for @secdata + * Return 0 on success, error on failure. + * @sk_alloc_security: + * Allocate and attach a security structure to the sk->sk_security field, + * which is used to copy security attributes between local stream sockets. + * @sk_free_security: + * Deallocate security structure. + * @sk_clone_security: + * Clone/copy security structure. + * @sk_getsecid: + * Retrieve the LSM-specific secid for the sock to enable caching + * of network authorizations. + * @sock_graft: + * Sets the socket's isec sid to the sock's sid. + * @inet_conn_request: + * Sets the openreq's sid to socket's sid with MLS portion taken + * from peer sid. + * @inet_csk_clone: + * Sets the new child socket's sid to the openreq sid. + * @inet_conn_established: + * Sets the connection's peersid to the secmark on skb. + * @secmark_relabel_packet: + * check if the process should be allowed to relabel packets to + * the given secid + * @security_secmark_refcount_inc + * tells the LSM to increment the number of secmark labeling rules loaded + * @security_secmark_refcount_dec + * tells the LSM to decrement the number of secmark labeling rules loaded + * @req_classify_flow: + * Sets the flow's sid to the openreq sid. + * @tun_dev_alloc_security: + * This hook allows a module to allocate a security structure for a TUN + * device. + * @security pointer to a security structure pointer. + * Returns a zero on success, negative values on failure. + * @tun_dev_free_security: + * This hook allows a module to free the security structure for a TUN + * device. + * @security pointer to the TUN device's security structure + * @tun_dev_create: + * Check permissions prior to creating a new TUN device. + * @tun_dev_attach_queue: + * Check permissions prior to attaching to a TUN device queue. + * @security pointer to the TUN device's security structure. + * @tun_dev_attach: + * This hook can be used by the module to update any security state + * associated with the TUN device's sock structure. + * @sk contains the existing sock structure. + * @security pointer to the TUN device's security structure. + * @tun_dev_open: + * This hook can be used by the module to update any security state + * associated with the TUN device's security structure. + * @security pointer to the TUN devices's security structure. + * + * Security hooks for XFRM operations. + * + * @xfrm_policy_alloc_security: + * @ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy + * Database used by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level policy update program (e.g., setkey). + * Allocate a security structure to the xp->security field; the security + * field is initialized to NULL when the xfrm_policy is allocated. + * Return 0 if operation was successful (memory to allocate, legal context) + * @gfp is to specify the context for the allocation + * @xfrm_policy_clone_security: + * @old_ctx contains an existing xfrm_sec_ctx. + * @new_ctxp contains a new xfrm_sec_ctx being cloned from old. + * Allocate a security structure in new_ctxp that contains the + * information from the old_ctx structure. + * Return 0 if operation was successful (memory to allocate). + * @xfrm_policy_free_security: + * @ctx contains the xfrm_sec_ctx + * Deallocate xp->security. + * @xfrm_policy_delete_security: + * @ctx contains the xfrm_sec_ctx. + * Authorize deletion of xp->security. + * @xfrm_state_alloc: + * @x contains the xfrm_state being added to the Security Association + * Database by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level SA generation program (e.g., setkey or racoon). + * Allocate a security structure to the x->security field; the security + * field is initialized to NULL when the xfrm_state is allocated. Set the + * context to correspond to sec_ctx. Return 0 if operation was successful + * (memory to allocate, legal context). + * @xfrm_state_alloc_acquire: + * @x contains the xfrm_state being added to the Security Association + * Database by the XFRM system. + * @polsec contains the policy's security context. + * @secid contains the secid from which to take the mls portion of the + * context. + * Allocate a security structure to the x->security field; the security + * field is initialized to NULL when the xfrm_state is allocated. Set the + * context to correspond to secid. Return 0 if operation was successful + * (memory to allocate, legal context). + * @xfrm_state_free_security: + * @x contains the xfrm_state. + * Deallocate x->security. + * @xfrm_state_delete_security: + * @x contains the xfrm_state. + * Authorize deletion of x->security. + * @xfrm_policy_lookup: + * @ctx contains the xfrm_sec_ctx for which the access control is being + * checked. + * @fl_secid contains the flow security label that is used to authorize + * access to the policy xp. + * @dir contains the direction of the flow (input or output). + * Check permission when a flow selects a xfrm_policy for processing + * XFRMs on a packet. The hook is called when selecting either a + * per-socket policy or a generic xfrm policy. + * Return 0 if permission is granted, -ESRCH otherwise, or -errno + * on other errors. + * @xfrm_state_pol_flow_match: + * @x contains the state to match. + * @xp contains the policy to check for a match. + * @fl contains the flow to check for a match. + * Return 1 if there is a match. + * @xfrm_decode_session: + * @skb points to skb to decode. + * @secid points to the flow key secid to set. + * @ckall says if all xfrms used should be checked for same secid. + * Return 0 if ckall is zero or all xfrms used have the same secid. + * + * Security hooks affecting all Key Management operations + * + * @key_alloc: + * Permit allocation of a key and assign security data. Note that key does + * not have a serial number assigned at this point. + * @key points to the key. + * @flags is the allocation flags + * Return 0 if permission is granted, -ve error otherwise. + * @key_free: + * Notification of destruction; free security data. + * @key points to the key. + * No return value. + * @key_permission: + * See whether a specific operational right is granted to a process on a + * key. + * @key_ref refers to the key (key pointer + possession attribute bit). + * @cred points to the credentials to provide the context against which to + * evaluate the security data on the key. + * @perm describes the combination of permissions required of this key. + * Return 0 if permission is granted, -ve error otherwise. + * @key_getsecurity: + * Get a textual representation of the security context attached to a key + * for the purposes of honouring KEYCTL_GETSECURITY. This function + * allocates the storage for the NUL-terminated string and the caller + * should free it. + * @key points to the key to be queried. + * @_buffer points to a pointer that should be set to point to the + * resulting string (if no label or an error occurs). + * Return the length of the string (including terminating NUL) or -ve if + * an error. + * May also return 0 (and a NULL buffer pointer) if there is no label. + * + * Security hooks affecting all System V IPC operations. + * + * @ipc_permission: + * Check permissions for access to IPC + * @ipcp contains the kernel IPC permission structure + * @flag contains the desired (requested) permission set + * Return 0 if permission is granted. + * @ipc_getsecid: + * Get the secid associated with the ipc object. + * @ipcp contains the kernel IPC permission structure. + * @secid contains a pointer to the location where result will be saved. + * In case of failure, @secid will be set to zero. + * + * Security hooks for individual messages held in System V IPC message queues + * @msg_msg_alloc_security: + * Allocate and attach a security structure to the msg->security field. + * The security field is initialized to NULL when the structure is first + * created. + * @msg contains the message structure to be modified. + * Return 0 if operation was successful and permission is granted. + * @msg_msg_free_security: + * Deallocate the security structure for this message. + * @msg contains the message structure to be modified. + * + * Security hooks for System V IPC Message Queues + * + * @msg_queue_alloc_security: + * Allocate and attach a security structure to the + * msq->q_perm.security field. The security field is initialized to + * NULL when the structure is first created. + * @msq contains the message queue structure to be modified. + * Return 0 if operation was successful and permission is granted. + * @msg_queue_free_security: + * Deallocate security structure for this message queue. + * @msq contains the message queue structure to be modified. + * @msg_queue_associate: + * Check permission when a message queue is requested through the + * msgget system call. This hook is only called when returning the + * message queue identifier for an existing message queue, not when a + * new message queue is created. + * @msq contains the message queue to act upon. + * @msqflg contains the operation control flags. + * Return 0 if permission is granted. + * @msg_queue_msgctl: + * Check permission when a message control operation specified by @cmd + * is to be performed on the message queue @msq. + * The @msq may be NULL, e.g. for IPC_INFO or MSG_INFO. + * @msq contains the message queue to act upon. May be NULL. + * @cmd contains the operation to be performed. + * Return 0 if permission is granted. + * @msg_queue_msgsnd: + * Check permission before a message, @msg, is enqueued on the message + * queue, @msq. + * @msq contains the message queue to send message to. + * @msg contains the message to be enqueued. + * @msqflg contains operational flags. + * Return 0 if permission is granted. + * @msg_queue_msgrcv: + * Check permission before a message, @msg, is removed from the message + * queue, @msq. The @target task structure contains a pointer to the + * process that will be receiving the message (not equal to the current + * process when inline receives are being performed). + * @msq contains the message queue to retrieve message from. + * @msg contains the message destination. + * @target contains the task structure for recipient process. + * @type contains the type of message requested. + * @mode contains the operational flags. + * Return 0 if permission is granted. + * + * Security hooks for System V Shared Memory Segments + * + * @shm_alloc_security: + * Allocate and attach a security structure to the shp->shm_perm.security + * field. The security field is initialized to NULL when the structure is + * first created. + * @shp contains the shared memory structure to be modified. + * Return 0 if operation was successful and permission is granted. + * @shm_free_security: + * Deallocate the security struct for this memory segment. + * @shp contains the shared memory structure to be modified. + * @shm_associate: + * Check permission when a shared memory region is requested through the + * shmget system call. This hook is only called when returning the shared + * memory region identifier for an existing region, not when a new shared + * memory region is created. + * @shp contains the shared memory structure to be modified. + * @shmflg contains the operation control flags. + * Return 0 if permission is granted. + * @shm_shmctl: + * Check permission when a shared memory control operation specified by + * @cmd is to be performed on the shared memory region @shp. + * The @shp may be NULL, e.g. for IPC_INFO or SHM_INFO. + * @shp contains shared memory structure to be modified. + * @cmd contains the operation to be performed. + * Return 0 if permission is granted. + * @shm_shmat: + * Check permissions prior to allowing the shmat system call to attach the + * shared memory segment @shp to the data segment of the calling process. + * The attaching address is specified by @shmaddr. + * @shp contains the shared memory structure to be modified. + * @shmaddr contains the address to attach memory region to. + * @shmflg contains the operational flags. + * Return 0 if permission is granted. + * + * Security hooks for System V Semaphores + * + * @sem_alloc_security: + * Allocate and attach a security structure to the sma->sem_perm.security + * field. The security field is initialized to NULL when the structure is + * first created. + * @sma contains the semaphore structure + * Return 0 if operation was successful and permission is granted. + * @sem_free_security: + * deallocate security struct for this semaphore + * @sma contains the semaphore structure. + * @sem_associate: + * Check permission when a semaphore is requested through the semget + * system call. This hook is only called when returning the semaphore + * identifier for an existing semaphore, not when a new one must be + * created. + * @sma contains the semaphore structure. + * @semflg contains the operation control flags. + * Return 0 if permission is granted. + * @sem_semctl: + * Check permission when a semaphore operation specified by @cmd is to be + * performed on the semaphore @sma. The @sma may be NULL, e.g. for + * IPC_INFO or SEM_INFO. + * @sma contains the semaphore structure. May be NULL. + * @cmd contains the operation to be performed. + * Return 0 if permission is granted. + * @sem_semop + * Check permissions before performing operations on members of the + * semaphore set @sma. If the @alter flag is nonzero, the semaphore set + * may be modified. + * @sma contains the semaphore structure. + * @sops contains the operations to perform. + * @nsops contains the number of operations to perform. + * @alter contains the flag indicating whether changes are to be made. + * Return 0 if permission is granted. + * + * @binder_set_context_mgr + * Check whether @mgr is allowed to be the binder context manager. + * @mgr contains the task_struct for the task being registered. + * Return 0 if permission is granted. + * @binder_transaction + * Check whether @from is allowed to invoke a binder transaction call + * to @to. + * @from contains the task_struct for the sending task. + * @to contains the task_struct for the receiving task. + * @binder_transfer_binder + * Check whether @from is allowed to transfer a binder reference to @to. + * @from contains the task_struct for the sending task. + * @to contains the task_struct for the receiving task. + * @binder_transfer_file + * Check whether @from is allowed to transfer @file to @to. + * @from contains the task_struct for the sending task. + * @file contains the struct file being transferred. + * @to contains the task_struct for the receiving task. + * + * @ptrace_access_check: + * Check permission before allowing the current process to trace the + * @child process. + * Security modules may also want to perform a process tracing check + * during an execve in the set_security or apply_creds hooks of + * tracing check during an execve in the bprm_set_creds hook of + * binprm_security_ops if the process is being traced and its security + * attributes would be changed by the execve. + * @child contains the task_struct structure for the target process. + * @mode contains the PTRACE_MODE flags indicating the form of access. + * Return 0 if permission is granted. + * @ptrace_traceme: + * Check that the @parent process has sufficient permission to trace the + * current process before allowing the current process to present itself + * to the @parent process for tracing. + * @parent contains the task_struct structure for debugger process. + * Return 0 if permission is granted. + * @capget: + * Get the @effective, @inheritable, and @permitted capability sets for + * the @target process. The hook may also perform permission checking to + * determine if the current process is allowed to see the capability sets + * of the @target process. + * @target contains the task_struct structure for target process. + * @effective contains the effective capability set. + * @inheritable contains the inheritable capability set. + * @permitted contains the permitted capability set. + * Return 0 if the capability sets were successfully obtained. + * @capset: + * Set the @effective, @inheritable, and @permitted capability sets for + * the current process. + * @new contains the new credentials structure for target process. + * @old contains the current credentials structure for target process. + * @effective contains the effective capability set. + * @inheritable contains the inheritable capability set. + * @permitted contains the permitted capability set. + * Return 0 and update @new if permission is granted. + * @capable: + * Check whether the @tsk process has the @cap capability in the indicated + * credentials. + * @cred contains the credentials to use. + * @ns contains the user namespace we want the capability in + * @cap contains the capability . + * @audit: Whether to write an audit message or not + * Return 0 if the capability is granted for @tsk. + * @syslog: + * Check permission before accessing the kernel message ring or changing + * logging to the console. + * See the syslog(2) manual page for an explanation of the @type values. + * @type contains the type of action. + * @from_file indicates the context of action (if it came from /proc). + * Return 0 if permission is granted. + * @settime: + * Check permission to change the system time. + * struct timespec and timezone are defined in include/linux/time.h + * @ts contains new time + * @tz contains new timezone + * Return 0 if permission is granted. + * @vm_enough_memory: + * Check permissions for allocating a new virtual mapping. + * @mm contains the mm struct it is being added to. + * @pages contains the number of pages. + * Return 0 if permission is granted. + * + * @ismaclabel: + * Check if the extended attribute specified by @name + * represents a MAC label. Returns 1 if name is a MAC + * attribute otherwise returns 0. + * @name full extended attribute name to check against + * LSM as a MAC label. + * + * @secid_to_secctx: + * Convert secid to security context. If secdata is NULL the length of + * the result will be returned in seclen, but no secdata will be returned. + * This does mean that the length could change between calls to check the + * length and the next call which actually allocates and returns the + * secdata. + * @secid contains the security ID. + * @secdata contains the pointer that stores the converted security + * context. + * @seclen pointer which contains the length of the data + * @secctx_to_secid: + * Convert security context to secid. + * @secid contains the pointer to the generated security ID. + * @secdata contains the security context. + * + * @release_secctx: + * Release the security context. + * @secdata contains the security context. + * @seclen contains the length of the security context. + * + * Security hooks for Audit + * + * @audit_rule_init: + * Allocate and initialize an LSM audit rule structure. + * @field contains the required Audit action. + * Fields flags are defined in include/linux/audit.h + * @op contains the operator the rule uses. + * @rulestr contains the context where the rule will be applied to. + * @lsmrule contains a pointer to receive the result. + * Return 0 if @lsmrule has been successfully set, + * -EINVAL in case of an invalid rule. + * + * @audit_rule_known: + * Specifies whether given @rule contains any fields related to + * current LSM. + * @rule contains the audit rule of interest. + * Return 1 in case of relation found, 0 otherwise. + * + * @audit_rule_match: + * Determine if given @secid matches a rule previously approved + * by @audit_rule_known. + * @secid contains the security id in question. + * @field contains the field which relates to current LSM. + * @op contains the operator that will be used for matching. + * @rule points to the audit rule that will be checked against. + * @actx points to the audit context associated with the check. + * Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure. + * + * @audit_rule_free: + * Deallocate the LSM audit rule structure previously allocated by + * audit_rule_init. + * @rule contains the allocated rule + * + * @inode_notifysecctx: + * Notify the security module of what the security context of an inode + * should be. Initializes the incore security context managed by the + * security module for this inode. Example usage: NFS client invokes + * this hook to initialize the security context in its incore inode to the + * value provided by the server for the file when the server returned the + * file's attributes to the client. + * + * Must be called with inode->i_mutex locked. + * + * @inode we wish to set the security context of. + * @ctx contains the string which we wish to set in the inode. + * @ctxlen contains the length of @ctx. + * + * @inode_setsecctx: + * Change the security context of an inode. Updates the + * incore security context managed by the security module and invokes the + * fs code as needed (via __vfs_setxattr_noperm) to update any backing + * xattrs that represent the context. Example usage: NFS server invokes + * this hook to change the security context in its incore inode and on the + * backing filesystem to a value provided by the client on a SETATTR + * operation. + * + * Must be called with inode->i_mutex locked. + * + * @dentry contains the inode we wish to set the security context of. + * @ctx contains the string which we wish to set in the inode. + * @ctxlen contains the length of @ctx. + * + * @inode_getsecctx: + * On success, returns 0 and fills out @ctx and @ctxlen with the security + * context for the given @inode. + * + * @inode we wish to get the security context of. + * @ctx is a pointer in which to place the allocated security context. + * @ctxlen points to the place to put the length of @ctx. + * This is the main security structure. + */ + struct security_operations { char name[SECURITY_NAME_MAX + 1]; -- cgit v1.2.3 From 346033a28fb16b83dac2a74d8025ff8ee64a2c9b Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:11:14 -0700 Subject: LSM: Remove a comment from security.h Remove the large comment describing the content of the security_operations structure from security.h. This wasn't done in the previous (2/7) patch because it would have exceeded the mail list size limits. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/security.h | 1270 ---------------------------------------------- 1 file changed, 1270 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index f3d42c636f27..a2a100e7ac6e 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -186,1276 +186,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) opts->num_mnt_opts = 0; } -/** - * struct security_operations - main security structure - * - * Security module identifier. - * - * @name: - * A string that acts as a unique identifier for the LSM with max number - * of characters = SECURITY_NAME_MAX. - * - * Security hooks for program execution operations. - * - * @bprm_set_creds: - * Save security information in the bprm->security field, typically based - * on information about the bprm->file, for later use by the apply_creds - * hook. This hook may also optionally check permissions (e.g. for - * transitions between security domains). - * This hook may be called multiple times during a single execve, e.g. for - * interpreters. The hook can tell whether it has already been called by - * checking to see if @bprm->security is non-NULL. If so, then the hook - * may decide either to retain the security information saved earlier or - * to replace it. - * @bprm contains the linux_binprm structure. - * Return 0 if the hook is successful and permission is granted. - * @bprm_check_security: - * This hook mediates the point when a search for a binary handler will - * begin. It allows a check the @bprm->security value which is set in the - * preceding set_creds call. The primary difference from set_creds is - * that the argv list and envp list are reliably available in @bprm. This - * hook may be called multiple times during a single execve; and in each - * pass set_creds is called first. - * @bprm contains the linux_binprm structure. - * Return 0 if the hook is successful and permission is granted. - * @bprm_committing_creds: - * Prepare to install the new security attributes of a process being - * transformed by an execve operation, based on the old credentials - * pointed to by @current->cred and the information set in @bprm->cred by - * the bprm_set_creds hook. @bprm points to the linux_binprm structure. - * This hook is a good place to perform state changes on the process such - * as closing open file descriptors to which access will no longer be - * granted when the attributes are changed. This is called immediately - * before commit_creds(). - * @bprm_committed_creds: - * Tidy up after the installation of the new security attributes of a - * process being transformed by an execve operation. The new credentials - * have, by this point, been set to @current->cred. @bprm points to the - * linux_binprm structure. This hook is a good place to perform state - * changes on the process such as clearing out non-inheritable signal - * state. This is called immediately after commit_creds(). - * @bprm_secureexec: - * Return a boolean value (0 or 1) indicating whether a "secure exec" - * is required. The flag is passed in the auxiliary table - * on the initial stack to the ELF interpreter to indicate whether libc - * should enable secure mode. - * @bprm contains the linux_binprm structure. - * - * Security hooks for filesystem operations. - * - * @sb_alloc_security: - * Allocate and attach a security structure to the sb->s_security field. - * The s_security field is initialized to NULL when the structure is - * allocated. - * @sb contains the super_block structure to be modified. - * Return 0 if operation was successful. - * @sb_free_security: - * Deallocate and clear the sb->s_security field. - * @sb contains the super_block structure to be modified. - * @sb_statfs: - * Check permission before obtaining filesystem statistics for the @mnt - * mountpoint. - * @dentry is a handle on the superblock for the filesystem. - * Return 0 if permission is granted. - * @sb_mount: - * Check permission before an object specified by @dev_name is mounted on - * the mount point named by @nd. For an ordinary mount, @dev_name - * identifies a device if the file system type requires a device. For a - * remount (@flags & MS_REMOUNT), @dev_name is irrelevant. For a - * loopback/bind mount (@flags & MS_BIND), @dev_name identifies the - * pathname of the object being mounted. - * @dev_name contains the name for object being mounted. - * @path contains the path for mount point object. - * @type contains the filesystem type. - * @flags contains the mount flags. - * @data contains the filesystem-specific data. - * Return 0 if permission is granted. - * @sb_copy_data: - * Allow mount option data to be copied prior to parsing by the filesystem, - * so that the security module can extract security-specific mount - * options cleanly (a filesystem may modify the data e.g. with strsep()). - * This also allows the original mount data to be stripped of security- - * specific options to avoid having to make filesystems aware of them. - * @type the type of filesystem being mounted. - * @orig the original mount data copied from userspace. - * @copy copied data which will be passed to the security module. - * Returns 0 if the copy was successful. - * @sb_remount: - * Extracts security system specific mount options and verifies no changes - * are being made to those options. - * @sb superblock being remounted - * @data contains the filesystem-specific data. - * Return 0 if permission is granted. - * @sb_umount: - * Check permission before the @mnt file system is unmounted. - * @mnt contains the mounted file system. - * @flags contains the unmount flags, e.g. MNT_FORCE. - * Return 0 if permission is granted. - * @sb_pivotroot: - * Check permission before pivoting the root filesystem. - * @old_path contains the path for the new location of the current root (put_old). - * @new_path contains the path for the new root (new_root). - * Return 0 if permission is granted. - * @sb_set_mnt_opts: - * Set the security relevant mount options used for a superblock - * @sb the superblock to set security mount options for - * @opts binary data structure containing all lsm mount data - * @sb_clone_mnt_opts: - * Copy all security options from a given superblock to another - * @oldsb old superblock which contain information to clone - * @newsb new superblock which needs filled in - * @sb_parse_opts_str: - * Parse a string of security data filling in the opts structure - * @options string containing all mount options known by the LSM - * @opts binary data structure usable by the LSM - * @dentry_init_security: - * Compute a context for a dentry as the inode is not yet available - * since NFSv4 has no label backed by an EA anyway. - * @dentry dentry to use in calculating the context. - * @mode mode used to determine resource type. - * @name name of the last path component used to create file - * @ctx pointer to place the pointer to the resulting context in. - * @ctxlen point to place the length of the resulting context. - * - * - * Security hooks for inode operations. - * - * @inode_alloc_security: - * Allocate and attach a security structure to @inode->i_security. The - * i_security field is initialized to NULL when the inode structure is - * allocated. - * @inode contains the inode structure. - * Return 0 if operation was successful. - * @inode_free_security: - * @inode contains the inode structure. - * Deallocate the inode security structure and set @inode->i_security to - * NULL. - * @inode_init_security: - * Obtain the security attribute name suffix and value to set on a newly - * created inode and set up the incore security field for the new inode. - * This hook is called by the fs code as part of the inode creation - * transaction and provides for atomic labeling of the inode, unlike - * the post_create/mkdir/... hooks called by the VFS. The hook function - * is expected to allocate the name and value via kmalloc, with the caller - * being responsible for calling kfree after using them. - * If the security module does not use security attributes or does - * not wish to put a security attribute on this particular inode, - * then it should return -EOPNOTSUPP to skip this processing. - * @inode contains the inode structure of the newly created inode. - * @dir contains the inode structure of the parent directory. - * @qstr contains the last path component of the new object - * @name will be set to the allocated name suffix (e.g. selinux). - * @value will be set to the allocated attribute value. - * @len will be set to the length of the value. - * Returns 0 if @name and @value have been successfully set, - * -EOPNOTSUPP if no security attribute is needed, or - * -ENOMEM on memory allocation failure. - * @inode_create: - * Check permission to create a regular file. - * @dir contains inode structure of the parent of the new file. - * @dentry contains the dentry structure for the file to be created. - * @mode contains the file mode of the file to be created. - * Return 0 if permission is granted. - * @inode_link: - * Check permission before creating a new hard link to a file. - * @old_dentry contains the dentry structure for an existing link to the file. - * @dir contains the inode structure of the parent directory of the new link. - * @new_dentry contains the dentry structure for the new link. - * Return 0 if permission is granted. - * @path_link: - * Check permission before creating a new hard link to a file. - * @old_dentry contains the dentry structure for an existing link - * to the file. - * @new_dir contains the path structure of the parent directory of - * the new link. - * @new_dentry contains the dentry structure for the new link. - * Return 0 if permission is granted. - * @inode_unlink: - * Check the permission to remove a hard link to a file. - * @dir contains the inode structure of parent directory of the file. - * @dentry contains the dentry structure for file to be unlinked. - * Return 0 if permission is granted. - * @path_unlink: - * Check the permission to remove a hard link to a file. - * @dir contains the path structure of parent directory of the file. - * @dentry contains the dentry structure for file to be unlinked. - * Return 0 if permission is granted. - * @inode_symlink: - * Check the permission to create a symbolic link to a file. - * @dir contains the inode structure of parent directory of the symbolic link. - * @dentry contains the dentry structure of the symbolic link. - * @old_name contains the pathname of file. - * Return 0 if permission is granted. - * @path_symlink: - * Check the permission to create a symbolic link to a file. - * @dir contains the path structure of parent directory of - * the symbolic link. - * @dentry contains the dentry structure of the symbolic link. - * @old_name contains the pathname of file. - * Return 0 if permission is granted. - * @inode_mkdir: - * Check permissions to create a new directory in the existing directory - * associated with inode structure @dir. - * @dir contains the inode structure of parent of the directory to be created. - * @dentry contains the dentry structure of new directory. - * @mode contains the mode of new directory. - * Return 0 if permission is granted. - * @path_mkdir: - * Check permissions to create a new directory in the existing directory - * associated with path structure @path. - * @dir contains the path structure of parent of the directory - * to be created. - * @dentry contains the dentry structure of new directory. - * @mode contains the mode of new directory. - * Return 0 if permission is granted. - * @inode_rmdir: - * Check the permission to remove a directory. - * @dir contains the inode structure of parent of the directory to be removed. - * @dentry contains the dentry structure of directory to be removed. - * Return 0 if permission is granted. - * @path_rmdir: - * Check the permission to remove a directory. - * @dir contains the path structure of parent of the directory to be - * removed. - * @dentry contains the dentry structure of directory to be removed. - * Return 0 if permission is granted. - * @inode_mknod: - * Check permissions when creating a special file (or a socket or a fifo - * file created via the mknod system call). Note that if mknod operation - * is being done for a regular file, then the create hook will be called - * and not this hook. - * @dir contains the inode structure of parent of the new file. - * @dentry contains the dentry structure of the new file. - * @mode contains the mode of the new file. - * @dev contains the device number. - * Return 0 if permission is granted. - * @path_mknod: - * Check permissions when creating a file. Note that this hook is called - * even if mknod operation is being done for a regular file. - * @dir contains the path structure of parent of the new file. - * @dentry contains the dentry structure of the new file. - * @mode contains the mode of the new file. - * @dev contains the undecoded device number. Use new_decode_dev() to get - * the decoded device number. - * Return 0 if permission is granted. - * @inode_rename: - * Check for permission to rename a file or directory. - * @old_dir contains the inode structure for parent of the old link. - * @old_dentry contains the dentry structure of the old link. - * @new_dir contains the inode structure for parent of the new link. - * @new_dentry contains the dentry structure of the new link. - * Return 0 if permission is granted. - * @path_rename: - * Check for permission to rename a file or directory. - * @old_dir contains the path structure for parent of the old link. - * @old_dentry contains the dentry structure of the old link. - * @new_dir contains the path structure for parent of the new link. - * @new_dentry contains the dentry structure of the new link. - * Return 0 if permission is granted. - * @path_chmod: - * Check for permission to change DAC's permission of a file or directory. - * @dentry contains the dentry structure. - * @mnt contains the vfsmnt structure. - * @mode contains DAC's mode. - * Return 0 if permission is granted. - * @path_chown: - * Check for permission to change owner/group of a file or directory. - * @path contains the path structure. - * @uid contains new owner's ID. - * @gid contains new group's ID. - * Return 0 if permission is granted. - * @path_chroot: - * Check for permission to change root directory. - * @path contains the path structure. - * Return 0 if permission is granted. - * @inode_readlink: - * Check the permission to read the symbolic link. - * @dentry contains the dentry structure for the file link. - * Return 0 if permission is granted. - * @inode_follow_link: - * Check permission to follow a symbolic link when looking up a pathname. - * @dentry contains the dentry structure for the link. - * @nd contains the nameidata structure for the parent directory. - * Return 0 if permission is granted. - * @inode_permission: - * Check permission before accessing an inode. This hook is called by the - * existing Linux permission function, so a security module can use it to - * provide additional checking for existing Linux permission checks. - * Notice that this hook is called when a file is opened (as well as many - * other operations), whereas the file_security_ops permission hook is - * called when the actual read/write operations are performed. - * @inode contains the inode structure to check. - * @mask contains the permission mask. - * Return 0 if permission is granted. - * @inode_setattr: - * Check permission before setting file attributes. Note that the kernel - * call to notify_change is performed from several locations, whenever - * file attributes change (such as when a file is truncated, chown/chmod - * operations, transferring disk quotas, etc). - * @dentry contains the dentry structure for the file. - * @attr is the iattr structure containing the new file attributes. - * Return 0 if permission is granted. - * @path_truncate: - * Check permission before truncating a file. - * @path contains the path structure for the file. - * Return 0 if permission is granted. - * @inode_getattr: - * Check permission before obtaining file attributes. - * @mnt is the vfsmount where the dentry was looked up - * @dentry contains the dentry structure for the file. - * Return 0 if permission is granted. - * @inode_setxattr: - * Check permission before setting the extended attributes - * @value identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_post_setxattr: - * Update inode security field after successful setxattr operation. - * @value identified by @name for @dentry. - * @inode_getxattr: - * Check permission before obtaining the extended attributes - * identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_listxattr: - * Check permission before obtaining the list of extended attribute - * names for @dentry. - * Return 0 if permission is granted. - * @inode_removexattr: - * Check permission before removing the extended attribute - * identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_getsecurity: - * Retrieve a copy of the extended attribute representation of the - * security label associated with @name for @inode via @buffer. Note that - * @name is the remainder of the attribute name after the security prefix - * has been removed. @alloc is used to specify of the call should return a - * value via the buffer or just the value length Return size of buffer on - * success. - * @inode_setsecurity: - * Set the security label associated with @name for @inode from the - * extended attribute value @value. @size indicates the size of the - * @value in bytes. @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. - * Note that @name is the remainder of the attribute name after the - * security. prefix has been removed. - * Return 0 on success. - * @inode_listsecurity: - * Copy the extended attribute names for the security labels - * associated with @inode into @buffer. The maximum size of @buffer - * is specified by @buffer_size. @buffer may be NULL to request - * the size of the buffer required. - * Returns number of bytes used/required on success. - * @inode_need_killpriv: - * Called when an inode has been changed. - * @dentry is the dentry being changed. - * Return <0 on error to abort the inode change operation. - * Return 0 if inode_killpriv does not need to be called. - * Return >0 if inode_killpriv does need to be called. - * @inode_killpriv: - * The setuid bit is being removed. Remove similar security labels. - * Called with the dentry->d_inode->i_mutex held. - * @dentry is the dentry being changed. - * Return 0 on success. If error is returned, then the operation - * causing setuid bit removal is failed. - * @inode_getsecid: - * Get the secid associated with the node. - * @inode contains a pointer to the inode. - * @secid contains a pointer to the location where result will be saved. - * In case of failure, @secid will be set to zero. - * - * Security hooks for file operations - * - * @file_permission: - * Check file permissions before accessing an open file. This hook is - * called by various operations that read or write files. A security - * module can use this hook to perform additional checking on these - * operations, e.g. to revalidate permissions on use to support privilege - * bracketing or policy changes. Notice that this hook is used when the - * actual read/write operations are performed, whereas the - * inode_security_ops hook is called when a file is opened (as well as - * many other operations). - * Caveat: Although this hook can be used to revalidate permissions for - * various system call operations that read or write files, it does not - * address the revalidation of permissions for memory-mapped files. - * Security modules must handle this separately if they need such - * revalidation. - * @file contains the file structure being accessed. - * @mask contains the requested permissions. - * Return 0 if permission is granted. - * @file_alloc_security: - * Allocate and attach a security structure to the file->f_security field. - * The security field is initialized to NULL when the structure is first - * created. - * @file contains the file structure to secure. - * Return 0 if the hook is successful and permission is granted. - * @file_free_security: - * Deallocate and free any security structures stored in file->f_security. - * @file contains the file structure being modified. - * @file_ioctl: - * @file contains the file structure. - * @cmd contains the operation to perform. - * @arg contains the operational arguments. - * Check permission for an ioctl operation on @file. Note that @arg - * sometimes represents a user space pointer; in other cases, it may be a - * simple integer value. When @arg represents a user space pointer, it - * should never be used by the security module. - * Return 0 if permission is granted. - * @mmap_addr : - * Check permissions for a mmap operation at @addr. - * @addr contains virtual address that will be used for the operation. - * Return 0 if permission is granted. - * @mmap_file : - * Check permissions for a mmap operation. The @file may be NULL, e.g. - * if mapping anonymous memory. - * @file contains the file structure for file to map (may be NULL). - * @reqprot contains the protection requested by the application. - * @prot contains the protection that will be applied by the kernel. - * @flags contains the operational flags. - * Return 0 if permission is granted. - * @file_mprotect: - * Check permissions before changing memory access permissions. - * @vma contains the memory region to modify. - * @reqprot contains the protection requested by the application. - * @prot contains the protection that will be applied by the kernel. - * Return 0 if permission is granted. - * @file_lock: - * Check permission before performing file locking operations. - * Note: this hook mediates both flock and fcntl style locks. - * @file contains the file structure. - * @cmd contains the posix-translated lock operation to perform - * (e.g. F_RDLCK, F_WRLCK). - * Return 0 if permission is granted. - * @file_fcntl: - * Check permission before allowing the file operation specified by @cmd - * from being performed on the file @file. Note that @arg sometimes - * represents a user space pointer; in other cases, it may be a simple - * integer value. When @arg represents a user space pointer, it should - * never be used by the security module. - * @file contains the file structure. - * @cmd contains the operation to be performed. - * @arg contains the operational arguments. - * Return 0 if permission is granted. - * @file_set_fowner: - * Save owner security information (typically from current->security) in - * file->f_security for later use by the send_sigiotask hook. - * @file contains the file structure to update. - * Return 0 on success. - * @file_send_sigiotask: - * Check permission for the file owner @fown to send SIGIO or SIGURG to the - * process @tsk. Note that this hook is sometimes called from interrupt. - * Note that the fown_struct, @fown, is never outside the context of a - * struct file, so the file structure (and associated security information) - * can always be obtained: - * container_of(fown, struct file, f_owner) - * @tsk contains the structure of task receiving signal. - * @fown contains the file owner information. - * @sig is the signal that will be sent. When 0, kernel sends SIGIO. - * Return 0 if permission is granted. - * @file_receive: - * This hook allows security modules to control the ability of a process - * to receive an open file descriptor via socket IPC. - * @file contains the file structure being received. - * Return 0 if permission is granted. - * @file_open - * Save open-time permission checking state for later use upon - * file_permission, and recheck access if anything has changed - * since inode_permission. - * - * Security hooks for task operations. - * - * @task_create: - * Check permission before creating a child process. See the clone(2) - * manual page for definitions of the @clone_flags. - * @clone_flags contains the flags indicating what should be shared. - * Return 0 if permission is granted. - * @task_free: - * @task task being freed - * Handle release of task-related resources. (Note that this can be called - * from interrupt context.) - * @cred_alloc_blank: - * @cred points to the credentials. - * @gfp indicates the atomicity of any memory allocations. - * Only allocate sufficient memory and attach to @cred such that - * cred_transfer() will not get ENOMEM. - * @cred_free: - * @cred points to the credentials. - * Deallocate and clear the cred->security field in a set of credentials. - * @cred_prepare: - * @new points to the new credentials. - * @old points to the original credentials. - * @gfp indicates the atomicity of any memory allocations. - * Prepare a new set of credentials by copying the data from the old set. - * @cred_transfer: - * @new points to the new credentials. - * @old points to the original credentials. - * Transfer data from original creds to new creds - * @kernel_act_as: - * Set the credentials for a kernel service to act as (subjective context). - * @new points to the credentials to be modified. - * @secid specifies the security ID to be set - * The current task must be the one that nominated @secid. - * Return 0 if successful. - * @kernel_create_files_as: - * Set the file creation context in a set of credentials to be the same as - * the objective context of the specified inode. - * @new points to the credentials to be modified. - * @inode points to the inode to use as a reference. - * The current task must be the one that nominated @inode. - * Return 0 if successful. - * @kernel_fw_from_file: - * Load firmware from userspace (not called for built-in firmware). - * @file contains the file structure pointing to the file containing - * the firmware to load. This argument will be NULL if the firmware - * was loaded via the uevent-triggered blob-based interface exposed - * by CONFIG_FW_LOADER_USER_HELPER. - * @buf pointer to buffer containing firmware contents. - * @size length of the firmware contents. - * Return 0 if permission is granted. - * @kernel_module_request: - * Ability to trigger the kernel to automatically upcall to userspace for - * userspace to load a kernel module with the given name. - * @kmod_name name of the module requested by the kernel - * Return 0 if successful. - * @kernel_module_from_file: - * Load a kernel module from userspace. - * @file contains the file structure pointing to the file containing - * the kernel module to load. If the module is being loaded from a blob, - * this argument will be NULL. - * Return 0 if permission is granted. - * @task_fix_setuid: - * Update the module's state after setting one or more of the user - * identity attributes of the current process. The @flags parameter - * indicates which of the set*uid system calls invoked this hook. If - * @new is the set of credentials that will be installed. Modifications - * should be made to this rather than to @current->cred. - * @old is the set of credentials that are being replaces - * @flags contains one of the LSM_SETID_* values. - * Return 0 on success. - * @task_setpgid: - * Check permission before setting the process group identifier of the - * process @p to @pgid. - * @p contains the task_struct for process being modified. - * @pgid contains the new pgid. - * Return 0 if permission is granted. - * @task_getpgid: - * Check permission before getting the process group identifier of the - * process @p. - * @p contains the task_struct for the process. - * Return 0 if permission is granted. - * @task_getsid: - * Check permission before getting the session identifier of the process - * @p. - * @p contains the task_struct for the process. - * Return 0 if permission is granted. - * @task_getsecid: - * Retrieve the security identifier of the process @p. - * @p contains the task_struct for the process and place is into @secid. - * In case of failure, @secid will be set to zero. - * - * @task_setnice: - * Check permission before setting the nice value of @p to @nice. - * @p contains the task_struct of process. - * @nice contains the new nice value. - * Return 0 if permission is granted. - * @task_setioprio - * Check permission before setting the ioprio value of @p to @ioprio. - * @p contains the task_struct of process. - * @ioprio contains the new ioprio value - * Return 0 if permission is granted. - * @task_getioprio - * Check permission before getting the ioprio value of @p. - * @p contains the task_struct of process. - * Return 0 if permission is granted. - * @task_setrlimit: - * Check permission before setting the resource limits of the current - * process for @resource to @new_rlim. The old resource limit values can - * be examined by dereferencing (current->signal->rlim + resource). - * @resource contains the resource whose limit is being set. - * @new_rlim contains the new limits for @resource. - * Return 0 if permission is granted. - * @task_setscheduler: - * Check permission before setting scheduling policy and/or parameters of - * process @p based on @policy and @lp. - * @p contains the task_struct for process. - * @policy contains the scheduling policy. - * @lp contains the scheduling parameters. - * Return 0 if permission is granted. - * @task_getscheduler: - * Check permission before obtaining scheduling information for process - * @p. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_movememory - * Check permission before moving memory owned by process @p. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_kill: - * Check permission before sending signal @sig to @p. @info can be NULL, - * the constant 1, or a pointer to a siginfo structure. If @info is 1 or - * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming - * from the kernel and should typically be permitted. - * SIGIO signals are handled separately by the send_sigiotask hook in - * file_security_ops. - * @p contains the task_struct for process. - * @info contains the signal information. - * @sig contains the signal value. - * @secid contains the sid of the process where the signal originated - * Return 0 if permission is granted. - * @task_wait: - * Check permission before allowing a process to reap a child process @p - * and collect its status information. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_prctl: - * Check permission before performing a process control operation on the - * current process. - * @option contains the operation. - * @arg2 contains a argument. - * @arg3 contains a argument. - * @arg4 contains a argument. - * @arg5 contains a argument. - * Return -ENOSYS if no-one wanted to handle this op, any other value to - * cause prctl() to return immediately with that value. - * @task_to_inode: - * Set the security attributes for an inode based on an associated task's - * security attributes, e.g. for /proc/pid inodes. - * @p contains the task_struct for the task. - * @inode contains the inode structure for the inode. - * - * Security hooks for Netlink messaging. - * - * @netlink_send: - * Save security information for a netlink message so that permission - * checking can be performed when the message is processed. The security - * information can be saved using the eff_cap field of the - * netlink_skb_parms structure. Also may be used to provide fine - * grained control over message transmission. - * @sk associated sock of task sending the message. - * @skb contains the sk_buff structure for the netlink message. - * Return 0 if the information was successfully saved and message - * is allowed to be transmitted. - * - * Security hooks for Unix domain networking. - * - * @unix_stream_connect: - * Check permissions before establishing a Unix domain stream connection - * between @sock and @other. - * @sock contains the sock structure. - * @other contains the peer sock structure. - * @newsk contains the new sock structure. - * Return 0 if permission is granted. - * @unix_may_send: - * Check permissions before connecting or sending datagrams from @sock to - * @other. - * @sock contains the socket structure. - * @other contains the peer socket structure. - * Return 0 if permission is granted. - * - * The @unix_stream_connect and @unix_may_send hooks were necessary because - * Linux provides an alternative to the conventional file name space for Unix - * domain sockets. Whereas binding and connecting to sockets in the file name - * space is mediated by the typical file permissions (and caught by the mknod - * and permission hooks in inode_security_ops), binding and connecting to - * sockets in the abstract name space is completely unmediated. Sufficient - * control of Unix domain sockets in the abstract name space isn't possible - * using only the socket layer hooks, since we need to know the actual target - * socket, which is not looked up until we are inside the af_unix code. - * - * Security hooks for socket operations. - * - * @socket_create: - * Check permissions prior to creating a new socket. - * @family contains the requested protocol family. - * @type contains the requested communications type. - * @protocol contains the requested protocol. - * @kern set to 1 if a kernel socket. - * Return 0 if permission is granted. - * @socket_post_create: - * This hook allows a module to update or allocate a per-socket security - * structure. Note that the security field was not added directly to the - * socket structure, but rather, the socket security information is stored - * in the associated inode. Typically, the inode alloc_security hook will - * allocate and and attach security information to - * sock->inode->i_security. This hook may be used to update the - * sock->inode->i_security field with additional information that wasn't - * available when the inode was allocated. - * @sock contains the newly created socket structure. - * @family contains the requested protocol family. - * @type contains the requested communications type. - * @protocol contains the requested protocol. - * @kern set to 1 if a kernel socket. - * @socket_bind: - * Check permission before socket protocol layer bind operation is - * performed and the socket @sock is bound to the address specified in the - * @address parameter. - * @sock contains the socket structure. - * @address contains the address to bind to. - * @addrlen contains the length of address. - * Return 0 if permission is granted. - * @socket_connect: - * Check permission before socket protocol layer connect operation - * attempts to connect socket @sock to a remote address, @address. - * @sock contains the socket structure. - * @address contains the address of remote endpoint. - * @addrlen contains the length of address. - * Return 0 if permission is granted. - * @socket_listen: - * Check permission before socket protocol layer listen operation. - * @sock contains the socket structure. - * @backlog contains the maximum length for the pending connection queue. - * Return 0 if permission is granted. - * @socket_accept: - * Check permission before accepting a new connection. Note that the new - * socket, @newsock, has been created and some information copied to it, - * but the accept operation has not actually been performed. - * @sock contains the listening socket structure. - * @newsock contains the newly created server socket for connection. - * Return 0 if permission is granted. - * @socket_sendmsg: - * Check permission before transmitting a message to another socket. - * @sock contains the socket structure. - * @msg contains the message to be transmitted. - * @size contains the size of message. - * Return 0 if permission is granted. - * @socket_recvmsg: - * Check permission before receiving a message from a socket. - * @sock contains the socket structure. - * @msg contains the message structure. - * @size contains the size of message structure. - * @flags contains the operational flags. - * Return 0 if permission is granted. - * @socket_getsockname: - * Check permission before the local address (name) of the socket object - * @sock is retrieved. - * @sock contains the socket structure. - * Return 0 if permission is granted. - * @socket_getpeername: - * Check permission before the remote address (name) of a socket object - * @sock is retrieved. - * @sock contains the socket structure. - * Return 0 if permission is granted. - * @socket_getsockopt: - * Check permissions before retrieving the options associated with socket - * @sock. - * @sock contains the socket structure. - * @level contains the protocol level to retrieve option from. - * @optname contains the name of option to retrieve. - * Return 0 if permission is granted. - * @socket_setsockopt: - * Check permissions before setting the options associated with socket - * @sock. - * @sock contains the socket structure. - * @level contains the protocol level to set options for. - * @optname contains the name of the option to set. - * Return 0 if permission is granted. - * @socket_shutdown: - * Checks permission before all or part of a connection on the socket - * @sock is shut down. - * @sock contains the socket structure. - * @how contains the flag indicating how future sends and receives are handled. - * Return 0 if permission is granted. - * @socket_sock_rcv_skb: - * Check permissions on incoming network packets. This hook is distinct - * from Netfilter's IP input hooks since it is the first time that the - * incoming sk_buff @skb has been associated with a particular socket, @sk. - * Must not sleep inside this hook because some callers hold spinlocks. - * @sk contains the sock (not socket) associated with the incoming sk_buff. - * @skb contains the incoming network data. - * @socket_getpeersec_stream: - * This hook allows the security module to provide peer socket security - * state for unix or connected tcp sockets to userspace via getsockopt - * SO_GETPEERSEC. For tcp sockets this can be meaningful if the - * socket is associated with an ipsec SA. - * @sock is the local socket. - * @optval userspace memory where the security state is to be copied. - * @optlen userspace int where the module should copy the actual length - * of the security state. - * @len as input is the maximum length to copy to userspace provided - * by the caller. - * Return 0 if all is well, otherwise, typical getsockopt return - * values. - * @socket_getpeersec_dgram: - * This hook allows the security module to provide peer socket security - * state for udp sockets on a per-packet basis to userspace via - * getsockopt SO_GETPEERSEC. The application must first have indicated - * the IP_PASSSEC option via getsockopt. It can then retrieve the - * security state returned by this hook for a packet via the SCM_SECURITY - * ancillary message type. - * @skb is the skbuff for the packet being queried - * @secdata is a pointer to a buffer in which to copy the security data - * @seclen is the maximum length for @secdata - * Return 0 on success, error on failure. - * @sk_alloc_security: - * Allocate and attach a security structure to the sk->sk_security field, - * which is used to copy security attributes between local stream sockets. - * @sk_free_security: - * Deallocate security structure. - * @sk_clone_security: - * Clone/copy security structure. - * @sk_getsecid: - * Retrieve the LSM-specific secid for the sock to enable caching of network - * authorizations. - * @sock_graft: - * Sets the socket's isec sid to the sock's sid. - * @inet_conn_request: - * Sets the openreq's sid to socket's sid with MLS portion taken from peer sid. - * @inet_csk_clone: - * Sets the new child socket's sid to the openreq sid. - * @inet_conn_established: - * Sets the connection's peersid to the secmark on skb. - * @secmark_relabel_packet: - * check if the process should be allowed to relabel packets to the given secid - * @security_secmark_refcount_inc - * tells the LSM to increment the number of secmark labeling rules loaded - * @security_secmark_refcount_dec - * tells the LSM to decrement the number of secmark labeling rules loaded - * @req_classify_flow: - * Sets the flow's sid to the openreq sid. - * @tun_dev_alloc_security: - * This hook allows a module to allocate a security structure for a TUN - * device. - * @security pointer to a security structure pointer. - * Returns a zero on success, negative values on failure. - * @tun_dev_free_security: - * This hook allows a module to free the security structure for a TUN - * device. - * @security pointer to the TUN device's security structure - * @tun_dev_create: - * Check permissions prior to creating a new TUN device. - * @tun_dev_attach_queue: - * Check permissions prior to attaching to a TUN device queue. - * @security pointer to the TUN device's security structure. - * @tun_dev_attach: - * This hook can be used by the module to update any security state - * associated with the TUN device's sock structure. - * @sk contains the existing sock structure. - * @security pointer to the TUN device's security structure. - * @tun_dev_open: - * This hook can be used by the module to update any security state - * associated with the TUN device's security structure. - * @security pointer to the TUN devices's security structure. - * @skb_owned_by: - * This hook sets the packet's owning sock. - * @skb is the packet. - * @sk the sock which owns the packet. - * - * Security hooks for XFRM operations. - * - * @xfrm_policy_alloc_security: - * @ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy - * Database used by the XFRM system. - * @sec_ctx contains the security context information being provided by - * the user-level policy update program (e.g., setkey). - * Allocate a security structure to the xp->security field; the security - * field is initialized to NULL when the xfrm_policy is allocated. - * Return 0 if operation was successful (memory to allocate, legal context) - * @gfp is to specify the context for the allocation - * @xfrm_policy_clone_security: - * @old_ctx contains an existing xfrm_sec_ctx. - * @new_ctxp contains a new xfrm_sec_ctx being cloned from old. - * Allocate a security structure in new_ctxp that contains the - * information from the old_ctx structure. - * Return 0 if operation was successful (memory to allocate). - * @xfrm_policy_free_security: - * @ctx contains the xfrm_sec_ctx - * Deallocate xp->security. - * @xfrm_policy_delete_security: - * @ctx contains the xfrm_sec_ctx. - * Authorize deletion of xp->security. - * @xfrm_state_alloc: - * @x contains the xfrm_state being added to the Security Association - * Database by the XFRM system. - * @sec_ctx contains the security context information being provided by - * the user-level SA generation program (e.g., setkey or racoon). - * Allocate a security structure to the x->security field; the security - * field is initialized to NULL when the xfrm_state is allocated. Set the - * context to correspond to sec_ctx. Return 0 if operation was successful - * (memory to allocate, legal context). - * @xfrm_state_alloc_acquire: - * @x contains the xfrm_state being added to the Security Association - * Database by the XFRM system. - * @polsec contains the policy's security context. - * @secid contains the secid from which to take the mls portion of the - * context. - * Allocate a security structure to the x->security field; the security - * field is initialized to NULL when the xfrm_state is allocated. Set the - * context to correspond to secid. Return 0 if operation was successful - * (memory to allocate, legal context). - * @xfrm_state_free_security: - * @x contains the xfrm_state. - * Deallocate x->security. - * @xfrm_state_delete_security: - * @x contains the xfrm_state. - * Authorize deletion of x->security. - * @xfrm_policy_lookup: - * @ctx contains the xfrm_sec_ctx for which the access control is being - * checked. - * @fl_secid contains the flow security label that is used to authorize - * access to the policy xp. - * @dir contains the direction of the flow (input or output). - * Check permission when a flow selects a xfrm_policy for processing - * XFRMs on a packet. The hook is called when selecting either a - * per-socket policy or a generic xfrm policy. - * Return 0 if permission is granted, -ESRCH otherwise, or -errno - * on other errors. - * @xfrm_state_pol_flow_match: - * @x contains the state to match. - * @xp contains the policy to check for a match. - * @fl contains the flow to check for a match. - * Return 1 if there is a match. - * @xfrm_decode_session: - * @skb points to skb to decode. - * @secid points to the flow key secid to set. - * @ckall says if all xfrms used should be checked for same secid. - * Return 0 if ckall is zero or all xfrms used have the same secid. - * - * Security hooks affecting all Key Management operations - * - * @key_alloc: - * Permit allocation of a key and assign security data. Note that key does - * not have a serial number assigned at this point. - * @key points to the key. - * @flags is the allocation flags - * Return 0 if permission is granted, -ve error otherwise. - * @key_free: - * Notification of destruction; free security data. - * @key points to the key. - * No return value. - * @key_permission: - * See whether a specific operational right is granted to a process on a - * key. - * @key_ref refers to the key (key pointer + possession attribute bit). - * @cred points to the credentials to provide the context against which to - * evaluate the security data on the key. - * @perm describes the combination of permissions required of this key. - * Return 0 if permission is granted, -ve error otherwise. - * @key_getsecurity: - * Get a textual representation of the security context attached to a key - * for the purposes of honouring KEYCTL_GETSECURITY. This function - * allocates the storage for the NUL-terminated string and the caller - * should free it. - * @key points to the key to be queried. - * @_buffer points to a pointer that should be set to point to the - * resulting string (if no label or an error occurs). - * Return the length of the string (including terminating NUL) or -ve if - * an error. - * May also return 0 (and a NULL buffer pointer) if there is no label. - * - * Security hooks affecting all System V IPC operations. - * - * @ipc_permission: - * Check permissions for access to IPC - * @ipcp contains the kernel IPC permission structure - * @flag contains the desired (requested) permission set - * Return 0 if permission is granted. - * @ipc_getsecid: - * Get the secid associated with the ipc object. - * @ipcp contains the kernel IPC permission structure. - * @secid contains a pointer to the location where result will be saved. - * In case of failure, @secid will be set to zero. - * - * Security hooks for individual messages held in System V IPC message queues - * @msg_msg_alloc_security: - * Allocate and attach a security structure to the msg->security field. - * The security field is initialized to NULL when the structure is first - * created. - * @msg contains the message structure to be modified. - * Return 0 if operation was successful and permission is granted. - * @msg_msg_free_security: - * Deallocate the security structure for this message. - * @msg contains the message structure to be modified. - * - * Security hooks for System V IPC Message Queues - * - * @msg_queue_alloc_security: - * Allocate and attach a security structure to the - * msq->q_perm.security field. The security field is initialized to - * NULL when the structure is first created. - * @msq contains the message queue structure to be modified. - * Return 0 if operation was successful and permission is granted. - * @msg_queue_free_security: - * Deallocate security structure for this message queue. - * @msq contains the message queue structure to be modified. - * @msg_queue_associate: - * Check permission when a message queue is requested through the - * msgget system call. This hook is only called when returning the - * message queue identifier for an existing message queue, not when a - * new message queue is created. - * @msq contains the message queue to act upon. - * @msqflg contains the operation control flags. - * Return 0 if permission is granted. - * @msg_queue_msgctl: - * Check permission when a message control operation specified by @cmd - * is to be performed on the message queue @msq. - * The @msq may be NULL, e.g. for IPC_INFO or MSG_INFO. - * @msq contains the message queue to act upon. May be NULL. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @msg_queue_msgsnd: - * Check permission before a message, @msg, is enqueued on the message - * queue, @msq. - * @msq contains the message queue to send message to. - * @msg contains the message to be enqueued. - * @msqflg contains operational flags. - * Return 0 if permission is granted. - * @msg_queue_msgrcv: - * Check permission before a message, @msg, is removed from the message - * queue, @msq. The @target task structure contains a pointer to the - * process that will be receiving the message (not equal to the current - * process when inline receives are being performed). - * @msq contains the message queue to retrieve message from. - * @msg contains the message destination. - * @target contains the task structure for recipient process. - * @type contains the type of message requested. - * @mode contains the operational flags. - * Return 0 if permission is granted. - * - * Security hooks for System V Shared Memory Segments - * - * @shm_alloc_security: - * Allocate and attach a security structure to the shp->shm_perm.security - * field. The security field is initialized to NULL when the structure is - * first created. - * @shp contains the shared memory structure to be modified. - * Return 0 if operation was successful and permission is granted. - * @shm_free_security: - * Deallocate the security struct for this memory segment. - * @shp contains the shared memory structure to be modified. - * @shm_associate: - * Check permission when a shared memory region is requested through the - * shmget system call. This hook is only called when returning the shared - * memory region identifier for an existing region, not when a new shared - * memory region is created. - * @shp contains the shared memory structure to be modified. - * @shmflg contains the operation control flags. - * Return 0 if permission is granted. - * @shm_shmctl: - * Check permission when a shared memory control operation specified by - * @cmd is to be performed on the shared memory region @shp. - * The @shp may be NULL, e.g. for IPC_INFO or SHM_INFO. - * @shp contains shared memory structure to be modified. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @shm_shmat: - * Check permissions prior to allowing the shmat system call to attach the - * shared memory segment @shp to the data segment of the calling process. - * The attaching address is specified by @shmaddr. - * @shp contains the shared memory structure to be modified. - * @shmaddr contains the address to attach memory region to. - * @shmflg contains the operational flags. - * Return 0 if permission is granted. - * - * Security hooks for System V Semaphores - * - * @sem_alloc_security: - * Allocate and attach a security structure to the sma->sem_perm.security - * field. The security field is initialized to NULL when the structure is - * first created. - * @sma contains the semaphore structure - * Return 0 if operation was successful and permission is granted. - * @sem_free_security: - * deallocate security struct for this semaphore - * @sma contains the semaphore structure. - * @sem_associate: - * Check permission when a semaphore is requested through the semget - * system call. This hook is only called when returning the semaphore - * identifier for an existing semaphore, not when a new one must be - * created. - * @sma contains the semaphore structure. - * @semflg contains the operation control flags. - * Return 0 if permission is granted. - * @sem_semctl: - * Check permission when a semaphore operation specified by @cmd is to be - * performed on the semaphore @sma. The @sma may be NULL, e.g. for - * IPC_INFO or SEM_INFO. - * @sma contains the semaphore structure. May be NULL. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @sem_semop - * Check permissions before performing operations on members of the - * semaphore set @sma. If the @alter flag is nonzero, the semaphore set - * may be modified. - * @sma contains the semaphore structure. - * @sops contains the operations to perform. - * @nsops contains the number of operations to perform. - * @alter contains the flag indicating whether changes are to be made. - * Return 0 if permission is granted. - * - * @binder_set_context_mgr - * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the task_struct for the task being registered. - * Return 0 if permission is granted. - * @binder_transaction - * Check whether @from is allowed to invoke a binder transaction call - * to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. - * @binder_transfer_binder - * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. - * @binder_transfer_file - * Check whether @from is allowed to transfer @file to @to. - * @from contains the task_struct for the sending task. - * @file contains the struct file being transferred. - * @to contains the task_struct for the receiving task. - * - * @ptrace_access_check: - * Check permission before allowing the current process to trace the - * @child process. - * Security modules may also want to perform a process tracing check - * during an execve in the set_security or apply_creds hooks of - * tracing check during an execve in the bprm_set_creds hook of - * binprm_security_ops if the process is being traced and its security - * attributes would be changed by the execve. - * @child contains the task_struct structure for the target process. - * @mode contains the PTRACE_MODE flags indicating the form of access. - * Return 0 if permission is granted. - * @ptrace_traceme: - * Check that the @parent process has sufficient permission to trace the - * current process before allowing the current process to present itself - * to the @parent process for tracing. - * @parent contains the task_struct structure for debugger process. - * Return 0 if permission is granted. - * @capget: - * Get the @effective, @inheritable, and @permitted capability sets for - * the @target process. The hook may also perform permission checking to - * determine if the current process is allowed to see the capability sets - * of the @target process. - * @target contains the task_struct structure for target process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 if the capability sets were successfully obtained. - * @capset: - * Set the @effective, @inheritable, and @permitted capability sets for - * the current process. - * @new contains the new credentials structure for target process. - * @old contains the current credentials structure for target process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 and update @new if permission is granted. - * @capable: - * Check whether the @tsk process has the @cap capability in the indicated - * credentials. - * @cred contains the credentials to use. - * @ns contains the user namespace we want the capability in - * @cap contains the capability . - * @audit: Whether to write an audit message or not - * Return 0 if the capability is granted for @tsk. - * @syslog: - * Check permission before accessing the kernel message ring or changing - * logging to the console. - * See the syslog(2) manual page for an explanation of the @type values. - * @type contains the type of action. - * @from_file indicates the context of action (if it came from /proc). - * Return 0 if permission is granted. - * @settime: - * Check permission to change the system time. - * struct timespec and timezone are defined in include/linux/time.h - * @ts contains new time - * @tz contains new timezone - * Return 0 if permission is granted. - * @vm_enough_memory: - * Check permissions for allocating a new virtual mapping. - * @mm contains the mm struct it is being added to. - * @pages contains the number of pages. - * Return 0 if permission is granted. - * - * @ismaclabel: - * Check if the extended attribute specified by @name - * represents a MAC label. Returns 1 if name is a MAC - * attribute otherwise returns 0. - * @name full extended attribute name to check against - * LSM as a MAC label. - * - * @secid_to_secctx: - * Convert secid to security context. If secdata is NULL the length of - * the result will be returned in seclen, but no secdata will be returned. - * This does mean that the length could change between calls to check the - * length and the next call which actually allocates and returns the secdata. - * @secid contains the security ID. - * @secdata contains the pointer that stores the converted security context. - * @seclen pointer which contains the length of the data - * @secctx_to_secid: - * Convert security context to secid. - * @secid contains the pointer to the generated security ID. - * @secdata contains the security context. - * - * @release_secctx: - * Release the security context. - * @secdata contains the security context. - * @seclen contains the length of the security context. - * - * Security hooks for Audit - * - * @audit_rule_init: - * Allocate and initialize an LSM audit rule structure. - * @field contains the required Audit action. Fields flags are defined in include/linux/audit.h - * @op contains the operator the rule uses. - * @rulestr contains the context where the rule will be applied to. - * @lsmrule contains a pointer to receive the result. - * Return 0 if @lsmrule has been successfully set, - * -EINVAL in case of an invalid rule. - * - * @audit_rule_known: - * Specifies whether given @rule contains any fields related to current LSM. - * @rule contains the audit rule of interest. - * Return 1 in case of relation found, 0 otherwise. - * - * @audit_rule_match: - * Determine if given @secid matches a rule previously approved - * by @audit_rule_known. - * @secid contains the security id in question. - * @field contains the field which relates to current LSM. - * @op contains the operator that will be used for matching. - * @rule points to the audit rule that will be checked against. - * @actx points to the audit context associated with the check. - * Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure. - * - * @audit_rule_free: - * Deallocate the LSM audit rule structure previously allocated by - * audit_rule_init. - * @rule contains the allocated rule - * - * @inode_notifysecctx: - * Notify the security module of what the security context of an inode - * should be. Initializes the incore security context managed by the - * security module for this inode. Example usage: NFS client invokes - * this hook to initialize the security context in its incore inode to the - * value provided by the server for the file when the server returned the - * file's attributes to the client. - * - * Must be called with inode->i_mutex locked. - * - * @inode we wish to set the security context of. - * @ctx contains the string which we wish to set in the inode. - * @ctxlen contains the length of @ctx. - * - * @inode_setsecctx: - * Change the security context of an inode. Updates the - * incore security context managed by the security module and invokes the - * fs code as needed (via __vfs_setxattr_noperm) to update any backing - * xattrs that represent the context. Example usage: NFS server invokes - * this hook to change the security context in its incore inode and on the - * backing filesystem to a value provided by the client on a SETATTR - * operation. - * - * Must be called with inode->i_mutex locked. - * - * @dentry contains the inode we wish to set the security context of. - * @ctx contains the string which we wish to set in the inode. - * @ctxlen contains the length of @ctx. - * - * @inode_getsecctx: - * On success, returns 0 and fills out @ctx and @ctxlen with the security - * context for the given @inode. - * - * @inode we wish to get the security context of. - * @ctx is a pointer in which to place the allocated security context. - * @ctxlen points to the place to put the length of @ctx. - * This is the main security structure. - */ - /* prototypes */ extern int security_init(void); -- cgit v1.2.3 From e20b043a6902ecb61c2c84355c3bae5149f391db Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:11:36 -0700 Subject: LSM: Add security module hook list heads Add a list header for each security hook. They aren't used until later in the patch series. They are grouped together in a structure so that there doesn't need to be an external address for each. Macro-ize the initialization of the security_operations for each security module in anticipation of changing out the security_operations structure. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 220 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index b4c91de510c2..27dd6fcacccc 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1626,6 +1626,226 @@ struct security_operations { #endif /* CONFIG_AUDIT */ }; +struct security_hook_heads { + struct list_head binder_set_context_mgr; + struct list_head binder_transaction; + struct list_head binder_transfer_binder; + struct list_head binder_transfer_file; + struct list_head ptrace_access_check; + struct list_head ptrace_traceme; + struct list_head capget; + struct list_head capset; + struct list_head capable; + struct list_head quotactl; + struct list_head quota_on; + struct list_head syslog; + struct list_head settime; + struct list_head vm_enough_memory; + struct list_head bprm_set_creds; + struct list_head bprm_check_security; + struct list_head bprm_secureexec; + struct list_head bprm_committing_creds; + struct list_head bprm_committed_creds; + struct list_head sb_alloc_security; + struct list_head sb_free_security; + struct list_head sb_copy_data; + struct list_head sb_remount; + struct list_head sb_kern_mount; + struct list_head sb_show_options; + struct list_head sb_statfs; + struct list_head sb_mount; + struct list_head sb_umount; + struct list_head sb_pivotroot; + struct list_head sb_set_mnt_opts; + struct list_head sb_clone_mnt_opts; + struct list_head sb_parse_opts_str; + struct list_head dentry_init_security; +#ifdef CONFIG_SECURITY_PATH + struct list_head path_unlink; + struct list_head path_mkdir; + struct list_head path_rmdir; + struct list_head path_mknod; + struct list_head path_truncate; + struct list_head path_symlink; + struct list_head path_link; + struct list_head path_rename; + struct list_head path_chmod; + struct list_head path_chown; + struct list_head path_chroot; +#endif + struct list_head inode_alloc_security; + struct list_head inode_free_security; + struct list_head inode_init_security; + struct list_head inode_create; + struct list_head inode_link; + struct list_head inode_unlink; + struct list_head inode_symlink; + struct list_head inode_mkdir; + struct list_head inode_rmdir; + struct list_head inode_mknod; + struct list_head inode_rename; + struct list_head inode_readlink; + struct list_head inode_follow_link; + struct list_head inode_permission; + struct list_head inode_setattr; + struct list_head inode_getattr; + struct list_head inode_setxattr; + struct list_head inode_post_setxattr; + struct list_head inode_getxattr; + struct list_head inode_listxattr; + struct list_head inode_removexattr; + struct list_head inode_need_killpriv; + struct list_head inode_killpriv; + struct list_head inode_getsecurity; + struct list_head inode_setsecurity; + struct list_head inode_listsecurity; + struct list_head inode_getsecid; + struct list_head file_permission; + struct list_head file_alloc_security; + struct list_head file_free_security; + struct list_head file_ioctl; + struct list_head mmap_addr; + struct list_head mmap_file; + struct list_head file_mprotect; + struct list_head file_lock; + struct list_head file_fcntl; + struct list_head file_set_fowner; + struct list_head file_send_sigiotask; + struct list_head file_receive; + struct list_head file_open; + struct list_head task_create; + struct list_head task_free; + struct list_head cred_alloc_blank; + struct list_head cred_free; + struct list_head cred_prepare; + struct list_head cred_transfer; + struct list_head kernel_act_as; + struct list_head kernel_create_files_as; + struct list_head kernel_fw_from_file; + struct list_head kernel_module_request; + struct list_head kernel_module_from_file; + struct list_head task_fix_setuid; + struct list_head task_setpgid; + struct list_head task_getpgid; + struct list_head task_getsid; + struct list_head task_getsecid; + struct list_head task_setnice; + struct list_head task_setioprio; + struct list_head task_getioprio; + struct list_head task_setrlimit; + struct list_head task_setscheduler; + struct list_head task_getscheduler; + struct list_head task_movememory; + struct list_head task_kill; + struct list_head task_wait; + struct list_head task_prctl; + struct list_head task_to_inode; + struct list_head ipc_permission; + struct list_head ipc_getsecid; + struct list_head msg_msg_alloc_security; + struct list_head msg_msg_free_security; + struct list_head msg_queue_alloc_security; + struct list_head msg_queue_free_security; + struct list_head msg_queue_associate; + struct list_head msg_queue_msgctl; + struct list_head msg_queue_msgsnd; + struct list_head msg_queue_msgrcv; + struct list_head shm_alloc_security; + struct list_head shm_free_security; + struct list_head shm_associate; + struct list_head shm_shmctl; + struct list_head shm_shmat; + struct list_head sem_alloc_security; + struct list_head sem_free_security; + struct list_head sem_associate; + struct list_head sem_semctl; + struct list_head sem_semop; + struct list_head netlink_send; + struct list_head d_instantiate; + struct list_head getprocattr; + struct list_head setprocattr; + struct list_head ismaclabel; + struct list_head secid_to_secctx; + struct list_head secctx_to_secid; + struct list_head release_secctx; + struct list_head inode_notifysecctx; + struct list_head inode_setsecctx; + struct list_head inode_getsecctx; +#ifdef CONFIG_SECURITY_NETWORK + struct list_head unix_stream_connect; + struct list_head unix_may_send; + struct list_head socket_create; + struct list_head socket_post_create; + struct list_head socket_bind; + struct list_head socket_connect; + struct list_head socket_listen; + struct list_head socket_accept; + struct list_head socket_sendmsg; + struct list_head socket_recvmsg; + struct list_head socket_getsockname; + struct list_head socket_getpeername; + struct list_head socket_getsockopt; + struct list_head socket_setsockopt; + struct list_head socket_shutdown; + struct list_head socket_sock_rcv_skb; + struct list_head socket_getpeersec_stream; + struct list_head socket_getpeersec_dgram; + struct list_head sk_alloc_security; + struct list_head sk_free_security; + struct list_head sk_clone_security; + struct list_head sk_getsecid; + struct list_head sock_graft; + struct list_head inet_conn_request; + struct list_head inet_csk_clone; + struct list_head inet_conn_established; + struct list_head secmark_relabel_packet; + struct list_head secmark_refcount_inc; + struct list_head secmark_refcount_dec; + struct list_head req_classify_flow; + struct list_head tun_dev_alloc_security; + struct list_head tun_dev_free_security; + struct list_head tun_dev_create; + struct list_head tun_dev_attach_queue; + struct list_head tun_dev_attach; + struct list_head tun_dev_open; + struct list_head skb_owned_by; +#endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM + struct list_head xfrm_policy_alloc_security; + struct list_head xfrm_policy_clone_security; + struct list_head xfrm_policy_free_security; + struct list_head xfrm_policy_delete_security; + struct list_head xfrm_state_alloc; + struct list_head xfrm_state_alloc_acquire; + struct list_head xfrm_state_free_security; + struct list_head xfrm_state_delete_security; + struct list_head xfrm_policy_lookup; + struct list_head xfrm_state_pol_flow_match; + struct list_head xfrm_decode_session; +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ +#ifdef CONFIG_KEYS + struct list_head key_alloc; + struct list_head key_free; + struct list_head key_permission; + struct list_head key_getsecurity; +#endif /* CONFIG_KEYS */ +#ifdef CONFIG_AUDIT + struct list_head audit_rule_init; + struct list_head audit_rule_known; + struct list_head audit_rule_match; + struct list_head audit_rule_free; +#endif /* CONFIG_AUDIT */ +}; + +/* + * Initializing a security_hook_list structure takes + * up a lot of space in a source file. This macro takes + * care of the common case and reduces the amount of + * text involved. + * Casey says: Comment is true in the next patch. + */ +#define LSM_HOOK_INIT(HEAD, HOOK) .HEAD = HOOK + /* prototypes */ extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); -- cgit v1.2.3 From b1d9e6b0646d0e5ee5d9050bd236b6c65d66faef Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:11:42 -0700 Subject: LSM: Switch to lists of hooks Instead of using a vector of security operations with explicit, special case stacking of the capability and yama hooks use lists of hooks with capability and yama hooks included as appropriate. The security_operations structure is no longer required. Instead, there is a union of the function pointers that allows all the hooks lists to use a common mechanism for list management while retaining typing. Each module supplies an array describing the hooks it provides instead of a sparsely populated security_operations structure. The description includes the element that gets put on the hook list, avoiding the issues surrounding individual element allocation. The method for registering security modules is changed to reflect the information available. The method for removing a module, currently only used by SELinux, has also changed. It should be generic now, however if there are potential race conditions based on ordering of hook removal that needs to be addressed by the calling module. The security hooks are called from the lists and the first failure is returned. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 77 ++++++++++++++++++++++++++++++++--------------- include/linux/security.h | 46 +++------------------------- 2 files changed, 57 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 27dd6fcacccc..f014f2596e22 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -25,21 +25,10 @@ #define __LINUX_LSM_HOOKS_H #include - -/* Maximum number of letters for an LSM name string */ -#define SECURITY_NAME_MAX 10 - -#ifdef CONFIG_SECURITY +#include +#include /** - * struct security_operations - main security structure - * - * Security module identifier. - * - * @name: - * A string that acts as a unique identifier for the LSM with max number - * of characters = SECURITY_NAME_MAX. - * * Security hooks for program execution operations. * * @bprm_set_creds: @@ -1310,9 +1299,7 @@ * This is the main security structure. */ -struct security_operations { - char name[SECURITY_NAME_MAX + 1]; - +union security_list_options { int (*binder_set_context_mgr)(struct task_struct *mgr); int (*binder_transaction)(struct task_struct *from, struct task_struct *to); @@ -1837,21 +1824,63 @@ struct security_hook_heads { #endif /* CONFIG_AUDIT */ }; +/* + * Security module hook list structure. + * For use with generic list macros for common operations. + */ +struct security_hook_list { + struct list_head list; + struct list_head *head; + union security_list_options hook; +}; + /* * Initializing a security_hook_list structure takes * up a lot of space in a source file. This macro takes * care of the common case and reduces the amount of * text involved. - * Casey says: Comment is true in the next patch. */ -#define LSM_HOOK_INIT(HEAD, HOOK) .HEAD = HOOK +#define LSM_HOOK_INIT(HEAD, HOOK) \ + { .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } } + +extern struct security_hook_heads security_hook_heads; + +static inline void security_add_hooks(struct security_hook_list *hooks, + int count) +{ + int i; -/* prototypes */ -extern int security_module_enable(struct security_operations *ops); -extern int register_security(struct security_operations *ops); -extern void __init security_fixup_ops(struct security_operations *ops); -extern void reset_security_ops(void); + for (i = 0; i < count; i++) + list_add_tail_rcu(&hooks[i].list, hooks[i].head); +} -#endif /* CONFIG_SECURITY */ +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +/* + * Assuring the safety of deleting a security module is up to + * the security module involved. This may entail ordering the + * module's hook list in a particular way, refusing to disable + * the module once a policy is loaded or any number of other + * actions better imagined than described. + * + * The name of the configuration option reflects the only module + * that currently uses the mechanism. Any developer who thinks + * disabling their module is a good idea needs to be at least as + * careful as the SELinux team. + */ +static inline void security_delete_hooks(struct security_hook_list *hooks, + int count) +{ + int i; + + for (i = 0; i < count; i++) + list_del_rcu(&hooks[i].list); +} +#endif /* CONFIG_SECURITY_SELINUX_DISABLE */ + +extern int __init security_module_enable(const char *module); +extern void __init capability_add_hooks(void); +#ifdef CONFIG_SECURITY_YAMA_STACKED +void __init yama_add_hooks(void); +#endif #endif /* ! __LINUX_LSM_HOOKS_H */ diff --git a/include/linux/security.h b/include/linux/security.h index a2a100e7ac6e..8c8175d41b4c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -27,6 +27,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -54,9 +55,6 @@ struct xattr; struct xfrm_sec_ctx; struct mm_struct; -/* Maximum number of letters for an LSM name string */ -#define SECURITY_NAME_MAX 10 - /* If capable should audit the security request */ #define SECURITY_CAP_NOAUDIT 0 #define SECURITY_CAP_AUDIT 1 @@ -69,10 +67,7 @@ struct audit_krule; struct user_namespace; struct timezone; -/* - * These functions are in security/capability.c and are used - * as the default capabilities functions - */ +/* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, int cap, int audit); extern int cap_settime(const struct timespec *ts, const struct timezone *tz); @@ -114,8 +109,6 @@ struct xfrm_state; struct xfrm_user_sec_ctx; struct seq_file; -extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); - #ifdef CONFIG_MMU extern unsigned long mmap_min_addr; extern unsigned long dac_mmap_min_addr; @@ -472,7 +465,7 @@ static inline int security_settime(const struct timespec *ts, static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { - return cap_vm_enough_memory(mm, pages); + return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages)); } static inline int security_bprm_set_creds(struct linux_binprm *bprm) @@ -1075,7 +1068,7 @@ static inline int security_setprocattr(struct task_struct *p, char *name, void * static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb) { - return cap_netlink_send(sk, skb); + return 0; } static inline int security_ismaclabel(const char *name) @@ -1643,36 +1636,5 @@ static inline void free_secdata(void *secdata) { } #endif /* CONFIG_SECURITY */ -#ifdef CONFIG_SECURITY_YAMA -extern int yama_ptrace_access_check(struct task_struct *child, - unsigned int mode); -extern int yama_ptrace_traceme(struct task_struct *parent); -extern void yama_task_free(struct task_struct *task); -extern int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5); -#else -static inline int yama_ptrace_access_check(struct task_struct *child, - unsigned int mode) -{ - return 0; -} - -static inline int yama_ptrace_traceme(struct task_struct *parent) -{ - return 0; -} - -static inline void yama_task_free(struct task_struct *task) -{ -} - -static inline int yama_task_prctl(int option, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5) -{ - return -ENOSYS; -} -#endif /* CONFIG_SECURITY_YAMA */ - #endif /* ! __LINUX_SECURITY_H */ -- cgit v1.2.3 From 755a27e7e4c817dd51ade41668b380f26026899c Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sun, 3 May 2015 18:18:02 +0800 Subject: tracing: remove unused ftrace_output_event() prototype The prototype of ftrace_output_event was added by commit 1d6bae966e90 ("tracing: Move raw output code from macro to standalone function") but this function was not defined anywhere, and is still nowhere to be found. Link: http://lkml.kernel.org/r/1430648282-25792-1-git-send-email-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index f9ecf63d47f1..65ce6de91307 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -219,9 +219,6 @@ struct ftrace_event_class { extern int ftrace_event_reg(struct ftrace_event_call *event, enum trace_reg type, void *data); -int ftrace_output_event(struct trace_iterator *iter, struct ftrace_event_call *event, - char *fmt, ...); - int ftrace_event_define_field(struct ftrace_event_call *call, char *type, int len, char *item, int offset, int field_size, int sign, int filter); -- cgit v1.2.3 From 9abdffe286c1532a54d5aee31571d3029be4026c Mon Sep 17 00:00:00 2001 From: Sumit Semwal Date: Tue, 5 May 2015 14:56:15 +0530 Subject: dma-buf: add ref counting for module as exporter Add reference counting on a kernel module that exports dma-buf and implements its operations. This prevents the module from being unloaded while DMABUF file is in use. The original patch [1] was submitted by Tomasz Stanislawski, but this is a simpler way to do it. v3: call module_put() as late as possible, per gregkh's comment. v2: move owner to struct dma_buf, and use DEFINE_DMA_BUF_EXPORT_INFO macro to simplify the change. Acked-by: Greg Kroah-Hartman Signed-off-by: Sumit Semwal [1]: https://lkml.org/lkml/2012/8/8/163 --- include/linux/dma-buf.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 2f0b431b73e0..f98bd7068d55 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -115,6 +115,8 @@ struct dma_buf_ops { * @attachments: list of dma_buf_attachment that denotes all devices attached. * @ops: dma_buf_ops associated with this buffer object. * @exp_name: name of the exporter; useful for debugging. + * @owner: pointer to exporter module; used for refcounting when exporter is a + * kernel module. * @list_node: node for dma_buf accounting and debugging. * @priv: exporter specific private data for this buffer object. * @resv: reservation object linked to this dma-buf @@ -129,6 +131,7 @@ struct dma_buf { unsigned vmapping_counter; void *vmap_ptr; const char *exp_name; + struct module *owner; struct list_head list_node; void *priv; struct reservation_object *resv; @@ -164,7 +167,8 @@ struct dma_buf_attachment { /** * struct dma_buf_export_info - holds information needed to export a dma_buf - * @exp_name: name of the exporting module - useful for debugging. + * @exp_name: name of the exporter - useful for debugging. + * @owner: pointer to exporter module - used for refcounting kernel module * @ops: Attach allocator-defined dma buf ops to the new buffer * @size: Size of the buffer * @flags: mode flags for the file @@ -176,6 +180,7 @@ struct dma_buf_attachment { */ struct dma_buf_export_info { const char *exp_name; + struct module *owner; const struct dma_buf_ops *ops; size_t size; int flags; @@ -187,7 +192,8 @@ struct dma_buf_export_info { * helper macro for exporters; zeros and fills in most common values */ #define DEFINE_DMA_BUF_EXPORT_INFO(a) \ - struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME } + struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME, \ + .owner = THIS_MODULE } /** * get_dma_buf - convenience wrapper for get_file. -- cgit v1.2.3 From 42c86547f4e5c2e81616c76ce9a2badce515c41f Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Wed, 11 Mar 2015 11:34:25 +0100 Subject: clk: Expose clk_hw_reparent() to providers To be used by clock implementations for switching to a new parent during rate change. Signed-off-by: Tomeu Vizoso Signed-off-by: Thierry Reding --- include/linux/clk-provider.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index df695313f975..51efb9ec3e37 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -589,6 +589,7 @@ long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate, unsigned long max_rate, unsigned long *best_parent_rate, struct clk_hw **best_parent_p); +void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent); static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src) { -- cgit v1.2.3 From 289fcff4bcdb1dcc0ce8788b7ea0f58a9e4a495f Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 13 May 2015 15:26:42 +0300 Subject: usb: add bus type for USB ULPI UTMI+ Low Pin Interface (ULPI) is a commonly used PHY interface for USB 2.0. The ULPI specification describes a standard set of registers which the vendors can extend for their specific needs. ULPI PHYs provide often functions such as charger detection and ADP sensing and probing. There are two major issues that the bus type is meant to tackle: Firstly, ULPI registers are accessed from the controller. The bus provides convenient method for the controller drivers to share that access with the actual PHY drivers. Secondly, there are already platforms that assume ULPI PHYs are runtime detected, such as many Intel Baytrail based platforms. They do not provide any kind of hardware description for the ULPI PHYs like separate ACPI device object that could be used to enumerate a device from. Signed-off-by: Heikki Krogerus Acked-by: David Cohen Signed-off-by: Felipe Balbi --- include/linux/mod_devicetable.h | 6 ++ include/linux/ulpi/driver.h | 60 ++++++++++++++++++ include/linux/ulpi/interface.h | 23 +++++++ include/linux/ulpi/regs.h | 130 ++++++++++++++++++++++++++++++++++++++ include/linux/usb/ulpi.h | 134 +--------------------------------------- 5 files changed, 221 insertions(+), 132 deletions(-) create mode 100644 include/linux/ulpi/driver.h create mode 100644 include/linux/ulpi/interface.h create mode 100644 include/linux/ulpi/regs.h (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 3bfd56778c29..7ab00d61d30a 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -629,4 +629,10 @@ struct mcb_device_id { kernel_ulong_t driver_data; }; +struct ulpi_device_id { + __u16 vendor; + __u16 product; + kernel_ulong_t driver_data; +}; + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/include/linux/ulpi/driver.h b/include/linux/ulpi/driver.h new file mode 100644 index 000000000000..388f6e08b9d4 --- /dev/null +++ b/include/linux/ulpi/driver.h @@ -0,0 +1,60 @@ +#ifndef __LINUX_ULPI_DRIVER_H +#define __LINUX_ULPI_DRIVER_H + +#include + +#include + +struct ulpi_ops; + +/** + * struct ulpi - describes ULPI PHY device + * @id: vendor and product ids for ULPI device + * @ops: I/O access + * @dev: device interface + */ +struct ulpi { + struct ulpi_device_id id; + struct ulpi_ops *ops; + struct device dev; +}; + +#define to_ulpi_dev(d) container_of(d, struct ulpi, dev) + +static inline void ulpi_set_drvdata(struct ulpi *ulpi, void *data) +{ + dev_set_drvdata(&ulpi->dev, data); +} + +static inline void *ulpi_get_drvdata(struct ulpi *ulpi) +{ + return dev_get_drvdata(&ulpi->dev); +} + +/** + * struct ulpi_driver - describes a ULPI PHY driver + * @id_table: array of device identifiers supported by this driver + * @probe: binds this driver to ULPI device + * @remove: unbinds this driver from ULPI device + * @driver: the name and owner members must be initialized by the drivers + */ +struct ulpi_driver { + const struct ulpi_device_id *id_table; + int (*probe)(struct ulpi *ulpi); + void (*remove)(struct ulpi *ulpi); + struct device_driver driver; +}; + +#define to_ulpi_driver(d) container_of(d, struct ulpi_driver, driver) + +int ulpi_register_driver(struct ulpi_driver *drv); +void ulpi_unregister_driver(struct ulpi_driver *drv); + +#define module_ulpi_driver(__ulpi_driver) \ + module_driver(__ulpi_driver, ulpi_register_driver, \ + ulpi_unregister_driver) + +int ulpi_read(struct ulpi *ulpi, u8 addr); +int ulpi_write(struct ulpi *ulpi, u8 addr, u8 val); + +#endif /* __LINUX_ULPI_DRIVER_H */ diff --git a/include/linux/ulpi/interface.h b/include/linux/ulpi/interface.h new file mode 100644 index 000000000000..4de8ab491038 --- /dev/null +++ b/include/linux/ulpi/interface.h @@ -0,0 +1,23 @@ +#ifndef __LINUX_ULPI_INTERFACE_H +#define __LINUX_ULPI_INTERFACE_H + +#include + +struct ulpi; + +/** + * struct ulpi_ops - ULPI register access + * @dev: the interface provider + * @read: read operation for ULPI register access + * @write: write operation for ULPI register access + */ +struct ulpi_ops { + struct device *dev; + int (*read)(struct ulpi_ops *ops, u8 addr); + int (*write)(struct ulpi_ops *ops, u8 addr, u8 val); +}; + +struct ulpi *ulpi_register_interface(struct device *, struct ulpi_ops *); +void ulpi_unregister_interface(struct ulpi *); + +#endif /* __LINUX_ULPI_INTERFACE_H */ diff --git a/include/linux/ulpi/regs.h b/include/linux/ulpi/regs.h new file mode 100644 index 000000000000..b5b8b8804560 --- /dev/null +++ b/include/linux/ulpi/regs.h @@ -0,0 +1,130 @@ +#ifndef __LINUX_ULPI_REGS_H +#define __LINUX_ULPI_REGS_H + +/* + * Macros for Set and Clear + * See ULPI 1.1 specification to find the registers with Set and Clear offsets + */ +#define ULPI_SET(a) (a + 1) +#define ULPI_CLR(a) (a + 2) + +/* + * Register Map + */ +#define ULPI_VENDOR_ID_LOW 0x00 +#define ULPI_VENDOR_ID_HIGH 0x01 +#define ULPI_PRODUCT_ID_LOW 0x02 +#define ULPI_PRODUCT_ID_HIGH 0x03 +#define ULPI_FUNC_CTRL 0x04 +#define ULPI_IFC_CTRL 0x07 +#define ULPI_OTG_CTRL 0x0a +#define ULPI_USB_INT_EN_RISE 0x0d +#define ULPI_USB_INT_EN_FALL 0x10 +#define ULPI_USB_INT_STS 0x13 +#define ULPI_USB_INT_LATCH 0x14 +#define ULPI_DEBUG 0x15 +#define ULPI_SCRATCH 0x16 +/* Optional Carkit Registers */ +#define ULPI_CARKIT_CTRL 0x19 +#define ULPI_CARKIT_INT_DELAY 0x1c +#define ULPI_CARKIT_INT_EN 0x1d +#define ULPI_CARKIT_INT_STS 0x20 +#define ULPI_CARKIT_INT_LATCH 0x21 +#define ULPI_CARKIT_PLS_CTRL 0x22 +/* Other Optional Registers */ +#define ULPI_TX_POS_WIDTH 0x25 +#define ULPI_TX_NEG_WIDTH 0x26 +#define ULPI_POLARITY_RECOVERY 0x27 +/* Access Extended Register Set */ +#define ULPI_ACCESS_EXTENDED 0x2f +/* Vendor Specific */ +#define ULPI_VENDOR_SPECIFIC 0x30 +/* Extended Registers */ +#define ULPI_EXT_VENDOR_SPECIFIC 0x80 + +/* + * Register Bits + */ + +/* Function Control */ +#define ULPI_FUNC_CTRL_XCVRSEL BIT(0) +#define ULPI_FUNC_CTRL_XCVRSEL_MASK 0x3 +#define ULPI_FUNC_CTRL_HIGH_SPEED 0x0 +#define ULPI_FUNC_CTRL_FULL_SPEED 0x1 +#define ULPI_FUNC_CTRL_LOW_SPEED 0x2 +#define ULPI_FUNC_CTRL_FS4LS 0x3 +#define ULPI_FUNC_CTRL_TERMSELECT BIT(2) +#define ULPI_FUNC_CTRL_OPMODE BIT(3) +#define ULPI_FUNC_CTRL_OPMODE_MASK (0x3 << 3) +#define ULPI_FUNC_CTRL_OPMODE_NORMAL (0x0 << 3) +#define ULPI_FUNC_CTRL_OPMODE_NONDRIVING (0x1 << 3) +#define ULPI_FUNC_CTRL_OPMODE_DISABLE_NRZI (0x2 << 3) +#define ULPI_FUNC_CTRL_OPMODE_NOSYNC_NOEOP (0x3 << 3) +#define ULPI_FUNC_CTRL_RESET BIT(5) +#define ULPI_FUNC_CTRL_SUSPENDM BIT(6) + +/* Interface Control */ +#define ULPI_IFC_CTRL_6_PIN_SERIAL_MODE BIT(0) +#define ULPI_IFC_CTRL_3_PIN_SERIAL_MODE BIT(1) +#define ULPI_IFC_CTRL_CARKITMODE BIT(2) +#define ULPI_IFC_CTRL_CLOCKSUSPENDM BIT(3) +#define ULPI_IFC_CTRL_AUTORESUME BIT(4) +#define ULPI_IFC_CTRL_EXTERNAL_VBUS BIT(5) +#define ULPI_IFC_CTRL_PASSTHRU BIT(6) +#define ULPI_IFC_CTRL_PROTECT_IFC_DISABLE BIT(7) + +/* OTG Control */ +#define ULPI_OTG_CTRL_ID_PULLUP BIT(0) +#define ULPI_OTG_CTRL_DP_PULLDOWN BIT(1) +#define ULPI_OTG_CTRL_DM_PULLDOWN BIT(2) +#define ULPI_OTG_CTRL_DISCHRGVBUS BIT(3) +#define ULPI_OTG_CTRL_CHRGVBUS BIT(4) +#define ULPI_OTG_CTRL_DRVVBUS BIT(5) +#define ULPI_OTG_CTRL_DRVVBUS_EXT BIT(6) +#define ULPI_OTG_CTRL_EXTVBUSIND BIT(7) + +/* USB Interrupt Enable Rising, + * USB Interrupt Enable Falling, + * USB Interrupt Status and + * USB Interrupt Latch + */ +#define ULPI_INT_HOST_DISCONNECT BIT(0) +#define ULPI_INT_VBUS_VALID BIT(1) +#define ULPI_INT_SESS_VALID BIT(2) +#define ULPI_INT_SESS_END BIT(3) +#define ULPI_INT_IDGRD BIT(4) + +/* Debug */ +#define ULPI_DEBUG_LINESTATE0 BIT(0) +#define ULPI_DEBUG_LINESTATE1 BIT(1) + +/* Carkit Control */ +#define ULPI_CARKIT_CTRL_CARKITPWR BIT(0) +#define ULPI_CARKIT_CTRL_IDGNDDRV BIT(1) +#define ULPI_CARKIT_CTRL_TXDEN BIT(2) +#define ULPI_CARKIT_CTRL_RXDEN BIT(3) +#define ULPI_CARKIT_CTRL_SPKLEFTEN BIT(4) +#define ULPI_CARKIT_CTRL_SPKRIGHTEN BIT(5) +#define ULPI_CARKIT_CTRL_MICEN BIT(6) + +/* Carkit Interrupt Enable */ +#define ULPI_CARKIT_INT_EN_IDFLOAT_RISE BIT(0) +#define ULPI_CARKIT_INT_EN_IDFLOAT_FALL BIT(1) +#define ULPI_CARKIT_INT_EN_CARINTDET BIT(2) +#define ULPI_CARKIT_INT_EN_DP_RISE BIT(3) +#define ULPI_CARKIT_INT_EN_DP_FALL BIT(4) + +/* Carkit Interrupt Status and + * Carkit Interrupt Latch + */ +#define ULPI_CARKIT_INT_IDFLOAT BIT(0) +#define ULPI_CARKIT_INT_CARINTDET BIT(1) +#define ULPI_CARKIT_INT_DP BIT(2) + +/* Carkit Pulse Control*/ +#define ULPI_CARKIT_PLS_CTRL_TXPLSEN BIT(0) +#define ULPI_CARKIT_PLS_CTRL_RXPLSEN BIT(1) +#define ULPI_CARKIT_PLS_CTRL_SPKRLEFT_BIASEN BIT(2) +#define ULPI_CARKIT_PLS_CTRL_SPKRRIGHT_BIASEN BIT(3) + +#endif /* __LINUX_ULPI_REGS_H */ diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h index 5c295c26ad37..5f07407a367a 100644 --- a/include/linux/usb/ulpi.h +++ b/include/linux/usb/ulpi.h @@ -12,6 +12,8 @@ #define __LINUX_USB_ULPI_H #include +#include + /*-------------------------------------------------------------------------*/ /* @@ -49,138 +51,6 @@ /*-------------------------------------------------------------------------*/ -/* - * Macros for Set and Clear - * See ULPI 1.1 specification to find the registers with Set and Clear offsets - */ -#define ULPI_SET(a) (a + 1) -#define ULPI_CLR(a) (a + 2) - -/*-------------------------------------------------------------------------*/ - -/* - * Register Map - */ -#define ULPI_VENDOR_ID_LOW 0x00 -#define ULPI_VENDOR_ID_HIGH 0x01 -#define ULPI_PRODUCT_ID_LOW 0x02 -#define ULPI_PRODUCT_ID_HIGH 0x03 -#define ULPI_FUNC_CTRL 0x04 -#define ULPI_IFC_CTRL 0x07 -#define ULPI_OTG_CTRL 0x0a -#define ULPI_USB_INT_EN_RISE 0x0d -#define ULPI_USB_INT_EN_FALL 0x10 -#define ULPI_USB_INT_STS 0x13 -#define ULPI_USB_INT_LATCH 0x14 -#define ULPI_DEBUG 0x15 -#define ULPI_SCRATCH 0x16 -/* Optional Carkit Registers */ -#define ULPI_CARCIT_CTRL 0x19 -#define ULPI_CARCIT_INT_DELAY 0x1c -#define ULPI_CARCIT_INT_EN 0x1d -#define ULPI_CARCIT_INT_STS 0x20 -#define ULPI_CARCIT_INT_LATCH 0x21 -#define ULPI_CARCIT_PLS_CTRL 0x22 -/* Other Optional Registers */ -#define ULPI_TX_POS_WIDTH 0x25 -#define ULPI_TX_NEG_WIDTH 0x26 -#define ULPI_POLARITY_RECOVERY 0x27 -/* Access Extended Register Set */ -#define ULPI_ACCESS_EXTENDED 0x2f -/* Vendor Specific */ -#define ULPI_VENDOR_SPECIFIC 0x30 -/* Extended Registers */ -#define ULPI_EXT_VENDOR_SPECIFIC 0x80 - -/*-------------------------------------------------------------------------*/ - -/* - * Register Bits - */ - -/* Function Control */ -#define ULPI_FUNC_CTRL_XCVRSEL (1 << 0) -#define ULPI_FUNC_CTRL_XCVRSEL_MASK (3 << 0) -#define ULPI_FUNC_CTRL_HIGH_SPEED (0 << 0) -#define ULPI_FUNC_CTRL_FULL_SPEED (1 << 0) -#define ULPI_FUNC_CTRL_LOW_SPEED (2 << 0) -#define ULPI_FUNC_CTRL_FS4LS (3 << 0) -#define ULPI_FUNC_CTRL_TERMSELECT (1 << 2) -#define ULPI_FUNC_CTRL_OPMODE (1 << 3) -#define ULPI_FUNC_CTRL_OPMODE_MASK (3 << 3) -#define ULPI_FUNC_CTRL_OPMODE_NORMAL (0 << 3) -#define ULPI_FUNC_CTRL_OPMODE_NONDRIVING (1 << 3) -#define ULPI_FUNC_CTRL_OPMODE_DISABLE_NRZI (2 << 3) -#define ULPI_FUNC_CTRL_OPMODE_NOSYNC_NOEOP (3 << 3) -#define ULPI_FUNC_CTRL_RESET (1 << 5) -#define ULPI_FUNC_CTRL_SUSPENDM (1 << 6) - -/* Interface Control */ -#define ULPI_IFC_CTRL_6_PIN_SERIAL_MODE (1 << 0) -#define ULPI_IFC_CTRL_3_PIN_SERIAL_MODE (1 << 1) -#define ULPI_IFC_CTRL_CARKITMODE (1 << 2) -#define ULPI_IFC_CTRL_CLOCKSUSPENDM (1 << 3) -#define ULPI_IFC_CTRL_AUTORESUME (1 << 4) -#define ULPI_IFC_CTRL_EXTERNAL_VBUS (1 << 5) -#define ULPI_IFC_CTRL_PASSTHRU (1 << 6) -#define ULPI_IFC_CTRL_PROTECT_IFC_DISABLE (1 << 7) - -/* OTG Control */ -#define ULPI_OTG_CTRL_ID_PULLUP (1 << 0) -#define ULPI_OTG_CTRL_DP_PULLDOWN (1 << 1) -#define ULPI_OTG_CTRL_DM_PULLDOWN (1 << 2) -#define ULPI_OTG_CTRL_DISCHRGVBUS (1 << 3) -#define ULPI_OTG_CTRL_CHRGVBUS (1 << 4) -#define ULPI_OTG_CTRL_DRVVBUS (1 << 5) -#define ULPI_OTG_CTRL_DRVVBUS_EXT (1 << 6) -#define ULPI_OTG_CTRL_EXTVBUSIND (1 << 7) - -/* USB Interrupt Enable Rising, - * USB Interrupt Enable Falling, - * USB Interrupt Status and - * USB Interrupt Latch - */ -#define ULPI_INT_HOST_DISCONNECT (1 << 0) -#define ULPI_INT_VBUS_VALID (1 << 1) -#define ULPI_INT_SESS_VALID (1 << 2) -#define ULPI_INT_SESS_END (1 << 3) -#define ULPI_INT_IDGRD (1 << 4) - -/* Debug */ -#define ULPI_DEBUG_LINESTATE0 (1 << 0) -#define ULPI_DEBUG_LINESTATE1 (1 << 1) - -/* Carkit Control */ -#define ULPI_CARKIT_CTRL_CARKITPWR (1 << 0) -#define ULPI_CARKIT_CTRL_IDGNDDRV (1 << 1) -#define ULPI_CARKIT_CTRL_TXDEN (1 << 2) -#define ULPI_CARKIT_CTRL_RXDEN (1 << 3) -#define ULPI_CARKIT_CTRL_SPKLEFTEN (1 << 4) -#define ULPI_CARKIT_CTRL_SPKRIGHTEN (1 << 5) -#define ULPI_CARKIT_CTRL_MICEN (1 << 6) - -/* Carkit Interrupt Enable */ -#define ULPI_CARKIT_INT_EN_IDFLOAT_RISE (1 << 0) -#define ULPI_CARKIT_INT_EN_IDFLOAT_FALL (1 << 1) -#define ULPI_CARKIT_INT_EN_CARINTDET (1 << 2) -#define ULPI_CARKIT_INT_EN_DP_RISE (1 << 3) -#define ULPI_CARKIT_INT_EN_DP_FALL (1 << 4) - -/* Carkit Interrupt Status and - * Carkit Interrupt Latch - */ -#define ULPI_CARKIT_INT_IDFLOAT (1 << 0) -#define ULPI_CARKIT_INT_CARINTDET (1 << 1) -#define ULPI_CARKIT_INT_DP (1 << 2) - -/* Carkit Pulse Control*/ -#define ULPI_CARKIT_PLS_CTRL_TXPLSEN (1 << 0) -#define ULPI_CARKIT_PLS_CTRL_RXPLSEN (1 << 1) -#define ULPI_CARKIT_PLS_CTRL_SPKRLEFT_BIASEN (1 << 2) -#define ULPI_CARKIT_PLS_CTRL_SPKRRIGHT_BIASEN (1 << 3) - -/*-------------------------------------------------------------------------*/ - #if IS_ENABLED(CONFIG_USB_ULPI) struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, unsigned int flags); -- cgit v1.2.3 From f267caab44451baa70d25fa3191a68bb79ad1b08 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 14:03:51 -0400 Subject: tracing: Remove unused prototype ftrace_event_define_field() ftrace_event_define_field() has a prototype defined but never used. Remove it. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 65ce6de91307..f8465d65f3c7 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -219,10 +219,6 @@ struct ftrace_event_class { extern int ftrace_event_reg(struct ftrace_event_call *event, enum trace_reg type, void *data); -int ftrace_event_define_field(struct ftrace_event_call *call, - char *type, int len, char *item, int offset, - int field_size, int sign, int filter); - struct ftrace_event_buffer { struct ring_buffer *buffer; struct ring_buffer_event *event; @@ -238,10 +234,6 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); -int ftrace_event_define_field(struct ftrace_event_call *call, - char *type, int len, char *item, int offset, - int field_size, int sign, int filter); - enum { TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, -- cgit v1.2.3 From af658dca221207174fc0a7bcdcd4cff7c589fdd8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 29 Apr 2015 14:36:05 -0400 Subject: tracing: Rename ftrace_event.h to trace_events.h The term "ftrace" is really the infrastructure of the function hooks, and not the trace events. Rename ftrace_event.h to trace_events.h to represent the trace_event infrastructure and decouple the term ftrace from it. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 616 ------------------------------------------- include/linux/trace_events.h | 616 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 616 insertions(+), 616 deletions(-) delete mode 100644 include/linux/ftrace_event.h create mode 100644 include/linux/trace_events.h (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h deleted file mode 100644 index f8465d65f3c7..000000000000 --- a/include/linux/ftrace_event.h +++ /dev/null @@ -1,616 +0,0 @@ - -#ifndef _LINUX_FTRACE_EVENT_H -#define _LINUX_FTRACE_EVENT_H - -#include -#include -#include -#include -#include -#include - -struct trace_array; -struct trace_buffer; -struct tracer; -struct dentry; -struct bpf_prog; - -struct trace_print_flags { - unsigned long mask; - const char *name; -}; - -struct trace_print_flags_u64 { - unsigned long long mask; - const char *name; -}; - -const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, - unsigned long flags, - const struct trace_print_flags *flag_array); - -const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); - -#if BITS_PER_LONG == 32 -const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, - unsigned long long val, - const struct trace_print_flags_u64 - *symbol_array); -#endif - -const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, - unsigned int bitmask_size); - -const char *ftrace_print_hex_seq(struct trace_seq *p, - const unsigned char *buf, int len); - -const char *ftrace_print_array_seq(struct trace_seq *p, - const void *buf, int count, - size_t el_size); - -struct trace_iterator; -struct trace_event; - -int ftrace_raw_output_prep(struct trace_iterator *iter, - struct trace_event *event); - -/* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned short type; - unsigned char flags; - unsigned char preempt_count; - int pid; -}; - -#define FTRACE_MAX_EVENT \ - ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) - -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - struct trace_buffer *trace_buffer; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter **buffer_iter; - unsigned long iter_flags; - - /* trace_seq for __print_flags() and __print_symbolic() etc. */ - struct trace_seq tmp_seq; - - cpumask_var_t started; - - /* it's true when current open file is snapshot */ - bool snapshot; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - unsigned long lost_events; - int leftover; - int ent_size; - int cpu; - u64 ts; - - loff_t pos; - long idx; - - /* All new field here will be zeroed out in pipe_read */ -}; - -enum trace_iter_flags { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, - TRACE_FILE_TIME_IN_NS = 4, -}; - - -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags, struct trace_event *event); - -struct trace_event_functions { - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - -struct trace_event { - struct hlist_node node; - struct list_head list; - int type; - struct trace_event_functions *funcs; -}; - -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); - -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - -/* - * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq - * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function - * simplifies those functions and keeps them in sync. - */ -static inline enum print_line_t trace_handle_return(struct trace_seq *s) -{ - return trace_seq_has_overflowed(s) ? - TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; -} - -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); -struct ftrace_event_file; - -struct ring_buffer_event * -trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, - struct ftrace_event_file *ftrace_file, - int type, unsigned long len, - unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, - int type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - struct pt_regs *regs); -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event); - -void tracing_record_cmdline(struct task_struct *tsk); - -int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); - -struct event_filter; - -enum trace_reg { - TRACE_REG_REGISTER, - TRACE_REG_UNREGISTER, -#ifdef CONFIG_PERF_EVENTS - TRACE_REG_PERF_REGISTER, - TRACE_REG_PERF_UNREGISTER, - TRACE_REG_PERF_OPEN, - TRACE_REG_PERF_CLOSE, - TRACE_REG_PERF_ADD, - TRACE_REG_PERF_DEL, -#endif -}; - -struct ftrace_event_call; - -struct ftrace_event_class { - const char *system; - void *probe; -#ifdef CONFIG_PERF_EVENTS - void *perf_probe; -#endif - int (*reg)(struct ftrace_event_call *event, - enum trace_reg type, void *data); - int (*define_fields)(struct ftrace_event_call *); - struct list_head *(*get_fields)(struct ftrace_event_call *); - struct list_head fields; - int (*raw_init)(struct ftrace_event_call *); -}; - -extern int ftrace_event_reg(struct ftrace_event_call *event, - enum trace_reg type, void *data); - -struct ftrace_event_buffer { - struct ring_buffer *buffer; - struct ring_buffer_event *event; - struct ftrace_event_file *ftrace_file; - void *entry; - unsigned long flags; - int pc; -}; - -void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, - struct ftrace_event_file *ftrace_file, - unsigned long len); - -void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); - -enum { - TRACE_EVENT_FL_FILTERED_BIT, - TRACE_EVENT_FL_CAP_ANY_BIT, - TRACE_EVENT_FL_NO_SET_FILTER_BIT, - TRACE_EVENT_FL_IGNORE_ENABLE_BIT, - TRACE_EVENT_FL_WAS_ENABLED_BIT, - TRACE_EVENT_FL_USE_CALL_FILTER_BIT, - TRACE_EVENT_FL_TRACEPOINT_BIT, - TRACE_EVENT_FL_KPROBE_BIT, -}; - -/* - * Event flags: - * FILTERED - The event has a filter attached - * CAP_ANY - Any user can enable for perf - * NO_SET_FILTER - Set when filter has error and is to be ignored - * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file - * WAS_ENABLED - Set and stays set when an event was ever enabled - * (used for module unloading, if a module event is enabled, - * it is best to clear the buffers that used it). - * USE_CALL_FILTER - For ftrace internal events, don't use file filter - * TRACEPOINT - Event is a tracepoint - * KPROBE - Event is a kprobe - */ -enum { - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), - TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), - TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), - TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), - TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), - TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), - TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), - TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), -}; - -struct ftrace_event_call { - struct list_head list; - struct ftrace_event_class *class; - union { - char *name; - /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ - struct tracepoint *tp; - }; - struct trace_event event; - char *print_fmt; - struct event_filter *filter; - void *mod; - void *data; - /* - * bit 0: filter_active - * bit 1: allow trace by non root (cap any) - * bit 2: failed to apply filter - * bit 3: ftrace internal event (do not enable) - * bit 4: Event was enabled by module - * bit 5: use call filter rather than file filter - * bit 6: Event is a tracepoint - */ - int flags; /* static flags of different events */ - -#ifdef CONFIG_PERF_EVENTS - int perf_refcount; - struct hlist_head __percpu *perf_events; - struct bpf_prog *prog; - - int (*perf_perm)(struct ftrace_event_call *, - struct perf_event *); -#endif -}; - -static inline const char * -ftrace_event_name(struct ftrace_event_call *call) -{ - if (call->flags & TRACE_EVENT_FL_TRACEPOINT) - return call->tp ? call->tp->name : NULL; - else - return call->name; -} - -struct trace_array; -struct ftrace_subsystem_dir; - -enum { - FTRACE_EVENT_FL_ENABLED_BIT, - FTRACE_EVENT_FL_RECORDED_CMD_BIT, - FTRACE_EVENT_FL_FILTERED_BIT, - FTRACE_EVENT_FL_NO_SET_FILTER_BIT, - FTRACE_EVENT_FL_SOFT_MODE_BIT, - FTRACE_EVENT_FL_SOFT_DISABLED_BIT, - FTRACE_EVENT_FL_TRIGGER_MODE_BIT, - FTRACE_EVENT_FL_TRIGGER_COND_BIT, -}; - -/* - * Ftrace event file flags: - * ENABLED - The event is enabled - * RECORDED_CMD - The comms should be recorded at sched_switch - * FILTERED - The event has a filter attached - * NO_SET_FILTER - Set when filter has error and is to be ignored - * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED - * SOFT_DISABLED - When set, do not trace the event (even though its - * tracepoint may be enabled) - * TRIGGER_MODE - When set, invoke the triggers associated with the event - * TRIGGER_COND - When set, one or more triggers has an associated filter - */ -enum { - FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), - FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), - FTRACE_EVENT_FL_FILTERED = (1 << FTRACE_EVENT_FL_FILTERED_BIT), - FTRACE_EVENT_FL_NO_SET_FILTER = (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT), - FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT), - FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT), - FTRACE_EVENT_FL_TRIGGER_MODE = (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT), - FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), -}; - -struct ftrace_event_file { - struct list_head list; - struct ftrace_event_call *event_call; - struct event_filter *filter; - struct dentry *dir; - struct trace_array *tr; - struct ftrace_subsystem_dir *system; - struct list_head triggers; - - /* - * 32 bit flags: - * bit 0: enabled - * bit 1: enabled cmd record - * bit 2: enable/disable with the soft disable bit - * bit 3: soft disabled - * bit 4: trigger enabled - * - * Note: The bits must be set atomically to prevent races - * from other writers. Reads of flags do not need to be in - * sync as they occur in critical sections. But the way flags - * is currently used, these changes do not affect the code - * except that when a change is made, it may have a slight - * delay in propagating the changes to other CPUs due to - * caching and such. Which is mostly OK ;-) - */ - unsigned long flags; - atomic_t sm_ref; /* soft-mode reference counter */ - atomic_t tm_ref; /* trigger-mode reference counter */ -}; - -#define __TRACE_EVENT_FLAGS(name, value) \ - static int __init trace_init_flags_##name(void) \ - { \ - event_##name.flags |= value; \ - return 0; \ - } \ - early_initcall(trace_init_flags_##name); - -#define __TRACE_EVENT_PERF_PERM(name, expr...) \ - static int perf_perm_##name(struct ftrace_event_call *tp_event, \ - struct perf_event *p_event) \ - { \ - return ({ expr; }); \ - } \ - static int __init trace_init_perf_perm_##name(void) \ - { \ - event_##name.perf_perm = &perf_perm_##name; \ - return 0; \ - } \ - early_initcall(trace_init_perf_perm_##name); - -#define PERF_MAX_TRACE_SIZE 2048 - -#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ - -enum event_trigger_type { - ETT_NONE = (0), - ETT_TRACE_ONOFF = (1 << 0), - ETT_SNAPSHOT = (1 << 1), - ETT_STACKTRACE = (1 << 2), - ETT_EVENT_ENABLE = (1 << 3), -}; - -extern int filter_match_preds(struct event_filter *filter, void *rec); - -extern int filter_check_discard(struct ftrace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); -extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); -extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file, - void *rec); -extern void event_triggers_post_call(struct ftrace_event_file *file, - enum event_trigger_type tt); - -/** - * ftrace_trigger_soft_disabled - do triggers and test if soft disabled - * @file: The file pointer of the event to test - * - * If any triggers without filters are attached to this event, they - * will be called here. If the event is soft disabled and has no - * triggers that require testing the fields, it will return true, - * otherwise false. - */ -static inline bool -ftrace_trigger_soft_disabled(struct ftrace_event_file *file) -{ - unsigned long eflags = file->flags; - - if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) { - if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE) - event_triggers_call(file, NULL); - if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED) - return true; - } - return false; -} - -/* - * Helper function for event_trigger_unlock_commit{_regs}(). - * If there are event triggers attached to this event that requires - * filtering against its fields, then they wil be called as the - * entry already holds the field information of the current event. - * - * It also checks if the event should be discarded or not. - * It is to be discarded if the event is soft disabled and the - * event was only recorded to process triggers, or if the event - * filter is active and this event did not match the filters. - * - * Returns true if the event is discarded, false otherwise. - */ -static inline bool -__event_trigger_test_discard(struct ftrace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, - enum event_trigger_type *tt) -{ - unsigned long eflags = file->flags; - - if (eflags & FTRACE_EVENT_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry); - - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags)) - ring_buffer_discard_commit(buffer, event); - else if (!filter_check_discard(file, entry, buffer, event)) - return false; - - return true; -} - -/** - * event_trigger_unlock_commit - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - */ -static inline void -event_trigger_unlock_commit(struct ftrace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit(buffer, event, irq_flags, pc); - - if (tt) - event_triggers_post_call(file, tt); -} - -/** - * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - * - * Same as event_trigger_unlock_commit() but calls - * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). - */ -static inline void -event_trigger_unlock_commit_regs(struct ftrace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc, - struct pt_regs *regs) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); - - if (tt) - event_triggers_post_call(file, tt); -} - -#ifdef CONFIG_BPF_SYSCALL -unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); -#else -static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) -{ - return 1; -} -#endif - -enum { - FILTER_OTHER = 0, - FILTER_STATIC_STRING, - FILTER_DYN_STRING, - FILTER_PTR_STRING, - FILTER_TRACE_FN, -}; - -extern int trace_event_raw_init(struct ftrace_event_call *call); -extern int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, - int is_signed, int filter_type); -extern int trace_add_event_call(struct ftrace_event_call *call); -extern int trace_remove_event_call(struct ftrace_event_call *call); - -#define is_signed_type(type) (((type)(-1)) < (type)1) - -int trace_set_clr_event(const char *system, const char *event, int set); - -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) - -#ifdef CONFIG_PERF_EVENTS -struct perf_event; - -DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); - -extern int perf_trace_init(struct perf_event *event); -extern void perf_trace_destroy(struct perf_event *event); -extern int perf_trace_add(struct perf_event *event, int flags); -extern void perf_trace_del(struct perf_event *event, int flags); -extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, - char *filter_str); -extern void ftrace_profile_free_filter(struct perf_event *event); -extern void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs **regs, int *rctxp); - -static inline void -perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, - u64 count, struct pt_regs *regs, void *head, - struct task_struct *task) -{ - perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); -} -#endif - -#endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h new file mode 100644 index 000000000000..f8465d65f3c7 --- /dev/null +++ b/include/linux/trace_events.h @@ -0,0 +1,616 @@ + +#ifndef _LINUX_FTRACE_EVENT_H +#define _LINUX_FTRACE_EVENT_H + +#include +#include +#include +#include +#include +#include + +struct trace_array; +struct trace_buffer; +struct tracer; +struct dentry; +struct bpf_prog; + +struct trace_print_flags { + unsigned long mask; + const char *name; +}; + +struct trace_print_flags_u64 { + unsigned long long mask; + const char *name; +}; + +const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array); + +const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array); + +#if BITS_PER_LONG == 32 +const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, + unsigned long long val, + const struct trace_print_flags_u64 + *symbol_array); +#endif + +const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size); + +const char *ftrace_print_hex_seq(struct trace_seq *p, + const unsigned char *buf, int len); + +const char *ftrace_print_array_seq(struct trace_seq *p, + const void *buf, int count, + size_t el_size); + +struct trace_iterator; +struct trace_event; + +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *event); + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned short type; + unsigned char flags; + unsigned char preempt_count; + int pid; +}; + +#define FTRACE_MAX_EVENT \ + ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + struct trace_buffer *trace_buffer; + void *private; + int cpu_file; + struct mutex mutex; + struct ring_buffer_iter **buffer_iter; + unsigned long iter_flags; + + /* trace_seq for __print_flags() and __print_symbolic() etc. */ + struct trace_seq tmp_seq; + + cpumask_var_t started; + + /* it's true when current open file is snapshot */ + bool snapshot; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + unsigned long lost_events; + int leftover; + int ent_size; + int cpu; + u64 ts; + + loff_t pos; + long idx; + + /* All new field here will be zeroed out in pipe_read */ +}; + +enum trace_iter_flags { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, + TRACE_FILE_TIME_IN_NS = 4, +}; + + +typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, + int flags, struct trace_event *event); + +struct trace_event_functions { + trace_print_func trace; + trace_print_func raw; + trace_print_func hex; + trace_print_func binary; +}; + +struct trace_event { + struct hlist_node node; + struct list_head list; + int type; + struct trace_event_functions *funcs; +}; + +extern int register_ftrace_event(struct trace_event *event); +extern int unregister_ftrace_event(struct trace_event *event); + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ + TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ +}; + +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +static inline enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} + +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); +struct ftrace_event_file; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc); +struct ring_buffer_event * +trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, + int type, unsigned long len, + unsigned long flags, int pc); +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + +void tracing_record_cmdline(struct task_struct *tsk); + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); + +struct event_filter; + +enum trace_reg { + TRACE_REG_REGISTER, + TRACE_REG_UNREGISTER, +#ifdef CONFIG_PERF_EVENTS + TRACE_REG_PERF_REGISTER, + TRACE_REG_PERF_UNREGISTER, + TRACE_REG_PERF_OPEN, + TRACE_REG_PERF_CLOSE, + TRACE_REG_PERF_ADD, + TRACE_REG_PERF_DEL, +#endif +}; + +struct ftrace_event_call; + +struct ftrace_event_class { + const char *system; + void *probe; +#ifdef CONFIG_PERF_EVENTS + void *perf_probe; +#endif + int (*reg)(struct ftrace_event_call *event, + enum trace_reg type, void *data); + int (*define_fields)(struct ftrace_event_call *); + struct list_head *(*get_fields)(struct ftrace_event_call *); + struct list_head fields; + int (*raw_init)(struct ftrace_event_call *); +}; + +extern int ftrace_event_reg(struct ftrace_event_call *event, + enum trace_reg type, void *data); + +struct ftrace_event_buffer { + struct ring_buffer *buffer; + struct ring_buffer_event *event; + struct ftrace_event_file *ftrace_file; + void *entry; + unsigned long flags; + int pc; +}; + +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, + struct ftrace_event_file *ftrace_file, + unsigned long len); + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); + +enum { + TRACE_EVENT_FL_FILTERED_BIT, + TRACE_EVENT_FL_CAP_ANY_BIT, + TRACE_EVENT_FL_NO_SET_FILTER_BIT, + TRACE_EVENT_FL_IGNORE_ENABLE_BIT, + TRACE_EVENT_FL_WAS_ENABLED_BIT, + TRACE_EVENT_FL_USE_CALL_FILTER_BIT, + TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_KPROBE_BIT, +}; + +/* + * Event flags: + * FILTERED - The event has a filter attached + * CAP_ANY - Any user can enable for perf + * NO_SET_FILTER - Set when filter has error and is to be ignored + * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file + * WAS_ENABLED - Set and stays set when an event was ever enabled + * (used for module unloading, if a module event is enabled, + * it is best to clear the buffers that used it). + * USE_CALL_FILTER - For ftrace internal events, don't use file filter + * TRACEPOINT - Event is a tracepoint + * KPROBE - Event is a kprobe + */ +enum { + TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), + TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), + TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), + TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), + TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), + TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), + TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), +}; + +struct ftrace_event_call { + struct list_head list; + struct ftrace_event_class *class; + union { + char *name; + /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ + struct tracepoint *tp; + }; + struct trace_event event; + char *print_fmt; + struct event_filter *filter; + void *mod; + void *data; + /* + * bit 0: filter_active + * bit 1: allow trace by non root (cap any) + * bit 2: failed to apply filter + * bit 3: ftrace internal event (do not enable) + * bit 4: Event was enabled by module + * bit 5: use call filter rather than file filter + * bit 6: Event is a tracepoint + */ + int flags; /* static flags of different events */ + +#ifdef CONFIG_PERF_EVENTS + int perf_refcount; + struct hlist_head __percpu *perf_events; + struct bpf_prog *prog; + + int (*perf_perm)(struct ftrace_event_call *, + struct perf_event *); +#endif +}; + +static inline const char * +ftrace_event_name(struct ftrace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_TRACEPOINT) + return call->tp ? call->tp->name : NULL; + else + return call->name; +} + +struct trace_array; +struct ftrace_subsystem_dir; + +enum { + FTRACE_EVENT_FL_ENABLED_BIT, + FTRACE_EVENT_FL_RECORDED_CMD_BIT, + FTRACE_EVENT_FL_FILTERED_BIT, + FTRACE_EVENT_FL_NO_SET_FILTER_BIT, + FTRACE_EVENT_FL_SOFT_MODE_BIT, + FTRACE_EVENT_FL_SOFT_DISABLED_BIT, + FTRACE_EVENT_FL_TRIGGER_MODE_BIT, + FTRACE_EVENT_FL_TRIGGER_COND_BIT, +}; + +/* + * Ftrace event file flags: + * ENABLED - The event is enabled + * RECORDED_CMD - The comms should be recorded at sched_switch + * FILTERED - The event has a filter attached + * NO_SET_FILTER - Set when filter has error and is to be ignored + * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED + * SOFT_DISABLED - When set, do not trace the event (even though its + * tracepoint may be enabled) + * TRIGGER_MODE - When set, invoke the triggers associated with the event + * TRIGGER_COND - When set, one or more triggers has an associated filter + */ +enum { + FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), + FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), + FTRACE_EVENT_FL_FILTERED = (1 << FTRACE_EVENT_FL_FILTERED_BIT), + FTRACE_EVENT_FL_NO_SET_FILTER = (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT), + FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT), + FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT), + FTRACE_EVENT_FL_TRIGGER_MODE = (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT), + FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), +}; + +struct ftrace_event_file { + struct list_head list; + struct ftrace_event_call *event_call; + struct event_filter *filter; + struct dentry *dir; + struct trace_array *tr; + struct ftrace_subsystem_dir *system; + struct list_head triggers; + + /* + * 32 bit flags: + * bit 0: enabled + * bit 1: enabled cmd record + * bit 2: enable/disable with the soft disable bit + * bit 3: soft disabled + * bit 4: trigger enabled + * + * Note: The bits must be set atomically to prevent races + * from other writers. Reads of flags do not need to be in + * sync as they occur in critical sections. But the way flags + * is currently used, these changes do not affect the code + * except that when a change is made, it may have a slight + * delay in propagating the changes to other CPUs due to + * caching and such. Which is mostly OK ;-) + */ + unsigned long flags; + atomic_t sm_ref; /* soft-mode reference counter */ + atomic_t tm_ref; /* trigger-mode reference counter */ +}; + +#define __TRACE_EVENT_FLAGS(name, value) \ + static int __init trace_init_flags_##name(void) \ + { \ + event_##name.flags |= value; \ + return 0; \ + } \ + early_initcall(trace_init_flags_##name); + +#define __TRACE_EVENT_PERF_PERM(name, expr...) \ + static int perf_perm_##name(struct ftrace_event_call *tp_event, \ + struct perf_event *p_event) \ + { \ + return ({ expr; }); \ + } \ + static int __init trace_init_perf_perm_##name(void) \ + { \ + event_##name.perf_perm = &perf_perm_##name; \ + return 0; \ + } \ + early_initcall(trace_init_perf_perm_##name); + +#define PERF_MAX_TRACE_SIZE 2048 + +#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ + +enum event_trigger_type { + ETT_NONE = (0), + ETT_TRACE_ONOFF = (1 << 0), + ETT_SNAPSHOT = (1 << 1), + ETT_STACKTRACE = (1 << 2), + ETT_EVENT_ENABLE = (1 << 3), +}; + +extern int filter_match_preds(struct event_filter *filter, void *rec); + +extern int filter_check_discard(struct ftrace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); +extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); +extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file, + void *rec); +extern void event_triggers_post_call(struct ftrace_event_file *file, + enum event_trigger_type tt); + +/** + * ftrace_trigger_soft_disabled - do triggers and test if soft disabled + * @file: The file pointer of the event to test + * + * If any triggers without filters are attached to this event, they + * will be called here. If the event is soft disabled and has no + * triggers that require testing the fields, it will return true, + * otherwise false. + */ +static inline bool +ftrace_trigger_soft_disabled(struct ftrace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) { + if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE) + event_triggers_call(file, NULL); + if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED) + return true; + } + return false; +} + +/* + * Helper function for event_trigger_unlock_commit{_regs}(). + * If there are event triggers attached to this event that requires + * filtering against its fields, then they wil be called as the + * entry already holds the field information of the current event. + * + * It also checks if the event should be discarded or not. + * It is to be discarded if the event is soft disabled and the + * event was only recorded to process triggers, or if the event + * filter is active and this event did not match the filters. + * + * Returns true if the event is discarded, false otherwise. + */ +static inline bool +__event_trigger_test_discard(struct ftrace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, + enum event_trigger_type *tt) +{ + unsigned long eflags = file->flags; + + if (eflags & FTRACE_EVENT_FL_TRIGGER_COND) + *tt = event_triggers_call(file, entry); + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags)) + ring_buffer_discard_commit(buffer, event); + else if (!filter_check_discard(file, entry, buffer, event)) + return false; + + return true; +} + +/** + * event_trigger_unlock_commit - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + */ +static inline void +event_trigger_unlock_commit(struct ftrace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + if (tt) + event_triggers_post_call(file, tt); +} + +/** + * event_trigger_unlock_commit_regs - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + * + * Same as event_trigger_unlock_commit() but calls + * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). + */ +static inline void +event_trigger_unlock_commit_regs(struct ftrace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc, + struct pt_regs *regs) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); + + if (tt) + event_triggers_post_call(file, tt); +} + +#ifdef CONFIG_BPF_SYSCALL +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); +#else +static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + return 1; +} +#endif + +enum { + FILTER_OTHER = 0, + FILTER_STATIC_STRING, + FILTER_DYN_STRING, + FILTER_PTR_STRING, + FILTER_TRACE_FN, +}; + +extern int trace_event_raw_init(struct ftrace_event_call *call); +extern int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type); +extern int trace_add_event_call(struct ftrace_event_call *call); +extern int trace_remove_event_call(struct ftrace_event_call *call); + +#define is_signed_type(type) (((type)(-1)) < (type)1) + +int trace_set_clr_event(const char *system, const char *event, int set); + +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement optimizing out. + */ +#define event_trace_printk(ip, fmt, args...) \ +do { \ + __trace_printk_check_format(fmt, ##args); \ + tracing_record_cmdline(current); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(ip, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(ip, fmt, ##args); \ +} while (0) + +#ifdef CONFIG_PERF_EVENTS +struct perf_event; + +DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); + +extern int perf_trace_init(struct perf_event *event); +extern void perf_trace_destroy(struct perf_event *event); +extern int perf_trace_add(struct perf_event *event, int flags); +extern void perf_trace_del(struct perf_event *event, int flags); +extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str); +extern void ftrace_profile_free_filter(struct perf_event *event); +extern void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs **regs, int *rctxp); + +static inline void +perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, + u64 count, struct pt_regs *regs, void *head, + struct task_struct *task) +{ + perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); +} +#endif + +#endif /* _LINUX_FTRACE_EVENT_H */ -- cgit v1.2.3 From 645df987f7c1740bb1ba783ab907001720a20cf7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 4 May 2015 18:12:44 -0400 Subject: tracing: Rename ftrace_print_*() functions ta trace_print_*() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The functions ftrace_print_*() are not part of the function infrastructure, and the names can be confusing. Rename them to be trace_print_*(). Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f8465d65f3c7..29627cbafdea 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -1,6 +1,6 @@ -#ifndef _LINUX_FTRACE_EVENT_H -#define _LINUX_FTRACE_EVENT_H +#ifndef _LINUX_TRACE_EVENT_H +#define _LINUX_TRACE_EVENT_H #include #include @@ -25,27 +25,27 @@ struct trace_print_flags_u64 { const char *name; }; -const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, - unsigned long flags, - const struct trace_print_flags *flag_array); +const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array); -const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); +const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 -const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, - unsigned long long val, - const struct trace_print_flags_u64 +const char *trace_print_symbols_seq_u64(struct trace_seq *p, + unsigned long long val, + const struct trace_print_flags_u64 *symbol_array); #endif -const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, - unsigned int bitmask_size); +const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size); -const char *ftrace_print_hex_seq(struct trace_seq *p, - const unsigned char *buf, int len); +const char *trace_print_hex_seq(struct trace_seq *p, + const unsigned char *buf, int len); -const char *ftrace_print_array_seq(struct trace_seq *p, +const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); -- cgit v1.2.3 From 9023c930902fbbcf0cebf6110828700f792989a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 09:39:12 -0400 Subject: tracing: Rename (un)register_ftrace_event() to (un)register_trace_event() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The functions (un)register_ftrace_event() is really about trace_events, and the name should be register_trace_event() instead. Also renamed ftrace_event_reg() to trace_event_reg() for the same reason. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 29627cbafdea..99924c07a042 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -132,8 +132,8 @@ struct trace_event { struct trace_event_functions *funcs; }; -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); +extern int register_trace_event(struct trace_event *event); +extern int unregister_trace_event(struct trace_event *event); /* Return values for print_line callback */ enum print_line_t { @@ -216,7 +216,7 @@ struct ftrace_event_class { int (*raw_init)(struct ftrace_event_call *); }; -extern int ftrace_event_reg(struct ftrace_event_call *event, +extern int trace_event_reg(struct ftrace_event_call *event, enum trace_reg type, void *data); struct ftrace_event_buffer { -- cgit v1.2.3 From 7f1d2f8210195c8c309d424a77dbf06a6d2186f4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 10:09:53 -0400 Subject: tracing: Rename ftrace_event_file to trace_event_file The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The structure ftrace_event_file is really about trace events and not "ftrace". Rename it to trace_event_file. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 99924c07a042..ae19233c7dd8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -157,11 +157,11 @@ static inline enum print_line_t trace_handle_return(struct trace_seq *s) void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc); -struct ftrace_event_file; +struct trace_event_file; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, - struct ftrace_event_file *ftrace_file, + struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc); struct ring_buffer_event * @@ -222,14 +222,14 @@ extern int trace_event_reg(struct ftrace_event_call *event, struct ftrace_event_buffer { struct ring_buffer *buffer; struct ring_buffer_event *event; - struct ftrace_event_file *ftrace_file; + struct trace_event_file *trace_file; void *entry; unsigned long flags; int pc; }; void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, - struct ftrace_event_file *ftrace_file, + struct trace_event_file *trace_file, unsigned long len); void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); @@ -349,7 +349,7 @@ enum { FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), }; -struct ftrace_event_file { +struct trace_event_file { struct list_head list; struct ftrace_event_call *event_call; struct event_filter *filter; @@ -414,15 +414,15 @@ enum event_trigger_type { extern int filter_match_preds(struct event_filter *filter, void *rec); -extern int filter_check_discard(struct ftrace_event_file *file, void *rec, +extern int filter_check_discard(struct trace_event_file *file, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); -extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file, +extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, void *rec); -extern void event_triggers_post_call(struct ftrace_event_file *file, +extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); /** @@ -435,7 +435,7 @@ extern void event_triggers_post_call(struct ftrace_event_file *file, * otherwise false. */ static inline bool -ftrace_trigger_soft_disabled(struct ftrace_event_file *file) +ftrace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; @@ -462,7 +462,7 @@ ftrace_trigger_soft_disabled(struct ftrace_event_file *file) * Returns true if the event is discarded, false otherwise. */ static inline bool -__event_trigger_test_discard(struct ftrace_event_file *file, +__event_trigger_test_discard(struct trace_event_file *file, struct ring_buffer *buffer, struct ring_buffer_event *event, void *entry, @@ -495,7 +495,7 @@ __event_trigger_test_discard(struct ftrace_event_file *file, * if the event is soft disabled and should be discarded. */ static inline void -event_trigger_unlock_commit(struct ftrace_event_file *file, +event_trigger_unlock_commit(struct trace_event_file *file, struct ring_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc) @@ -526,7 +526,7 @@ event_trigger_unlock_commit(struct ftrace_event_file *file, * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). */ static inline void -event_trigger_unlock_commit_regs(struct ftrace_event_file *file, +event_trigger_unlock_commit_regs(struct trace_event_file *file, struct ring_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc, -- cgit v1.2.3 From 2425bcb9240f8c97d793cb31c8e8d8d0a843fa29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 11:45:27 -0400 Subject: tracing: Rename ftrace_event_{call,class} to trace_event_{call,class} The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The structures ftrace_event_call and ftrace_event_class have nothing to do with the function hooks, and are really trace_event structures. Rename ftrace_event_* to trace_event_*. Signed-off-by: Steven Rostedt --- include/linux/module.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/syscalls.h | 12 ++++++------ include/linux/trace_events.h | 38 +++++++++++++++++++------------------- 4 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index c883b86ea964..3e0d492682bb 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -336,7 +336,7 @@ struct module { const char **trace_bprintk_fmt_start; #endif #ifdef CONFIG_EVENT_TRACING - struct ftrace_event_call **trace_events; + struct trace_event_call **trace_events; unsigned int num_trace_events; struct trace_enum_map **trace_enums; unsigned int num_trace_enums; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..d089d6d58ae0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -483,7 +483,7 @@ struct perf_event { void *overflow_handler_context; #ifdef CONFIG_EVENT_TRACING - struct ftrace_event_call *tp_event; + struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38aabe1..d8b06abb264f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -111,14 +111,14 @@ union bpf_attr; #define __SC_STR_ADECL(t, a) #a #define __SC_STR_TDECL(t, a) #t -extern struct ftrace_event_class event_class_syscall_enter; -extern struct ftrace_event_class event_class_syscall_exit; +extern struct trace_event_class event_class_syscall_enter; +extern struct trace_event_class event_class_syscall_exit; extern struct trace_event_functions enter_syscall_print_funcs; extern struct trace_event_functions exit_syscall_print_funcs; #define SYSCALL_TRACE_ENTER_EVENT(sname) \ static struct syscall_metadata __syscall_meta_##sname; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ event_enter_##sname = { \ .class = &event_class_syscall_enter, \ { \ @@ -128,13 +128,13 @@ extern struct trace_event_functions exit_syscall_print_funcs; .data = (void *)&__syscall_meta_##sname,\ .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) \ *__event_enter_##sname = &event_enter_##sname; #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static struct syscall_metadata __syscall_meta_##sname; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ event_exit_##sname = { \ .class = &event_class_syscall_exit, \ { \ @@ -144,7 +144,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; .data = (void *)&__syscall_meta_##sname,\ .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) \ *__event_exit_##sname = &event_exit_##sname; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ae19233c7dd8..d10ab04a17b2 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -200,23 +200,23 @@ enum trace_reg { #endif }; -struct ftrace_event_call; +struct trace_event_call; -struct ftrace_event_class { +struct trace_event_class { const char *system; void *probe; #ifdef CONFIG_PERF_EVENTS void *perf_probe; #endif - int (*reg)(struct ftrace_event_call *event, + int (*reg)(struct trace_event_call *event, enum trace_reg type, void *data); - int (*define_fields)(struct ftrace_event_call *); - struct list_head *(*get_fields)(struct ftrace_event_call *); + int (*define_fields)(struct trace_event_call *); + struct list_head *(*get_fields)(struct trace_event_call *); struct list_head fields; - int (*raw_init)(struct ftrace_event_call *); + int (*raw_init)(struct trace_event_call *); }; -extern int trace_event_reg(struct ftrace_event_call *event, +extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); struct ftrace_event_buffer { @@ -269,9 +269,9 @@ enum { TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), }; -struct ftrace_event_call { +struct trace_event_call { struct list_head list; - struct ftrace_event_class *class; + struct trace_event_class *class; union { char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ @@ -298,13 +298,13 @@ struct ftrace_event_call { struct hlist_head __percpu *perf_events; struct bpf_prog *prog; - int (*perf_perm)(struct ftrace_event_call *, + int (*perf_perm)(struct trace_event_call *, struct perf_event *); #endif }; static inline const char * -ftrace_event_name(struct ftrace_event_call *call) +ftrace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; @@ -351,7 +351,7 @@ enum { struct trace_event_file { struct list_head list; - struct ftrace_event_call *event_call; + struct trace_event_call *event_call; struct event_filter *filter; struct dentry *dir; struct trace_array *tr; @@ -388,7 +388,7 @@ struct trace_event_file { early_initcall(trace_init_flags_##name); #define __TRACE_EVENT_PERF_PERM(name, expr...) \ - static int perf_perm_##name(struct ftrace_event_call *tp_event, \ + static int perf_perm_##name(struct trace_event_call *tp_event, \ struct perf_event *p_event) \ { \ return ({ expr; }); \ @@ -417,7 +417,7 @@ extern int filter_match_preds(struct event_filter *filter, void *rec); extern int filter_check_discard(struct trace_event_file *file, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); -extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, +extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, @@ -559,12 +559,12 @@ enum { FILTER_TRACE_FN, }; -extern int trace_event_raw_init(struct ftrace_event_call *call); -extern int trace_define_field(struct ftrace_event_call *call, const char *type, +extern int trace_event_raw_init(struct trace_event_call *call); +extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); -extern int trace_add_event_call(struct ftrace_event_call *call); -extern int trace_remove_event_call(struct ftrace_event_call *call); +extern int trace_add_event_call(struct trace_event_call *call); +extern int trace_remove_event_call(struct trace_event_call *call); #define is_signed_type(type) (((type)(-1)) < (type)1) @@ -613,4 +613,4 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, } #endif -#endif /* _LINUX_FTRACE_EVENT_H */ +#endif /* _LINUX_TRACE_EVENT_H */ -- cgit v1.2.3 From 3f795dcfc7364cd811c3f6f03d115fcefbbdc1ca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 13:18:46 -0400 Subject: tracing: Rename ftrace_event_buffer to trace_event_buffer. The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The ftrace_event_buffer functions and data structures are for trace_events and not for function hooks. Rename them to trace_event_buffer*. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d10ab04a17b2..a1fa8ebaf684 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -219,7 +219,7 @@ struct trace_event_class { extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); -struct ftrace_event_buffer { +struct trace_event_buffer { struct ring_buffer *buffer; struct ring_buffer_event *event; struct trace_event_file *trace_file; @@ -228,11 +228,11 @@ struct ftrace_event_buffer { int pc; }; -void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, +void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len); -void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { TRACE_EVENT_FL_FILTERED_BIT, -- cgit v1.2.3 From 892c505aac2bdded3c8ec2ec27abc6d74fd210f5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 14:18:11 -0400 Subject: tracing: Rename ftrace_output functions to trace_output The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The ftrace_output_*() and ftrace_raw_output_*() functions represent the trace_event code. Rename them to just trace_output or trace_raw_output. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a1fa8ebaf684..12ca46322a94 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -52,8 +52,8 @@ const char *trace_print_array_seq(struct trace_seq *p, struct trace_iterator; struct trace_event; -int ftrace_raw_output_prep(struct trace_iterator *iter, - struct trace_event *event); +int trace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *event); /* * The trace entry - the most basic unit of tracing. This is what @@ -183,7 +183,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer *buffer, void tracing_record_cmdline(struct task_struct *tsk); -int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); struct event_filter; -- cgit v1.2.3 From 609a74045238c303bbe9396775eacf5bac1f51cc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 13:44:36 -0400 Subject: tracing: Rename FTRACE_MAX_EVENT to TRACE_EVENT_TYPE_MAX The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. Rename the max trace_event type size to something more descriptive and appropriate. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12ca46322a94..6f28464be418 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -68,7 +68,7 @@ struct trace_entry { int pid; }; -#define FTRACE_MAX_EVENT \ +#define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) /* -- cgit v1.2.3 From 687fcc4aee4567df14e31e82d6993418b826f408 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 14:20:14 -0400 Subject: tracing: Rename ftrace_event_name() to trace_event_name() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. ftrace_event_name() returns the name of an event tracepoint, has nothing to do with function tracing. Rename it to trace_event_name(). Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 6f28464be418..15617798849c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -304,7 +304,7 @@ struct trace_event_call { }; static inline const char * -ftrace_event_name(struct trace_event_call *call) +trace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; -- cgit v1.2.3 From 7967b3e0c40ff72fb2cf44d3b50e2cb388ef6c67 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 14:59:40 -0400 Subject: tracing: Rename struct ftrace_subsystem_dir to trace_subsystem_dir The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The structure ftrace_subsystem_dir holds the information about trace event subsystems. It should not be named ftrace, rename it to trace_subsystem_dir. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 15617798849c..d4ad58ec684a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -313,7 +313,7 @@ trace_event_name(struct trace_event_call *call) } struct trace_array; -struct ftrace_subsystem_dir; +struct trace_subsystem_dir; enum { FTRACE_EVENT_FL_ENABLED_BIT, @@ -355,7 +355,7 @@ struct trace_event_file { struct event_filter *filter; struct dentry *dir; struct trace_array *tr; - struct ftrace_subsystem_dir *system; + struct trace_subsystem_dir *system; struct list_head triggers; /* -- cgit v1.2.3 From 5d6ad960a71f0b36d95d74ef93285733b9f62f59 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 15:12:33 -0400 Subject: tracing: Rename FTRACE_EVENT_FL_* flags to EVENT_FILE_FL_* The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The FTRACE_EVENT_FL_* flags are flags to do with the trace_event files in the tracefs directory. They are not related to function tracing. Rename them to a more descriptive name. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 50 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d4ad58ec684a..a46c138b2eea 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -250,11 +250,11 @@ enum { * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored - * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file + * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * WAS_ENABLED - Set and stays set when an event was ever enabled * (used for module unloading, if a module event is enabled, * it is best to clear the buffers that used it). - * USE_CALL_FILTER - For ftrace internal events, don't use file filter + * USE_CALL_FILTER - For trace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint * KPROBE - Event is a kprobe */ @@ -286,7 +286,7 @@ struct trace_event_call { * bit 0: filter_active * bit 1: allow trace by non root (cap any) * bit 2: failed to apply filter - * bit 3: ftrace internal event (do not enable) + * bit 3: trace internal event (do not enable) * bit 4: Event was enabled by module * bit 5: use call filter rather than file filter * bit 6: Event is a tracepoint @@ -316,18 +316,18 @@ struct trace_array; struct trace_subsystem_dir; enum { - FTRACE_EVENT_FL_ENABLED_BIT, - FTRACE_EVENT_FL_RECORDED_CMD_BIT, - FTRACE_EVENT_FL_FILTERED_BIT, - FTRACE_EVENT_FL_NO_SET_FILTER_BIT, - FTRACE_EVENT_FL_SOFT_MODE_BIT, - FTRACE_EVENT_FL_SOFT_DISABLED_BIT, - FTRACE_EVENT_FL_TRIGGER_MODE_BIT, - FTRACE_EVENT_FL_TRIGGER_COND_BIT, + EVENT_FILE_FL_ENABLED_BIT, + EVENT_FILE_FL_RECORDED_CMD_BIT, + EVENT_FILE_FL_FILTERED_BIT, + EVENT_FILE_FL_NO_SET_FILTER_BIT, + EVENT_FILE_FL_SOFT_MODE_BIT, + EVENT_FILE_FL_SOFT_DISABLED_BIT, + EVENT_FILE_FL_TRIGGER_MODE_BIT, + EVENT_FILE_FL_TRIGGER_COND_BIT, }; /* - * Ftrace event file flags: + * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch * FILTERED - The event has a filter attached @@ -339,14 +339,14 @@ enum { * TRIGGER_COND - When set, one or more triggers has an associated filter */ enum { - FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), - FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), - FTRACE_EVENT_FL_FILTERED = (1 << FTRACE_EVENT_FL_FILTERED_BIT), - FTRACE_EVENT_FL_NO_SET_FILTER = (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT), - FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT), - FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT), - FTRACE_EVENT_FL_TRIGGER_MODE = (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT), - FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), + EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), + EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), + EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), + EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), + EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), + EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT), + EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT), + EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), }; struct trace_event_file { @@ -439,10 +439,10 @@ ftrace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; - if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) { - if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE) + if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) event_triggers_call(file, NULL); - if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED) + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) return true; } return false; @@ -470,10 +470,10 @@ __event_trigger_test_discard(struct trace_event_file *file, { unsigned long eflags = file->flags; - if (eflags & FTRACE_EVENT_FL_TRIGGER_COND) + if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, entry); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags)) + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) ring_buffer_discard_commit(buffer, event); else if (!filter_check_discard(file, entry, buffer, event)) return false; -- cgit v1.2.3 From 09a5059aa1a2cbf8c8993e61b013cc83a0dd5833 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 15:21:25 -0400 Subject: tracing: Rename ftrace_trigger_soft_disabled() to trace_trigger_soft_disabled() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The ftrace_trigger_soft_disabled() tests if a trace_event is soft disabled (called but not traced), and returns true if it is. It has nothing to do with function tracing and should be renamed. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a46c138b2eea..1063c850dbab 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -426,7 +426,7 @@ extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); /** - * ftrace_trigger_soft_disabled - do triggers and test if soft disabled + * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test * * If any triggers without filters are attached to this event, they @@ -435,7 +435,7 @@ extern void event_triggers_post_call(struct trace_event_file *file, * otherwise false. */ static inline bool -ftrace_trigger_soft_disabled(struct trace_event_file *file) +trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; -- cgit v1.2.3 From 1b852bceb0d111e510d1a15826ecc4a19358d512 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 23:22:29 -0500 Subject: mnt: Refactor the logic for mounting sysfs and proc in a user namespace Fresh mounts of proc and sysfs are a very special case that works very much like a bind mount. Unfortunately the current structure can not preserve the MNT_LOCK... mount flags. Therefore refactor the logic into a form that can be modified to preserve those lock bits. Add a new filesystem flag FS_USERNS_VISIBLE that requires some mount of the filesystem be fully visible in the current mount namespace, before the filesystem may be mounted. Move the logic for calling fs_fully_visible from proc and sysfs into fs/namespace.c where it has greater access to mount namespace state. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..2d24eeb8e59c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1897,6 +1897,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ +#define FS_USERNS_VISIBLE 32 /* FS must already be visible */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); @@ -1984,7 +1985,6 @@ extern int vfs_ustat(dev_t, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); extern bool our_mnt(struct vfsmount *mnt); -extern bool fs_fully_visible(struct file_system_type *); extern int current_umask(void); -- cgit v1.2.3 From 3f7f642b9bc46453e1435e8b67f1c4f7949be7ff Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Wed, 13 May 2015 12:26:42 +0200 Subject: iio: core: add high pass filter attributes Add a high pass filter attribute for measurements (like the existing low pass) Also add both high and low pass attributes for events. Signed-off-by: Martin Fuzzey Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 1 + include/linux/iio/types.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 058441da4984..f79148261d16 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -32,6 +32,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW, IIO_CHAN_INFO_AVERAGE_RAW, IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY, + IIO_CHAN_INFO_HIGH_PASS_FILTER_3DB_FREQUENCY, IIO_CHAN_INFO_SAMP_FREQ, IIO_CHAN_INFO_FREQUENCY, IIO_CHAN_INFO_PHASE, diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 942b6de68e2f..32b579525004 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -17,6 +17,8 @@ enum iio_event_info { IIO_EV_INFO_VALUE, IIO_EV_INFO_HYSTERESIS, IIO_EV_INFO_PERIOD, + IIO_EV_INFO_HIGH_PASS_FILTER_3DB, + IIO_EV_INFO_LOW_PASS_FILTER_3DB, }; #define IIO_VAL_INT 1 -- cgit v1.2.3 From e1031dc1f7ba5c8724ba211062134076df292791 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 7 May 2015 17:38:07 +0200 Subject: dmaengine: Support different source and destination stride In interleaved mode, we can expect to have different source and destination strides. Add support for such case to dmaengine. Signed-off-by: Maxime Ripard Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..5d63acb09813 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -122,10 +122,18 @@ enum dma_transfer_direction { * chunk and before first src/dst address for next chunk. * Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false. * Ignored for src(assumed 0), if src_inc is true and src_sgl is false. + * @dst_icg: Number of bytes to jump after last dst address of this + * chunk and before the first dst address for next chunk. + * Ignored if dst_inc is true and dst_sgl is false. + * @src_icg: Number of bytes to jump after last src address of this + * chunk and before the first src address for next chunk. + * Ignored if src_inc is true and src_sgl is false. */ struct data_chunk { size_t size; size_t icg; + size_t dst_icg; + size_t src_icg; }; /** -- cgit v1.2.3 From b4a04ab7a37b490cad48e69abfe14288cacb669c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 15:38:40 -0400 Subject: cgroup: separate out include/linux/cgroup-defs.h From 2d728f74bfc071df06773e2fd7577dd5dab6425d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 15:37:01 -0400 This patch separates out cgroup-defs.h from cgroup.h which has grown a lot of dependencies. cgroup-defs.h currently only contains constant and type definitions and can be used to break circular include dependency. While moving, definitions are reordered so that cgroup-defs.h has consistent logical structure. This patch is pure reorganization. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 464 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup.h | 455 +------------------------------------------ 2 files changed, 466 insertions(+), 453 deletions(-) create mode 100644 include/linux/cgroup-defs.h (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h new file mode 100644 index 000000000000..55f3120fb952 --- /dev/null +++ b/include/linux/cgroup-defs.h @@ -0,0 +1,464 @@ +/* + * linux/cgroup-defs.h - basic definitions for cgroup + * + * This file provides basic type and interface. Include this file directly + * only if necessary to avoid cyclic dependencies. + */ +#ifndef _LINUX_CGROUP_DEFS_H +#define _LINUX_CGROUP_DEFS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CGROUPS + +struct cgroup; +struct cgroup_root; +struct cgroup_subsys; +struct cgroup_taskset; +struct kernfs_node; +struct kernfs_ops; +struct kernfs_open_file; + +#define MAX_CGROUP_TYPE_NAMELEN 32 +#define MAX_CGROUP_ROOT_NAMELEN 64 +#define MAX_CFTYPE_NAME 64 + +/* define the enumeration of all cgroup subsystems */ +#define SUBSYS(_x) _x ## _cgrp_id, +enum cgroup_subsys_id { +#include + CGROUP_SUBSYS_COUNT, +}; +#undef SUBSYS + +/* bits in struct cgroup_subsys_state flags field */ +enum { + CSS_NO_REF = (1 << 0), /* no reference counting for this css */ + CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ + CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ +}; + +/* bits in struct cgroup flags field */ +enum { + /* Control Group requires release notifications to userspace */ + CGRP_NOTIFY_ON_RELEASE, + /* + * Clone the parent's configuration when creating a new child + * cpuset cgroup. For historical reasons, this option can be + * specified at mount time and thus is implemented here. + */ + CGRP_CPUSET_CLONE_CHILDREN, +}; + +/* cgroup_root->flags */ +enum { + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ + CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ + CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ +}; + +/* cftype->flags */ +enum { + CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ + CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ + + /* internal flags, do not use outside cgroup core proper */ + __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ + __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ +}; + +/* + * Per-subsystem/per-cgroup state maintained by the system. This is the + * fundamental structural building block that controllers deal with. + * + * Fields marked with "PI:" are public and immutable and may be accessed + * directly without synchronization. + */ +struct cgroup_subsys_state { + /* PI: the cgroup that this css is attached to */ + struct cgroup *cgroup; + + /* PI: the cgroup subsystem that this css is attached to */ + struct cgroup_subsys *ss; + + /* reference count - access via css_[try]get() and css_put() */ + struct percpu_ref refcnt; + + /* PI: the parent css */ + struct cgroup_subsys_state *parent; + + /* siblings list anchored at the parent's ->children */ + struct list_head sibling; + struct list_head children; + + /* + * PI: Subsys-unique ID. 0 is unused and root is always 1. The + * matching css can be looked up using css_from_id(). + */ + int id; + + unsigned int flags; + + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all csses. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr and + * used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + + /* percpu_ref killing and RCU release */ + struct rcu_head rcu_head; + struct work_struct destroy_work; +}; + +/* + * A css_set is a structure holding pointers to a set of + * cgroup_subsys_state objects. This saves space in the task struct + * object and speeds up fork()/exit(), since a single inc/dec and a + * list_add()/del() can bump the reference count on the entire cgroup + * set for a task. + */ +struct css_set { + /* Reference count */ + atomic_t refcount; + + /* + * List running through all cgroup groups in the same hash + * slot. Protected by css_set_lock + */ + struct hlist_node hlist; + + /* + * Lists running through all tasks using this cgroup group. + * mg_tasks lists tasks which belong to this cset but are in the + * process of being migrated out or in. Protected by + * css_set_rwsem, but, during migration, once tasks are moved to + * mg_tasks, it can be read safely while holding cgroup_mutex. + */ + struct list_head tasks; + struct list_head mg_tasks; + + /* + * List of cgrp_cset_links pointing at cgroups referenced from this + * css_set. Protected by css_set_lock. + */ + struct list_head cgrp_links; + + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; + + /* + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). + */ + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + + /* + * List of csets participating in the on-going migration either as + * source or destination. Protected by cgroup_mutex. + */ + struct list_head mg_preload_node; + struct list_head mg_node; + + /* + * If this cset is acting as the source of migration the following + * two fields are set. mg_src_cgrp is the source cgroup of the + * on-going migration and mg_dst_cset is the destination cset the + * target tasks on this cset should be migrated to. Protected by + * cgroup_mutex. + */ + struct cgroup *mg_src_cgrp; + struct css_set *mg_dst_cset; + + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + + /* For RCU-protected deletion */ + struct rcu_head rcu_head; +}; + +struct cgroup { + /* self css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state self; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * idr allocated in-hierarchy ID. + * + * ID 0 is not used, the ID of the root cgroup is always 1, and a + * new cgroup will be assigned with a smallest available ID. + * + * Allocating/Removing ID must be protected by cgroup_mutex. + */ + int id; + + /* + * If this cgroup contains any tasks, it contributes one to + * populated_cnt. All children with non-zero popuplated_cnt of + * their own contribute one. The count is zero iff there's no task + * in this cgroup or its subtree. + */ + int populated_cnt; + + struct kernfs_node *kn; /* cgroup kernfs entry */ + struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ + + /* + * The bitmask of subsystems enabled on the child cgroups. + * ->subtree_control is the one configured through + * "cgroup.subtree_control" while ->child_subsys_mask is the + * effective one which may have more subsystems enabled. + * Controller knobs are made available iff it's enabled in + * ->subtree_control. + */ + unsigned int subtree_control; + unsigned int child_subsys_mask; + + /* Private pointers for each registered subsystem */ + struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; + + struct cgroup_root *root; + + /* + * List of cgrp_cset_links pointing at css_sets with tasks in this + * cgroup. Protected by css_set_lock. + */ + struct list_head cset_links; + + /* + * On the default hierarchy, a css_set for a cgroup with some + * susbsys disabled will point to css's which are associated with + * the closest ancestor which has the subsys enabled. The + * following lists all css_sets which point to this cgroup's css + * for the given subsystem. + */ + struct list_head e_csets[CGROUP_SUBSYS_COUNT]; + + /* + * list of pidlists, up to two for each namespace (one for procs, one + * for tasks); created on demand. + */ + struct list_head pidlists; + struct mutex pidlist_mutex; + + /* used to wait for offlining of csses */ + wait_queue_head_t offline_waitq; + + /* used to schedule release agent */ + struct work_struct release_agent_work; +}; + +/* + * A cgroup_root represents the root of a cgroup hierarchy, and may be + * associated with a kernfs_root to form an active hierarchy. This is + * internal to cgroup core. Don't access directly from controllers. + */ +struct cgroup_root { + struct kernfs_root *kf_root; + + /* The bitmask of subsystems attached to this hierarchy */ + unsigned int subsys_mask; + + /* Unique id for this hierarchy. */ + int hierarchy_id; + + /* The root cgroup. Root is destroyed on its release. */ + struct cgroup cgrp; + + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ + atomic_t nr_cgrps; + + /* A list running through the active hierarchies */ + struct list_head root_list; + + /* Hierarchy-specific flags */ + unsigned int flags; + + /* IDs for cgroups in this hierarchy */ + struct idr cgroup_idr; + + /* The path to use for release notifications. */ + char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; +}; + +/* + * struct cftype: handler definitions for cgroup control files + * + * When reading/writing to a file: + * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata + */ +struct cftype { + /* + * By convention, the name should begin with the name of the + * subsystem, followed by a period. Zero length string indicates + * end of cftype array. + */ + char name[MAX_CFTYPE_NAME]; + int private; + /* + * If not 0, file mode is set to this value, otherwise it will + * be figured out automatically + */ + umode_t mode; + + /* + * The maximum length of string, excluding trailing nul, that can + * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. + */ + size_t max_write_len; + + /* CFTYPE_* flags */ + unsigned int flags; + + /* + * Fields used for internal bookkeeping. Initialized automatically + * during registration. + */ + struct cgroup_subsys *ss; /* NULL for cgroup core files */ + struct list_head node; /* anchored at ss->cfts */ + struct kernfs_ops *kf_ops; + + /* + * read_u64() is a shortcut for the common case of returning a + * single integer. Use it in place of read() + */ + u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); + /* + * read_s64() is a signed version of read_u64() + */ + s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); + + /* generic seq_file read interface */ + int (*seq_show)(struct seq_file *sf, void *v); + + /* optional ops, implement all or none */ + void *(*seq_start)(struct seq_file *sf, loff_t *ppos); + void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); + void (*seq_stop)(struct seq_file *sf, void *v); + + /* + * write_u64() is a shortcut for the common case of accepting + * a single integer (as parsed by simple_strtoull) from + * userspace. Use in place of write(); return 0 or error. + */ + int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val); + /* + * write_s64() is a signed version of write_u64() + */ + int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val); + + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. Use + * of_css/cft() to access the associated css and cft. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key lockdep_key; +#endif +}; + +/* + * Control Group subsystem type. + * See Documentation/cgroups/cgroups.txt for details + */ +struct cgroup_subsys { + struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); + int (*css_online)(struct cgroup_subsys_state *css); + void (*css_offline)(struct cgroup_subsys_state *css); + void (*css_released)(struct cgroup_subsys_state *css); + void (*css_free)(struct cgroup_subsys_state *css); + void (*css_reset)(struct cgroup_subsys_state *css); + void (*css_e_css_changed)(struct cgroup_subsys_state *css); + + int (*can_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*fork)(struct task_struct *task); + void (*exit)(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task); + void (*bind)(struct cgroup_subsys_state *root_css); + + int disabled; + int early_init; + + /* + * If %false, this subsystem is properly hierarchical - + * configuration, resource accounting and restriction on a parent + * cgroup cover those of its children. If %true, hierarchy support + * is broken in some ways - some subsystems ignore hierarchy + * completely while others are only implemented half-way. + * + * It's now disallowed to create nested cgroups if the subsystem is + * broken and cgroup core will emit a warning message on such + * cases. Eventually, all subsystems will be made properly + * hierarchical and this will go away. + */ + bool broken_hierarchy; + bool warned_broken_hierarchy; + + /* the following two fields are initialized automtically during boot */ + int id; + const char *name; + + /* link to parent, protected by cgroup_lock() */ + struct cgroup_root *root; + + /* idr for css->id */ + struct idr css_idr; + + /* + * List of cftypes. Each entry is the first entry of an array + * terminated by zero length name. + */ + struct list_head cfts; + + /* + * Base cftypes which are automatically registered. The two can + * point to the same array. + */ + struct cftype *dfl_cftypes; /* for the default hierarchy */ + struct cftype *legacy_cftypes; /* for the legacy hierarchies */ + + /* + * A subsystem may depend on other subsystems. When such subsystem + * is enabled on a cgroup, the depended-upon subsystems are enabled + * together if available. Subsystems enabled due to dependency are + * not visible to userland until explicitly enabled. The following + * specifies the mask of subsystems that this one depends on. + */ + unsigned int depends_on; +}; + +#endif /* CONFIG_CGROUPS */ +#endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b9cb94c3102a..96a2ecd5aa69 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -11,23 +11,16 @@ #include #include #include -#include #include #include #include -#include -#include #include -#include #include #include -#include -#ifdef CONFIG_CGROUPS +#include -struct cgroup_root; -struct cgroup_subsys; -struct cgroup; +#ifdef CONFIG_CGROUPS extern int cgroup_init_early(void); extern int cgroup_init(void); @@ -40,66 +33,6 @@ extern int cgroupstats_build(struct cgroupstats *stats, extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); -/* define the enumeration of all cgroup subsystems */ -#define SUBSYS(_x) _x ## _cgrp_id, -enum cgroup_subsys_id { -#include - CGROUP_SUBSYS_COUNT, -}; -#undef SUBSYS - -/* - * Per-subsystem/per-cgroup state maintained by the system. This is the - * fundamental structural building block that controllers deal with. - * - * Fields marked with "PI:" are public and immutable and may be accessed - * directly without synchronization. - */ -struct cgroup_subsys_state { - /* PI: the cgroup that this css is attached to */ - struct cgroup *cgroup; - - /* PI: the cgroup subsystem that this css is attached to */ - struct cgroup_subsys *ss; - - /* reference count - access via css_[try]get() and css_put() */ - struct percpu_ref refcnt; - - /* PI: the parent css */ - struct cgroup_subsys_state *parent; - - /* siblings list anchored at the parent's ->children */ - struct list_head sibling; - struct list_head children; - - /* - * PI: Subsys-unique ID. 0 is unused and root is always 1. The - * matching css can be looked up using css_from_id(). - */ - int id; - - unsigned int flags; - - /* - * Monotonically increasing unique serial number which defines a - * uniform order among all csses. It's guaranteed that all - * ->children lists are in the ascending order of ->serial_nr and - * used to allow interrupting and resuming iterations. - */ - u64 serial_nr; - - /* percpu_ref killing and RCU release */ - struct rcu_head rcu_head; - struct work_struct destroy_work; -}; - -/* bits in struct cgroup_subsys_state flags field */ -enum { - CSS_NO_REF = (1 << 0), /* no reference counting for this css */ - CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ - CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ -}; - /** * css_get - obtain a reference on the specified css * @css: target css @@ -185,307 +118,6 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } -/* bits in struct cgroup flags field */ -enum { - /* Control Group requires release notifications to userspace */ - CGRP_NOTIFY_ON_RELEASE, - /* - * Clone the parent's configuration when creating a new child - * cpuset cgroup. For historical reasons, this option can be - * specified at mount time and thus is implemented here. - */ - CGRP_CPUSET_CLONE_CHILDREN, -}; - -struct cgroup { - /* self css with NULL ->ss, points back to this cgroup */ - struct cgroup_subsys_state self; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - - /* - * If this cgroup contains any tasks, it contributes one to - * populated_cnt. All children with non-zero popuplated_cnt of - * their own contribute one. The count is zero iff there's no task - * in this cgroup or its subtree. - */ - int populated_cnt; - - struct kernfs_node *kn; /* cgroup kernfs entry */ - struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ - - /* - * The bitmask of subsystems enabled on the child cgroups. - * ->subtree_control is the one configured through - * "cgroup.subtree_control" while ->child_subsys_mask is the - * effective one which may have more subsystems enabled. - * Controller knobs are made available iff it's enabled in - * ->subtree_control. - */ - unsigned int subtree_control; - unsigned int child_subsys_mask; - - /* Private pointers for each registered subsystem */ - struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; - - struct cgroup_root *root; - - /* - * List of cgrp_cset_links pointing at css_sets with tasks in this - * cgroup. Protected by css_set_lock. - */ - struct list_head cset_links; - - /* - * On the default hierarchy, a css_set for a cgroup with some - * susbsys disabled will point to css's which are associated with - * the closest ancestor which has the subsys enabled. The - * following lists all css_sets which point to this cgroup's css - * for the given subsystem. - */ - struct list_head e_csets[CGROUP_SUBSYS_COUNT]; - - /* - * list of pidlists, up to two for each namespace (one for procs, one - * for tasks); created on demand. - */ - struct list_head pidlists; - struct mutex pidlist_mutex; - - /* used to wait for offlining of csses */ - wait_queue_head_t offline_waitq; - - /* used to schedule release agent */ - struct work_struct release_agent_work; -}; - -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* cgroup_root->flags */ -enum { - CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ - CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ - CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ -}; - -/* - * A cgroup_root represents the root of a cgroup hierarchy, and may be - * associated with a kernfs_root to form an active hierarchy. This is - * internal to cgroup core. Don't access directly from controllers. - */ -struct cgroup_root { - struct kernfs_root *kf_root; - - /* The bitmask of subsystems attached to this hierarchy */ - unsigned int subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The root cgroup. Root is destroyed on its release. */ - struct cgroup cgrp; - - /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ - atomic_t nr_cgrps; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* Hierarchy-specific flags */ - unsigned int flags; - - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - -/* - * A css_set is a structure holding pointers to a set of - * cgroup_subsys_state objects. This saves space in the task struct - * object and speeds up fork()/exit(), since a single inc/dec and a - * list_add()/del() can bump the reference count on the entire cgroup - * set for a task. - */ - -struct css_set { - - /* Reference count */ - atomic_t refcount; - - /* - * List running through all cgroup groups in the same hash - * slot. Protected by css_set_lock - */ - struct hlist_node hlist; - - /* - * Lists running through all tasks using this cgroup group. - * mg_tasks lists tasks which belong to this cset but are in the - * process of being migrated out or in. Protected by - * css_set_rwsem, but, during migration, once tasks are moved to - * mg_tasks, it can be read safely while holding cgroup_mutex. - */ - struct list_head tasks; - struct list_head mg_tasks; - - /* - * List of cgrp_cset_links pointing at cgroups referenced from this - * css_set. Protected by css_set_lock. - */ - struct list_head cgrp_links; - - /* the default cgroup associated with this css_set */ - struct cgroup *dfl_cgrp; - - /* - * Set of subsystem states, one for each subsystem. This array is - * immutable after creation apart from the init_css_set during - * subsystem registration (at boot time). - */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; - - /* - * List of csets participating in the on-going migration either as - * source or destination. Protected by cgroup_mutex. - */ - struct list_head mg_preload_node; - struct list_head mg_node; - - /* - * If this cset is acting as the source of migration the following - * two fields are set. mg_src_cgrp is the source cgroup of the - * on-going migration and mg_dst_cset is the destination cset the - * target tasks on this cset should be migrated to. Protected by - * cgroup_mutex. - */ - struct cgroup *mg_src_cgrp; - struct css_set *mg_dst_cset; - - /* - * On the default hierarhcy, ->subsys[ssid] may point to a css - * attached to an ancestor instead of the cgroup this css_set is - * associated with. The following node is anchored at - * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to - * iterate through all css's attached to a given cgroup. - */ - struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; - - /* For RCU-protected deletion */ - struct rcu_head rcu_head; -}; - -/* - * struct cftype: handler definitions for cgroup control files - * - * When reading/writing to a file: - * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_path.dentry->d_fsdata - */ - -/* cftype->flags */ -enum { - CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ - CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ - CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ - - /* internal flags, do not use outside cgroup core proper */ - __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ - __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ -}; - -#define MAX_CFTYPE_NAME 64 - -struct cftype { - /* - * By convention, the name should begin with the name of the - * subsystem, followed by a period. Zero length string indicates - * end of cftype array. - */ - char name[MAX_CFTYPE_NAME]; - int private; - /* - * If not 0, file mode is set to this value, otherwise it will - * be figured out automatically - */ - umode_t mode; - - /* - * The maximum length of string, excluding trailing nul, that can - * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. - */ - size_t max_write_len; - - /* CFTYPE_* flags */ - unsigned int flags; - - /* - * Fields used for internal bookkeeping. Initialized automatically - * during registration. - */ - struct cgroup_subsys *ss; /* NULL for cgroup core files */ - struct list_head node; /* anchored at ss->cfts */ - struct kernfs_ops *kf_ops; - - /* - * read_u64() is a shortcut for the common case of returning a - * single integer. Use it in place of read() - */ - u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); - /* - * read_s64() is a signed version of read_u64() - */ - s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); - - /* generic seq_file read interface */ - int (*seq_show)(struct seq_file *sf, void *v); - - /* optional ops, implement all or none */ - void *(*seq_start)(struct seq_file *sf, loff_t *ppos); - void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); - void (*seq_stop)(struct seq_file *sf, void *v); - - /* - * write_u64() is a shortcut for the common case of accepting - * a single integer (as parsed by simple_strtoull) from - * userspace. Use in place of write(); return 0 or error. - */ - int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val); - /* - * write_s64() is a signed version of write_u64() - */ - int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val); - - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. Use - * of_css/cft() to access the associated css and cft. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lock_class_key lockdep_key; -#endif -}; - extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; @@ -612,11 +244,6 @@ int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); -/* - * Control Group taskset, used to pass around set of tasks to cgroup_subsys - * methods. - */ -struct cgroup_taskset; struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); @@ -629,84 +256,6 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) -/* - * Control Group subsystem type. - * See Documentation/cgroups/cgroups.txt for details - */ - -struct cgroup_subsys { - struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); - int (*css_online)(struct cgroup_subsys_state *css); - void (*css_offline)(struct cgroup_subsys_state *css); - void (*css_released)(struct cgroup_subsys_state *css); - void (*css_free)(struct cgroup_subsys_state *css); - void (*css_reset)(struct cgroup_subsys_state *css); - void (*css_e_css_changed)(struct cgroup_subsys_state *css); - - int (*can_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*fork)(struct task_struct *task); - void (*exit)(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task); - void (*bind)(struct cgroup_subsys_state *root_css); - - int disabled; - int early_init; - - /* - * If %false, this subsystem is properly hierarchical - - * configuration, resource accounting and restriction on a parent - * cgroup cover those of its children. If %true, hierarchy support - * is broken in some ways - some subsystems ignore hierarchy - * completely while others are only implemented half-way. - * - * It's now disallowed to create nested cgroups if the subsystem is - * broken and cgroup core will emit a warning message on such - * cases. Eventually, all subsystems will be made properly - * hierarchical and this will go away. - */ - bool broken_hierarchy; - bool warned_broken_hierarchy; - - /* the following two fields are initialized automtically during boot */ - int id; -#define MAX_CGROUP_TYPE_NAMELEN 32 - const char *name; - - /* link to parent, protected by cgroup_lock() */ - struct cgroup_root *root; - - /* idr for css->id */ - struct idr css_idr; - - /* - * List of cftypes. Each entry is the first entry of an array - * terminated by zero length name. - */ - struct list_head cfts; - - /* - * Base cftypes which are automatically registered. The two can - * point to the same array. - */ - struct cftype *dfl_cftypes; /* for the default hierarchy */ - struct cftype *legacy_cftypes; /* for the legacy hierarchies */ - - /* - * A subsystem may depend on other subsystems. When such subsystem - * is enabled on a cgroup, the depended-upon subsystems are enabled - * together if available. Subsystems enabled due to dependency are - * not visible to userland until explicitly enabled. The following - * specifies the mask of subsystems that this one depends on. - */ - unsigned int depends_on; -}; - #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include #undef SUBSYS -- cgit v1.2.3 From c326aa2bb2209e10df4a381801bb34ca0f923038 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:24:16 -0400 Subject: cgroup: reorganize include/linux/cgroup.h From c4d440938b5e2015c70594fe6666a099c844f929 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:21:40 -0400 Over time, cgroup.h grew organically and doesn't have much logical structure at this point. Separation of cgroup-defs.h in the previous patch gives us a good chance for reorganizing cgroup.h as changes to the header are likely to cause conflicts anyway. This patch reorganizes cgroup.h so that it has consistent logical grouping. This is pure reorganization. v2: Relocating #ifdef CONFIG_CGROUPS caused build failure when cgroup is disabled. Dropped. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 552 ++++++++++++++++++++++++------------------------- 1 file changed, 271 insertions(+), 281 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 96a2ecd5aa69..82319fb31cfe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -22,16 +22,189 @@ #ifdef CONFIG_CGROUPS -extern int cgroup_init_early(void); -extern int cgroup_init(void); -extern void cgroup_fork(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); -extern void cgroup_exit(struct task_struct *p); -extern int cgroupstats_build(struct cgroupstats *stats, - struct dentry *dentry); +/* a css_task_iter should be treated as an opaque object */ +struct css_task_iter { + struct cgroup_subsys *ss; + + struct list_head *cset_pos; + struct list_head *cset_head; + + struct list_head *task_pos; + struct list_head *tasks_head; + struct list_head *mg_tasks_head; +}; + +extern struct cgroup_root cgrp_dfl_root; +extern struct css_set init_css_set; + +#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; +#include +#undef SUBSYS + +bool css_has_online_children(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, + struct cgroup_subsys *ss); +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); + +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); + +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_rm_cftypes(struct cftype *cfts); + +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); +int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk); + +void cgroup_fork(struct task_struct *p); +void cgroup_post_fork(struct task_struct *p); +void cgroup_exit(struct task_struct *p); + +int cgroup_init_early(void); +int cgroup_init(void); + +/* + * Iteration helpers and macros. + */ + +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent); +struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); +struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); +struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); + +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); + +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it); +struct task_struct *css_task_iter_next(struct css_task_iter *it); +void css_task_iter_end(struct css_task_iter *it); + +/** + * css_for_each_child - iterate through children of a css + * @pos: the css * to use as the loop cursor + * @parent: css whose children to walk + * + * Walk @parent's children. Must be called under rcu_read_lock(). + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. + */ +#define css_for_each_child(pos, parent) \ + for ((pos) = css_next_child(NULL, (parent)); (pos); \ + (pos) = css_next_child((pos), (parent))) + +/** + * css_for_each_descendant_pre - pre-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @root: css whose descendants to walk + * + * Walk @root's descendants. @root is included in the iteration and the + * first node to be visited. Must be called under rcu_read_lock(). + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * For example, the following guarantees that a descendant can't escape + * state updates of its ancestors. + * + * my_online(@css) + * { + * Lock @css's parent and @css; + * Inherit state from the parent; + * Unlock both. + * } + * + * my_update_state(@css) + * { + * css_for_each_descendant_pre(@pos, @css) { + * Lock @pos; + * if (@pos == @css) + * Update @css's state; + * else + * Verify @pos is alive and inherit state from its parent; + * Unlock @pos; + * } + * } + * + * As long as the inheriting step, including checking the parent state, is + * enclosed inside @pos locking, double-locking the parent isn't necessary + * while inheriting. The state update to the parent is guaranteed to be + * visible by walking order and, as long as inheriting operations to the + * same @pos are atomic to each other, multiple updates racing each other + * still result in the correct state. It's guaranateed that at least one + * inheritance happens for any css after the latest update to its parent. + * + * If checking parent's state requires locking the parent, each inheriting + * iteration should lock and unlock both @pos->parent and @pos. + * + * Alternatively, a subsystem may choose to use a single global lock to + * synchronize ->css_online() and ->css_offline() against tree-walking + * operations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. + */ +#define css_for_each_descendant_pre(pos, css) \ + for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ + (pos) = css_next_descendant_pre((pos), (css))) + +/** + * css_for_each_descendant_post - post-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @css: css whose descendants to walk + * + * Similar to css_for_each_descendant_pre() but performs post-order + * traversal instead. @root is included in the iteration and the last + * node to be visited. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * Note that the walk visibility guarantee example described in pre-order + * walk doesn't apply the same to post-order walks. + */ +#define css_for_each_descendant_post(pos, css) \ + for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ + (pos) = css_next_descendant_post((pos), (css))) + +/** + * cgroup_taskset_for_each - iterate cgroup_taskset + * @task: the loop cursor + * @tset: taskset to iterate + */ +#define cgroup_taskset_for_each(task, tset) \ + for ((task) = cgroup_taskset_first((tset)); (task); \ + (task) = cgroup_taskset_next((tset))) -extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk); +/* + * Inline functions. + */ /** * css_get - obtain a reference on the specified css @@ -118,8 +291,87 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } -extern struct cgroup_root cgrp_dfl_root; -extern struct css_set init_css_set; +/** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for + * @__c: extra condition expression to be passed to rcu_dereference_check() + * + * A task's css_set is RCU protected, initialized and exited while holding + * task_lock(), and can only be modified while holding both cgroup_mutex + * and task_lock() while the task is alive. This macro verifies that the + * caller is inside proper critical section and returns @task's css_set. + * + * The caller can also specify additional allowed conditions via @__c, such + * as locks used during the cgroup_subsys::attach() methods. + */ +#ifdef CONFIG_PROVE_RCU +extern struct mutex cgroup_mutex; +extern struct rw_semaphore css_set_rwsem; +#define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ + lockdep_is_held(&cgroup_mutex) || \ + lockdep_is_held(&css_set_rwsem) || \ + ((task)->flags & PF_EXITING) || (__c)) +#else +#define task_css_set_check(task, __c) \ + rcu_dereference((task)->cgroups) +#endif + +/** + * task_css_check - obtain css for (task, subsys) w/ extra access conds + * @task: the target task + * @subsys_id: the target subsystem ID + * @__c: extra condition expression to be passed to rcu_dereference_check() + * + * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The + * synchronization rules are the same as task_css_set_check(). + */ +#define task_css_check(task, subsys_id, __c) \ + task_css_set_check((task), (__c))->subsys[(subsys_id)] + +/** + * task_css_set - obtain a task's css_set + * @task: the task to obtain css_set for + * + * See task_css_set_check(). + */ +static inline struct css_set *task_css_set(struct task_struct *task) +{ + return task_css_set_check(task, false); +} + +/** + * task_css - obtain css for (task, subsys) + * @task: the target task + * @subsys_id: the target subsystem ID + * + * See task_css_check(). + */ +static inline struct cgroup_subsys_state *task_css(struct task_struct *task, + int subsys_id) +{ + return task_css_check(task, subsys_id, false); +} + +/** + * task_css_is_root - test whether a task belongs to the root css + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Test whether @task belongs to the root css on the specified subsystem. + * May be invoked in any context. + */ +static inline bool task_css_is_root(struct task_struct *task, int subsys_id) +{ + return task_css_check(task, subsys_id, true) == + init_css_set.subsys[subsys_id]; +} + +static inline struct cgroup *task_cgroup(struct task_struct *task, + int subsys_id) +{ + return task_css(task, subsys_id)->cgroup; +} /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy @@ -236,284 +488,22 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); - -int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_rm_cftypes(struct cftype *cfts); - -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); - -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); - -/** - * cgroup_taskset_for_each - iterate cgroup_taskset - * @task: the loop cursor - * @tset: taskset to iterate - */ -#define cgroup_taskset_for_each(task, tset) \ - for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) - -#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; -#include -#undef SUBSYS - -/** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for - * @__c: extra condition expression to be passed to rcu_dereference_check() - * - * A task's css_set is RCU protected, initialized and exited while holding - * task_lock(), and can only be modified while holding both cgroup_mutex - * and task_lock() while the task is alive. This macro verifies that the - * caller is inside proper critical section and returns @task's css_set. - * - * The caller can also specify additional allowed conditions via @__c, such - * as locks used during the cgroup_subsys::attach() methods. - */ -#ifdef CONFIG_PROVE_RCU -extern struct mutex cgroup_mutex; -extern struct rw_semaphore css_set_rwsem; -#define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ - lockdep_is_held(&cgroup_mutex) || \ - lockdep_is_held(&css_set_rwsem) || \ - ((task)->flags & PF_EXITING) || (__c)) -#else -#define task_css_set_check(task, __c) \ - rcu_dereference((task)->cgroups) -#endif - -/** - * task_css_check - obtain css for (task, subsys) w/ extra access conds - * @task: the target task - * @subsys_id: the target subsystem ID - * @__c: extra condition expression to be passed to rcu_dereference_check() - * - * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The - * synchronization rules are the same as task_css_set_check(). - */ -#define task_css_check(task, subsys_id, __c) \ - task_css_set_check((task), (__c))->subsys[(subsys_id)] - -/** - * task_css_set - obtain a task's css_set - * @task: the task to obtain css_set for - * - * See task_css_set_check(). - */ -static inline struct css_set *task_css_set(struct task_struct *task) -{ - return task_css_set_check(task, false); -} - -/** - * task_css - obtain css for (task, subsys) - * @task: the target task - * @subsys_id: the target subsystem ID - * - * See task_css_check(). - */ -static inline struct cgroup_subsys_state *task_css(struct task_struct *task, - int subsys_id) -{ - return task_css_check(task, subsys_id, false); -} - -/** - * task_css_is_root - test whether a task belongs to the root css - * @task: the target task - * @subsys_id: the target subsystem ID - * - * Test whether @task belongs to the root css on the specified subsystem. - * May be invoked in any context. - */ -static inline bool task_css_is_root(struct task_struct *task, int subsys_id) -{ - return task_css_check(task, subsys_id, true) == - init_css_set.subsys[subsys_id]; -} - -static inline struct cgroup *task_cgroup(struct task_struct *task, - int subsys_id) -{ - return task_css(task, subsys_id)->cgroup; -} - -struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *parent); - -struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); - -/** - * css_for_each_child - iterate through children of a css - * @pos: the css * to use as the loop cursor - * @parent: css whose children to walk - * - * Walk @parent's children. Must be called under rcu_read_lock(). - * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. - * - * It is allowed to temporarily drop RCU read lock during iteration. The - * caller is responsible for ensuring that @pos remains accessible until - * the start of the next iteration by, for example, bumping the css refcnt. - */ -#define css_for_each_child(pos, parent) \ - for ((pos) = css_next_child(NULL, (parent)); (pos); \ - (pos) = css_next_child((pos), (parent))) - -struct cgroup_subsys_state * -css_next_descendant_pre(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *css); - -struct cgroup_subsys_state * -css_rightmost_descendant(struct cgroup_subsys_state *pos); - -/** - * css_for_each_descendant_pre - pre-order walk of a css's descendants - * @pos: the css * to use as the loop cursor - * @root: css whose descendants to walk - * - * Walk @root's descendants. @root is included in the iteration and the - * first node to be visited. Must be called under rcu_read_lock(). - * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. - * - * For example, the following guarantees that a descendant can't escape - * state updates of its ancestors. - * - * my_online(@css) - * { - * Lock @css's parent and @css; - * Inherit state from the parent; - * Unlock both. - * } - * - * my_update_state(@css) - * { - * css_for_each_descendant_pre(@pos, @css) { - * Lock @pos; - * if (@pos == @css) - * Update @css's state; - * else - * Verify @pos is alive and inherit state from its parent; - * Unlock @pos; - * } - * } - * - * As long as the inheriting step, including checking the parent state, is - * enclosed inside @pos locking, double-locking the parent isn't necessary - * while inheriting. The state update to the parent is guaranteed to be - * visible by walking order and, as long as inheriting operations to the - * same @pos are atomic to each other, multiple updates racing each other - * still result in the correct state. It's guaranateed that at least one - * inheritance happens for any css after the latest update to its parent. - * - * If checking parent's state requires locking the parent, each inheriting - * iteration should lock and unlock both @pos->parent and @pos. - * - * Alternatively, a subsystem may choose to use a single global lock to - * synchronize ->css_online() and ->css_offline() against tree-walking - * operations. - * - * It is allowed to temporarily drop RCU read lock during iteration. The - * caller is responsible for ensuring that @pos remains accessible until - * the start of the next iteration by, for example, bumping the css refcnt. - */ -#define css_for_each_descendant_pre(pos, css) \ - for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ - (pos) = css_next_descendant_pre((pos), (css))) - -struct cgroup_subsys_state * -css_next_descendant_post(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *css); - -/** - * css_for_each_descendant_post - post-order walk of a css's descendants - * @pos: the css * to use as the loop cursor - * @css: css whose descendants to walk - * - * Similar to css_for_each_descendant_pre() but performs post-order - * traversal instead. @root is included in the iteration and the last - * node to be visited. - * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. - * - * Note that the walk visibility guarantee example described in pre-order - * walk doesn't apply the same to post-order walks. - */ -#define css_for_each_descendant_post(pos, css) \ - for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ - (pos) = css_next_descendant_post((pos), (css))) - -bool css_has_online_children(struct cgroup_subsys_state *css); - -/* A css_task_iter should be treated as an opaque object */ -struct css_task_iter { - struct cgroup_subsys *ss; - - struct list_head *cset_pos; - struct list_head *cset_head; - - struct list_head *task_pos; - struct list_head *tasks_head; - struct list_head *mg_tasks_head; -}; - -void css_task_iter_start(struct cgroup_subsys_state *css, - struct css_task_iter *it); -struct task_struct *css_task_iter_next(struct css_task_iter *it); -void css_task_iter_end(struct css_task_iter *it); - -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); - -struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, - struct cgroup_subsys *ss); -struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss); - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; -static inline int cgroup_init_early(void) { return 0; } -static inline int cgroup_init(void) { return 0; } +static inline void css_put(struct cgroup_subsys_state *css) {} +static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } +static inline int cgroupstats_build(struct cgroupstats *stats, + struct dentry *dentry) { return -EINVAL; } + static inline void cgroup_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {} -static inline int cgroupstats_build(struct cgroupstats *stats, - struct dentry *dentry) -{ - return -EINVAL; -} - -static inline void css_put(struct cgroup_subsys_state *css) {} - -/* No cgroups - nothing to do */ -static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) -{ - return 0; -} +static inline int cgroup_init_early(void) { return 0; } +static inline int cgroup_init(void) { return 0; } #endif /* !CONFIG_CGROUPS */ -- cgit v1.2.3 From f03123783d4e43cd59df58e23e963136e04f8280 Mon Sep 17 00:00:00 2001 From: Ramakrishna Pallala Date: Thu, 30 Apr 2015 20:44:45 +0530 Subject: extcon: axp288: Add axp288 extcon driver support This patch adds the extcon support for AXP288 PMIC which has the BC1.2 charger detection capability. Additionally it also adds the USB mux switching support b/w SOC and PMIC based on GPIO control. Signed-off-by: Ramakrishna Pallala Acked-by: Lee Jones [cw00.choi: Modify the log message to keep the consistent log message pattern] Signed-off-by: Chanwoo Choi --- include/linux/mfd/axp20x.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index dfabd6db7ddf..4ed8071d062e 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -275,4 +275,9 @@ struct axp20x_fg_pdata { int thermistor_curve[MAX_THERM_CURVE_SIZE][2]; }; +struct axp288_extcon_pdata { + /* GPIO pin control to switch D+/D- lines b/w PMIC and SOC */ + struct gpio_desc *gpio_mux_cntl; +}; + #endif /* __LINUX_MFD_AXP20X_H */ -- cgit v1.2.3 From 707d7550875a9bd245ce5f1077d2d2cb44eab218 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 15 Apr 2015 13:57:51 +0900 Subject: extcon: Add extcon_get_edev_name() API to get the extcon device name This patch adds the extcon_get_edev_name() API to get the name of extcon device because all information inclued in the structure extcon_dev should be accessed by extcon core API instead of directly accessing the data. Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 36f49c405dfb..e2cf6254c86a 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -259,6 +259,10 @@ extern int extcon_unregister_notifier(struct extcon_dev *edev, * This function use phandle of devicetree to get extcon device directly. */ extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index); + +/* Following API to get information of extcon device */ +extern const char *extcon_get_edev_name(struct extcon_dev *edev); + #else /* CONFIG_EXTCON */ static inline int extcon_dev_register(struct extcon_dev *edev) { -- cgit v1.2.3 From b9ec23c08a0274d31ee626f14b359563ea0cae46 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 24 Apr 2015 14:48:52 +0900 Subject: extcon: Fix the checkpatch warning and minor coding style issue This patch clean up the extcon core driver by fixing the checkpatch warning and minor coding style issue. Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index e2cf6254c86a..799474d9dc48 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -97,8 +97,8 @@ struct extcon_cable; * @state: Attach/detach state of this extcon. Do not provide at * register-time. * @nh: Notifier for the state change events from this extcon - * @entry: To support list of extcon devices so that users can search - * for extcon devices based on the extcon name. + * @entry: To support list of extcon devices so that users can + * search for extcon devices based on the extcon name. * @lock: * @max_supported: Internal value to store the number of cables. * @extcon_dev_type: Device_type struct to provide attribute_groups @@ -258,7 +258,8 @@ extern int extcon_unregister_notifier(struct extcon_dev *edev, * Following API get the extcon device from devicetree. * This function use phandle of devicetree to get extcon device directly. */ -extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index); +extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, + int index); /* Following API to get information of extcon device */ extern const char *extcon_get_edev_name(struct extcon_dev *edev); -- cgit v1.2.3 From 9e86b2ad4c11fd52ee8133abce7a29e0b32d29a7 Mon Sep 17 00:00:00 2001 From: Inha Song Date: Mon, 4 May 2015 13:42:13 +0900 Subject: extcon: arizona: Add support for select accessory detect mode when headphone detection This patch add support for select accessory detect mode to HPDETL or HPDETR. Arizona provides a headphone detection circuit on the HPDETL and HPDETR pins to measure the impedance of an external load connected to the headphone. Depending on board design, headphone detect pins can change to HPDETR or HPDETL. Signed-off-by: Inha Song Acked-by: Lee Jones Acked-by: Charles Keepax Signed-off-by: Chanwoo Choi --- include/linux/mfd/arizona/pdata.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 1789cb0f4f17..aa5c48b06755 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -121,6 +121,9 @@ struct arizona_pdata { /** GPIO used for mic isolation with HPDET */ int hpdet_id_gpio; + /** Channel to use for headphone detection */ + unsigned int hpdet_channel; + /** Extra debounce timeout used during initial mic detection (ms) */ int micd_detect_debounce; -- cgit v1.2.3 From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2015 09:38:13 +0200 Subject: block: use an atomic_t for mq_freeze_depth lockdep gets unhappy about the not disabling irqs when using the queue_lock around it. Instead of trying to fix that up just switch to an atomic_t and get rid of the lock. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2da818a48097..bc917956a6d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -444,7 +444,7 @@ struct request_queue { struct mutex sysfs_lock; int bypass_depth; - int mq_freeze_depth; + atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; -- cgit v1.2.3 From b25de9d6da49b1a8760a89672283128aa8c78345 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2015 21:41:01 +0200 Subject: block: remove BIO_EOPNOTSUPP Since the big barrier rewrite/removal in 2007 we never fail FLUSH or FUA requests, which means we can remove the magic BIO_EOPNOTSUPP flag to help propagating those to the buffer_head layer. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 93d2e7153816..daf95915d104 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -118,7 +118,6 @@ struct bio { #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ -#define BIO_EOPNOTSUPP 7 /* not supported */ #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ -- cgit v1.2.3 From 97ca223c3b37ed12a5b67a5dc6247e5a4799d337 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2015 21:41:02 +0200 Subject: block: remove unused BIO_RW_BLOCK and BIO_EOF flags Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index daf95915d104..09c7a2cd48ef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -112,8 +112,6 @@ struct bio { * bio flags */ #define BIO_UPTODATE 0 /* ok after I/O completion */ -#define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ -#define BIO_EOF 2 /* out-out-bounds error */ #define BIO_SEG_VALID 3 /* bi_phys_segments valid */ #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ -- cgit v1.2.3 From b2dbe0a60f1bcf4db5c701f1577b3135c3159eb5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 May 2015 09:18:28 -0600 Subject: block: collapse bio bit space Various previous patches removed bits and left holes, collapse them all. Leave the reset start bit where it is, we don't need to change that. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09c7a2cd48ef..3f4ded0b1a34 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -112,15 +112,15 @@ struct bio { * bio flags */ #define BIO_UPTODATE 0 /* ok after I/O completion */ -#define BIO_SEG_VALID 3 /* bi_phys_segments valid */ -#define BIO_CLONED 4 /* doesn't own data */ -#define BIO_BOUNCED 5 /* bio is a bounce bio */ -#define BIO_USER_MAPPED 6 /* contains user pages */ -#define BIO_NULL_MAPPED 8 /* contains invalid user pages */ -#define BIO_QUIET 9 /* Make BIO Quiet */ -#define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ -#define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ -#define BIO_REFFED 12 /* bio has elevated ->bi_cnt */ +#define BIO_SEG_VALID 1 /* bi_phys_segments valid */ +#define BIO_CLONED 2 /* doesn't own data */ +#define BIO_BOUNCED 3 /* bio is a bounce bio */ +#define BIO_USER_MAPPED 4 /* contains user pages */ +#define BIO_NULL_MAPPED 5 /* contains invalid user pages */ +#define BIO_QUIET 6 /* Make BIO Quiet */ +#define BIO_SNAP_STABLE 7 /* bio data must be snapshotted during write */ +#define BIO_CHAIN 8 /* chained bio, ->bi_remaining in effect */ +#define BIO_REFFED 9 /* bio has elevated ->bi_cnt */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From 343df3c79c62b644ce6ff5dff96c9e0be1ecb242 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2015 09:23:23 +0200 Subject: suspend: simplify block I/O handling Stop abusing struct page functionality and the swap end_io handler, and instead add a modified version of the blk-lib.c bio_batch helpers. Also move the block I/O code into swap.c as they are directly tied into each other. Signed-off-by: Christoph Hellwig Tested-by: Pavel Machek Tested-by: Ming Lin Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index cee108cbe2d5..38874729dc5f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -377,7 +377,6 @@ extern void end_swap_bio_write(struct bio *bio, int err); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)); extern int swap_set_page_dirty(struct page *page); -extern void end_swap_bio_read(struct bio *bio, int err); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); -- cgit v1.2.3 From ecc8617053e0a97272ef2eee138809f30080e84b Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 30 Mar 2015 16:20:03 -0700 Subject: module: add extra argument for parse_params() callback This adds an extra argument onto parse_params() to be used as a way to make the unused callback a bit more useful and generic by allowing the caller to pass on a data structure of its choice. An example use case is to allow us to easily make module parameters for every module which we will do next. @ parse @ identifier name, args, params, num, level_min, level_max; identifier unknown, param, val, doing; type s16; @@ extern char *parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, s16 level_min, s16 level_max, + void *arg, int (*unknown)(char *param, char *val, const char *doing + , void *arg )); @ parse_mod @ identifier name, args, params, num, level_min, level_max; identifier unknown, param, val, doing; type s16; @@ char *parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, s16 level_min, s16 level_max, + void *arg, int (*unknown)(char *param, char *val, const char *doing + , void *arg )) { ... } @ parse_args_found @ expression R, E1, E2, E3, E4, E5, E6; identifier func; @@ ( R = parse_args(E1, E2, E3, E4, E5, E6, + NULL, func); | R = parse_args(E1, E2, E3, E4, E5, E6, + NULL, &func); | R = parse_args(E1, E2, E3, E4, E5, E6, + NULL, NULL); | parse_args(E1, E2, E3, E4, E5, E6, + NULL, func); | parse_args(E1, E2, E3, E4, E5, E6, + NULL, &func); | parse_args(E1, E2, E3, E4, E5, E6, + NULL, NULL); ) @ parse_args_unused depends on parse_args_found @ identifier parse_args_found.func; @@ int func(char *param, char *val, const char *unused + , void *arg ) { ... } @ mod_unused depends on parse_args_found @ identifier parse_args_found.func; expression A1, A2, A3; @@ - func(A1, A2, A3); + func(A1, A2, A3, NULL); Generated-by: Coccinelle SmPL Cc: cocci@systeme.lip6.fr Cc: Tejun Heo Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Rusty Russell Cc: Christoph Hellwig Cc: Felipe Contreras Cc: Ewan Milne Cc: Jean Delvare Cc: Hannes Reinecke Cc: Jani Nikula Cc: linux-kernel@vger.kernel.org Reviewed-by: Tejun Heo Acked-by: Rusty Russell Signed-off-by: Luis R. Rodriguez Signed-off-by: Greg Kroah-Hartman --- include/linux/moduleparam.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 1c9effa25e26..13923709d30d 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -357,8 +357,9 @@ extern char *parse_args(const char *name, unsigned num, s16 level_min, s16 level_max, + void *arg, int (*unknown)(char *param, char *val, - const char *doing)); + const char *doing, void *arg)); /* Called by module remove. */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 765230b5f084863183aa8adb3405ab3f32c0b16e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 30 Mar 2015 16:20:04 -0700 Subject: driver-core: add asynchronous probing support for drivers Some devices take a long time when initializing, and not all drivers are suited to initialize their devices when they are open. For example, input drivers need to interrogate their devices in order to publish device's capabilities before userspace will open them. When such drivers are compiled into kernel they may stall entire kernel initialization. This change allows drivers request for their probe functions to be called asynchronously during driver and device registration (manual binding is still synchronous). Because async_schedule is used to perform asynchronous calls module loading will still wait for the probing to complete. Note that the end goal is to make the probing asynchronous by default, so annotating drivers with PROBE_PREFER_ASYNCHRONOUS is a temporary measure that allows us to speed up boot process while we validating and fixing the rest of the drivers and preparing userspace. This change is based on earlier patch by "Luis R. Rodriguez" Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 6558af90c8fe..7857b46c548b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -195,6 +195,31 @@ extern int bus_unregister_notifier(struct bus_type *bus, extern struct kset *bus_get_kset(struct bus_type *bus); extern struct klist *bus_get_device_klist(struct bus_type *bus); +/** + * enum probe_type - device driver probe type to try + * Device drivers may opt in for special handling of their + * respective probe routines. This tells the core what to + * expect and prefer. + * + * @PROBE_SYNCHRONOUS: Default. Drivers expect their probe routines + * to run synchronously with driver and device registration + * (with the exception of -EPROBE_DEFER handling - re-probing + * always ends up being done asynchronously). + * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which + * probing order is not essential for booting the system may + * opt into executing their probes asynchronously. + * + * Note that the end goal is to switch the kernel to use asynchronous + * probing by default, so annotating drivers with + * %PROBE_PREFER_ASYNCHRONOUS is a temporary measure that allows us + * to speed up boot process while we are validating the rest of the + * drivers. + */ +enum probe_type { + PROBE_SYNCHRONOUS, + PROBE_PREFER_ASYNCHRONOUS, +}; + /** * struct device_driver - The basic device driver structure * @name: Name of the device driver. @@ -202,6 +227,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * @owner: The module owner. * @mod_name: Used for built-in modules. * @suppress_bind_attrs: Disables bind/unbind via sysfs. + * @probe_type: Type of the probe (synchronous or asynchronous) to use. * @of_match_table: The open firmware table. * @acpi_match_table: The ACPI match table. * @probe: Called to query the existence of a specific device, @@ -235,6 +261,7 @@ struct device_driver { const char *mod_name; /* used for built-in modules */ bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ + enum probe_type probe_type; const struct of_device_id *of_match_table; const struct acpi_device_id *acpi_match_table; @@ -975,6 +1002,7 @@ extern int __must_check device_bind_driver(struct device *dev); extern void device_release_driver(struct device *dev); extern int __must_check device_attach(struct device *dev); extern int __must_check driver_attach(struct device_driver *drv); +extern void device_initial_probe(struct device *dev); extern int __must_check device_reprobe(struct device *dev); /* -- cgit v1.2.3 From f2411da746985e60d4d087f3a43e271c61785927 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 30 Mar 2015 16:20:05 -0700 Subject: driver-core: add driver module asynchronous probe support Some init systems may wish to express the desire to have device drivers run their probe() code asynchronously. This implements support for this and allows userspace to request async probe as a preference through a generic shared device driver module parameter, async_probe. Implementation for async probe is supported through a module parameter given that since synchronous probe has been prevalent for years some userspace might exist which relies on the fact that the device driver will probe synchronously and the assumption that devices it provides will be immediately available after this. Signed-off-by: Luis R. Rodriguez Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 8 +++++--- include/linux/module.h | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 7857b46c548b..77b7cd9e5467 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -201,10 +201,12 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * respective probe routines. This tells the core what to * expect and prefer. * - * @PROBE_SYNCHRONOUS: Default. Drivers expect their probe routines + * @PROBE_DEFAULT_STRATEGY: Drivers expect their probe routines * to run synchronously with driver and device registration * (with the exception of -EPROBE_DEFER handling - re-probing - * always ends up being done asynchronously). + * always ends up being done asynchronously) unless user + * explicitly requested asynchronous probing via module + * parameter. * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which * probing order is not essential for booting the system may * opt into executing their probes asynchronously. @@ -216,7 +218,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * drivers. */ enum probe_type { - PROBE_SYNCHRONOUS, + PROBE_DEFAULT_STRATEGY, PROBE_PREFER_ASYNCHRONOUS, }; diff --git a/include/linux/module.h b/include/linux/module.h index c883b86ea964..f46a47d3c0dc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -257,6 +257,8 @@ struct module { bool sig_ok; #endif + bool async_probe_requested; + /* symbols that will be GPL-only in the near future. */ const struct kernel_symbol *gpl_future_syms; const unsigned long *gpl_future_crcs; -- cgit v1.2.3 From d173a137c5bd95ee29d02705e5fa8890ef149718 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 30 Mar 2015 16:20:06 -0700 Subject: driver-core: enable drivers to opt-out of async probe There are drivers that can not be probed asynchronously. One such group is platform drivers registered with platform_driver_probe(), which expects driver's probe routine be discarded after the driver has been registered and initial binding attempt executed. Also platform_driver_probe() an error when no devices were bound to the driver, allowing failing to load such driver module altogether. Other drivers do not work well with asynchronous probing because of driver bug or not optimal driver organization. To allow using such drivers even when user requests asynchronous probing as default boot strategy, let's allow them to opt out. Signed-off-by: Luis R. Rodriguez Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 77b7cd9e5467..00ac57c26615 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -201,15 +201,15 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * respective probe routines. This tells the core what to * expect and prefer. * - * @PROBE_DEFAULT_STRATEGY: Drivers expect their probe routines - * to run synchronously with driver and device registration - * (with the exception of -EPROBE_DEFER handling - re-probing - * always ends up being done asynchronously) unless user - * explicitly requested asynchronous probing via module - * parameter. + * @PROBE_DEFAULT_STRATEGY: Used by drivers that work equally well + * whether probed synchronously or asynchronously. * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which * probing order is not essential for booting the system may * opt into executing their probes asynchronously. + * @PROBE_FORCE_SYNCHRONOUS: Use this to annotate drivers that need + * their probe routines to run synchronously with driver and + * device registration (with the exception of -EPROBE_DEFER + * handling - re-probing always ends up being done asynchronously). * * Note that the end goal is to switch the kernel to use asynchronous * probing by default, so annotating drivers with @@ -220,6 +220,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); enum probe_type { PROBE_DEFAULT_STRATEGY, PROBE_PREFER_ASYNCHRONOUS, + PROBE_FORCE_SYNCHRONOUS, }; /** -- cgit v1.2.3 From ec0ccc16a09fc32f7142ef3ddf1c2276fbbb35d0 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 30 Mar 2015 16:20:09 -0700 Subject: module: add core_param_unsafe Similarly to module_param_unsafe(), add the helper to be used by core code wishing to expose unsafe debugging or testing parameters that taint the kernel when set. Acked-by: Rusty Russell Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/moduleparam.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 13923709d30d..6480dcaca275 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -310,6 +310,15 @@ static inline void __kernel_param_unlock(void) #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ __module_param_call("", name, ¶m_ops_##type, &var, perm, -1, 0) + +/** + * core_param_unsafe - same as core_param but taints kernel + */ +#define core_param_unsafe(name, var, type, perm) \ + param_check_##type(name, &(var)); \ + __module_param_call("", name, ¶m_ops_##type, &var, perm, \ + -1, KERNEL_PARAM_FL_UNSAFE) + #endif /* !MODULE */ /** -- cgit v1.2.3 From 985eba3aba77a997f65109e1418837b1d8b8512a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 3 Dec 2014 16:46:14 +0100 Subject: mfd: syscon: Add Atmel MC (Memory Controller) registers definition The at91rm9200 SoC embeds a Memory Controller block which is used to configure several aspects of the platform: - AHB/APB Bus behavior - SDRAM Controller - EBI (External Bus Interface) and SMC (Static Memory Controller) config Those registers might be accessed by different drivers, hence we need to define it as a syscon device. Signed-off-by: Boris Brezillon Signed-off-by: Alexandre Belloni Acked-by: Lee Jones Acked-by: Nicolas Ferre --- include/linux/mfd/syscon/atmel-mc.h | 144 ++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 include/linux/mfd/syscon/atmel-mc.h (limited to 'include/linux') diff --git a/include/linux/mfd/syscon/atmel-mc.h b/include/linux/mfd/syscon/atmel-mc.h new file mode 100644 index 000000000000..afd9b8f1e363 --- /dev/null +++ b/include/linux/mfd/syscon/atmel-mc.h @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2005 Ivan Kokshaysky + * Copyright (C) SAN People + * + * Memory Controllers (MC, EBI, SMC, SDRAMC, BFC) - System peripherals + * registers. + * Based on AT91RM9200 datasheet revision E. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _LINUX_MFD_SYSCON_ATMEL_MC_H_ +#define _LINUX_MFD_SYSCON_ATMEL_MC_H_ + +/* Memory Controller */ +#define AT91_MC_RCR 0x00 +#define AT91_MC_RCB BIT(0) + +#define AT91_MC_ASR 0x04 +#define AT91_MC_UNADD BIT(0) +#define AT91_MC_MISADD BIT(1) +#define AT91_MC_ABTSZ GENMASK(9, 8) +#define AT91_MC_ABTSZ_BYTE (0 << 8) +#define AT91_MC_ABTSZ_HALFWORD (1 << 8) +#define AT91_MC_ABTSZ_WORD (2 << 8) +#define AT91_MC_ABTTYP GENMASK(11, 10) +#define AT91_MC_ABTTYP_DATAREAD (0 << 10) +#define AT91_MC_ABTTYP_DATAWRITE (1 << 10) +#define AT91_MC_ABTTYP_FETCH (2 << 10) +#define AT91_MC_MST(n) BIT(16 + (n)) +#define AT91_MC_SVMST(n) BIT(24 + (n)) + +#define AT91_MC_AASR 0x08 + +#define AT91_MC_MPR 0x0c +#define AT91_MPR_MSTP(n) GENMASK(2 + ((x) * 4), ((x) * 4)) + +/* External Bus Interface (EBI) registers */ +#define AT91_MC_EBI_CSA 0x60 +#define AT91_MC_EBI_CS(n) BIT(x) +#define AT91_MC_EBI_NUM_CS 8 + +#define AT91_MC_EBI_CFGR 0x64 +#define AT91_MC_EBI_DBPUC BIT(0) + +/* Static Memory Controller (SMC) registers */ +#define AT91_MC_SMC_CSR(n) (0x70 + ((n) * 4)) +#define AT91_MC_SMC_NWS GENMASK(6, 0) +#define AT91_MC_SMC_NWS_(x) ((x) << 0) +#define AT91_MC_SMC_WSEN BIT(7) +#define AT91_MC_SMC_TDF GENMASK(11, 8) +#define AT91_MC_SMC_TDF_(x) ((x) << 8) +#define AT91_MC_SMC_TDF_MAX 0xf +#define AT91_MC_SMC_BAT BIT(12) +#define AT91_MC_SMC_DBW GENMASK(14, 13) +#define AT91_MC_SMC_DBW_16 (1 << 13) +#define AT91_MC_SMC_DBW_8 (2 << 13) +#define AT91_MC_SMC_DPR BIT(15) +#define AT91_MC_SMC_ACSS GENMASK(17, 16) +#define AT91_MC_SMC_ACSS_(x) ((x) << 16) +#define AT91_MC_SMC_ACSS_MAX 3 +#define AT91_MC_SMC_RWSETUP GENMASK(26, 24) +#define AT91_MC_SMC_RWSETUP_(x) ((x) << 24) +#define AT91_MC_SMC_RWHOLD GENMASK(30, 28) +#define AT91_MC_SMC_RWHOLD_(x) ((x) << 28) +#define AT91_MC_SMC_RWHOLDSETUP_MAX 7 + +/* SDRAM Controller registers */ +#define AT91_MC_SDRAMC_MR 0x90 +#define AT91_MC_SDRAMC_MODE GENMASK(3, 0) +#define AT91_MC_SDRAMC_MODE_NORMAL (0 << 0) +#define AT91_MC_SDRAMC_MODE_NOP (1 << 0) +#define AT91_MC_SDRAMC_MODE_PRECHARGE (2 << 0) +#define AT91_MC_SDRAMC_MODE_LMR (3 << 0) +#define AT91_MC_SDRAMC_MODE_REFRESH (4 << 0) +#define AT91_MC_SDRAMC_DBW_16 BIT(4) + +#define AT91_MC_SDRAMC_TR 0x94 +#define AT91_MC_SDRAMC_COUNT GENMASK(11, 0) + +#define AT91_MC_SDRAMC_CR 0x98 +#define AT91_MC_SDRAMC_NC GENMASK(1, 0) +#define AT91_MC_SDRAMC_NC_8 (0 << 0) +#define AT91_MC_SDRAMC_NC_9 (1 << 0) +#define AT91_MC_SDRAMC_NC_10 (2 << 0) +#define AT91_MC_SDRAMC_NC_11 (3 << 0) +#define AT91_MC_SDRAMC_NR GENMASK(3, 2) +#define AT91_MC_SDRAMC_NR_11 (0 << 2) +#define AT91_MC_SDRAMC_NR_12 (1 << 2) +#define AT91_MC_SDRAMC_NR_13 (2 << 2) +#define AT91_MC_SDRAMC_NB BIT(4) +#define AT91_MC_SDRAMC_NB_2 (0 << 4) +#define AT91_MC_SDRAMC_NB_4 (1 << 4) +#define AT91_MC_SDRAMC_CAS GENMASK(6, 5) +#define AT91_MC_SDRAMC_CAS_2 (2 << 5) +#define AT91_MC_SDRAMC_TWR GENMASK(10, 7) +#define AT91_MC_SDRAMC_TRC GENMASK(14, 11) +#define AT91_MC_SDRAMC_TRP GENMASK(18, 15) +#define AT91_MC_SDRAMC_TRCD GENMASK(22, 19) +#define AT91_MC_SDRAMC_TRAS GENMASK(26, 23) +#define AT91_MC_SDRAMC_TXSR GENMASK(30, 27) + +#define AT91_MC_SDRAMC_SRR 0x9c +#define AT91_MC_SDRAMC_SRCB BIT(0) + +#define AT91_MC_SDRAMC_LPR 0xa0 +#define AT91_MC_SDRAMC_LPCB BIT(0) + +#define AT91_MC_SDRAMC_IER 0xa4 +#define AT91_MC_SDRAMC_IDR 0xa8 +#define AT91_MC_SDRAMC_IMR 0xac +#define AT91_MC_SDRAMC_ISR 0xb0 +#define AT91_MC_SDRAMC_RES BIT(0) + +/* Burst Flash Controller register */ +#define AT91_MC_BFC_MR 0xc0 +#define AT91_MC_BFC_BFCOM GENMASK(1, 0) +#define AT91_MC_BFC_BFCOM_DISABLED (0 << 0) +#define AT91_MC_BFC_BFCOM_ASYNC (1 << 0) +#define AT91_MC_BFC_BFCOM_BURST (2 << 0) +#define AT91_MC_BFC_BFCC GENMASK(3, 2) +#define AT91_MC_BFC_BFCC_MCK (1 << 2) +#define AT91_MC_BFC_BFCC_DIV2 (2 << 2) +#define AT91_MC_BFC_BFCC_DIV4 (3 << 2) +#define AT91_MC_BFC_AVL GENMASK(7, 4) +#define AT91_MC_BFC_PAGES GENMASK(10, 8) +#define AT91_MC_BFC_PAGES_NO_PAGE (0 << 8) +#define AT91_MC_BFC_PAGES_16 (1 << 8) +#define AT91_MC_BFC_PAGES_32 (2 << 8) +#define AT91_MC_BFC_PAGES_64 (3 << 8) +#define AT91_MC_BFC_PAGES_128 (4 << 8) +#define AT91_MC_BFC_PAGES_256 (5 << 8) +#define AT91_MC_BFC_PAGES_512 (6 << 8) +#define AT91_MC_BFC_PAGES_1024 (7 << 8) +#define AT91_MC_BFC_OEL GENMASK(13, 12) +#define AT91_MC_BFC_BAAEN BIT(16) +#define AT91_MC_BFC_BFOEH BIT(17) +#define AT91_MC_BFC_MUXEN BIT(18) +#define AT91_MC_BFC_RDYEN BIT(19) + +#endif /* _LINUX_MFD_SYSCON_ATMEL_MC_H_ */ -- cgit v1.2.3 From be32417796c2b8a83fe4cbece83bea96ab9e378f Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Wed, 6 May 2015 12:26:22 +0800 Subject: block: export blkdev_reread_part() and __blkdev_reread_part() This patch exports blkdev_reread_part() for block drivers, also introduce __blkdev_reread_part(). For some drivers, such as loop, reread of partitions can be run from the release path, and bd_mutex may already be held prior to calling ioctl_by_bdev(bdev, BLKRRPART, 0), so introduce __blkdev_reread_part for use in such cases. CC: Christoph Hellwig CC: Jens Axboe CC: Tejun Heo CC: Alexander Viro CC: Markus Pargmann CC: Stefan Weinhuber CC: Stefan Haberland CC: Sebastian Ott CC: Fabian Frederick CC: Ming Lei CC: David Herrmann CC: Andrew Morton CC: Peter Zijlstra CC: nbd-general@lists.sourceforge.net CC: linux-s390@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jarod Wilson Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..1ef63900243c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2279,6 +2279,9 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); extern void blkdev_put(struct block_device *bdev, fmode_t mode); +extern int __blkdev_reread_part(struct block_device *bdev); +extern int blkdev_reread_part(struct block_device *bdev); + #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); extern void bd_unlink_disk_holder(struct block_device *bdev, -- cgit v1.2.3 From 6e844b035360294636271f4f0fd4a5a685e03c06 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Apr 2015 10:45:55 -0700 Subject: ARM: BCM63xx: Add Broadcom BCM63xx PMB controller helpers This patch adds both common register definitions and helper functions used to issue read/write commands to the Broadcom BCM63xx PMB controller which is used to power on and release from reset internal on-chip peripherals such as the integrated Ethernet switch, AHCI, USB, as well as the secondary CPU core. This is going to be utilized by the BCM63138 SMP code, as well as by the BCM63138 reset controller later. Signed-off-by: Florian Fainelli --- include/linux/reset/bcm63xx_pmb.h | 88 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 include/linux/reset/bcm63xx_pmb.h (limited to 'include/linux') diff --git a/include/linux/reset/bcm63xx_pmb.h b/include/linux/reset/bcm63xx_pmb.h new file mode 100644 index 000000000000..bb4af7b5eb36 --- /dev/null +++ b/include/linux/reset/bcm63xx_pmb.h @@ -0,0 +1,88 @@ +/* + * Broadcom BCM63xx Processor Monitor Bus shared routines (SMP and reset) + * + * Copyright (C) 2015, Broadcom Corporation + * Author: Florian Fainelli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation version 2. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef __BCM63XX_PMB_H +#define __BCM63XX_PMB_H + +#include +#include +#include +#include + +/* PMB Master controller register */ +#define PMB_CTRL 0x00 +#define PMC_PMBM_START (1 << 31) +#define PMC_PMBM_TIMEOUT (1 << 30) +#define PMC_PMBM_SLAVE_ERR (1 << 29) +#define PMC_PMBM_BUSY (1 << 28) +#define PMC_PMBM_READ (0 << 20) +#define PMC_PMBM_WRITE (1 << 20) +#define PMB_WR_DATA 0x04 +#define PMB_TIMEOUT 0x08 +#define PMB_RD_DATA 0x0C + +#define PMB_BUS_ID_SHIFT 8 + +/* Perform the low-level PMB master operation, shared between reads and + * writes. + */ +static inline int __bpcm_do_op(void __iomem *master, unsigned int addr, + u32 off, u32 op) +{ + unsigned int timeout = 1000; + u32 cmd; + + cmd = (PMC_PMBM_START | op | (addr & 0xff) << 12 | off); + writel(cmd, master + PMB_CTRL); + do { + cmd = readl(master + PMB_CTRL); + if (!(cmd & PMC_PMBM_START)) + return 0; + + if (cmd & PMC_PMBM_SLAVE_ERR) + return -EIO; + + if (cmd & PMC_PMBM_TIMEOUT) + return -ETIMEDOUT; + + udelay(1); + } while (timeout-- > 0); + + return -ETIMEDOUT; +} + +static inline int bpcm_rd(void __iomem *master, unsigned int addr, + u32 off, u32 *val) +{ + int ret = 0; + + ret = __bpcm_do_op(master, addr, off >> 2, PMC_PMBM_READ); + *val = readl(master + PMB_RD_DATA); + + return ret; +} + +static inline int bpcm_wr(void __iomem *master, unsigned int addr, + u32 off, u32 val) +{ + int ret = 0; + + writel(val, master + PMB_WR_DATA); + ret = __bpcm_do_op(master, addr, off >> 2, PMC_PMBM_WRITE); + + return ret; +} + +#endif /* __BCM63XX_PMB_H */ -- cgit v1.2.3 From 37b1ef31a568fc02e53587620226e5f3c66454c8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 May 2015 14:41:19 +0800 Subject: workqueue: move flush_scheduled_work() to workqueue.h flush_scheduled_work() is just a simple call to flush_work(). Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4618dd672d1b..738b30b39b68 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -435,7 +435,6 @@ extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); -extern void flush_scheduled_work(void); extern int schedule_on_each_cpu(work_func_t func); @@ -531,6 +530,35 @@ static inline bool schedule_work(struct work_struct *work) return queue_work(system_wq, work); } +/** + * flush_scheduled_work - ensure that any scheduled work has run to completion. + * + * Forces execution of the kernel-global workqueue and blocks until its + * completion. + * + * Think twice before calling this function! It's very easy to get into + * trouble if you don't take great care. Either of the following situations + * will lead to deadlock: + * + * One of the work items currently on the workqueue needs to acquire + * a lock held by your code or its caller. + * + * Your code is running in the context of a work routine. + * + * They will be detected by lockdep when they occur, but the first might not + * occur very often. It depends on what work items are on the workqueue and + * what locks they need, which you have no control over. + * + * In most situations flushing the entire workqueue is overkill; you merely + * need to know that a particular work item isn't queued and isn't running. + * In such cases you should use cancel_delayed_work_sync() or + * cancel_work_sync() instead. + */ +static inline void flush_scheduled_work(void) +{ + flush_workqueue(system_wq); +} + /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use -- cgit v1.2.3 From 2a9de9c0f08d61fbe3764a21d22d0b72df97d6ae Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 24 Apr 2015 19:16:05 +0900 Subject: extcon: Use the unique id for external connector instead of string This patch uses the unique id to identify the type of external connector instead of string name. The string name have the many potential issues. So, this patch defines the 'extcon' enumeration which includes all supported external connector on EXTCON subsystem. If new external connector is necessary, the unique id of new connector have to be added in 'extcon' enumeration. There are current supported external connector in 'enum extcon' as following: enum extcon { EXTCON_NONE = 0x0, /* USB external connector */ EXTCON_USB = 0x1, EXTCON_USB_HOST = 0x2, /* Charger external connector */ EXTCON_TA = 0x10, EXTCON_FAST_CHARGER = 0x11, EXTCON_SLOW_CHARGER = 0x12, EXTCON_CHARGE_DOWNSTREAM = 0x13, /* Audio and video external connector */ EXTCON_LINE_IN = 0x20, EXTCON_LINE_OUT = 0x21, EXTCON_MICROPHONE = 0x22, EXTCON_HEADPHONE = 0x23, EXTCON_HDMI = 0x30, EXTCON_MHL = 0x31, EXTCON_DVI = 0x32, EXTCON_VGA = 0x33, EXTCON_SPDIF_IN = 0x34, EXTCON_SPDIF_OUT = 0x35, EXTCON_VIDEO_IN = 0x36, EXTCON_VIDEO_OUT = 0x37, /* Miscellaneous external connector */ EXTCON_DOCK = 0x50, EXTCON_JIG = 0x51, EXTCON_MECHANICAL = 0x52, EXTCON_END, }; For example in extcon-arizona.c: To use unique id removes the potential issue about handling the inconsistent name of external connector with string. - Previously, use the string to register the type of arizona jack connector static const char *arizona_cable[] = { "Mechanical", "Microphone", "Headphone", "Line-out", }; - Newly, use the unique id to register the type of arizona jack connector static const enum extcon arizona_cable[] = { EXTCON_MECHANICAL, EXTCON_MICROPHONE, EXTCON_HEADPHONE, EXTCON_LINE_OUT, EXTCON_NONE, }; And this patch modify the prototype of extcon_{get|set}_cable_state_() which uses the 'enum extcon id' instead of 'cable_index'. Because although one more extcon drivers support USB cable, each extcon driver might has the differnt 'cable_index' for USB cable. All extcon drivers can use the unique id number for same external connector with modified extcon_{get|set}_cable_state_(). - Previously, use 'cable_index' on these functions: extcon_get_cable_state_(struct extcon_dev*, int cable_index) extcon_set_cable_state_(struct extcon_dev*, int cable_index, bool state) -Newly, use 'enum extcon id' on these functions: extcon_get_cable_state_(struct extcon_dev*, enum extcon id) extcon_set_cable_state_(struct extcon_dev*, enum extcon id, bool state) Cc: Arnd Bergmann Cc: Felipe Balbi Signed-off-by: Chanwoo Choi Acked-by: Roger Quadros Acked-by: Charles Keepax Acked-by: Ramakrishna Pallala Reviewed-by: Krzysztof Kozlowski [arnd: Report the build break about drivers/usb/phy/phy-tahvo.c after using the unique id for external connector insteadf of string] Reported-by: Arnd Bergmann [dan.carpenter: Report the build warning of extcon_{set|get}_cable_state_()] Reported-by: Dan Carpenter --- include/linux/extcon.h | 111 ++++++++++++++------------------- include/linux/extcon/extcon-adc-jack.h | 5 +- 2 files changed, 50 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 799474d9dc48..85c882f0029e 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -1,6 +1,9 @@ /* * External connector (extcon) class driver * + * Copyright (C) 2015 Samsung Electronics + * Author: Chanwoo Choi + * * Copyright (C) 2012 Samsung Electronics * Author: Donggeun Kim * Author: MyungJoo Ham @@ -27,50 +30,41 @@ #include #include -#define SUPPORTED_CABLE_MAX 32 -#define CABLE_NAME_MAX 30 - -/* - * The standard cable name is to help support general notifier - * and notifiee device drivers to share the common names. - * Please use standard cable names unless your notifier device has - * a very unique and abnormal cable or - * the cable type is supposed to be used with only one unique - * pair of notifier/notifiee devices. - * - * Please add any other "standard" cables used with extcon dev. - * - * You may add a dot and number to specify version or specification - * of the specific cable if it is required. (e.g., "Fast-charger.18" - * and "Fast-charger.10" for 1.8A and 1.0A chargers) - * However, the notifiee and notifier should be able to handle such - * string and if the notifiee can negotiate the protocol or identify, - * you don't need such convention. This convention is helpful when - * notifier can distinguish but notifiee cannot. - */ -enum extcon_cable_name { - EXTCON_USB = 0, - EXTCON_USB_HOST, - EXTCON_TA, /* Travel Adaptor */ - EXTCON_FAST_CHARGER, - EXTCON_SLOW_CHARGER, - EXTCON_CHARGE_DOWNSTREAM, /* Charging an external device */ - EXTCON_HDMI, - EXTCON_MHL, - EXTCON_DVI, - EXTCON_VGA, - EXTCON_DOCK, - EXTCON_LINE_IN, - EXTCON_LINE_OUT, - EXTCON_MIC_IN, - EXTCON_HEADPHONE_OUT, - EXTCON_SPDIF_IN, - EXTCON_SPDIF_OUT, - EXTCON_VIDEO_IN, - EXTCON_VIDEO_OUT, - EXTCON_MECHANICAL, +enum extcon { + EXTCON_NONE = 0x0, + + /* USB external connector */ + EXTCON_USB = 0x1, + EXTCON_USB_HOST = 0x2, + + /* Charger external connector */ + EXTCON_TA = 0x10, + EXTCON_FAST_CHARGER = 0x11, + EXTCON_SLOW_CHARGER = 0x12, + EXTCON_CHARGE_DOWNSTREAM = 0x13, + + /* Audio/Video external connector */ + EXTCON_LINE_IN = 0x20, + EXTCON_LINE_OUT = 0x21, + EXTCON_MICROPHONE = 0x22, + EXTCON_HEADPHONE = 0x23, + + EXTCON_HDMI = 0x30, + EXTCON_MHL = 0x31, + EXTCON_DVI = 0x32, + EXTCON_VGA = 0x33, + EXTCON_SPDIF_IN = 0x34, + EXTCON_SPDIF_OUT = 0x35, + EXTCON_VIDEO_IN = 0x36, + EXTCON_VIDEO_OUT = 0x37, + + /* Etc external connector */ + EXTCON_DOCK = 0x50, + EXTCON_JIG = 0x51, + EXTCON_MECHANICAL = 0x52, + + EXTCON_END, }; -extern const char extcon_cable_name[][CABLE_NAME_MAX + 1]; struct extcon_cable; @@ -78,7 +72,7 @@ struct extcon_cable; * struct extcon_dev - An extcon device represents one external connector. * @name: The name of this extcon device. Parent device name is * used if NULL. - * @supported_cable: Array of supported cable names ending with NULL. + * @supported_cable: Array of supported cable names ending with EXTCON_NONE. * If supported_cable is NULL, cable name related APIs * are disabled. * @mutually_exclusive: Array of mutually exclusive set of cables that cannot @@ -113,7 +107,7 @@ struct extcon_cable; struct extcon_dev { /* Optional user initializing data */ const char *name; - const char **supported_cable; + const enum extcon *supported_cable; const u32 *mutually_exclusive; /* Optional callbacks to override class functions */ @@ -194,10 +188,10 @@ extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name); /* * Following APIs control the memory of extcon device. */ -extern struct extcon_dev *extcon_dev_allocate(const char **cables); +extern struct extcon_dev *extcon_dev_allocate(const enum extcon *cable); extern void extcon_dev_free(struct extcon_dev *edev); extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const char **cables); + const enum extcon *cable); extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev); /* @@ -216,13 +210,10 @@ extern int extcon_update_state(struct extcon_dev *edev, u32 mask, u32 state); /* * get/set_cable_state access each bit of the 32b encoded state value. - * They are used to access the status of each cable based on the cable_name - * or cable_index, which is retrieved by extcon_find_cable_index + * They are used to access the status of each cable based on the cable_name. */ -extern int extcon_find_cable_index(struct extcon_dev *sdev, - const char *cable_name); -extern int extcon_get_cable_state_(struct extcon_dev *edev, int cable_index); -extern int extcon_set_cable_state_(struct extcon_dev *edev, int cable_index, +extern int extcon_get_cable_state_(struct extcon_dev *edev, enum extcon id); +extern int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id, bool cable_state); extern int extcon_get_cable_state(struct extcon_dev *edev, @@ -281,7 +272,7 @@ static inline int devm_extcon_dev_register(struct device *dev, static inline void devm_extcon_dev_unregister(struct device *dev, struct extcon_dev *edev) { } -static inline struct extcon_dev *extcon_dev_allocate(const char **cables) +static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable) { return ERR_PTR(-ENOSYS); } @@ -289,7 +280,7 @@ static inline struct extcon_dev *extcon_dev_allocate(const char **cables) static inline void extcon_dev_free(struct extcon_dev *edev) { } static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const char **cables) + const enum extcon *cable) { return ERR_PTR(-ENOSYS); } @@ -312,20 +303,14 @@ static inline int extcon_update_state(struct extcon_dev *edev, u32 mask, return 0; } -static inline int extcon_find_cable_index(struct extcon_dev *edev, - const char *cable_name) -{ - return 0; -} - static inline int extcon_get_cable_state_(struct extcon_dev *edev, - int cable_index) + enum extcon id) { return 0; } static inline int extcon_set_cable_state_(struct extcon_dev *edev, - int cable_index, bool cable_state) + enum extcon id, bool cable_state) { return 0; } diff --git a/include/linux/extcon/extcon-adc-jack.h b/include/linux/extcon/extcon-adc-jack.h index 9ca958c4e94c..53c60806bcfb 100644 --- a/include/linux/extcon/extcon-adc-jack.h +++ b/include/linux/extcon/extcon-adc-jack.h @@ -44,7 +44,7 @@ struct adc_jack_cond { * @consumer_channel: Unique name to identify the channel on the consumer * side. This typically describes the channels used within * the consumer. E.g. 'battery_voltage' - * @cable_names: array of cable names ending with null. + * @cable_names: array of extcon id for supported cables. * @adc_contitions: array of struct adc_jack_cond conditions ending * with .state = 0 entry. This describes how to decode * adc values into extcon state. @@ -58,8 +58,7 @@ struct adc_jack_pdata { const char *name; const char *consumer_channel; - /* The last entry should be NULL */ - const char **cable_names; + const enum extcon *cable_names; /* The last entry's state should be 0 */ struct adc_jack_cond *adc_conditions; -- cgit v1.2.3 From 046050f6e623e442e9c71c525462ebd395dae526 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Tue, 19 May 2015 20:01:12 +0900 Subject: extcon: Update the prototype of extcon_register_notifier() with enum extcon Previously, extcon consumer driver used the extcon_register_interest() to register the notifier chain and then to receive the notifier event when external connector's state is changed. When registering the notifier chain for specific external connector with extcon_register_interest(), it used the the string name of external connector directly. There are potential problem because of unclear, non-standard and inconsequent cable name. Namely, it is not appropriate method to identify each external connector. So, this patch modify the prototype of extcon_register_notifier() by using the 'enum extcon' which are the unique id for each external connector instead of unclear string method. - Previously, the extcon consumer driver used the extcon_register_interest() with 'cable_name' to point out the specific external connector. Also. it used the un-needed structure (struct extcon_specific_cable_nb). : int extcon_register_interest(struct extcon_specific_cable_nb *obj, const char *extcon_name, const char *cable_name, struct notifier_block *nb) - Newly, the updated extcon_register_notifier() would definitely support the same feature to detech the changed state of external connector without any specific structure (struct extcon_specific_cable_nb). : int extcon_register_notifier(struct extcon_dev *edev, enum extcon id, struct notifier_block *nb) This patch support the both extcon_register_interest() and new extcon_register_ notifier(). But the extcon_{register|unregister}_interest() will be deprecated because extcon core would support the notifier event for extcon consumer driver with only updated extcon_register_notifier() and 'extcon_specific_cable_nb' will be removed if there are no extcon consumer driver with legacy extcon_{register|unregister}_interest(). Signed-off-by: Chanwoo Choi Reviewed-by: Krzysztof Kozlowski --- include/linux/extcon.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 85c882f0029e..be9652b3a154 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -116,7 +116,7 @@ struct extcon_dev { /* Internal data. Please do not set. */ struct device dev; - struct raw_notifier_head nh; + struct raw_notifier_head *nh; struct list_head entry; int max_supported; spinlock_t lock; /* could be called by irq handler */ @@ -155,8 +155,6 @@ struct extcon_cable { /** * struct extcon_specific_cable_nb - An internal data for * extcon_register_interest(). - * @internal_nb: A notifier block bridging extcon notifier - * and cable notifier. * @user_nb: user provided notifier block for events from * a specific cable. * @cable_index: the target cable. @@ -164,7 +162,6 @@ struct extcon_cable { * @previous_value: the saved previous event value. */ struct extcon_specific_cable_nb { - struct notifier_block internal_nb; struct notifier_block *user_nb; int cable_index; struct extcon_dev *edev; @@ -240,10 +237,10 @@ extern int extcon_unregister_interest(struct extcon_specific_cable_nb *nb); * we do not recommend to use this for normal 'notifiee' device drivers who * want to be notified by a specific external port of the notifier. */ -extern int extcon_register_notifier(struct extcon_dev *edev, +extern int extcon_register_notifier(struct extcon_dev *edev, enum extcon id, + struct notifier_block *nb); +extern int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id, struct notifier_block *nb); -extern int extcon_unregister_notifier(struct extcon_dev *edev, - struct notifier_block *nb); /* * Following API get the extcon device from devicetree. @@ -333,13 +330,15 @@ static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name) } static inline int extcon_register_notifier(struct extcon_dev *edev, - struct notifier_block *nb) + enum extcon id, + struct notifier_block *nb) { return 0; } static inline int extcon_unregister_notifier(struct extcon_dev *edev, - struct notifier_block *nb) + enum extcon id, + struct notifier_block *nb) { return 0; } -- cgit v1.2.3 From f705f837c58ebe1ea69dfffff4dcc234e2fbc8dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 11:12:38 +0200 Subject: nvme: consolidate synchronous command submission helpers Note that we keep the unused timeout argument, but allow callers to pass 0 instead of a timeout if they want the default. This will allow adding a timeout to the pass through path later on. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8dbd05e70f09..61488b2ae291 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -158,11 +158,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length); void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod); -int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *, - struct nvme_command *, u32 *); -int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); -int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, - u32 *result); +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd); int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, dma_addr_t dma_addr); int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, -- cgit v1.2.3 From e75ec752d725b7b612c0b2db1bca50a9e53c0879 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 11:12:39 +0200 Subject: nvme: store a struct device pointer in struct nvme_dev Most users want the generic device, so store that in struct nvme_dev instead of the pci_dev. This also happens to be a nice step towards making some code reusable for non-PCI transports. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 61488b2ae291..de0e49a716b8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -74,7 +74,7 @@ struct nvme_dev { struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; - struct pci_dev *pci_dev; + struct device *dev; struct dma_pool *prp_page_pool; struct dma_pool *prp_small_pool; int instance; -- cgit v1.2.3 From d29ec8241c10eacf59c23b3828a88dbae06e7e3f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 11:12:46 +0200 Subject: nvme: submit internal commands through the block layer Use block layer queues with an internal cmd_type to submit internally generated NVMe commands. This both simplifies the code a lot and allow for a better structure. For example now the LighNVM code can construct commands without knowing the details of the underlying I/O descriptors. Or a future NVMe over network target could inject commands, as well as could the SCSI translation and ioctl code be reused for such a beast. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index de0e49a716b8..986bf8ad8e93 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -146,21 +146,15 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } -/** - * nvme_free_iod - frees an nvme_iod - * @dev: The device that the I/O was submitted to - * @iod: The memory to free - */ -void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod); - -int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t); -struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, - unsigned long addr, unsigned length); -void nvme_unmap_user_pages(struct nvme_dev *dev, int write, - struct nvme_iod *iod); -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd); -int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, - dma_addr_t dma_addr); +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buf, unsigned bufflen); +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, void __user *ubuffer, unsigned bufflen, + u32 *result, unsigned timeout); +int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id); +int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, + struct nvme_id_ns **id); +int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log); int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result); int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, -- cgit v1.2.3 From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 22 May 2015 09:14:03 -0400 Subject: block: remove management of bi_remaining when restoring original bi_end_io Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") regressed all existing callers that followed this pattern: 1) saving a bio's original bi_end_io 2) wiring up an intermediate bi_end_io 3) restoring the original bi_end_io from intermediate bi_end_io 4) calling bio_endio() to execute the restored original bi_end_io The regression was due to BIO_CHAIN only ever getting set if bio_inc_remaining() is called. For the above pattern it isn't set until step 3 above (step 2 would've needed to establish BIO_CHAIN). As such the first bio_endio(), in step 2 above, never decremented __bi_remaining before calling the intermediate bi_end_io -- leaving __bi_remaining with the value 1 instead of 0. When bio_inc_remaining() occurred during step 3 it brought it to a value of 2. When the second bio_endio() was called, in step 4 above, it should've called the original bi_end_io but it didn't because there was an extra reference that wasn't dropped (due to atomic operations being optimized away since BIO_CHAIN wasn't set upfront). Fix this issue by removing the __bi_remaining management complexity for all callers that use the above pattern -- bio_chain() is the only interface that _needs_ to be concerned with __bi_remaining. For the above pattern callers just expect the bi_end_io they set to get called! Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls that aren't associated with the bio_chain() interface. Also, the bio_inc_remaining() interface has been moved local to bio.c. Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7486ea103f6e..f0291cf64cc5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -427,7 +427,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } extern void bio_endio(struct bio *, int); -extern void bio_endio_nodec(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -658,17 +657,6 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } -/* - * Increment chain count for the bio. Make sure the CHAIN flag update - * is visible before the raised count. - */ -static inline void bio_inc_remaining(struct bio *bio) -{ - bio->bi_flags |= (1 << BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. -- cgit v1.2.3 From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 09:14:04 -0400 Subject: block, dm: don't copy bios for request clones Currently dm-multipath has to clone the bios for every request sent to the lower devices, which wastes cpu cycles and ties down memory. This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio to not complete bios attached to a request, which we set on clone requests similar to bios in a flush sequence. With this change I/O errors on a path failure only get propagated to dm-multipath, which can then either resubmit the I/O or complete the bios on the original request. I've done some basic testing of this on a Linux target with ALUA support, and it survives path failures during I/O nicely. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3f4ded0b1a34..45a6be89957c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -192,6 +192,7 @@ enum rq_flag_bits { __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NO_TIMEOUT, /* requests may never expire */ + __REQ_CLONE, /* cloned bios */ __REQ_NR_BITS, /* stops here */ }; @@ -246,5 +247,6 @@ enum rq_flag_bits { #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) +#define REQ_CLONE (1ULL << __REQ_CLONE) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc917956a6d0..9ded80da2c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -775,11 +775,7 @@ extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); -extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data); -extern void blk_rq_unprep_clone(struct request *rq); +extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern void blk_delay_queue(struct request_queue *, unsigned long); -- cgit v1.2.3 From 25d238b2260973bfca0b82181824340c7aeae45a Mon Sep 17 00:00:00 2001 From: Rajeev Kumar Date: Fri, 22 May 2015 09:58:30 -0700 Subject: Input: update email-id of Rajeev Kumar rajeev-dlh.kumar@st.com email-id doesn't exist anymore as I have left the company. Signed-off-by: Rajeev Kumar Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/keyboard-spear.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/keyboard-spear.h b/include/linux/platform_data/keyboard-spear.h index 9248e3a7e333..5e3ff653900c 100644 --- a/include/linux/platform_data/keyboard-spear.h +++ b/include/linux/platform_data/keyboard-spear.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2010 ST Microelectronics - * Rajeev Kumar + * Rajeev Kumar * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any -- cgit v1.2.3 From c93b76b34b4d8dbe8e3443eb27e49ac60034342b Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 7 May 2015 15:54:02 +0300 Subject: mei: bus: report also uuid in module alias In order to automate modules matching add device uuid which is reported in client enumeration, keep also the name that is needed in for nfc distinguishing radio vendor Report mei:name:uuid Cc: linux-api@vger.kernel.org Cc: Samuel Ortiz Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 3bfd56778c29..2d2b2b571d61 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -599,9 +599,22 @@ struct ipack_device_id { #define MEI_CL_MODULE_PREFIX "mei:" #define MEI_CL_NAME_SIZE 32 +#define MEI_CL_UUID_FMT "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x" +#define MEI_CL_UUID_ARGS(_u) \ + _u[0], _u[1], _u[2], _u[3], _u[4], _u[5], _u[6], _u[7], \ + _u[8], _u[9], _u[10], _u[11], _u[12], _u[13], _u[14], _u[15] +/** + * struct mei_cl_device_id - MEI client device identifier + * @name: helper name + * @uuid: client uuid + * @driver_info: information used by the driver. + * + * identifies mei client device by uuid and name + */ struct mei_cl_device_id { char name[MEI_CL_NAME_SIZE]; + __u8 uuid[16]; kernel_ulong_t driver_info; }; -- cgit v1.2.3 From dbac993f6a6df24d5edc362667e524ba43543472 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 7 May 2015 15:54:07 +0300 Subject: mei: export mei client device struct to external use Cc: Samuel Ortiz Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- include/linux/mei_cl_bus.h | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index 0819d36a3a74..a16b1f9c1aca 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -7,6 +7,42 @@ struct mei_cl_device; +typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device, + u32 events, void *context); + +/** + * struct mei_cl_device - MEI device handle + * An mei_cl_device pointer is returned from mei_add_device() + * and links MEI bus clients to their actual ME host client pointer. + * Drivers for MEI devices will get an mei_cl_device pointer + * when being probed and shall use it for doing ME bus I/O. + * + * @dev: linux driver model device pointer + * @me_cl: me client + * @cl: mei client + * @name: device name + * @event_work: async work to execute event callback + * @event_cb: Drivers register this callback to get asynchronous ME + * events (e.g. Rx buffer pending) notifications. + * @event_context: event callback run context + * @events: Events bitmask sent to the driver. + * @priv_data: client private data + */ +struct mei_cl_device { + struct device dev; + + struct mei_me_client *me_cl; + struct mei_cl *cl; + char name[MEI_CL_NAME_SIZE]; + + struct work_struct event_work; + mei_cl_event_cb_t event_cb; + void *event_context; + unsigned long events; + + void *priv_data; +}; + struct mei_cl_driver { struct device_driver driver; const char *name; @@ -28,8 +64,6 @@ void mei_cl_driver_unregister(struct mei_cl_driver *driver); ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length); ssize_t mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length); -typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device, - u32 events, void *context); int mei_cl_register_event_cb(struct mei_cl_device *device, mei_cl_event_cb_t read_cb, void *context); -- cgit v1.2.3 From 7df20f2d893db42eaa1ea1e30a2573c971ec9238 Mon Sep 17 00:00:00 2001 From: Sudeep Dutt Date: Wed, 29 Apr 2015 05:32:28 -0700 Subject: misc: mic: SCIF header file and IOCTL interface This patch introduces the SCIF documentation in the header file and describes the IOCTL interface for user mode. mic_overview.txt is updated with documentation on SCIF and a new document describing SCIF in more details is available in scif_overview.txt. Reviewed-by: Nikhil Rao Reviewed-by: Ashutosh Dixit Signed-off-by: Sudeep Dutt Signed-off-by: Greg Kroah-Hartman --- include/linux/scif.h | 993 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 993 insertions(+) create mode 100644 include/linux/scif.h (limited to 'include/linux') diff --git a/include/linux/scif.h b/include/linux/scif.h new file mode 100644 index 000000000000..44f4f3898bbe --- /dev/null +++ b/include/linux/scif.h @@ -0,0 +1,993 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel SCIF driver. + * + */ +#ifndef __SCIF_H__ +#define __SCIF_H__ + +#include +#include +#include + +#define SCIF_ACCEPT_SYNC 1 +#define SCIF_SEND_BLOCK 1 +#define SCIF_RECV_BLOCK 1 + +enum { + SCIF_PROT_READ = (1 << 0), + SCIF_PROT_WRITE = (1 << 1) +}; + +enum { + SCIF_MAP_FIXED = 0x10, + SCIF_MAP_KERNEL = 0x20, +}; + +enum { + SCIF_FENCE_INIT_SELF = (1 << 0), + SCIF_FENCE_INIT_PEER = (1 << 1), + SCIF_SIGNAL_LOCAL = (1 << 4), + SCIF_SIGNAL_REMOTE = (1 << 5) +}; + +enum { + SCIF_RMA_USECPU = (1 << 0), + SCIF_RMA_USECACHE = (1 << 1), + SCIF_RMA_SYNC = (1 << 2), + SCIF_RMA_ORDERED = (1 << 3) +}; + +/* End of SCIF Admin Reserved Ports */ +#define SCIF_ADMIN_PORT_END 1024 + +/* End of SCIF Reserved Ports */ +#define SCIF_PORT_RSVD 1088 + +typedef struct scif_endpt *scif_epd_t; + +#define SCIF_OPEN_FAILED ((scif_epd_t)-1) +#define SCIF_REGISTER_FAILED ((off_t)-1) +#define SCIF_MMAP_FAILED ((void *)-1) + +/** + * scif_open() - Create an endpoint + * + * Return: + * Upon successful completion, scif_open() returns an endpoint descriptor to + * be used in subsequent SCIF functions calls to refer to that endpoint; + * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is + * returned and errno is set to indicate the error; in kernel mode a NULL + * scif_epd_t is returned. + * + * Errors: + * ENOMEM - Insufficient kernel memory was available + */ +scif_epd_t scif_open(void); + +/** + * scif_bind() - Bind an endpoint to a port + * @epd: endpoint descriptor + * @pn: port number + * + * scif_bind() binds endpoint epd to port pn, where pn is a port number on the + * local node. If pn is zero, a port number greater than or equal to + * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to + * exactly one local port. Ports less than 1024 when requested can only be bound + * by system (or root) processes or by processes executed by privileged users. + * + * Return: + * Upon successful completion, scif_bind() returns the port number to which epd + * is bound; otherwise in user mode -1 is returned and errno is set to + * indicate the error; in kernel mode the negative of one of the following + * errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * EINVAL - the endpoint or the port is already bound + * EISCONN - The endpoint is already connected + * ENOSPC - No port number available for assignment + * EACCES - The port requested is protected and the user is not the superuser + */ +int scif_bind(scif_epd_t epd, u16 pn); + +/** + * scif_listen() - Listen for connections on an endpoint + * @epd: endpoint descriptor + * @backlog: maximum pending connection requests + * + * scif_listen() marks the endpoint epd as a listening endpoint - that is, as + * an endpoint that will be used to accept incoming connection requests. Once + * so marked, the endpoint is said to be in the listening state and may not be + * used as the endpoint of a connection. + * + * The endpoint, epd, must have been bound to a port. + * + * The backlog argument defines the maximum length to which the queue of + * pending connections for epd may grow. If a connection request arrives when + * the queue is full, the client may receive an error with an indication that + * the connection was refused. + * + * Return: + * Upon successful completion, scif_listen() returns 0; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * EINVAL - the endpoint is not bound to a port + * EISCONN - The endpoint is already connected or listening + */ +int scif_listen(scif_epd_t epd, int backlog); + +/** + * scif_connect() - Initiate a connection on a port + * @epd: endpoint descriptor + * @dst: global id of port to which to connect + * + * The scif_connect() function requests the connection of endpoint epd to remote + * port dst. If the connection is successful, a peer endpoint, bound to dst, is + * created on node dst.node. On successful return, the connection is complete. + * + * If the endpoint epd has not already been bound to a port, scif_connect() + * will bind it to an unused local port. + * + * A connection is terminated when an endpoint of the connection is closed, + * either explicitly by scif_close(), or when a process that owns one of the + * endpoints of the connection is terminated. + * + * In user space, scif_connect() supports an asynchronous connection mode + * if the application has set the O_NONBLOCK flag on the endpoint via the + * fcntl() system call. Setting this flag will result in the calling process + * not to wait during scif_connect(). + * + * Return: + * Upon successful completion, scif_connect() returns the port ID to which the + * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is + * set to indicate the error; in kernel mode the negative of one of the + * following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNREFUSED - The destination was not listening for connections or refused + * the connection request + * EINVAL - dst.port is not a valid port ID + * EISCONN - The endpoint is already connected + * ENOMEM - No buffer space is available + * ENODEV - The destination node does not exist, or the node is lost or existed, + * but is not currently in the network since it may have crashed + * ENOSPC - No port number available for assignment + * EOPNOTSUPP - The endpoint is listening and cannot be connected + */ +int scif_connect(scif_epd_t epd, struct scif_port_id *dst); + +/** + * scif_accept() - Accept a connection on an endpoint + * @epd: endpoint descriptor + * @peer: global id of port to which connected + * @newepd: new connected endpoint descriptor + * @flags: flags + * + * The scif_accept() call extracts the first connection request from the queue + * of pending connections for the port on which epd is listening. scif_accept() + * creates a new endpoint, bound to the same port as epd, and allocates a new + * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new + * endpoint is connected to the endpoint through which the connection was + * requested. epd is unaffected by this call, and remains in the listening + * state. + * + * On successful return, peer holds the global port identifier (node id and + * local port number) of the port which requested the connection. + * + * A connection is terminated when an endpoint of the connection is closed, + * either explicitly by scif_close(), or when a process that owns one of the + * endpoints of the connection is terminated. + * + * The number of connections that can (subsequently) be accepted on epd is only + * limited by system resources (memory). + * + * The flags argument is formed by OR'ing together zero or more of the + * following values. + * SCIF_ACCEPT_SYNC - block until a connection request is presented. If + * SCIF_ACCEPT_SYNC is not in flags, and no pending + * connections are present on the queue, scif_accept() + * fails with an EAGAIN error + * + * In user mode, the select() and poll() functions can be used to determine + * when there is a connection request. In kernel mode, the scif_poll() + * function may be used for this purpose. A readable event will be delivered + * when a connection is requested. + * + * Return: + * Upon successful completion, scif_accept() returns 0; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + * Errors: + * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be + * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete + * its connection request + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * EINTR - Interrupted function + * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is + * NULL, or newepd is NULL + * ENODEV - The requesting node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOENT - Secondary part of epd registration failed + */ +int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t + *newepd, int flags); + +/** + * scif_close() - Close an endpoint + * @epd: endpoint descriptor + * + * scif_close() closes an endpoint and performs necessary teardown of + * facilities associated with that endpoint. + * + * If epd is a listening endpoint then it will no longer accept connection + * requests on the port to which it is bound. Any pending connection requests + * are rejected. + * + * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs + * which are in-process through epd or its peer endpoint will complete before + * scif_close() returns. Registered windows of the local and peer endpoints are + * released as if scif_unregister() was called against each window. + * + * Closing a SCIF endpoint does not affect local registered memory mapped by + * a SCIF endpoint on a remote node. The local memory remains mapped by the peer + * SCIF endpoint explicitly removed by calling munmap(..) by the peer. + * + * If the peer endpoint's receive queue is not empty at the time that epd is + * closed, then the peer endpoint can be passed as the endpoint parameter to + * scif_recv() until the receive queue is empty. + * + * epd is freed and may no longer be accessed. + * + * Return: + * Upon successful completion, scif_close() returns 0; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + */ +int scif_close(scif_epd_t epd); + +/** + * scif_send() - Send a message + * @epd: endpoint descriptor + * @msg: message buffer address + * @len: message length + * @flags: blocking mode flags + * + * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data + * are copied from memory starting at address msg. On successful execution the + * return value of scif_send() is the number of bytes that were sent, and is + * zero if no bytes were sent because len was zero. scif_send() may be called + * only when the endpoint is in a connected state. + * + * If a scif_send() call is non-blocking, then it sends only those bytes which + * can be sent without waiting, up to a maximum of len bytes. + * + * If a scif_send() call is blocking, then it normally returns after sending + * all len bytes. If a blocking call is interrupted or the connection is + * reset, the call is considered successful if some bytes were sent or len is + * zero, otherwise the call is considered unsuccessful. + * + * In user mode, the select() and poll() functions can be used to determine + * when the send queue is not full. In kernel mode, the scif_poll() function + * may be used for this purpose. + * + * It is recommended that scif_send()/scif_recv() only be used for short + * control-type message communication between SCIF endpoints. The SCIF RMA + * APIs are expected to provide better performance for transfer sizes of + * 1024 bytes or longer for the current MIC hardware and software + * implementation. + * + * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK + * is passed as the flags argument. + * + * Return: + * Upon successful completion, scif_send() returns the number of bytes sent; + * otherwise in user mode -1 is returned and errno is set to indicate the + * error; in kernel mode the negative of one of the following errors is + * returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - An invalid address was specified for a parameter + * EINVAL - flags is invalid, or len is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOTCONN - The endpoint is not connected + */ +int scif_send(scif_epd_t epd, void *msg, int len, int flags); + +/** + * scif_recv() - Receive a message + * @epd: endpoint descriptor + * @msg: message buffer address + * @len: message buffer length + * @flags: blocking mode flags + * + * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of + * data are copied to memory starting at address msg. On successful execution + * the return value of scif_recv() is the number of bytes that were received, + * and is zero if no bytes were received because len was zero. scif_recv() may + * be called only when the endpoint is in a connected state. + * + * If a scif_recv() call is non-blocking, then it receives only those bytes + * which can be received without waiting, up to a maximum of len bytes. + * + * If a scif_recv() call is blocking, then it normally returns after receiving + * all len bytes. If the blocking call was interrupted due to a disconnection, + * subsequent calls to scif_recv() will copy all bytes received upto the point + * of disconnection. + * + * In user mode, the select() and poll() functions can be used to determine + * when data is available to be received. In kernel mode, the scif_poll() + * function may be used for this purpose. + * + * It is recommended that scif_send()/scif_recv() only be used for short + * control-type message communication between SCIF endpoints. The SCIF RMA + * APIs are expected to provide better performance for transfer sizes of + * 1024 bytes or longer for the current MIC hardware and software + * implementation. + * + * scif_recv() will block until the entire message is received if + * SCIF_RECV_BLOCK is passed as the flags argument. + * + * Return: + * Upon successful completion, scif_recv() returns the number of bytes + * received; otherwise in user mode -1 is returned and errno is set to + * indicate the error; in kernel mode the negative of one of the following + * errors is returned. + * + * Errors: + * EAGAIN - The destination node is returning from a low power state + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - An invalid address was specified for a parameter + * EINVAL - flags is invalid, or len is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOTCONN - The endpoint is not connected + */ +int scif_recv(scif_epd_t epd, void *msg, int len, int flags); + +/** + * scif_register() - Mark a memory region for remote access. + * @epd: endpoint descriptor + * @addr: starting virtual address + * @len: length of range + * @offset: offset of window + * @prot_flags: read/write protection flags + * @map_flags: mapping flags + * + * The scif_register() function opens a window, a range of whole pages of the + * registered address space of the endpoint epd, starting at offset po and + * continuing for len bytes. The value of po, further described below, is a + * function of the parameters offset and len, and the value of map_flags. Each + * page of the window represents the physical memory page which backs the + * corresponding page of the range of virtual address pages starting at addr + * and continuing for len bytes. addr and len are constrained to be multiples + * of the page size. A successful scif_register() call returns po. + * + * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset + * exactly, and offset is constrained to be a multiple of the page size. The + * mapping established by scif_register() will not replace any existing + * registration; an error is returned if any page within the range [offset, + * offset + len - 1] intersects an existing window. + * + * When SCIF_MAP_FIXED is not set, the implementation uses offset in an + * implementation-defined manner to arrive at po. The po value so chosen will + * be an area of the registered address space that the implementation deems + * suitable for a mapping of len bytes. An offset value of 0 is interpreted as + * granting the implementation complete freedom in selecting po, subject to + * constraints described below. A non-zero value of offset is taken to be a + * suggestion of an offset near which the mapping should be placed. When the + * implementation selects a value for po, it does not replace any extant + * window. In all cases, po will be a multiple of the page size. + * + * The physical pages which are so represented by a window are available for + * access in calls to mmap(), scif_readfrom(), scif_writeto(), + * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the + * physical pages represented by the window will not be reused by the memory + * subsystem for any other purpose. Note that the same physical page may be + * represented by multiple windows. + * + * Subsequent operations which change the memory pages to which virtual + * addresses are mapped (such as mmap(), munmap()) have no effect on + * existing window. + * + * If the process will fork(), it is recommended that the registered + * virtual address range be marked with MADV_DONTFORK. Doing so will prevent + * problems due to copy-on-write semantics. + * + * The prot_flags argument is formed by OR'ing together one or more of the + * following values. + * SCIF_PROT_READ - allow read operations from the window + * SCIF_PROT_WRITE - allow write operations to the window + * + * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a + * fixed offset. + * + * Return: + * Upon successful completion, scif_register() returns the offset at which the + * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that + * is (off_t *)-1) is returned and errno is set to indicate the error; in + * kernel mode the negative of one of the following errors is returned. + * + * Errors: + * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range + * [offset, offset + len -1] are already registered + * EAGAIN - The mapping could not be performed due to lack of resources + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid + * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is + * set in flags, and offset is not a multiple of the page size, or addr is not a + * multiple of the page size, or len is not a multiple of the page size, or is + * 0, or offset is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOTCONN -The endpoint is not connected + */ +off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, + int prot_flags, int map_flags); + +/** + * scif_unregister() - Mark a memory region for remote access. + * @epd: endpoint descriptor + * @offset: start of range to unregister + * @len: length of range to unregister + * + * The scif_unregister() function closes those previously registered windows + * which are entirely within the range [offset, offset + len - 1]. It is an + * error to specify a range which intersects only a subrange of a window. + * + * On a successful return, pages within the window may no longer be specified + * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(), + * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, + * however, continues to exist until all previous references against it are + * removed. A window is referenced if there is a mapping to it created by + * mmap(), or if scif_get_pages() was called against the window + * (and the pages have not been returned via scif_put_pages()). A window is + * also referenced while an RMA, in which some range of the window is a source + * or destination, is in progress. Finally a window is referenced while some + * offset in that window was specified to scif_fence_signal(), and the RMAs + * marked by that call to scif_fence_signal() have not completed. While a + * window is in this state, its registered address space pages are not + * available for use in a new registered window. + * + * When all such references to the window have been removed, its references to + * all the physical pages which it represents are removed. Similarly, the + * registered address space pages of the window become available for + * registration in a new window. + * + * Return: + * Upon successful completion, scif_unregister() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. In the event of an + * error, no windows are unregistered. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a + * window, or offset is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the + * registered address space of epd + */ +int scif_unregister(scif_epd_t epd, off_t offset, size_t len); + +/** + * scif_readfrom() - Copy from a remote address space + * @epd: endpoint descriptor + * @loffset: offset in local registered address space to + * which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space + * from which to copy + * @rma_flags: transfer mode flags + * + * scif_readfrom() copies len bytes from the remote registered address space of + * the peer of endpoint epd, starting at the offset roffset to the local + * registered address space of epd, starting at the offset loffset. + * + * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, + * roffset + len - 1] must be within some registered window or windows of the + * local and remote nodes. A range may intersect multiple registered windows, + * but only if those windows are contiguous in the registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * The optimal DMA performance will likely be realized if both + * loffset and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if loffset and roffset are not + * cacheline aligned but are separated by some multiple of 64. The lowest level + * of performance is likely if loffset and roffset are not separated by a + * multiple of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_readfrom() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered + * address space of epd, or, The range [roffset, roffset + len - 1] is invalid + * for the registered address space of the peer of epd, or loffset or roffset + * is negative + */ +int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t + roffset, int rma_flags); + +/** + * scif_writeto() - Copy to a remote address space + * @epd: endpoint descriptor + * @loffset: offset in local registered address space + * from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to + * which to copy + * @rma_flags: transfer mode flags + * + * scif_writeto() copies len bytes from the local registered address space of + * epd, starting at the offset loffset to the remote registered address space + * of the peer of endpoint epd, starting at the offset roffset. + * + * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, + * roffset + len - 1] must be within some registered window or windows of the + * local and remote nodes. A range may intersect multiple registered windows, + * but only if those windows are contiguous in the registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * The optimal DMA performance will likely be realized if both + * loffset and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if loffset and roffset are not cacheline + * aligned but are separated by some multiple of 64. The lowest level of + * performance is likely if loffset and roffset are not separated by a multiple + * of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_readfrom() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered + * address space of epd, or, The range [roffset , roffset + len -1] is invalid + * for the registered address space of the peer of epd, or loffset or roffset + * is negative + */ +int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t + roffset, int rma_flags); + +/** + * scif_vreadfrom() - Copy from a remote address space + * @epd: endpoint descriptor + * @addr: address to which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space + * from which to copy + * @rma_flags: transfer mode flags + * + * scif_vreadfrom() copies len bytes from the remote registered address + * space of the peer of endpoint epd, starting at the offset roffset, to local + * memory, starting at addr. + * + * The specified range [roffset, roffset + len - 1] must be within some + * registered window or windows of the remote nodes. The range may + * intersect multiple registered windows, but only if those windows are + * contiguous in the registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back + * the specified local memory range may be remain in a pinned state even after + * the specified transfer completes. This may reduce overhead if some or all of + * the same virtual address range is referenced in a subsequent call of + * scif_vreadfrom() or scif_vwriteto(). + * + * The optimal DMA performance will likely be realized if both + * addr and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if addr and roffset are not + * cacheline aligned but are separated by some multiple of 64. The lowest level + * of performance is likely if addr and roffset are not separated by a + * multiple of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_USECACHE - enable registration caching + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the + * registered address space of epd + */ +int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, + int rma_flags); + +/** + * scif_vwriteto() - Copy to a remote address space + * @epd: endpoint descriptor + * @addr: address from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to + * which to copy + * @rma_flags: transfer mode flags + * + * scif_vwriteto() copies len bytes from the local memory, starting at addr, to + * the remote registered address space of the peer of endpoint epd, starting at + * the offset roffset. + * + * The specified range [roffset, roffset + len - 1] must be within some + * registered window or windows of the remote nodes. The range may intersect + * multiple registered windows, but only if those windows are contiguous in the + * registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back + * the specified local memory range may be remain in a pinned state even after + * the specified transfer completes. This may reduce overhead if some or all of + * the same virtual address range is referenced in a subsequent call of + * scif_vreadfrom() or scif_vwriteto(). + * + * The optimal DMA performance will likely be realized if both + * addr and offset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if addr and offset are not cacheline + * aligned but are separated by some multiple of 64. The lowest level of + * performance is likely if addr and offset are not separated by a multiple of + * 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_USECACHE - allow registration caching + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_vwriteto() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the + * registered address space of epd + */ +int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, + int rma_flags); + +/** + * scif_fence_mark() - Mark previously issued RMAs + * @epd: endpoint descriptor + * @flags: control flags + * @mark: marked value returned as output. + * + * scif_fence_mark() returns after marking the current set of all uncompleted + * RMAs initiated through the endpoint epd or the current set of all + * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are + * marked with a value returned at mark. The application may subsequently call + * scif_fence_wait(), passing the value returned at mark, to await completion + * of all RMAs so marked. + * + * The flags argument has exactly one of the following values. + * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint + * epd are marked + * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer + * of endpoint epd are marked + * + * Return: + * Upon successful completion, scif_fence_mark() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENOMEM - Insufficient kernel memory was available + */ +int scif_fence_mark(scif_epd_t epd, int flags, int *mark); + +/** + * scif_fence_wait() - Wait for completion of marked RMAs + * @epd: endpoint descriptor + * @mark: mark request + * + * scif_fence_wait() returns after all RMAs marked with mark have completed. + * The value passed in mark must have been obtained in a previous call to + * scif_fence_mark(). + * + * Return: + * Upon successful completion, scif_fence_wait() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENOMEM - Insufficient kernel memory was available + */ +int scif_fence_wait(scif_epd_t epd, int mark); + +/** + * scif_fence_signal() - Request a memory update on completion of RMAs + * @epd: endpoint descriptor + * @loff: local offset + * @lval: local value to write to loffset + * @roff: remote offset + * @rval: remote value to write to roffset + * @flags: flags + * + * scif_fence_signal() returns after marking the current set of all uncompleted + * RMAs initiated through the endpoint epd or marking the current set of all + * uncompleted RMAs initiated through the peer of endpoint epd. + * + * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the + * marked set, lval is written to memory at the address corresponding to offset + * loff in the local registered address space of epd. loff must be within a + * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion + * of the RMAs in the marked set, rval is written to memory at the address + * corresponding to offset roff in the remote registered address space of epd. + * roff must be within a remote registered window of the peer of epd. Note + * that any specified offset must be DWORD (4 byte / 32 bit) aligned. + * + * The flags argument is formed by OR'ing together the following. + * Exactly one of the following values. + * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint + * epd are marked + * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer + * of endpoint epd are marked + * One or more of the following values. + * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to + * memory at the address corresponding to offset loff in the local + * registered address space of epd. + * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to + * memory at the address corresponding to offset roff in the remote + * registered address space of epd. + * + * Return: + * Upon successful completion, scif_fence_signal() returns 0; otherwise in + * user mode -1 is returned and errno is set to indicate the error; in kernel + * mode the negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - flags is invalid, or loff or roff are not DWORD aligned + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - loff is invalid for the registered address of epd, or roff is invalid + * for the registered address space, of the peer of epd + */ +int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff, + u64 rval, int flags); + +/** + * scif_get_node_ids() - Return information about online nodes + * @nodes: array in which to return online node IDs + * @len: number of entries in the nodes array + * @self: address to place the node ID of the local node + * + * scif_get_node_ids() fills in the nodes array with up to len node IDs of the + * nodes in the SCIF network. If there is not enough space in nodes, as + * indicated by the len parameter, only len node IDs are returned in nodes. The + * return value of scif_get_node_ids() is the total number of nodes currently in + * the SCIF network. By checking the return value against the len parameter, + * the user may determine if enough space for nodes was allocated. + * + * The node ID of the local node is returned at self. + * + * Return: + * Upon successful completion, scif_get_node_ids() returns the actual number of + * online nodes in the SCIF network including 'self'; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode no + * errors are returned. + * + * Errors: + * EFAULT - Bad address + */ +int scif_get_node_ids(u16 *nodes, int len, u16 *self); + +#endif /* __SCIF_H__ */ -- cgit v1.2.3 From 3647a83d9dcf00b8e17777ec8aa1e48f1ed4fe06 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Sat, 11 Apr 2015 18:07:39 -0700 Subject: Drivers: hv: util: move kvp/vss function declarations to hyperv_vmbus.h These declarations are internal to hv_util module and hv_fcopy_* declarations already reside there. Signed-off-by: Vitaly Kuznetsov Tested-by: Alex Ng Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 902c37aef67e..1744148a39f9 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1236,13 +1236,6 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *, struct icmsg_negotiate *, u8 *, int, int); -int hv_kvp_init(struct hv_util_service *); -void hv_kvp_deinit(void); -void hv_kvp_onchannelcallback(void *); - -int hv_vss_init(struct hv_util_service *); -void hv_vss_deinit(void); -void hv_vss_onchannelcallback(void *); void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid); extern struct resource hyperv_mmio; -- cgit v1.2.3 From db9ba2088f6507fee370904f02db1eb9b49bd088 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Wed, 22 Apr 2015 21:31:31 -0700 Subject: drivers: hv: vmbus: Get rid of some unused definitions Get rid of some unused definitions. Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 1744148a39f9..e29ccddc6300 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -389,10 +389,6 @@ enum vmbus_channel_message_type { CHANNELMSG_INITIATE_CONTACT = 14, CHANNELMSG_VERSION_RESPONSE = 15, CHANNELMSG_UNLOAD = 16, -#ifdef VMBUS_FEATURE_PARENT_OR_PEER_MEMORY_MAPPED_INTO_A_CHILD - CHANNELMSG_VIEWRANGE_ADD = 17, - CHANNELMSG_VIEWRANGE_REMOVE = 18, -#endif CHANNELMSG_COUNT }; @@ -549,21 +545,6 @@ struct vmbus_channel_gpadl_torndown { u32 gpadl; } __packed; -#ifdef VMBUS_FEATURE_PARENT_OR_PEER_MEMORY_MAPPED_INTO_A_CHILD -struct vmbus_channel_view_range_add { - struct vmbus_channel_message_header header; - PHYSICAL_ADDRESS viewrange_base; - u64 viewrange_length; - u32 child_relid; -} __packed; - -struct vmbus_channel_view_range_remove { - struct vmbus_channel_message_header header; - PHYSICAL_ADDRESS viewrange_base; - u32 child_relid; -} __packed; -#endif - struct vmbus_channel_relid_released { struct vmbus_channel_message_header header; u32 child_relid; -- cgit v1.2.3 From 2db84eff127e3f4b3635edc589cd6a56db8755a3 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Wed, 22 Apr 2015 21:31:32 -0700 Subject: Drivers: hv: vmbus: Implement the protocol for tearing down vmbus state Implement the protocol for tearing down the monitor state established with the host. Signed-off-by: K. Y. Srinivasan Tested-by: Vitaly Kuznetsov Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index e29ccddc6300..ea934864293d 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -389,6 +389,7 @@ enum vmbus_channel_message_type { CHANNELMSG_INITIATE_CONTACT = 14, CHANNELMSG_VERSION_RESPONSE = 15, CHANNELMSG_UNLOAD = 16, + CHANNELMSG_UNLOAD_RESPONSE = 17, CHANNELMSG_COUNT }; -- cgit v1.2.3 From fea844a2b0edd6540d5cde2cd54a8a3c86e9c53f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov <[mailto:vkuznets@redhat.com]> Date: Wed, 6 May 2015 17:47:43 -0700 Subject: Drivers: hv: vmbus: briefly comment num_sc and next_oc next_oc and num_sc fields of struct vmbus_channel deserve a description. Move them closer to sc_list as these fields are related to it. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index ea934864293d..3932a993ff5a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -726,6 +726,15 @@ struct vmbus_channel { * All Sub-channels of a primary channel are linked here. */ struct list_head sc_list; + /* + * Current number of sub-channels. + */ + int num_sc; + /* + * Number of a sub-channel (position within sc_list) which is supposed + * to be used as the next outgoing channel. + */ + int next_oc; /* * The primary channel this sub-channel belongs to. * This will be NULL for the primary channel. @@ -740,9 +749,6 @@ struct vmbus_channel { * link up channels based on their CPU affinity. */ struct list_head percpu_list; - - int num_sc; - int next_oc; }; static inline void set_channel_read_state(struct vmbus_channel *c, bool state) -- cgit v1.2.3 From 80c6e1465948c2e91214f01764f427d31ebedb26 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 21 May 2015 15:49:37 -0700 Subject: driver-core: fix build for !CONFIG_MODULES Commit f2411da74698 ("driver-core: add driver module asynchronous probe support") broke build in case modules are disabled, because in this case "struct module" is not defined and we can't dereference it. Let's define module_requested_async_probing() helper and stub it out if modules are disabled. Reported-by: kbuild test robot Reported-by: Stephen Rothwell Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/module.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index f46a47d3c0dc..57f5c0a804c0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -510,6 +510,11 @@ int unregister_module_notifier(struct notifier_block *nb); extern void print_modules(void); +static inline bool module_requested_async_probing(struct module *module) +{ + return module && module->async_probe_requested; +} + #else /* !CONFIG_MODULES... */ /* Given an address, look for it in the exception tables. */ @@ -620,6 +625,12 @@ static inline int unregister_module_notifier(struct notifier_block *nb) static inline void print_modules(void) { } + +static inline bool module_requested_async_probing(struct module *module) +{ + return false; +} + #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 2539b258ec028351af954c169ea1b0ff72023a9f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 May 2015 14:45:34 +0100 Subject: drivers/base: cacheinfo: fix annoying typo when DT nodes are absent s/hierarcy/hierarchy/ Maybe the typo will annoy people enough so that they add the missing nodes to their device-tree files, but I still think this is better off fixed. Signed-off-by: Will Deacon Acked-by: Sudeep Holla Signed-off-by: Greg Kroah-Hartman --- include/linux/cacheinfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 3daf5ed392c9..2189935075b4 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -19,7 +19,7 @@ enum cache_type { /** * struct cacheinfo - represent a cache leaf node * @type: type of the cache - data, inst or unified - * @level: represents the hierarcy in the multi-level cache + * @level: represents the hierarchy in the multi-level cache * @coherency_line_size: size of each cache line usually representing * the minimum amount of data that gets transferred from memory * @number_of_sets: total number of sets, a set is a collection of cache -- cgit v1.2.3 From f4445f8b204de44a8baa4326b0e56537be867427 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 14 May 2015 15:28:24 +0100 Subject: drivers: of/base: move of_init to driver_init Commit 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from devices with an OF node") adds the symlink `of_node` for each device pointing to it's device tree node while creating/initialising it. However the devicetree sysfs is created and setup in of_init which is executed at core_initcall level. For all the devices created before of_init, the following error is thrown: "Error -2(-ENOENT) creating of_node link" Like many other components in driver model, initialize the sysfs support for OF/devicetree from driver_init so that it's ready before any devices are created. Fixes: 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from devices with an OF node") Suggested-by: Rob Herring Cc: Grant Likely Cc: Pawel Moll Cc: Benjamin Herrenschmidt Signed-off-by: Sudeep Holla Tested-by: Robert Schwebel Acked-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- include/linux/of.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index ddeaae6d2083..b871ff9d81d7 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -121,6 +121,8 @@ extern struct device_node *of_stdout; extern raw_spinlock_t devtree_lock; #ifdef CONFIG_OF +void of_core_init(void); + static inline bool is_of_node(struct fwnode_handle *fwnode) { return fwnode && fwnode->type == FWNODE_OF; @@ -376,6 +378,10 @@ bool of_console_check(struct device_node *dn, char *name, int index); #else /* CONFIG_OF */ +static inline void of_core_init(void) +{ +} + static inline bool is_of_node(struct fwnode_handle *fwnode) { return false; -- cgit v1.2.3 From 496e7ce2a46562938edcb74f65b26068ee8895f6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 May 2015 01:08:08 -0700 Subject: gpiolib: Add missing dummies for the unified device properties interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If GPIOLIB=n: drivers/leds/leds-gpio.c: In function ‘gpio_leds_create’: drivers/leds/leds-gpio.c:187: error: implicit declaration of function ‘devm_get_gpiod_from_child’ drivers/leds/leds-gpio.c:187: warning: assignment makes pointer from integer without a cast Add dummies for fwnode_get_named_gpiod() and devm_get_gpiod_from_child() for the !GPIOLIB case to fix this. Signed-off-by: Geert Uytterhoeven Fixes: 40b7318319281b1b ("gpio: Support for unified device properties interface") Acked-by: Alexandre Courbot Acked-by: Linus Walleij Signed-off-by: Bryan Wu --- include/linux/gpio/consumer.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 3a7c9ffd5ab9..da042657dc31 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -406,6 +406,21 @@ static inline int desc_to_gpio(const struct gpio_desc *desc) return -EINVAL; } +/* Child properties interface */ +struct fwnode_handle; + +static inline struct gpio_desc *fwnode_get_named_gpiod( + struct fwnode_handle *fwnode, const char *propname) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct gpio_desc *devm_get_gpiod_from_child( + struct device *dev, const char *con_id, struct fwnode_handle *child) +{ + return ERR_PTR(-ENOSYS); +} + #endif /* CONFIG_GPIOLIB */ /* -- cgit v1.2.3 From a57e16cf03339c20b09642f46f60190069ff70c7 Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 25 May 2015 23:29:20 +0200 Subject: dmaengine: pxa: add pxa dmaengine driver This is a new driver for pxa SoCs, which is also compatible with the former mmp_pdma. The rationale behind a new driver (as opposed to incremental patching) was : - the new driver relies on virt-dma, which obsoletes all the internal structures of mmp_pdma (sw_desc, hw_desc, ...), and by consequence all the functions - mmp_pdma allocates dma coherent descriptors containing not only hardware descriptors but linked list information The new driver only puts the dma hardware descriptors (ie. 4 u32) into the dma pool allocated memory. This changes completely the way descriptors are handled - the architecture behind the interrupt/tasklet management was rewritten to be more conforming to virt-dma - the buffers alignment is handled differently The former driver assumed that the DMA channel stopped between each descriptor. The new one chains descriptors to let the channel running. This is a necessary guarantee for real-time high bandwidth usecases such as video capture on "old" architectures such as pxa. - hot chaining / cold chaining / no chaining Whenever possible, submitting a descriptor "hot chains" it to a running channel. There is still no guarantee that the descriptor will be issued, as the channel might be stopped just before the descriptor is submitted. Yet this allows to submit several video buffers, and resubmit a buffer while another is under handling. As before, dma_async_issue_pending() is the only guarantee to have all the buffers issued. When an alignment issue is detected (ie. one address in a descriptor is not a multiple of 8), if the already running channel is in "aligned mode", the channel will stop, and restarted in "misaligned mode" to finished the issued list. - descriptors reusing A submitted, issued and completed descriptor can be reused, ie resubmitted if it was prepared with the proper flag (DMA_PREP_ACK). Only a channel resources release will in this case release that buffer. This allows a rolling ring of buffers to be reused, where there are several thousands of hardware descriptors used (video buffer for example). Additionally, a set of more casual features is introduced : - debugging traces - lockless way to know if a descriptor is terminated or not The driver was tested on zylonite board (pxa3xx) and mioa701 (pxa27x), with dmatest, pxa_camera and pxamci. Signed-off-by: Robert Jarzmik Signed-off-by: Vinod Koul --- include/linux/dma/pxa-dma.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 include/linux/dma/pxa-dma.h (limited to 'include/linux') diff --git a/include/linux/dma/pxa-dma.h b/include/linux/dma/pxa-dma.h new file mode 100644 index 000000000000..3edc99294bf6 --- /dev/null +++ b/include/linux/dma/pxa-dma.h @@ -0,0 +1,27 @@ +#ifndef _PXA_DMA_H_ +#define _PXA_DMA_H_ + +enum pxad_chan_prio { + PXAD_PRIO_HIGHEST = 0, + PXAD_PRIO_NORMAL, + PXAD_PRIO_LOW, + PXAD_PRIO_LOWEST, +}; + +struct pxad_param { + unsigned int drcmr; + enum pxad_chan_prio prio; +}; + +struct dma_chan; + +#ifdef CONFIG_PXA_DMA +bool pxad_filter_fn(struct dma_chan *chan, void *param); +#else +static inline bool pxad_filter_fn(struct dma_chan *chan, void *param) +{ + return false; +} +#endif + +#endif /* _PXA_DMA_H_ */ -- cgit v1.2.3 From e0213bc5467ca5fe44ab04527f0e47998f30c046 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 18 May 2015 20:04:14 +0900 Subject: usb: renesas_usbhs: Change USBHS_TYPE_R8A779x to USBHS_TYPE_RCAR_GEN2 Since the HSUSB controllers of R-Car Gen2 are the same specification (they have 16 pipes and usb-dmac), this patch changes USBHS_TYPE_R8A7790 and USBHS_TYPE_R8A7791 to USBHS_TYPE_RCAR_GEN2. Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- include/linux/usb/renesas_usbhs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h index f06529c14141..3dd5a781da99 100644 --- a/include/linux/usb/renesas_usbhs.h +++ b/include/linux/usb/renesas_usbhs.h @@ -169,8 +169,7 @@ struct renesas_usbhs_driver_param { #define USBHS_USB_DMAC_XFER_SIZE 32 /* hardcode the xfer size */ }; -#define USBHS_TYPE_R8A7790 1 -#define USBHS_TYPE_R8A7791 2 +#define USBHS_TYPE_RCAR_GEN2 1 /* * option: -- cgit v1.2.3 From a09e23f53e2c14a65a3b14a00060fea163081e1f Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Sat, 16 May 2015 22:33:35 +0200 Subject: usb: gadget: net2280: check interrupts for all endpoints USB3380 in enhanced mode has 4 IN and 4 OUT endpoints. Check interrupts for all of them. Tested-by: Ricardo Ribalda Delgado Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Felipe Balbi --- include/linux/usb/net2280.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/net2280.h b/include/linux/usb/net2280.h index 148b8fa5b1a2..725120224472 100644 --- a/include/linux/usb/net2280.h +++ b/include/linux/usb/net2280.h @@ -168,6 +168,9 @@ struct net2280_regs { #define ENDPOINT_B_INTERRUPT 2 #define ENDPOINT_A_INTERRUPT 1 #define ENDPOINT_0_INTERRUPT 0 +#define USB3380_IRQSTAT0_EP_INTR_MASK_IN (0xF << 17) +#define USB3380_IRQSTAT0_EP_INTR_MASK_OUT (0xF << 1) + u32 irqstat1; #define POWER_STATE_CHANGE_INTERRUPT 27 #define PCI_ARBITER_TIMEOUT_INTERRUPT 26 -- cgit v1.2.3 From c65c4f052bc3b67989bf54914798513685c54988 Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Sat, 16 May 2015 22:33:36 +0200 Subject: usb: gadget: net2280: fix use of GPEP in both directions USB3380 enhanced mode allows GPEP to be used in both IN and OUT directions. However, IN and OUT endpoints must use same USB endpoint address (bEndpointAddress). Fix this by setting the ep_cfg.ep_number during initialization and keep it in net2280_enable() Tested-by: Ricardo Ribalda Delgado Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Felipe Balbi --- include/linux/usb/usb338x.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/usb338x.h b/include/linux/usb/usb338x.h index f92eb635b9d3..11525d8d89a7 100644 --- a/include/linux/usb/usb338x.h +++ b/include/linux/usb/usb338x.h @@ -43,6 +43,10 @@ #define IN_ENDPOINT_TYPE 12 #define OUT_ENDPOINT_ENABLE 10 #define OUT_ENDPOINT_TYPE 8 +#define USB3380_EP_CFG_MASK_IN ((0x3 << IN_ENDPOINT_TYPE) | \ + BIT(IN_ENDPOINT_ENABLE)) +#define USB3380_EP_CFG_MASK_OUT ((0x3 << OUT_ENDPOINT_TYPE) | \ + BIT(OUT_ENDPOINT_ENABLE)) struct usb338x_usb_ext_regs { u32 usbclass; -- cgit v1.2.3 From e842b84c8e7221c45c8dbd7de09185c6149e1cf9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 09:52:48 +1100 Subject: usb: phy: Add interface to get phy give of device_node. Split the "get phy from device_node" functionality out of "get phy by phandle" so it can be used directly. This is useful when a battery-charger is intimately associated with a particular phy but handled by a separate driver. The charger can find the device_node based on sibling relationships without the need for a redundant declaration in the devicetree description. As a peripheral that gets a phy will often want to register a notifier block, and de-register it later, that functionality is included so the de-registration is automatic. Acked-by: Pavel Machek Signed-off-by: NeilBrown Signed-off-by: Felipe Balbi --- include/linux/usb/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index bc91b5d380fd..8ed1e29ef329 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -205,6 +205,8 @@ extern struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index); extern struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); +extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, + struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern int usb_bind_phy(const char *dev_name, u8 index, -- cgit v1.2.3 From e4b88e19897f1039fd83f1630517becafc0dd163 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 22 May 2015 13:44:33 -0700 Subject: Input: stmpe-ts - enforce device tree only mode The STMPE MFD is only used with device tree configured systems (and STMPE MFD core depends on OF), so force the configuration to come from device tree only. Tested-by: Heiner Kallweit Reviewed-by: Marek Vasut Acked-by: Lee Jones Signed-off-by: Dmitry Torokhov --- include/linux/mfd/stmpe.h | 44 -------------------------------------------- 1 file changed, 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h index c9d869027300..cb83883918a7 100644 --- a/include/linux/mfd/stmpe.h +++ b/include/linux/mfd/stmpe.h @@ -117,47 +117,6 @@ extern int stmpe_disable(struct stmpe *stmpe, unsigned int blocks); #define STMPE_GPIO_NOREQ_811_TOUCH (0xf0) -/** - * struct stmpe_ts_platform_data - stmpe811 touch screen controller platform - * data - * @sample_time: ADC converstion time in number of clock. - * (0 -> 36 clocks, 1 -> 44 clocks, 2 -> 56 clocks, 3 -> 64 clocks, - * 4 -> 80 clocks, 5 -> 96 clocks, 6 -> 144 clocks), - * recommended is 4. - * @mod_12b: ADC Bit mode (0 -> 10bit ADC, 1 -> 12bit ADC) - * @ref_sel: ADC reference source - * (0 -> internal reference, 1 -> external reference) - * @adc_freq: ADC Clock speed - * (0 -> 1.625 MHz, 1 -> 3.25 MHz, 2 || 3 -> 6.5 MHz) - * @ave_ctrl: Sample average control - * (0 -> 1 sample, 1 -> 2 samples, 2 -> 4 samples, 3 -> 8 samples) - * @touch_det_delay: Touch detect interrupt delay - * (0 -> 10 us, 1 -> 50 us, 2 -> 100 us, 3 -> 500 us, - * 4-> 1 ms, 5 -> 5 ms, 6 -> 10 ms, 7 -> 50 ms) - * recommended is 3 - * @settling: Panel driver settling time - * (0 -> 10 us, 1 -> 100 us, 2 -> 500 us, 3 -> 1 ms, - * 4 -> 5 ms, 5 -> 10 ms, 6 for 50 ms, 7 -> 100 ms) - * recommended is 2 - * @fraction_z: Length of the fractional part in z - * (fraction_z ([0..7]) = Count of the fractional part) - * recommended is 7 - * @i_drive: current limit value of the touchscreen drivers - * (0 -> 20 mA typical 35 mA max, 1 -> 50 mA typical 80 mA max) - * - * */ -struct stmpe_ts_platform_data { - u8 sample_time; - u8 mod_12b; - u8 ref_sel; - u8 adc_freq; - u8 ave_ctrl; - u8 touch_det_delay; - u8 settling; - u8 fraction_z; - u8 i_drive; -}; - /** * struct stmpe_platform_data - STMPE platform data * @id: device id to distinguish between multiple STMPEs on the same board @@ -168,7 +127,6 @@ struct stmpe_ts_platform_data { * @irq_over_gpio: true if gpio is used to get irq * @irq_gpio: gpio number over which irq will be requested (significant only if * irq_over_gpio is true) - * @ts: touchscreen-specific platform data */ struct stmpe_platform_data { int id; @@ -178,8 +136,6 @@ struct stmpe_platform_data { bool irq_over_gpio; int irq_gpio; int autosleep_timeout; - - struct stmpe_ts_platform_data *ts; }; #endif -- cgit v1.2.3 From 7d7efec368d537226142cbe559f45797f18672f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:35:16 -0400 Subject: sched, cgroup: reorganize threadgroup locking threadgroup_change_begin/end() are used to mark the beginning and end of threadgroup modifying operations to allow code paths which require a threadgroup to stay stable across blocking operations to synchronize against those sections using threadgroup_lock/unlock(). It's currently implemented as a general mechanism in sched.h using per-signal_struct rwsem; however, this never grew non-cgroup use cases and becomes noop if !CONFIG_CGROUPS. It turns out that cgroups is gonna be better served with a different sycnrhonization scheme and is a bit silly to keep cgroups specific details as a general mechanism. What's general here is identifying the places where threadgroups are modified. This patch restructures threadgroup locking so that threadgroup_change_begin/end() become a place where subsystems which need to sycnhronize against threadgroup changes can hook into. cgroup_threadgroup_change_begin/end() which operate on the per-signal_struct rwsem are created and threadgroup_lock/unlock() are moved to cgroup.c and made static. This is pure reorganization which doesn't cause any functional changes. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/cgroup-defs.h | 10 +++++++++ include/linux/sched.h | 53 +++++++++++++++------------------------------ 2 files changed, 27 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 55f3120fb952..1b8c93806dbd 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #ifdef CONFIG_CGROUPS @@ -460,5 +461,14 @@ struct cgroup_subsys { unsigned int depends_on; }; +void cgroup_threadgroup_change_begin(struct task_struct *tsk); +void cgroup_threadgroup_change_end(struct task_struct *tsk); + +#else /* CONFIG_CGROUPS */ + +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} + #endif /* CONFIG_CGROUPS */ + #endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8222ae40ecb0..5ee290003470 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -58,6 +58,7 @@ struct sched_param { #include #include #include +#include #include @@ -2648,53 +2649,33 @@ static inline void unlock_task_sighand(struct task_struct *tsk, spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } -#ifdef CONFIG_CGROUPS -static inline void threadgroup_change_begin(struct task_struct *tsk) -{ - down_read(&tsk->signal->group_rwsem); -} -static inline void threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); -} - /** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. + * threadgroup_change_begin - mark the beginning of changes to a threadgroup + * @tsk: task causing the changes * - * fork and exit paths explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. - * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. + * All operations which modify a threadgroup - a new thread joining the + * group, death of a member thread (the assertion of PF_EXITING) and + * exec(2) dethreading the process and replacing the leader - are wrapped + * by threadgroup_change_{begin|end}(). This is to provide a place which + * subsystems needing threadgroup stability can hook into for + * synchronization. */ -static inline void threadgroup_lock(struct task_struct *tsk) +static inline void threadgroup_change_begin(struct task_struct *tsk) { - down_write(&tsk->signal->group_rwsem); + might_sleep(); + cgroup_threadgroup_change_begin(tsk); } /** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock + * threadgroup_change_end - mark the end of changes to a threadgroup + * @tsk: task causing the changes * - * Reverse threadgroup_lock(). + * See threadgroup_change_begin(). */ -static inline void threadgroup_unlock(struct task_struct *tsk) +static inline void threadgroup_change_end(struct task_struct *tsk) { - up_write(&tsk->signal->group_rwsem); + cgroup_threadgroup_change_end(tsk); } -#else -static inline void threadgroup_change_begin(struct task_struct *tsk) {} -static inline void threadgroup_change_end(struct task_struct *tsk) {} -static inline void threadgroup_lock(struct task_struct *tsk) {} -static inline void threadgroup_unlock(struct task_struct *tsk) {} -#endif #ifndef __HAVE_THREAD_FUNCTIONS -- cgit v1.2.3 From d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:35:17 -0400 Subject: sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem The cgroup side of threadgroup locking uses signal_struct->group_rwsem to synchronize against threadgroup changes. This per-process rwsem adds small overhead to thread creation, exit and exec paths, forces cgroup code paths to do lock-verify-unlock-retry dance in a couple places and makes it impossible to atomically perform operations across multiple processes. This patch replaces signal_struct->group_rwsem with a global percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader side and contained in cgroups proper. This patch converts one-to-one. This does make writer side heavier and lower the granularity; however, cgroup process migration is a fairly cold path, we do want to optimize thread operations over it and cgroup migration operations don't take enough time for the lower granularity to matter. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/cgroup-defs.h | 27 +++++++++++++++++++++++++-- include/linux/init_task.h | 8 -------- include/linux/sched.h | 12 ------------ 3 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b8c93806dbd..7d83d7f73420 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -461,8 +461,31 @@ struct cgroup_subsys { unsigned int depends_on; }; -void cgroup_threadgroup_change_begin(struct task_struct *tsk); -void cgroup_threadgroup_change_end(struct task_struct *tsk); +extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + +/** + * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_begin() and allows cgroup operations to + * synchronize against threadgroup changes using a percpu_rw_semaphore. + */ +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + percpu_down_read(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_end(). Counterpart of + * cgroup_threadcgroup_change_begin(). + */ +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + percpu_up_read(&cgroup_threadgroup_rwsem); +} #else /* CONFIG_CGROUPS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..0cc0bbf20022 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,13 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#ifdef CONFIG_CGROUPS -#define INIT_GROUP_RWSEM(sig) \ - .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), -#else -#define INIT_GROUP_RWSEM(sig) -#endif - #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -56,7 +49,6 @@ extern struct fs_struct init_fs; }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ - INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ee290003470..add524a910bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -743,18 +743,6 @@ struct signal_struct { unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif -#ifdef CONFIG_CGROUPS - /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. - */ - struct rw_semaphore group_rwsem; -#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ -- cgit v1.2.3 From 66eb579e66ecfea55e2007be0594869ea9e453d4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 13 May 2015 17:12:23 +0100 Subject: perf: allow for PMU-specific event filtering In certain circumstances it may not be possible to schedule particular events due to constraints other than a lack of hardware counters (e.g. on big.LITTLE systems where CPUs support different events). The core perf event code does not distinguish these cases and pessimistically assumes that any failure to schedule an event means that it is not worth attempting to schedule later events, even if some hardware counters are still unused. When an event a pmu cannot schedule exists in a flexible group list it can unnecessarily prevent event groups following it in the list from being scheduled (until it is rotated to the end of the list). This means some events are scheduled for only a portion of the time they could be, and for short running programs no events may be scheduled if the list is initially sorted in an unfortunate order. This patch adds a new (optional) filter_match function pointer to struct pmu which a pmu driver can use to tell perf core when an event matches pmu-specific scheduling requirements. This plugs into the existing event_filter_match logic, and makes it possible to avoid the scheduling problem described above. When no filter is provided by the PMU, the existing behaviour is retained. Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Acked-by: Will Deacon Acked-by: Peter Zijlstra Signed-off-by: Mark Rutland Signed-off-by: Will Deacon --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..67c719cc91aa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -304,6 +304,11 @@ struct pmu { * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ + + /* + * Filter events for PMU-specific reasons. + */ + int (*filter_match) (struct perf_event *event); /* optional */ }; /** -- cgit v1.2.3 From 24fe86a617c550fb9bdc6c8bd7cf647d3955f8ba Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 29 Mar 2015 12:50:46 +0200 Subject: phy: sun4i-usb: Add a sunxi specific function for setting squelch-detect The sunxi otg phy has a bug where it wrongly detects a high speed squelch when reset on the root port gets de-asserted with a lo-speed device. The workaround for this is to disable squelch detect before de-asserting reset, and re-enabling it after the reset de-assert is done. Add a sunxi specific phy function to allow the sunxi-musb glue to do this. Acked-by: Kishon Vijay Abraham I Signed-off-by: Hans de Goede Signed-off-by: Felipe Balbi --- include/linux/phy/phy-sun4i-usb.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/phy/phy-sun4i-usb.h (limited to 'include/linux') diff --git a/include/linux/phy/phy-sun4i-usb.h b/include/linux/phy/phy-sun4i-usb.h new file mode 100644 index 000000000000..50aed92ea89c --- /dev/null +++ b/include/linux/phy/phy-sun4i-usb.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2015 Hans de Goede + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef PHY_SUN4I_USB_H_ +#define PHY_SUN4I_USB_H_ + +#include "phy.h" + +/** + * sun4i_usb_phy_set_squelch_detect() - Enable/disable squelch detect + * @phy: reference to a sun4i usb phy + * @enabled: wether to enable or disable squelch detect + */ +void sun4i_usb_phy_set_squelch_detect(struct phy *phy, bool enabled); + +#endif -- cgit v1.2.3 From ad5fb870c486d932a1749d7853dd70f436a7e03f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 3 Apr 2015 12:05:28 -0400 Subject: e820, efi: add ACPI 6.0 persistent memory types ACPI 6.0 formalizes e820-type-7 and efi-type-14 as persistent memory. Mark it "reserved" and allow it to be claimed by a persistent memory device driver. This definition is in addition to the Linux kernel's existing type-12 definition that was recently added in support of shipping platforms with NVDIMM support that predate ACPI 6.0 (which now classifies type-12 as OEM reserved). Note, /proc/iomem can be consulted for differentiating legacy "Persistent Memory (legacy)" E820_PRAM vs standard "Persistent Memory" E820_PMEM. Cc: Boaz Harrosh Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jens Axboe Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Thomas Gleixner Acked-by: Jeff Moyer Acked-by: Andy Lutomirski Reviewed-by: Ross Zwisler Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/efi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index af5be0368dec..825b6e3d69cb 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -85,7 +85,8 @@ typedef struct { #define EFI_MEMORY_MAPPED_IO 11 #define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12 #define EFI_PAL_CODE 13 -#define EFI_MAX_MEMORY_TYPE 14 +#define EFI_PERSISTENT_MEMORY 14 +#define EFI_MAX_MEMORY_TYPE 15 /* Attribute values: */ #define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -- cgit v1.2.3 From 0be964be0d45084245673c971d72a4b51690231d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:35 +0930 Subject: module: Sanitize RCU usage and locking Currently the RCU usage in module is an inconsistent mess of RCU and RCU-sched, this is broken for CONFIG_PREEMPT where synchronize_rcu() does not imply synchronize_sched(). Most usage sites use preempt_{dis,en}able() which is RCU-sched, but (most of) the modification sites use synchronize_rcu(). With the exception of the module bug list, which actually uses RCU. Convert everything over to RCU-sched. Furthermore add lockdep asserts to all sites, because it's not at all clear to me the required locking is observed, esp. on exported functions. Cc: Rusty Russell Acked-by: "Paul E. McKenney" Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/module.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index c883b86ea964..fb56dd85a862 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -421,14 +421,22 @@ struct symsearch { bool unused; }; -/* Search for an exported symbol by name. */ +/* + * Search for an exported symbol by name. + * + * Must be called with module_mutex held or preemption disabled. + */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const unsigned long **crc, bool gplok, bool warn); -/* Walk the exported symbol table */ +/* + * Walk the exported symbol table + * + * Must be called with module_mutex held or preemption disabled. + */ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, struct module *owner, void *data), void *data); -- cgit v1.2.3 From d72da4a4d973d8a0a0d3c97e7cdebf287fbe3a99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:36 +0930 Subject: rbtree: Make lockless searches non-fatal Change the insert and erase code such that lockless searches are non-fatal. In and of itself an rbtree cannot be correctly searched while in-modification, we can however provide weaker guarantees that will allow the rbtree to be used in conjunction with other techniques, such as latches; see 9b0fd802e8c0 ("seqcount: Add raw_write_seqcount_latch()"). For this to work we need the following guarantees from the rbtree code: 1) a lockless reader must not see partial stores, this would allow it to observe nodes that are invalid memory. 2) there must not be (temporary) loops in the tree structure in the modifier's program order, this would cause a lookup which interrupts the modifier to get stuck indefinitely. For 1) we must use WRITE_ONCE() for all updates to the tree structure; in particular this patch only does rb_{left,right} as those are the only element required for simple searches. It generates slightly worse code, probably because volatile. But in pointer chasing heavy code a few instructions more should not matter. For 2) I have carefully audited the code and drawn every intermediate link state and not found a loop. Cc: Mathieu Desnoyers Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Rik van Riel Reviewed-by: Michel Lespinasse Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/rbtree.h | 16 +++++++++++++--- include/linux/rbtree_augmented.h | 21 ++++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index fb31765e935a..830c4992088d 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -31,6 +31,7 @@ #include #include +#include struct rb_node { unsigned long __rb_parent_color; @@ -73,11 +74,11 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ -extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, - struct rb_node ** rb_link) +static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; @@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, *rb_link = node; } +static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + node->__rb_parent_color = (unsigned long)parent; + node->rb_left = node->rb_right = NULL; + + rcu_assign_pointer(*rb_link, node); +} + #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 378c5ee75f78..14d7b831b63a 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -123,11 +123,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new, { if (parent) { if (parent->rb_left == old) - parent->rb_left = new; + WRITE_ONCE(parent->rb_left, new); else - parent->rb_right = new; + WRITE_ONCE(parent->rb_right, new); } else - root->rb_node = new; + WRITE_ONCE(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, @@ -137,7 +137,8 @@ static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *child = node->rb_right, *tmp = node->rb_left; + struct rb_node *child = node->rb_right; + struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; @@ -167,6 +168,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, tmp = parent; } else { struct rb_node *successor = child, *child2; + tmp = child->rb_left; if (!tmp) { /* @@ -180,6 +182,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, */ parent = successor; child2 = successor->rb_right; + augment->copy(node, successor); } else { /* @@ -201,19 +204,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, successor = tmp; tmp = tmp->rb_left; } while (tmp); - parent->rb_left = child2 = successor->rb_right; - successor->rb_right = child; + child2 = successor->rb_right; + WRITE_ONCE(parent->rb_left, child2); + WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); + augment->copy(node, successor); augment->propagate(parent, successor); } - successor->rb_left = tmp = node->rb_left; + tmp = node->rb_left; + WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); + if (child2) { successor->__rb_parent_color = pc; rb_set_parent_color(child2, parent, RB_BLACK); -- cgit v1.2.3 From 6695b92a60bc7160c92d6dc5b17cc79673017c2f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:36 +0930 Subject: seqlock: Better document raw_write_seqcount_latch() Improve the documentation of the latch technique as used in the current timekeeping code, such that it can be readily employed elsewhere. Borrow from the comments in timekeeping and replace those with a reference to this more generic comment. Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Rik van Riel Cc: "Paul E. McKenney" Cc: Oleg Nesterov Reviewed-by: Mathieu Desnoyers Acked-by: Michel Lespinasse Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/seqlock.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5f68d0a391ce..1c0cf3102fdc 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -233,9 +233,83 @@ static inline void raw_write_seqcount_end(seqcount_t *s) s->sequence++; } -/* +/** * raw_write_seqcount_latch - redirect readers to even/odd copy * @s: pointer to seqcount_t + * + * The latch technique is a multiversion concurrency control method that allows + * queries during non-atomic modifications. If you can guarantee queries never + * interrupt the modification -- e.g. the concurrency is strictly between CPUs + * -- you most likely do not need this. + * + * Where the traditional RCU/lockless data structures rely on atomic + * modifications to ensure queries observe either the old or the new state the + * latch allows the same for non-atomic updates. The trade-off is doubling the + * cost of storage; we have to maintain two copies of the entire data + * structure. + * + * Very simply put: we first modify one copy and then the other. This ensures + * there is always one copy in a stable state, ready to give us an answer. + * + * The basic form is a data structure like: + * + * struct latch_struct { + * seqcount_t seq; + * struct data_struct data[2]; + * }; + * + * Where a modification, which is assumed to be externally serialized, does the + * following: + * + * void latch_modify(struct latch_struct *latch, ...) + * { + * smp_wmb(); <- Ensure that the last data[1] update is visible + * latch->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * + * modify(latch->data[0], ...); + * + * smp_wmb(); <- Ensure that the data[0] update is visible + * latch->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * + * modify(latch->data[1], ...); + * } + * + * The query will have a form like: + * + * struct entry *latch_query(struct latch_struct *latch, ...) + * { + * struct entry *entry; + * unsigned seq, idx; + * + * do { + * seq = latch->seq; + * smp_rmb(); + * + * idx = seq & 0x01; + * entry = data_query(latch->data[idx], ...); + * + * smp_rmb(); + * } while (seq != latch->seq); + * + * return entry; + * } + * + * So during the modification, queries are first redirected to data[1]. Then we + * modify data[0]. When that is complete, we redirect queries back to data[0] + * and we can modify data[1]. + * + * NOTE: The non-requirement for atomic modifications does _NOT_ include + * the publishing of new entries in the case where data is a dynamic + * data structure. + * + * An iteration might start in data[0] and get suspended long enough + * to miss an entire modification sequence, once it resumes it might + * observe the new entry. + * + * NOTE: When data is a dynamic data structure; one should use regular RCU + * patterns to manage the lifetimes of the objects within. */ static inline void raw_write_seqcount_latch(seqcount_t *s) { -- cgit v1.2.3 From 0a04b0166929405cd833c1cc40f99e862b965ddc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:36 +0930 Subject: rcu: Move lockless_dereference() out of rcupdate.h I want to use lockless_dereference() from seqlock.h, which would mean including rcupdate.h from it, however rcupdate.h already includes seqlock.h. Avoid this by moving lockless_dereference() into compiler.h. This is somewhat tricky since it uses smp_read_barrier_depends() which isn't available there, but its a CPP macro so we can get away with it. The alternative would be moving it into asm/barrier.h, but that would be updating each arch (I can do if people feel that is more appropriate). Cc: Paul McKenney Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/compiler.h | 15 +++++++++++++++ include/linux/rcupdate.h | 15 --------------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..eae42c21d5fd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -457,6 +457,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s (volatile typeof(x) *)&(x); }) #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x)) +/** + * lockless_dereference() - safely load a pointer for later dereference + * @p: The pointer to load + * + * Similar to rcu_dereference(), but for situations where the pointed-to + * object's lifetime is managed by something other than RCU. That + * "something other" might be reference counting or simple immortality. + */ +#define lockless_dereference(p) \ +({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ + (_________p1); \ +}) + /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */ #ifdef CONFIG_KPROBES # define __kprobes __attribute__((__section__(".kprobes.text"))) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 573a5afd5ed8..0356ad954ea5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -649,21 +649,6 @@ static inline void rcu_preempt_sleep_check(void) */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) -/** - * lockless_dereference() - safely load a pointer for later dereference - * @p: The pointer to load - * - * Similar to rcu_dereference(), but for situations where the pointed-to - * object's lifetime is managed by something other than RCU. That - * "something other" might be reference counting or simple immortality. - */ -#define lockless_dereference(p) \ -({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ - (_________p1); \ -}) - /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to -- cgit v1.2.3 From 7fc26327b75685f37f58d64bdb061460f834f80d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:36 +0930 Subject: seqlock: Introduce raw_read_seqcount_latch() Because with latches there is a strict data dependency on the seq load we can avoid the rmb in favour of a read_barrier_depends. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/seqlock.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 1c0cf3102fdc..890c7ef709d5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -35,6 +35,7 @@ #include #include #include +#include #include /* @@ -233,6 +234,11 @@ static inline void raw_write_seqcount_end(seqcount_t *s) s->sequence++; } +static inline int raw_read_seqcount_latch(seqcount_t *s) +{ + return lockless_dereference(s->sequence); +} + /** * raw_write_seqcount_latch - redirect readers to even/odd copy * @s: pointer to seqcount_t @@ -284,8 +290,7 @@ static inline void raw_write_seqcount_end(seqcount_t *s) * unsigned seq, idx; * * do { - * seq = latch->seq; - * smp_rmb(); + * seq = lockless_dereference(latch->seq); * * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); -- cgit v1.2.3 From ade3f510f93a5613b672febe88eff8ea7f1c63b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:37 +0930 Subject: rbtree: Implement generic latch_tree Implement a latched RB-tree in order to get unconditional RCU/lockless lookups. Cc: Oleg Nesterov Cc: Michel Lespinasse Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Rik van Riel Cc: Mathieu Desnoyers Cc: "Paul E. McKenney" Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/rbtree_latch.h | 212 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 include/linux/rbtree_latch.h (limited to 'include/linux') diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h new file mode 100644 index 000000000000..4f3432c61d12 --- /dev/null +++ b/include/linux/rbtree_latch.h @@ -0,0 +1,212 @@ +/* + * Latched RB-trees + * + * Copyright (C) 2015 Intel Corp., Peter Zijlstra + * + * Since RB-trees have non-atomic modifications they're not immediately suited + * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for + * lockless lookups; we cannot guarantee they return a correct result. + * + * The simplest solution is a seqlock + RB-tree, this will allow lockless + * lookups; but has the constraint (inherent to the seqlock) that read sides + * cannot nest in write sides. + * + * If we need to allow unconditional lookups (say as required for NMI context + * usage) we need a more complex setup; this data structure provides this by + * employing the latch technique -- see @raw_write_seqcount_latch -- to + * implement a latched RB-tree which does allow for unconditional lookups by + * virtue of always having (at least) one stable copy of the tree. + * + * However, while we have the guarantee that there is at all times one stable + * copy, this does not guarantee an iteration will not observe modifications. + * What might have been a stable copy at the start of the iteration, need not + * remain so for the duration of the iteration. + * + * Therefore, this does require a lockless RB-tree iteration to be non-fatal; + * see the comment in lib/rbtree.c. Note however that we only require the first + * condition -- not seeing partial stores -- because the latch thing isolates + * us from loops. If we were to interrupt a modification the lookup would be + * pointed at the stable tree and complete while the modification was halted. + */ + +#ifndef RB_TREE_LATCH_H +#define RB_TREE_LATCH_H + +#include +#include + +struct latch_tree_node { + struct rb_node node[2]; +}; + +struct latch_tree_root { + seqcount_t seq; + struct rb_root tree[2]; +}; + +/** + * latch_tree_ops - operators to define the tree order + * @less: used for insertion; provides the (partial) order between two elements. + * @comp: used for lookups; provides the order between the search key and an element. + * + * The operators are related like: + * + * comp(a->key,b) < 0 := less(a,b) + * comp(a->key,b) > 0 := less(b,a) + * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) + * + * If these operators define a partial order on the elements we make no + * guarantee on which of the elements matching the key is found. See + * latch_tree_find(). + */ +struct latch_tree_ops { + bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b); + int (*comp)(void *key, struct latch_tree_node *b); +}; + +static __always_inline struct latch_tree_node * +__lt_from_rb(struct rb_node *node, int idx) +{ + return container_of(node, struct latch_tree_node, node[idx]); +} + +static __always_inline void +__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx, + bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b)) +{ + struct rb_root *root = <r->tree[idx]; + struct rb_node **link = &root->rb_node; + struct rb_node *node = <n->node[idx]; + struct rb_node *parent = NULL; + struct latch_tree_node *ltp; + + while (*link) { + parent = *link; + ltp = __lt_from_rb(parent, idx); + + if (less(ltn, ltp)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + rb_link_node_rcu(node, parent, link); + rb_insert_color(node, root); +} + +static __always_inline void +__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx) +{ + rb_erase(<n->node[idx], <r->tree[idx]); +} + +static __always_inline struct latch_tree_node * +__lt_find(void *key, struct latch_tree_root *ltr, int idx, + int (*comp)(void *key, struct latch_tree_node *node)) +{ + struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node); + struct latch_tree_node *ltn; + int c; + + while (node) { + ltn = __lt_from_rb(node, idx); + c = comp(key, ltn); + + if (c < 0) + node = rcu_dereference_raw(node->rb_left); + else if (c > 0) + node = rcu_dereference_raw(node->rb_right); + else + return ltn; + } + + return NULL; +} + +/** + * latch_tree_insert() - insert @node into the trees @root + * @node: nodes to insert + * @root: trees to insert @node into + * @ops: operators defining the node order + * + * It inserts @node into @root in an ordered fashion such that we can always + * observe one complete tree. See the comment for raw_write_seqcount_latch(). + * + * The inserts use rcu_assign_pointer() to publish the element such that the + * tree structure is stored before we can observe the new @node. + * + * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be + * serialized. + */ +static __always_inline void +latch_tree_insert(struct latch_tree_node *node, + struct latch_tree_root *root, + const struct latch_tree_ops *ops) +{ + raw_write_seqcount_latch(&root->seq); + __lt_insert(node, root, 0, ops->less); + raw_write_seqcount_latch(&root->seq); + __lt_insert(node, root, 1, ops->less); +} + +/** + * latch_tree_erase() - removes @node from the trees @root + * @node: nodes to remote + * @root: trees to remove @node from + * @ops: operators defining the node order + * + * Removes @node from the trees @root in an ordered fashion such that we can + * always observe one complete tree. See the comment for + * raw_write_seqcount_latch(). + * + * It is assumed that @node will observe one RCU quiescent state before being + * reused of freed. + * + * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be + * serialized. + */ +static __always_inline void +latch_tree_erase(struct latch_tree_node *node, + struct latch_tree_root *root, + const struct latch_tree_ops *ops) +{ + raw_write_seqcount_latch(&root->seq); + __lt_erase(node, root, 0); + raw_write_seqcount_latch(&root->seq); + __lt_erase(node, root, 1); +} + +/** + * latch_tree_find() - find the node matching @key in the trees @root + * @key: search key + * @root: trees to search for @key + * @ops: operators defining the node order + * + * Does a lockless lookup in the trees @root for the node matching @key. + * + * It is assumed that this is called while holding the appropriate RCU read + * side lock. + * + * If the operators define a partial order on the elements (there are multiple + * elements which have the same key value) it is undefined which of these + * elements will be found. Nor is it possible to iterate the tree to find + * further elements with the same key value. + * + * Returns: a pointer to the node matching @key or NULL. + */ +static __always_inline struct latch_tree_node * +latch_tree_find(void *key, struct latch_tree_root *root, + const struct latch_tree_ops *ops) +{ + struct latch_tree_node *node; + unsigned int seq; + + do { + seq = raw_read_seqcount_latch(&root->seq); + node = __lt_find(key, root, seq & 1, ops->comp); + } while (read_seqcount_retry(&root->seq, seq)); + + return node; +} + +#endif /* RB_TREE_LATCH_H */ -- cgit v1.2.3 From 93c2e105f6bcee231c951ba0e56e84505c4b0483 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:37 +0930 Subject: module: Optimize __module_address() using a latched RB-tree Currently __module_address() is using a linear search through all modules in order to find the module corresponding to the provided address. With a lot of modules this can take a lot of time. One of the users of this is kernel_text_address() which is employed in many stack unwinders; which in turn are used by perf-callchain and ftrace (possibly from NMI context). So by optimizing __module_address() we optimize many stack unwinders which are used by both perf and tracing in performance sensitive code. Cc: Rusty Russell Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Oleg Nesterov Cc: "Paul E. McKenney" Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/module.h | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index fb56dd85a862..ddf35a3368fb 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -210,6 +211,13 @@ enum module_state { MODULE_STATE_UNFORMED, /* Still setting it up. */ }; +struct module; + +struct mod_tree_node { + struct module *mod; + struct latch_tree_node node; +}; + struct module { enum module_state state; @@ -269,8 +277,15 @@ struct module { /* Startup function. */ int (*init)(void); - /* If this is non-NULL, vfree after init() returns */ - void *module_init; + /* + * If this is non-NULL, vfree() after init() returns. + * + * Cacheline align here, such that: + * module_init, module_core, init_size, core_size, + * init_text_size, core_text_size and ltn_core.node[0] + * are on the same cacheline. + */ + void *module_init ____cacheline_aligned; /* Here is the actual code + data, vfree'd on unload. */ void *module_core; @@ -281,6 +296,14 @@ struct module { /* The size of the executable code in each section. */ unsigned int init_text_size, core_text_size; + /* + * We want mtn_core::{mod,node[0]} to be in the same cacheline as the + * above entries such that a regular lookup will only touch one + * cacheline. + */ + struct mod_tree_node mtn_core; + struct mod_tree_node mtn_init; + /* Size of RO sections of the module (text+rodata) */ unsigned int init_ro_size, core_ro_size; @@ -367,7 +390,7 @@ struct module { ctor_fn_t *ctors; unsigned int num_ctors; #endif -}; +} ____cacheline_aligned; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif -- cgit v1.2.3 From 6c9692e2d6a2206d8fd75ea247daa47fb75e4a02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2015 11:09:37 +0930 Subject: module: Make the mod_tree stuff conditional on PERF_EVENTS || TRACING Andrew worried about the overhead on small systems; only use the fancy code when either perf or tracing is enabled. Cc: Rusty Russell Cc: Steven Rostedt Requested-by: Andrew Morton Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- include/linux/module.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index ddf35a3368fb..4c1b02e1361d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -282,7 +282,7 @@ struct module { * * Cacheline align here, such that: * module_init, module_core, init_size, core_size, - * init_text_size, core_text_size and ltn_core.node[0] + * init_text_size, core_text_size and mtn_core::{mod,node[0]} * are on the same cacheline. */ void *module_init ____cacheline_aligned; @@ -296,6 +296,7 @@ struct module { /* The size of the executable code in each section. */ unsigned int init_text_size, core_text_size; +#ifdef CONFIG_MODULES_TREE_LOOKUP /* * We want mtn_core::{mod,node[0]} to be in the same cacheline as the * above entries such that a regular lookup will only touch one @@ -303,6 +304,7 @@ struct module { */ struct mod_tree_node mtn_core; struct mod_tree_node mtn_init; +#endif /* Size of RO sections of the module (text+rodata) */ unsigned int init_ro_size, core_ro_size; -- cgit v1.2.3 From 28b8d0c8f560300836dff352348e513cdf328e50 Mon Sep 17 00:00:00 2001 From: Gobinda Charan Maji Date: Wed, 27 May 2015 11:09:38 +0930 Subject: sysfs: tightened sysfs permission checks There were some inconsistency in restriction to VERIFY_OCTAL_PERMISSIONS(). Previously the test was "User perms >= group perms >= other perms". The permission field of User, Group or Other consists of three bits. LSB is EXECUTE permission, MSB is READ permission and the middle bit is WRITE permission. But logically WRITE is "more privileged" than READ. Say for example, permission value is "0430". Here User has only READ permission whereas Group has both WRITE and EXECUTE permission. So, the checks could be tightened and the tests are separated to USER_READABLE >= GROUP_READABLE >= OTHER_READABLE, USER_WRITABLE >= GROUP_WRITABLE and OTHER_WRITABLE is not permitted. Signed-off-by: Gobinda Charan Maji Signed-off-by: Rusty Russell --- include/linux/kernel.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..cd54b357c934 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -818,13 +818,15 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */ -#define VERIFY_OCTAL_PERMISSIONS(perms) \ - (BUILD_BUG_ON_ZERO((perms) < 0) + \ - BUILD_BUG_ON_ZERO((perms) > 0777) + \ - /* User perms >= group perms >= other perms */ \ - BUILD_BUG_ON_ZERO(((perms) >> 6) < (((perms) >> 3) & 7)) + \ - BUILD_BUG_ON_ZERO((((perms) >> 3) & 7) < ((perms) & 7)) + \ - /* Other writable? Generally considered a bad idea. */ \ - BUILD_BUG_ON_ZERO((perms) & 2) + \ +#define VERIFY_OCTAL_PERMISSIONS(perms) \ + (BUILD_BUG_ON_ZERO((perms) < 0) + \ + BUILD_BUG_ON_ZERO((perms) > 0777) + \ + /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \ + BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \ + /* USER_WRITABLE >= GROUP_WRITABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \ + /* OTHER_WRITABLE? Generally considered a bad idea. */ \ + BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) #endif -- cgit v1.2.3 From 9c27847dda9cfae7c273cde62becf364f9fa9ea3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 27 May 2015 11:09:38 +0930 Subject: kernel/params: constify struct kernel_param_ops uses Most code already uses consts for the struct kernel_param_ops, sweep the kernel for the last offending stragglers. Other than include/linux/moduleparam.h and kernel/params.c all other changes were generated with the following Coccinelle SmPL patch. Merge conflicts between trees can be handled with Coccinelle. In the future git could get Coccinelle merge support to deal with patch --> fail --> grammar --> Coccinelle --> new patch conflicts automatically for us on patches where the grammar is available and the patch is of high confidence. Consider this a feature request. Test compiled on x86_64 against: * allnoconfig * allmodconfig * allyesconfig @ const_found @ identifier ops; @@ const struct kernel_param_ops ops = { }; @ const_not_found depends on !const_found @ identifier ops; @@ -struct kernel_param_ops ops = { +const struct kernel_param_ops ops = { }; Generated-by: Coccinelle SmPL Cc: Rusty Russell Cc: Junio C Hamano Cc: Andrew Morton Cc: Kees Cook Cc: Tejun Heo Cc: Ingo Molnar Cc: cocci@systeme.lip6.fr Cc: linux-kernel@vger.kernel.org Signed-off-by: Luis R. Rodriguez Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 1c9effa25e26..5d0f4d97997f 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -225,7 +225,7 @@ struct kparam_array /* Obsolete - use module_param_cb() */ #define module_param_call(name, set, get, arg, perm) \ - static struct kernel_param_ops __param_ops_##name = \ + static const struct kernel_param_ops __param_ops_##name = \ { .flags = 0, (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ @@ -376,64 +376,64 @@ static inline void destroy_params(const struct kernel_param *params, #define __param_check(name, p, type) \ static inline type __always_unused *__check_##name(void) { return(p); } -extern struct kernel_param_ops param_ops_byte; +extern const struct kernel_param_ops param_ops_byte; extern int param_set_byte(const char *val, const struct kernel_param *kp); extern int param_get_byte(char *buffer, const struct kernel_param *kp); #define param_check_byte(name, p) __param_check(name, p, unsigned char) -extern struct kernel_param_ops param_ops_short; +extern const struct kernel_param_ops param_ops_short; extern int param_set_short(const char *val, const struct kernel_param *kp); extern int param_get_short(char *buffer, const struct kernel_param *kp); #define param_check_short(name, p) __param_check(name, p, short) -extern struct kernel_param_ops param_ops_ushort; +extern const struct kernel_param_ops param_ops_ushort; extern int param_set_ushort(const char *val, const struct kernel_param *kp); extern int param_get_ushort(char *buffer, const struct kernel_param *kp); #define param_check_ushort(name, p) __param_check(name, p, unsigned short) -extern struct kernel_param_ops param_ops_int; +extern const struct kernel_param_ops param_ops_int; extern int param_set_int(const char *val, const struct kernel_param *kp); extern int param_get_int(char *buffer, const struct kernel_param *kp); #define param_check_int(name, p) __param_check(name, p, int) -extern struct kernel_param_ops param_ops_uint; +extern const struct kernel_param_ops param_ops_uint; extern int param_set_uint(const char *val, const struct kernel_param *kp); extern int param_get_uint(char *buffer, const struct kernel_param *kp); #define param_check_uint(name, p) __param_check(name, p, unsigned int) -extern struct kernel_param_ops param_ops_long; +extern const struct kernel_param_ops param_ops_long; extern int param_set_long(const char *val, const struct kernel_param *kp); extern int param_get_long(char *buffer, const struct kernel_param *kp); #define param_check_long(name, p) __param_check(name, p, long) -extern struct kernel_param_ops param_ops_ulong; +extern const struct kernel_param_ops param_ops_ulong; extern int param_set_ulong(const char *val, const struct kernel_param *kp); extern int param_get_ulong(char *buffer, const struct kernel_param *kp); #define param_check_ulong(name, p) __param_check(name, p, unsigned long) -extern struct kernel_param_ops param_ops_ullong; +extern const struct kernel_param_ops param_ops_ullong; extern int param_set_ullong(const char *val, const struct kernel_param *kp); extern int param_get_ullong(char *buffer, const struct kernel_param *kp); #define param_check_ullong(name, p) __param_check(name, p, unsigned long long) -extern struct kernel_param_ops param_ops_charp; +extern const struct kernel_param_ops param_ops_charp; extern int param_set_charp(const char *val, const struct kernel_param *kp); extern int param_get_charp(char *buffer, const struct kernel_param *kp); #define param_check_charp(name, p) __param_check(name, p, char *) /* We used to allow int as well as bool. We're taking that away! */ -extern struct kernel_param_ops param_ops_bool; +extern const struct kernel_param_ops param_ops_bool; extern int param_set_bool(const char *val, const struct kernel_param *kp); extern int param_get_bool(char *buffer, const struct kernel_param *kp); #define param_check_bool(name, p) __param_check(name, p, bool) -extern struct kernel_param_ops param_ops_invbool; +extern const struct kernel_param_ops param_ops_invbool; extern int param_set_invbool(const char *val, const struct kernel_param *kp); extern int param_get_invbool(char *buffer, const struct kernel_param *kp); #define param_check_invbool(name, p) __param_check(name, p, bool) /* An int, which can only be set like a bool (though it shows as an int). */ -extern struct kernel_param_ops param_ops_bint; +extern const struct kernel_param_ops param_ops_bint; extern int param_set_bint(const char *val, const struct kernel_param *kp); #define param_get_bint param_get_int #define param_check_bint param_check_int @@ -477,9 +477,9 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); perm, -1, 0); \ __MODULE_PARM_TYPE(name, "array of " #type) -extern struct kernel_param_ops param_array_ops; +extern const struct kernel_param_ops param_array_ops; -extern struct kernel_param_ops param_ops_string; +extern const struct kernel_param_ops param_ops_string; extern int param_set_copystring(const char *val, const struct kernel_param *); extern int param_get_string(char *buffer, const struct kernel_param *kp); -- cgit v1.2.3 From d19f05d8a8fa221e5d5f4eaca0f3ca5874c990d3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 27 May 2015 11:09:38 +0930 Subject: kernel/params.c: generalize bool_enable_only This takes out the bool_enable_only implementation from the module loading code and generalizes it so that others can make use of it. Cc: Rusty Russell Cc: Jani Nikula Cc: Andrew Morton Cc: Kees Cook Cc: Tejun Heo Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: cocci@systeme.lip6.fr Signed-off-by: Luis R. Rodriguez Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 5d0f4d97997f..7e0079936396 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -427,6 +427,12 @@ extern int param_set_bool(const char *val, const struct kernel_param *kp); extern int param_get_bool(char *buffer, const struct kernel_param *kp); #define param_check_bool(name, p) __param_check(name, p, bool) +extern const struct kernel_param_ops param_ops_bool_enable_only; +extern int param_set_bool_enable_only(const char *val, + const struct kernel_param *kp); +/* getter is the same as for the regular bool */ +#define param_check_bool_enable_only param_check_bool + extern const struct kernel_param_ops param_ops_invbool; extern int param_set_invbool(const char *val, const struct kernel_param *kp); extern int param_get_invbool(char *buffer, const struct kernel_param *kp); -- cgit v1.2.3 From 85e6f09785bd06f0510bdb00c40772c7f8da3c43 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Tue, 19 May 2015 16:16:14 +0100 Subject: ARM: 8367/1: sa1100: prepare for moving irq driver to drivers/irqchip Prepare for moving sa1100 irq driver to irqchip infrastructure - split sa1100_init_irq into helper code and irq parts. Signed-off-by: Dmitry Eremin-Solenikov Acked-by: Thomas Gleixner Signed-off-by: Russell King --- include/linux/irqchip/irq-sa11x0.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/irqchip/irq-sa11x0.h (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-sa11x0.h b/include/linux/irqchip/irq-sa11x0.h new file mode 100644 index 000000000000..15db6829c1e4 --- /dev/null +++ b/include/linux/irqchip/irq-sa11x0.h @@ -0,0 +1,17 @@ +/* + * Generic IRQ handling for the SA11x0. + * + * Copyright (C) 2015 Dmitry Eremin-Solenikov + * Copyright (C) 1999-2001 Nicolas Pitre + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H +#define __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H + +void __init sa11x0_init_irq_nodt(int irq_start, resource_size_t io_start); + +#endif -- cgit v1.2.3 From 391b459d7623b26153c7d8c200e8d213440a7d48 Mon Sep 17 00:00:00 2001 From: Pantelis Antoniou Date: Fri, 24 Apr 2015 12:41:56 +0300 Subject: of: Move OF flags to be visible even when !CONFIG_OF We need those to be visible even when compiling with CONFIG_OF disabled, since even the empty of_node_*_flag() method use the flag. Signed-off-by: Pantelis Antoniou Acked-by: Wolfram Sang Signed-off-by: Rob Herring --- include/linux/of.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index ddeaae6d2083..9b32cbdf0230 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -120,6 +120,12 @@ extern struct device_node *of_aliases; extern struct device_node *of_stdout; extern raw_spinlock_t devtree_lock; +/* flag descriptions (need to be visible even when !CONFIG_OF) */ +#define OF_DYNAMIC 1 /* node and properties were allocated via kmalloc */ +#define OF_DETACHED 2 /* node has been detached from the device tree */ +#define OF_POPULATED 3 /* device already created for the node */ +#define OF_POPULATED_BUS 4 /* of_platform_populate recursed to children of this node */ + #ifdef CONFIG_OF static inline bool is_of_node(struct fwnode_handle *fwnode) { @@ -217,12 +223,6 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size) #define of_node_cmp(s1, s2) strcasecmp((s1), (s2)) #endif -/* flag descriptions */ -#define OF_DYNAMIC 1 /* node and properties were allocated via kmalloc */ -#define OF_DETACHED 2 /* node has been detached from the device tree */ -#define OF_POPULATED 3 /* device already created for the node */ -#define OF_POPULATED_BUS 4 /* of_platform_populate recursed to children of this node */ - #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags) #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags) -- cgit v1.2.3 From 31712c98f8180c7bd3f33eefa461f0e1142e18f5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 4 May 2015 19:42:19 +0200 Subject: of: Grammar s/property exist/property exists/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Rob Herring --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 9b32cbdf0230..a3eb9d1e5800 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -815,7 +815,7 @@ static inline int of_property_read_string_index(struct device_node *np, * @propname: name of the property to be searched. * * Search for a property in a device node. - * Returns true if the property exist false otherwise. + * Returns true if the property exists false otherwise. */ static inline bool of_property_read_bool(const struct device_node *np, const char *propname) -- cgit v1.2.3 From 307c858bc245da5229b30186546590e1d472fc8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 May 2015 16:08:02 +0200 Subject: usb: phy: add static inline wrapper for devm_usb_get_phy_by_node The newly introduced devm_usb_get_phy_by_node function only has an extern declaration, but no alternative for the case that CONFIG_USB_PHY is disabled, which leads to a build error when it is used anyway: drivers/power/twl4030_charger.c: In function 'twl4030_bci_probe': drivers/power/twl4030_charger.c:648:23: error: implicit declaration of function 'devm_usb_get_phy_by_node' [-Werror=implicit-function-declaration] bci->transceiver = devm_usb_get_phy_by_node( This adds the wrapper in the same way that we have one for all other usb-phy API functions. Signed-off-by: Arnd Bergmann Fixes: e842b84c8e7 ("usb: phy: Add interface to get phy give of device_node.") Signed-off-by: Felipe Balbi --- include/linux/usb/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index 8ed1e29ef329..e39f251cf861 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -240,6 +240,12 @@ static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, return ERR_PTR(-ENXIO); } +static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, + struct device_node *node, struct notifier_block *nb) +{ + return ERR_PTR(-ENXIO); +} + static inline void usb_put_phy(struct usb_phy *x) { } -- cgit v1.2.3 From 9626b6993b2e6faf047d2d96958e8474edc9c7a5 Mon Sep 17 00:00:00 2001 From: jilai wang Date: Fri, 10 Apr 2015 16:15:59 -0400 Subject: firmware: qcom: scm: Add HDCP Support HDCP driver needs to check if secure environment supports HDCP. If it's supported, then it requires to program some registers through SCM. Add qcom_scm_hdcp_available and qcom_scm_hdcp_req to support these requirements. Signed-off-by: Jilai Wang Signed-off-by: Kumar Gala --- include/linux/qcom_scm.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index d7a974d5f57c..6e7d5ec65838 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2010-2014, The Linux Foundation. All rights reserved. +/* Copyright (c) 2010-2015, The Linux Foundation. All rights reserved. * Copyright (C) 2015 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify @@ -16,6 +16,17 @@ extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus); extern int qcom_scm_set_warm_boot_addr(void *entry, const cpumask_t *cpus); +#define QCOM_SCM_HDCP_MAX_REQ_CNT 5 + +struct qcom_scm_hdcp_req { + u32 addr; + u32 val; +}; + +extern bool qcom_scm_hdcp_available(void); +extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, + u32 *resp); + #define QCOM_SCM_CPU_PWR_DOWN_L2_ON 0x0 #define QCOM_SCM_CPU_PWR_DOWN_L2_OFF 0x1 -- cgit v1.2.3 From 3386e0fa905c31bf1dafa2cbe2ecceb7e5ce2e66 Mon Sep 17 00:00:00 2001 From: Joachim Eastwood Date: Wed, 6 May 2015 20:09:09 +0200 Subject: of: add helper function to retrive match data It's a common operation for device drivers to retrive the data member from of_device_id struct in their probe function. Most driver end up doing: const struct of_device_id *match; match = of_match_device(driver_of_match, &pdev->dev); driver->data = match->data; With the of_device_get_match_data helper function all this can done in one go. Signed-off-by: Joachim Eastwood [robh: add missing inline to dummmy declaration] Signed-off-by: Rob Herring --- include/linux/of_device.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 22801b10cef5..4c508549833a 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -33,6 +33,8 @@ extern int of_device_add(struct platform_device *pdev); extern int of_device_register(struct platform_device *ofdev); extern void of_device_unregister(struct platform_device *ofdev); +extern const void *of_device_get_match_data(const struct device *dev); + extern ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len); @@ -65,6 +67,11 @@ static inline int of_driver_match_device(struct device *dev, static inline void of_device_uevent(struct device *dev, struct kobj_uevent_env *env) { } +static inline const void *of_device_get_match_data(const struct device *dev) +{ + return NULL; +} + static inline int of_device_get_modalias(struct device *dev, char *str, ssize_t len) { -- cgit v1.2.3 From 3b1a2c97210b1edd1a999ff8c1f72ab28f7609f7 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 13 May 2015 16:33:56 +0200 Subject: of/fdt: Make fdt blob input parameters of unflatten functions const Operations to unflatten fdt blobs never modify the input blobs, hence make them const. Now we no longer need to cast arbitrary const data to "void *" when calling of_fdt_unflatten_tree(). Signed-off-by: Geert Uytterhoeven Signed-off-by: Rob Herring --- include/linux/of_fdt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 587ee507965d..0cf217d404ce 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -37,7 +37,7 @@ extern bool of_fdt_is_big_endian(const void *blob, unsigned long node); extern int of_fdt_match(const void *blob, unsigned long node, const char *const *compat); -extern void of_fdt_unflatten_tree(unsigned long *blob, +extern void of_fdt_unflatten_tree(const unsigned long *blob, struct device_node **mynodes); /* TBD: Temporary export of fdt globals - remove when code fully merged */ -- cgit v1.2.3 From 3c6296f716ebef704b76070d90567ab4faa8462c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 13:21:34 -0400 Subject: ring-buffer: Remove useless unused tracing_off_permanent() The tracing_off_permanent() call is a way to disable all ring_buffers. Nothing uses it and nothing should use it, as tracing_off() and friends are better, as they disable the ring buffers related to tracing. The tracing_off_permanent() even disabled non tracing ring buffers. This is a bit drastic, and was added to handle NMIs doing outputs that could corrupt the ring buffer when only tracing used them. It is now obsolete and adds a little overhead, it should be removed. Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..d948718a83d7 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -532,12 +532,6 @@ bool mac_pton(const char *s, u8 *mac); * * Most likely, you want to use tracing_on/tracing_off. */ -#ifdef CONFIG_RING_BUFFER -/* trace_off_permanent stops recording with no way to bring it back */ -void tracing_off_permanent(void); -#else -static inline void tracing_off_permanent(void) { } -#endif enum ftrace_dump_mode { DUMP_NONE, -- cgit v1.2.3 From 0040b933187b11f78e83dd162a31d64a46be4e37 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 20 Apr 2015 11:52:23 -0700 Subject: f2fs: add missing version info in superblock The mkfs.f2fs remains kernel version in superblock, but f2fs module has not added that so far. Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 591f8c3ef410..8d345c24bcf7 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -50,6 +50,8 @@ #define MAX_ACTIVE_NODE_LOGS 8 #define MAX_ACTIVE_DATA_LOGS 8 +#define VERSION_LEN 256 + /* * For superblock */ @@ -86,6 +88,9 @@ struct f2fs_super_block { __le32 extension_count; /* # of extensions below */ __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ __le32 cp_payload; + __u8 version[VERSION_LEN]; /* the kernel version */ + __u8 init_version[VERSION_LEN]; /* the initial kernel version */ + __u8 reserved[892]; /* valid reserved region */ } __packed; /* -- cgit v1.2.3 From 76f105a2dbcd47509bac6ba8d94cb3759a3e6e9d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Apr 2015 15:10:36 -0700 Subject: f2fs: add feature facility in superblock This patch introduces a feature in superblock, which will indicate any new features for f2fs. Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 8d345c24bcf7..d44e97f2b98e 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -90,7 +90,8 @@ struct f2fs_super_block { __le32 cp_payload; __u8 version[VERSION_LEN]; /* the kernel version */ __u8 init_version[VERSION_LEN]; /* the initial kernel version */ - __u8 reserved[892]; /* valid reserved region */ + __le32 feature; /* defined features */ + __u8 reserved[888]; /* valid reserved region */ } __packed; /* -- cgit v1.2.3 From cde4de1205770514005663d70a9a7d81cb555085 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 20 Apr 2015 13:57:51 -0700 Subject: f2fs crypto: declare some definitions for f2fs encryption feature This definitions will be used by inode and superblock for encyption. Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d44e97f2b98e..920408a21ffd 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -91,7 +91,9 @@ struct f2fs_super_block { __u8 version[VERSION_LEN]; /* the kernel version */ __u8 init_version[VERSION_LEN]; /* the initial kernel version */ __le32 feature; /* defined features */ - __u8 reserved[888]; /* valid reserved region */ + __u8 encryption_level; /* versioning level for encryption */ + __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __u8 reserved[871]; /* valid reserved region */ } __packed; /* -- cgit v1.2.3 From f8df88081183bd4d3c461c617c3519445eb85642 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 27 May 2015 23:06:30 +0900 Subject: extcon: Remove optional print_name() function pointer of extcon_dev This patch removes the optional print_name() function pointer included in 'struct extcon_dev' because the extcon must maintain the consistent name of extcon device on sysfs instead of inconsistent name. After merged patch[1], extcon can maintain the consistent name of extcon device without any hard-coded device name. [1] https://lkml.org/lkml/2015/4/27/258 Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index be9652b3a154..a7b224b20ecc 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -83,8 +83,6 @@ struct extcon_cable; * be attached simulataneously. {0x7, 0} is equivalent to * {0x3, 0x6, 0x5, 0}. If it is {0xFFFFFFFF, 0}, there * can be no simultaneous connections. - * @print_name: An optional callback to override the method to print the - * name of the extcon device. * @print_state: An optional callback to override the method to print the * status of the extcon device. * @dev: Device of this extcon. @@ -111,7 +109,6 @@ struct extcon_dev { const u32 *mutually_exclusive; /* Optional callbacks to override class functions */ - ssize_t (*print_name)(struct extcon_dev *edev, char *buf); ssize_t (*print_state)(struct extcon_dev *edev, char *buf); /* Internal data. Please do not set. */ -- cgit v1.2.3 From c80ef9e0c021ff86771fdd72583c75d8f7b6a720 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 May 2015 10:52:59 +0200 Subject: cgroup: add seq_file forward declaration for struct cftype Recent header file changes for cgroup caused lots of warnings about a missing struct seq_file form declaration for every inclusion of include/linux/cgroup-defs.h. As some files are built with -Werror, this leads to build failure like: from /git/arm-soc/drivers/gpu/drm/tilcdc/tilcdc_crtc.c:18: /git/arm-soc/include/linux/cgroup-defs.h:354:25: error: 'struct seq_file' declared inside parameter list [-Werror] cc1: all warnings being treated as errors make[6]: *** [drivers/gpu/drm/tilcdc/tilcdc_crtc.o] Error 1 This patch adds the declaration, which resolves both the warnings and the drm failure. tj: Moved it where other type declarations are. Signed-off-by: Arnd Bergmann Fixes: b4a04ab7a37b ("cgroup: separate out include/linux/cgroup-defs.h") Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7d83d7f73420..26d1cea7929f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -26,6 +26,7 @@ struct cgroup_taskset; struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; +struct seq_file; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 -- cgit v1.2.3 From e548ca4ee4595f65b262661d166310ad8a149bec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 May 2015 13:11:32 -0600 Subject: block: don't honor chunk sizes for data-less IO We don't need to honor chunk sizes for IO that doesn't carry any data. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9ded80da2c16..ccaa9aecd593 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -903,7 +903,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq) if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) return q->limits.max_hw_sectors; - if (!q->limits.chunk_sectors) + if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD)) return blk_queue_get_max_sectors(q, rq->cmd_flags); return min(blk_max_size_offset(q, blk_rq_pos(rq)), -- cgit v1.2.3 From 07d783fd830a49008f3b2764ae7b6033ee1bf329 Mon Sep 17 00:00:00 2001 From: Peter Senna Tschudin Date: Tue, 19 May 2015 11:44:46 +0200 Subject: staging: goldfish: Fix pointer cast for 32 bits As the first argument of gf_write64() was of type unsigned long, and as some calls to gf_write64() were casting the first argument from void * to u64 the compiler and/or sparse were printing warnings for casts of wrong sizes when compiling for i386. This patch changes the type of the first argument of gf_write64() to const void *, and update calls to the function. This change fixed the warnings and allowed to remove casts from 3 calls to gf_write64(). In addition gf_write64() was renamed to gf_write_ptr() as the name was misleading because it only writes 32 bits on 32 bit systems. gf_write_dma_addr() was added to handle dma_addr_t values which is used at drivers/staging/goldfish/goldfish_audio.c. Signed-off-by: Dan Carpenter Signed-off-by: Peter Senna Tschudin Signed-off-by: Greg Kroah-Hartman --- include/linux/goldfish.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/goldfish.h b/include/linux/goldfish.h index 569236e6b2bc..93e080b39cf6 100644 --- a/include/linux/goldfish.h +++ b/include/linux/goldfish.h @@ -3,13 +3,24 @@ /* Helpers for Goldfish virtual platform */ -static inline void gf_write64(unsigned long data, - void __iomem *portl, void __iomem *porth) +static inline void gf_write_ptr(const void *ptr, void __iomem *portl, + void __iomem *porth) { - writel((u32)data, portl); + writel((u32)(unsigned long)ptr, portl); #ifdef CONFIG_64BIT - writel(data>>32, porth); + writel((unsigned long)ptr >> 32, porth); #endif } +static inline void gf_write_dma_addr(const dma_addr_t addr, + void __iomem *portl, + void __iomem *porth) +{ + writel((u32)addr, portl); +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT + writel(addr >> 32, porth); +#endif +} + + #endif /* __LINUX_GOLDFISH_H */ -- cgit v1.2.3 From b144ce2d37619e05afdb0a15676500d76a64b1be Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 May 2015 17:17:27 -0700 Subject: mei: fix up uuid matching A previous commit, c93b76b34b4d ("mei: bus: report also uuid in module alias") caused a build error as I missed applying a needed patch to add some macros to uapi/linux/uuid.h. Instead of those additional macros, change the mei code to use the existing uuid structure directly. Fixes: c93b76b34b4d Cc: Tomas Winkler Cc: Samuel Ortiz Reported-by: Stephen Rothwell Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 2d2b2b571d61..048c270822f9 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -614,7 +614,7 @@ struct ipack_device_id { */ struct mei_cl_device_id { char name[MEI_CL_NAME_SIZE]; - __u8 uuid[16]; + uuid_le uuid; kernel_ulong_t driver_info; }; -- cgit v1.2.3 From 10081fb532a2a2216b7d8e4ad585c985075b6f60 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 1 May 2015 15:23:50 +0900 Subject: lib: introduce crc_t10dif_update() This introduces crc_t10dif_update() which enables to calculate CRC for a block which straddles multiple SG elements by calling multiple times. This also converts crc_t10dif() to use crc_t10dif_update() as they are almost same. (remove extra function call in crc_t10dif() and crc_t10dif_update - Tim + Herbert) Signed-off-by: Akinobu Mita Acked-by: Martin K. Petersen Cc: Tim Chen Cc: Herbert Xu Cc: "David S. Miller" Cc: linux-crypto@vger.kernel.org Cc: Nicholas Bellinger Cc: Sagi Grimberg Cc: "Martin K. Petersen" Cc: Christoph Hellwig Cc: "James E.J. Bottomley" Cc: target-devel@vger.kernel.org Signed-off-by: Nicholas Bellinger --- include/linux/crc-t10dif.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index cf53d0773ce3..d81961e9e37d 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -9,5 +9,6 @@ extern __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len); extern __u16 crc_t10dif(unsigned char const *, size_t); +extern __u16 crc_t10dif_update(__u16 crc, unsigned char const *, size_t); #endif -- cgit v1.2.3 From f0e6a326deec8b51ee12f82a34057efd3d0979b8 Mon Sep 17 00:00:00 2001 From: Abhishek Bist Date: Wed, 27 May 2015 23:54:19 +0530 Subject: USB: hcd.h : Removed an unnecessary function prototype usb_find_interface_driver() This function is used to call in early version of linux kernel in order to find out the interface used by a usb device. But now it's use is completely abolished. So,it would be relevant to remove this obselete function from kernel mainline. Signed-off-by: Abhishek Bist Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 68b1e836dff1..c9aa7792de10 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -622,8 +622,6 @@ extern struct list_head usb_bus_list; extern struct mutex usb_bus_list_lock; extern wait_queue_head_t usb_kill_urb_queue; -extern int usb_find_interface_driver(struct usb_device *dev, - struct usb_interface *interface); #define usb_endpoint_out(ep_dir) (!((ep_dir) & USB_DIR_IN)) -- cgit v1.2.3 From 138c3f03b017e261316a4f1ec793e1ff74516def Mon Sep 17 00:00:00 2001 From: Nikhil Badola Date: Tue, 26 May 2015 17:15:29 +0530 Subject: drivers:usb:fsl: Add support for USB controller version-2.5 Add support for USB controller version-2.5 used in T4240 rev2.0, T1024, T1040, T2080, LS1021A Signed-off-by: Nikhil Badola Signed-off-by: Greg Kroah-Hartman --- include/linux/fsl_devices.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h index a82296af413f..2a2f56b292c1 100644 --- a/include/linux/fsl_devices.h +++ b/include/linux/fsl_devices.h @@ -24,6 +24,7 @@ #define FSL_USB_VER_1_6 1 #define FSL_USB_VER_2_2 2 #define FSL_USB_VER_2_4 3 +#define FSL_USB_VER_2_5 4 #include -- cgit v1.2.3 From 34d2e4584ae594eff29d1595d47d7d044e57f834 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 May 2015 13:28:48 +0900 Subject: serial: 8250: include from serial_8250.h The header file, include/linux/serial_8250.h, contains references to UART_LSR_BRK_ERROR_BITS and UART_MSR_ANY_DELTA that are defined in . Signed-off-by: Masahiro Yamada Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index f0c68d88b6f4..ba82c07feb95 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -12,6 +12,7 @@ #define _LINUX_SERIAL_8250_H #include +#include #include /* -- cgit v1.2.3 From 6c4e5f9c9ff41ea997fd0f345b3b2b88c113eb68 Mon Sep 17 00:00:00 2001 From: Keith Mange Date: Tue, 26 May 2015 14:23:01 -0700 Subject: Drivers: hv: vmbus:Update preferred vmbus protocol version to windows 10. Add support for Windows 10. Signed-off-by: Keith Mange Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 3932a993ff5a..4317cd1b69ed 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -160,16 +160,18 @@ hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi, * 1 . 1 (Windows 7) * 2 . 4 (Windows 8) * 3 . 0 (Windows 8 R2) + * 4 . 0 (Windows 10) */ #define VERSION_WS2008 ((0 << 16) | (13)) #define VERSION_WIN7 ((1 << 16) | (1)) #define VERSION_WIN8 ((2 << 16) | (4)) #define VERSION_WIN8_1 ((3 << 16) | (0)) +#define VERSION_WIN10 ((4 << 16) | (0)) #define VERSION_INVAL -1 -#define VERSION_CURRENT VERSION_WIN8_1 +#define VERSION_CURRENT VERSION_WIN10 /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) -- cgit v1.2.3 From 6fa45a22689722dac9f0e90c0931d4b34b334ede Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Wed, 20 May 2015 20:56:57 +0530 Subject: parport: add device-model to parport subsystem parport subsystem starts using the device-model. Drivers using the device-model has to define devmodel as true and should register the device with parport using parport_register_dev_model(). Tested-by: Jean Delvare Tested-by: Alan Cox Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- include/linux/parport.h | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/parport.h b/include/linux/parport.h index c22f12547324..58e3c64c6b49 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -145,6 +146,8 @@ struct pardevice { unsigned int flags; struct pardevice *next; struct pardevice *prev; + struct device dev; + bool devmodel; struct parport_state *state; /* saved status over preemption */ wait_queue_head_t wait_q; unsigned long int time; @@ -156,6 +159,8 @@ struct pardevice { void * sysctl_table; }; +#define to_pardevice(n) container_of(n, struct pardevice, dev) + /* IEEE1284 information */ /* IEEE1284 phases. These are exposed to userland through ppdev IOCTL @@ -195,7 +200,7 @@ struct parport { * This may unfortulately be null if the * port has a legacy driver. */ - + struct device bus_dev; /* to link with the bus */ struct parport *physport; /* If this is a non-default mux parport, i.e. we're a clone of a real @@ -245,15 +250,26 @@ struct parport { struct parport *slaves[3]; }; +#define to_parport_dev(n) container_of(n, struct parport, bus_dev) + #define DEFAULT_SPIN_TIME 500 /* us */ struct parport_driver { const char *name; void (*attach) (struct parport *); void (*detach) (struct parport *); + void (*match_port)(struct parport *); + int (*probe)(struct pardevice *); + struct device_driver driver; + bool devmodel; struct list_head list; }; +#define to_parport_driver(n) container_of(n, struct parport_driver, driver) + +int parport_bus_init(void); +void parport_bus_exit(void); + /* parport_register_port registers a new parallel port at the given address (if one does not already exist) and returns a pointer to it. This entails claiming the I/O region, IRQ and DMA. NULL is returned @@ -272,10 +288,20 @@ void parport_announce_port (struct parport *port); extern void parport_remove_port(struct parport *port); /* Register a new high-level driver. */ -extern int parport_register_driver (struct parport_driver *); + +int __must_check __parport_register_driver(struct parport_driver *, + struct module *, + const char *mod_name); +/* + * parport_register_driver must be a macro so that KBUILD_MODNAME can + * be expanded + */ +#define parport_register_driver(driver) \ + __parport_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) /* Unregister a high-level driver. */ extern void parport_unregister_driver (struct parport_driver *); +void parport_unregister_driver(struct parport_driver *); /* If parport_register_driver doesn't fit your needs, perhaps * parport_find_xxx does. */ @@ -288,6 +314,15 @@ extern irqreturn_t parport_irq_handler(int irq, void *dev_id); /* Reference counting for ports. */ extern struct parport *parport_get_port (struct parport *); extern void parport_put_port (struct parport *); +void parport_del_port(struct parport *); + +struct pardev_cb { + int (*preempt)(void *); + void (*wakeup)(void *); + void *private; + void (*irq_func)(void *); + unsigned int flags; +}; /* parport_register_device declares that a device is connected to a port, and tells the kernel all it needs to know. @@ -301,6 +336,10 @@ struct pardevice *parport_register_device(struct parport *port, void (*irq_func)(void *), int flags, void *handle); +struct pardevice * +parport_register_dev_model(struct parport *port, const char *name, + const struct pardev_cb *par_dev_cb, int cnt); + /* parport_unregister unlinks a device from the chain. */ extern void parport_unregister_device(struct pardevice *dev); -- cgit v1.2.3 From 1c4b1d73bacc546ba4e42f7eb4cb88c54139820b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 27 May 2015 13:57:46 +0200 Subject: tty: move linux/gsmmux.h to uapi linux/gsmmux.h defines a user interface and therefore should be installed with other headers. Make the file include: * linux/if.h for IFNAMSIZ * linux/ioctl.h for _IO* macros Signed-off-by: Jiri Slaby Cc: Alan Cox Signed-off-by: Greg Kroah-Hartman --- include/linux/gsmmux.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 include/linux/gsmmux.h (limited to 'include/linux') diff --git a/include/linux/gsmmux.h b/include/linux/gsmmux.h deleted file mode 100644 index c25e9477f7c3..000000000000 --- a/include/linux/gsmmux.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _LINUX_GSMMUX_H -#define _LINUX_GSMMUX_H - -struct gsm_config -{ - unsigned int adaption; - unsigned int encapsulation; - unsigned int initiator; - unsigned int t1; - unsigned int t2; - unsigned int t3; - unsigned int n2; - unsigned int mru; - unsigned int mtu; - unsigned int k; - unsigned int i; - unsigned int unused[8]; /* Padding for expansion without - breaking stuff */ -}; - -#define GSMIOC_GETCONF _IOR('G', 0, struct gsm_config) -#define GSMIOC_SETCONF _IOW('G', 1, struct gsm_config) - -struct gsm_netconfig { - unsigned int adaption; /* Adaption to use in network mode */ - unsigned short protocol;/* Protocol to use - only ETH_P_IP supported */ - unsigned short unused2; - char if_name[IFNAMSIZ]; /* interface name format string */ - __u8 unused[28]; /* For future use */ -}; - -#define GSMIOC_ENABLE_NET _IOW('G', 2, struct gsm_netconfig) -#define GSMIOC_DISABLE_NET _IO('G', 3) - - -#endif -- cgit v1.2.3 From 1f656ff3fdddc2f59649cc84b633b799908f1f7b Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Sat, 30 May 2015 23:37:48 -0700 Subject: Drivers: hv: vmbus: Implement NUMA aware CPU affinity for channels Channels/sub-channels can be affinitized to VCPUs in the guest. Implement this affinity in a way that is NUMA aware. The current protocol distributed the primary channels uniformly across all available CPUs. The new protocol is NUMA aware: primary channels are distributed across the available NUMA nodes while the sub-channels within a primary channel are distributed amongst CPUs within the NUMA node assigned to the primary channel. Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 4317cd1b69ed..30d3a1f79450 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -696,6 +696,11 @@ struct vmbus_channel { u32 target_vp; /* The corresponding CPUID in the guest */ u32 target_cpu; + /* + * State to manage the CPU affiliation of channels. + */ + struct cpumask alloced_cpus_in_node; + int numa_node; /* * Support for sub-channels. For high performance devices, * it will be useful to have multiple sub-channels to support -- cgit v1.2.3 From 225d59adf1c899176cce0fc80e42b1d1c12f109f Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Fri, 29 May 2015 18:14:21 +0200 Subject: iio: Specify supported modes for buffers For each buffer type specify the supported device modes for this buffer. This allows us for devices which support multiple different operating modes to pick the correct operating mode based on the modes supported by the attached buffers. It also prevents that buffers with conflicting modes are attached to a device at the same time or that a buffer with a non-supported mode is attached to a device (e.g. in-kernel callback buffer to a device only supporting hardware mode). Signed-off-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index eb8622b78ec9..1600c55828e0 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -29,6 +29,7 @@ struct iio_buffer; * @set_length: set number of datums in buffer * @release: called when the last reference to the buffer is dropped, * should free all resources allocated by the buffer. + * @modes: Supported operating modes by this buffer type * * The purpose of this structure is to make the buffer element * modular as event for a given driver, different usecases may require @@ -51,6 +52,8 @@ struct iio_buffer_access_funcs { int (*set_length)(struct iio_buffer *buffer, int length); void (*release)(struct iio_buffer *buffer); + + unsigned int modes; }; /** -- cgit v1.2.3 From 5fbf65d5c9c0fd2e5c6c48d69ce34b1c5415f2fd Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2015 15:19:37 +0900 Subject: pinctrl: remove useless const qualifier This "const" claims the get_function_groups callback never changes the given num_groups pointer. It is always true in C language, so not worth mentioning. Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinmux.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index d3740fa7073f..ace60d775b20 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -69,7 +69,7 @@ struct pinmux_ops { int (*get_function_groups) (struct pinctrl_dev *pctldev, unsigned selector, const char * const **groups, - unsigned * const num_groups); + unsigned *num_groups); int (*set_mux) (struct pinctrl_dev *pctldev, unsigned func_selector, unsigned group_selector); int (*gpio_request_enable) (struct pinctrl_dev *pctldev, -- cgit v1.2.3 From b3da97ee581387cd42dafd76eb2ac23f2335cd92 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2015 15:25:50 +0900 Subject: pinctrl: use "const struct ..." rather than "struct ... const" Only this member, pins, is defined as "struct ... const *", but the others in this struct, pinlops, pmxops, confops, etc. are defined as "const struct ... *". Swap the "struct pinctrl_pin_desc" and "const" for consistency. Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 66e4697516de..9ba59fcba549 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -127,7 +127,7 @@ struct pinctrl_ops { */ struct pinctrl_desc { const char *name; - struct pinctrl_pin_desc const *pins; + const struct pinctrl_pin_desc *pins; unsigned int npins; const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; -- cgit v1.2.3 From cf561f0d2eb74574ad9985a2feab134267a9d298 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 24 Apr 2015 14:24:27 +0200 Subject: virtio: introduce virtio_is_little_endian() helper Signed-off-by: Greg Kurz Signed-off-by: Michael S. Tsirkin Acked-by: Cornelia Huck Reviewed-by: David Gibson --- include/linux/virtio_config.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 1e306f727edc..170947eb11b5 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -205,35 +205,40 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu) return 0; } +static inline bool virtio_is_little_endian(struct virtio_device *vdev) +{ + return virtio_has_feature(vdev, VIRTIO_F_VERSION_1); +} + /* Memory accessors */ static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val) { - return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __virtio16_to_cpu(virtio_is_little_endian(vdev), val); } static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val) { - return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __cpu_to_virtio16(virtio_is_little_endian(vdev), val); } static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val) { - return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __virtio32_to_cpu(virtio_is_little_endian(vdev), val); } static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val) { - return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __cpu_to_virtio32(virtio_is_little_endian(vdev), val); } static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val) { - return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __virtio64_to_cpu(virtio_is_little_endian(vdev), val); } static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val) { - return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __cpu_to_virtio64(virtio_is_little_endian(vdev), val); } /* Config space accessors. */ -- cgit v1.2.3 From 5da7b160b36a42f161980c5e20d3960d6d076fb8 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 24 Apr 2015 14:24:58 +0200 Subject: vringh: introduce vringh_is_little_endian() helper Signed-off-by: Greg Kurz Signed-off-by: Michael S. Tsirkin Acked-by: Cornelia Huck Reviewed-by: David Gibson --- include/linux/vringh.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vringh.h b/include/linux/vringh.h index a3fa537e717a..3ed62efc2bc5 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -226,33 +226,38 @@ static inline void vringh_notify(struct vringh *vrh) vrh->notify(vrh); } +static inline bool vringh_is_little_endian(const struct vringh *vrh) +{ + return vrh->little_endian; +} + static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val) { - return __virtio16_to_cpu(vrh->little_endian, val); + return __virtio16_to_cpu(vringh_is_little_endian(vrh), val); } static inline __virtio16 cpu_to_vringh16(const struct vringh *vrh, u16 val) { - return __cpu_to_virtio16(vrh->little_endian, val); + return __cpu_to_virtio16(vringh_is_little_endian(vrh), val); } static inline u32 vringh32_to_cpu(const struct vringh *vrh, __virtio32 val) { - return __virtio32_to_cpu(vrh->little_endian, val); + return __virtio32_to_cpu(vringh_is_little_endian(vrh), val); } static inline __virtio32 cpu_to_vringh32(const struct vringh *vrh, u32 val) { - return __cpu_to_virtio32(vrh->little_endian, val); + return __cpu_to_virtio32(vringh_is_little_endian(vrh), val); } static inline u64 vringh64_to_cpu(const struct vringh *vrh, __virtio64 val) { - return __virtio64_to_cpu(vrh->little_endian, val); + return __virtio64_to_cpu(vringh_is_little_endian(vrh), val); } static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val) { - return __cpu_to_virtio64(vrh->little_endian, val); + return __cpu_to_virtio64(vringh_is_little_endian(vrh), val); } #endif /* _LINUX_VRINGH_H */ -- cgit v1.2.3 From 7d82410950aa74adccf035c332e409af2bb93e92 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 24 Apr 2015 14:26:24 +0200 Subject: virtio: add explicit big-endian support to memory accessors The current memory accessors logic is: - little endian if little_endian - native endian (i.e. no byteswap) if !little_endian If we want to fully support cross-endian vhost, we also need to be able to convert to big endian. Instead of changing the little_endian argument to some 3-value enum, this patch changes the logic to: - little endian if little_endian - big endian if !little_endian The native endian case is handled by all users with a trivial helper. This patch doesn't change any functionality, nor it does add overhead. Signed-off-by: Greg Kurz Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck Reviewed-by: David Gibson --- include/linux/virtio_byteorder.h | 24 ++++++++++++++---------- include/linux/virtio_config.h | 3 ++- include/linux/vringh.h | 3 ++- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h index 51865d05b267..ce63a2c3a612 100644 --- a/include/linux/virtio_byteorder.h +++ b/include/linux/virtio_byteorder.h @@ -3,17 +3,21 @@ #include #include -/* - * Low-level memory accessors for handling virtio in modern little endian and in - * compatibility native endian format. - */ +static inline bool virtio_legacy_is_little_endian(void) +{ +#ifdef __LITTLE_ENDIAN + return true; +#else + return false; +#endif +} static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val) { if (little_endian) return le16_to_cpu((__force __le16)val); else - return (__force u16)val; + return be16_to_cpu((__force __be16)val); } static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val) @@ -21,7 +25,7 @@ static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val) if (little_endian) return (__force __virtio16)cpu_to_le16(val); else - return (__force __virtio16)val; + return (__force __virtio16)cpu_to_be16(val); } static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val) @@ -29,7 +33,7 @@ static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val) if (little_endian) return le32_to_cpu((__force __le32)val); else - return (__force u32)val; + return be32_to_cpu((__force __be32)val); } static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val) @@ -37,7 +41,7 @@ static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val) if (little_endian) return (__force __virtio32)cpu_to_le32(val); else - return (__force __virtio32)val; + return (__force __virtio32)cpu_to_be32(val); } static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val) @@ -45,7 +49,7 @@ static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val) if (little_endian) return le64_to_cpu((__force __le64)val); else - return (__force u64)val; + return be64_to_cpu((__force __be64)val); } static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val) @@ -53,7 +57,7 @@ static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val) if (little_endian) return (__force __virtio64)cpu_to_le64(val); else - return (__force __virtio64)val; + return (__force __virtio64)cpu_to_be64(val); } #endif /* _LINUX_VIRTIO_BYTEORDER */ diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 170947eb11b5..e5ce8ab0b8b0 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -207,7 +207,8 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu) static inline bool virtio_is_little_endian(struct virtio_device *vdev) { - return virtio_has_feature(vdev, VIRTIO_F_VERSION_1); + return virtio_has_feature(vdev, VIRTIO_F_VERSION_1) || + virtio_legacy_is_little_endian(); } /* Memory accessors */ diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 3ed62efc2bc5..bc6c28d04263 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -228,7 +228,8 @@ static inline void vringh_notify(struct vringh *vrh) static inline bool vringh_is_little_endian(const struct vringh *vrh) { - return vrh->little_endian; + return vrh->little_endian || + virtio_legacy_is_little_endian(); } static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val) -- cgit v1.2.3 From 4af34b572a85c44c55491a10693535a79627c478 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 1 Jun 2015 11:04:26 +0200 Subject: drivers: soc: sunxi: Introduce SoC driver to map SRAMs The Allwinner SoCs have a handful of SRAM that can be either mapped to be accessible by devices or the CPU. That mapping is controlled by an SRAM controller, and that mapping might not be set by the bootloader, for example if the device wasn't used at all, or if we're using solutions like the U-Boot's Falcon Boot. We could also imagine changing this at runtime for example to change the mapping of these SRAMs to use them for suspend/resume or runtime memory rate change, if that ever happens. These use cases require some API in the kernel to control that mapping, exported through a drivers/soc driver. This driver also implement a debugfs file that shows the SRAM found in the system, the current mapping and the SRAM that have been claimed by some drivers in the kernel. Signed-off-by: Maxime Ripard Acked-by: Arnd Bergmann Acked-by: Hans de Goede Tested-by: Hans de Goede Signed-off-by: Arnd Bergmann --- include/linux/soc/sunxi/sunxi_sram.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/soc/sunxi/sunxi_sram.h (limited to 'include/linux') diff --git a/include/linux/soc/sunxi/sunxi_sram.h b/include/linux/soc/sunxi/sunxi_sram.h new file mode 100644 index 000000000000..c5f663bba9c2 --- /dev/null +++ b/include/linux/soc/sunxi/sunxi_sram.h @@ -0,0 +1,19 @@ +/* + * Allwinner SoCs SRAM Controller Driver + * + * Copyright (C) 2015 Maxime Ripard + * + * Author: Maxime Ripard + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#ifndef _SUNXI_SRAM_H_ +#define _SUNXI_SRAM_H_ + +int sunxi_sram_claim(struct device *dev); +int sunxi_sram_release(struct device *dev); + +#endif /* _SUNXI_SRAM_H_ */ -- cgit v1.2.3 From f26cdc8536ad50fb802a0445f836b4f94ca09ae7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 1 Jun 2015 09:29:53 -0600 Subject: blk-mq: Shared tag enhancements Storage controllers may expose multiple block devices that share hardware resources managed by blk-mq. This patch enhances the shared tags so a low-level driver can access the shared resources not tied to the unshared h/w contexts. This way the LLD can dynamically add and delete disks and request queues without having to track all the request_queue hctx's to iterate outstanding tags. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2056a99b92f8..37d1602c4f7a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -96,6 +96,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int, typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); +typedef void (busy_tag_iter_fn)(struct request *, void *, bool); struct blk_mq_ops { /* @@ -182,6 +183,7 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); enum { BLK_MQ_UNIQUE_TAG_BITS = 16, @@ -224,6 +226,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv); +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); -- cgit v1.2.3 From 11f81becca04bb7d2826a9b65bb8d27b0a1bb543 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:15 -0400 Subject: page_writeback: revive cancel_dirty_page() in a restricted form cancel_dirty_page() had some issues and b9ea25152e56 ("page_writeback: clean up mess around cancel_dirty_page()") replaced it with account_page_cleaned() which makes the caller responsible for clearing the dirty bit; unfortunately, the planned changes for cgroup writeback support requires synchronization between dirty bit manipulation and stat updates. While we can open-code such synchronization in each account_page_cleaned() callsite, that's gonna be unnecessarily awkward and verbose. This patch revives cancel_dirty_page() but in a more restricted form. All it does is TestClearPageDirty() followed by account_page_cleaned() invocation if the page was dirty. This helper covers all account_page_cleaned() usages except for __delete_from_page_cache() which is a special case anyway and left alone. As this leaves no module user for account_page_cleaned(), EXPORT_SYMBOL() is dropped from it. This patch just revives cancel_dirty_page() as a trivial wrapper to replace equivalent usages and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..a83cf3a6f78e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1215,6 +1215,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping); void account_page_cleaned(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); +void cancel_dirty_page(struct page *page); int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -- cgit v1.2.3 From c4843a7593a9df3ff5b1806084cefdfa81dd7c79 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 22 May 2015 17:13:16 -0400 Subject: memcg: add per cgroup dirty page accounting When modifying PG_Dirty on cached file pages, update the new MEM_CGROUP_STAT_DIRTY counter. This is done in the same places where global NR_FILE_DIRTY is managed. The new memcg stat is visible in the per memcg memory.stat cgroupfs file. The most recent past attempt at this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632 The new accounting supports future efforts to add per cgroup dirty page throttling and writeback. It also helps an administrator break down a container's memory usage and provides evidence to understand memcg oom kills (the new dirty count is included in memcg oom kill messages). The ability to move page accounting between memcg (memory.move_charge_at_immigrate) makes this accounting more complicated than the global counter. The existing mem_cgroup_{begin,end}_page_stat() lock is used to serialize move accounting with stat updates. Typical update operation: memcg = mem_cgroup_begin_page_stat(page) if (TestSetPageDirty()) { [...] mem_cgroup_update_page_stat(memcg) } mem_cgroup_end_page_stat(memcg) Summary of mem_cgroup_end_page_stat() overhead: - Without CONFIG_MEMCG it's a no-op - With CONFIG_MEMCG and no inter memcg task movement, it's just rcu_read_lock() - With CONFIG_MEMCG and inter memcg task movement, it's rcu_read_lock() + spin_lock_irqsave() A memcg parameter is added to several routines because their callers now grab mem_cgroup_begin_page_stat() which returns the memcg later needed by for mem_cgroup_update_page_stat(). Because mem_cgroup_begin_page_stat() may disable interrupts, some adjustments are needed: - move __mark_inode_dirty() from __set_page_dirty() to its caller. __mark_inode_dirty() locking does not want interrupts disabled. - use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in __delete_from_page_cache(), replace_page_cache_page(), invalidate_complete_page2(), and __remove_mapping(). text data bss dec hex filename 8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before 8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after +192 text bytes 8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before 8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after +773 text bytes Performance tests run on v4.0-rc1-36-g4f671fe2f952. Lower is better for all metrics, they're all wall clock or cycle counts. The read and write fault benchmarks just measure fault time, they do not include I/O time. * CONFIG_MEMCG not set: baseline patched kbuild 1m25.030000(+-0.088% 3 samples) 1m25.426667(+-0.120% 3 samples) dd write 100 MiB 0.859211561 +-15.10% 0.874162885 +-15.03% dd write 200 MiB 1.670653105 +-17.87% 1.669384764 +-11.99% dd write 1000 MiB 8.434691190 +-14.15% 8.474733215 +-14.77% read fault cycles 254.0(+-0.000% 10 samples) 253.0(+-0.000% 10 samples) write fault cycles 2021.2(+-3.070% 10 samples) 1984.5(+-1.036% 10 samples) * CONFIG_MEMCG=y root_memcg: baseline patched kbuild 1m25.716667(+-0.105% 3 samples) 1m25.686667(+-0.153% 3 samples) dd write 100 MiB 0.855650830 +-14.90% 0.887557919 +-14.90% dd write 200 MiB 1.688322953 +-12.72% 1.667682724 +-13.33% dd write 1000 MiB 8.418601605 +-14.30% 8.673532299 +-15.00% read fault cycles 266.0(+-0.000% 10 samples) 266.0(+-0.000% 10 samples) write fault cycles 2051.7(+-1.349% 10 samples) 2049.6(+-1.686% 10 samples) * CONFIG_MEMCG=y non-root_memcg: baseline patched kbuild 1m26.120000(+-0.273% 3 samples) 1m25.763333(+-0.127% 3 samples) dd write 100 MiB 0.861723964 +-15.25% 0.818129350 +-14.82% dd write 200 MiB 1.669887569 +-13.30% 1.698645885 +-13.27% dd write 1000 MiB 8.383191730 +-14.65% 8.351742280 +-14.52% read fault cycles 265.7(+-0.172% 10 samples) 267.0(+-0.000% 10 samples) write fault cycles 2070.6(+-1.512% 10 samples) 2084.4(+-2.148% 10 samples) As expected anon page faults are not affected by this patch. tj: Updated to apply on top of the recent cancel_dirty_page() changes. Signed-off-by: Sha Zhengju Signed-off-by: Greg Thelen Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 1 + include/linux/mm.h | 6 ++++-- include/linux/pagemap.h | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 72dff5fb0d0c..5fe6411b5e54 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -41,6 +41,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, diff --git a/include/linux/mm.h b/include/linux/mm.h index a83cf3a6f78e..f48d979ced4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1211,8 +1211,10 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_cleaned(struct page *page, struct address_space *mapping); +void account_page_dirtied(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg); +void account_page_cleaned(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); void cancel_dirty_page(struct page *page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f7065c..fb0814ca65c7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); -extern void __delete_from_page_cache(struct page *page, void *shadow); +extern void __delete_from_page_cache(struct page *page, void *shadow, + struct mem_cgroup *memcg); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* -- cgit v1.2.3 From eea8f41cc58849e354ecf8b95bd7f806e1d1f703 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:17 -0400 Subject: blkcg: move block/blk-cgroup.h to include/linux/blk-cgroup.h cgroup aware writeback support will require exposing some of blkcg details. In preprataion, move block/blk-cgroup.h to include/linux/blk-cgroup.h. This patch is pure file move. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 603 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 603 insertions(+) create mode 100644 include/linux/blk-cgroup.h (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h new file mode 100644 index 000000000000..c567865b5f1d --- /dev/null +++ b/include/linux/blk-cgroup.h @@ -0,0 +1,603 @@ +#ifndef _BLK_CGROUP_H +#define _BLK_CGROUP_H +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include +#include +#include +#include +#include +#include + +/* Max limits for throttle policy */ +#define THROTL_IOPS_MAX UINT_MAX + +/* CFQ specific, out here for blkcg->cfq_weight */ +#define CFQ_WEIGHT_MIN 10 +#define CFQ_WEIGHT_MAX 1000 +#define CFQ_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_BLK_CGROUP + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +struct blkcg_gq; + +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + + struct radix_tree_root blkg_tree; + struct blkcg_gq *blkg_hint; + struct hlist_head blkg_list; + + /* TODO: per-policy storage in blkcg */ + unsigned int cfq_weight; /* belongs to cfq */ + unsigned int cfq_leaf_weight; +}; + +struct blkg_stat { + struct u64_stats_sync syncp; + uint64_t cnt; +}; + +struct blkg_rwstat { + struct u64_stats_sync syncp; + uint64_t cnt[BLKG_RWSTAT_NR]; +}; + +/* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each has its private + * data on each blkg, the size of which is determined by + * blkcg_policy->pd_size. blkcg core allocates and frees such areas + * together with blkg and invokes pd_init/exit_fn() methods. + * + * Such private data must embed struct blkg_policy_data (pd) at the + * beginning and pd_size can't be smaller than pd. + */ +struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; + + /* used during policy activation */ + struct list_head alloc_node; +}; + +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* request allocation list for this blkcg-q pair */ + struct request_list rl; + + /* reference count */ + atomic_t refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + struct rcu_head rcu_head; +}; + +typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); + +struct blkcg_policy { + int plid; + /* policy specific private data size */ + size_t pd_size; + /* cgroup files for the policy */ + struct cftype *cftypes; + + /* operations */ + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; + blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; +}; + +extern struct blkcg blkcg_root; + +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); +int blkcg_init_queue(struct request_queue *q); +void blkcg_drain_queue(struct request_queue *q); +void blkcg_exit_queue(struct request_queue *q); + +/* Blkio controller policy registration */ +int blkcg_policy_register(struct blkcg_policy *pol); +void blkcg_policy_unregister(struct blkcg_policy *pol); +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat); +u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); + +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + +struct blkg_conf_ctx { + struct gendisk *disk; + struct blkcg_gq *blkg; + u64 v; +}; + +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx); +void blkg_conf_finish(struct blkg_conf_ctx *ctx); + + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + +static inline struct blkcg *task_blkcg(struct task_struct *tsk) +{ + return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); +} + +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + return task_blkcg(current); +} + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) +{ + return blkg ? blkg->pd[pol->plid] : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) +{ + return pd ? pd->blkg : NULL; +} + +/** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ +static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) +{ + char *p; + + p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + if (!p) { + strncpy(buf, "", buflen); + return -ENAMETOOLONG; + } + + memmove(buf, p, buf + buflen - p); + return 0; +} + +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding an existing reference. + */ +static inline void blkg_get(struct blkcg_gq *blkg) +{ + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); +} + +void __blkg_release_rcu(struct rcu_head *rcu); + +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + */ +static inline void blkg_put(struct blkcg_gq *blkg) +{ + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); +} + +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, + bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Similar to blkg_for_each_descendant_pre() but performs post-order + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. + */ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; +root_rl: + rcu_read_unlock(); + return &q->root_rl; +} + +/** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ +static inline void blk_put_rl(struct request_list *rl) +{ + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); +} + +/** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) +{ + rq->rl = rl; +} + +/** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ +static inline struct request_list *blk_rq_rl(struct request *rq) +{ + return rq->rl; +} + +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); +/** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + +static inline void blkg_stat_init(struct blkg_stat *stat) +{ + u64_stats_init(&stat->syncp); +} + +/** + * blkg_stat_add - add a value to a blkg_stat + * @stat: target blkg_stat + * @val: value to add + * + * Add @val to @stat. The caller is responsible for synchronizing calls to + * this function. + */ +static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) +{ + u64_stats_update_begin(&stat->syncp); + stat->cnt += val; + u64_stats_update_end(&stat->syncp); +} + +/** + * blkg_stat_read - read the current value of a blkg_stat + * @stat: blkg_stat to read + * + * Read the current value of @stat. This function can be called without + * synchroniztion and takes care of u64 atomicity. + */ +static inline uint64_t blkg_stat_read(struct blkg_stat *stat) +{ + unsigned int start; + uint64_t v; + + do { + start = u64_stats_fetch_begin_irq(&stat->syncp); + v = stat->cnt; + } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); + + return v; +} + +/** + * blkg_stat_reset - reset a blkg_stat + * @stat: blkg_stat to reset + */ +static inline void blkg_stat_reset(struct blkg_stat *stat) +{ + stat->cnt = 0; +} + +/** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + +static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +{ + u64_stats_init(&rwstat->syncp); +} + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @rw: mask of REQ_{WRITE|SYNC} + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + int rw, uint64_t val) +{ + u64_stats_update_begin(&rwstat->syncp); + + if (rw & REQ_WRITE) + rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + else + rwstat->cnt[BLKG_RWSTAT_READ] += val; + if (rw & REQ_SYNC) + rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + else + rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + + u64_stats_update_end(&rwstat->syncp); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it as the return value. + * This function can be called without synchronization and takes care of + * u64 atomicity. + */ +static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) +{ + unsigned int start; + struct blkg_rwstat tmp; + + do { + start = u64_stats_fetch_begin_irq(&rwstat->syncp); + tmp = *rwstat; + } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + + return tmp; +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); + + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); +} + +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + +#else /* CONFIG_BLK_CGROUP */ + +struct cgroup; +struct blkcg; + +struct blkg_policy_data { +}; + +struct blkcg_gq { +}; + +struct blkcg_policy { +}; + +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline int blkcg_init_queue(struct request_queue *q) { return 0; } +static inline void blkcg_drain_queue(struct request_queue *q) { } +static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } +static inline int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { return 0; } +static inline void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { } + +static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } +static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } +static inline void blkg_get(struct blkcg_gq *blkg) { } +static inline void blkg_put(struct blkcg_gq *blkg) { } + +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } +static inline void blk_put_rl(struct request_list *rl) { } +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } +static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + +#endif /* CONFIG_BLK_CGROUP */ +#endif /* _BLK_CGROUP_H */ -- cgit v1.2.3 From efa7d1c733d1d2c1a468b85126d70bad9fdf6ba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:18 -0400 Subject: update !CONFIG_BLK_CGROUP dummies in include/linux/blk-cgroup.h The header file will be used more widely with the pending cgroup writeback support and the current set of dummy declarations aren't enough to handle different config combinations. Update as follows. * Drop the struct cgroup declaration. None of the dummy defs need it. * Define blkcg as an empty struct instead of just declaring it. * Wrap dummy function defs in CONFIG_BLOCK. Some functions use block data types and none of them are to be used w/o block enabled. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index c567865b5f1d..51f95b34d3f0 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -558,8 +558,8 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to, #else /* CONFIG_BLK_CGROUP */ -struct cgroup; -struct blkcg; +struct blkcg { +}; struct blkg_policy_data { }; @@ -570,6 +570,8 @@ struct blkcg_gq { struct blkcg_policy { }; +#ifdef CONFIG_BLOCK + static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } @@ -599,5 +601,6 @@ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) +#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_H */ -- cgit v1.2.3 From 56161634e4824380a67243a4cf3fa52eb1e5d836 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:20 -0400 Subject: memcg: add mem_cgroup_root_css Add global mem_cgroup_root_css which points to the root memcg css. This will be used by cgroup writeback support. If memcg is disabled, it's defined as ERR_PTR(-EINVAL). Signed-off-by: Tejun Heo Cc: Johannes Weiner aCc: Michal Hocko Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5fe6411b5e54..294498f4f6fc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -68,6 +68,8 @@ enum mem_cgroup_events_index { }; #ifdef CONFIG_MEMCG +extern struct cgroup_subsys_state *mem_cgroup_root_css; + void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr); @@ -196,6 +198,8 @@ void mem_cgroup_split_huge_fixup(struct page *head); #else /* CONFIG_MEMCG */ struct mem_cgroup; +#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + static inline void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr) -- cgit v1.2.3 From 496d5e7560dbb84399dbd92316fc33857aa83900 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:21 -0400 Subject: blkcg: add blkcg_root_css Add global constant blkcg_root_css which points to &blkcg_root.css. This will be used by cgroup writeback support. If blkcg is disabled, it's defined as ERR_PTR(-EINVAL). v2: The declarations moved to include/linux/blk-cgroup.h as suggested by Vivek. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 51f95b34d3f0..65f0c178fd04 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -134,6 +134,7 @@ struct blkcg_policy { }; extern struct blkcg blkcg_root; +extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, @@ -570,6 +571,8 @@ struct blkcg_gq { struct blkcg_policy { }; +#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit v1.2.3 From ec438699a9ae0856c2ce20a50dd39cdc7e92a732 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:22 -0400 Subject: cgroup, block: implement task_get_css() and use it in bio_associate_current() bio_associate_current() currently open codes task_css() and css_tryget_online() to find and pin $current's blkcg css. Abstract it into task_get_css() which is implemented from cgroup side. As a task is always associated with an online css for every subsystem except while the css_set update is propagating, task_get_css() retries till css_tryget_online() succeeds. This is a cleanup and shouldn't lead to noticeable behavior changes. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Jens Axboe Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b9cb94c3102a..e7da0aa65b2d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -773,6 +773,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, return task_css_check(task, subsys_id, false); } +/** + * task_get_css - find and get the css for (task, subsys) + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Find the css for the (@task, @subsys_id) combination, increment a + * reference on and return it. This function is guaranteed to return a + * valid css. + */ +static inline struct cgroup_subsys_state * +task_get_css(struct task_struct *task, int subsys_id) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + while (true) { + css = task_css(task, subsys_id); + if (likely(css_tryget_online(css))) + break; + cpu_relax(); + } + rcu_read_unlock(); + return css; +} + /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task -- cgit v1.2.3 From fd383c2d3cae146337cea809de0d622b8b887e6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:23 -0400 Subject: blkcg: implement task_get_blkcg_css() Implement a wrapper around task_get_css() to acquire the blkcg css for a given task. The wrapper is necessary for cgroup writeback support as there will be places outside blkcg proper trying to acquire blkcg_css and blkio_cgrp_id will be undefined when !CONFIG_BLK_CGROUP. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 65f0c178fd04..4dc643f2046e 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -195,6 +195,12 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return task_blkcg(current); } +static inline struct cgroup_subsys_state * +task_get_blkcg_css(struct task_struct *task) +{ + return task_get_css(task, blkio_cgrp_id); +} + /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -573,6 +579,12 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) +static inline struct cgroup_subsys_state * +task_get_blkcg_css(struct task_struct *task) +{ + return NULL; +} + #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit v1.2.3 From 1d933cf096e3aea15f1aec8297657b7a846fab63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:24 -0400 Subject: blkcg: implement bio_associate_blkcg() Currently, a bio can only be associated with the io_context and blkcg of %current using bio_associate_current(). This is too restrictive for cgroup writeback support. Implement bio_associate_blkcg() which associates a bio with the specified blkcg. bio_associate_blkcg() leaves the io_context unassociated. bio_associate_current() is updated so that it considers a bio as already associated if it has a blkcg_css, instead of an io_context, associated with it. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index f0291cf64cc5..5e963a6d7c14 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -482,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); #ifdef CONFIG_BLK_CGROUP +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); #else /* CONFIG_BLK_CGROUP */ +static inline int bio_associate_blkcg(struct bio *bio, + struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } #endif /* CONFIG_BLK_CGROUP */ -- cgit v1.2.3 From ad7fa852d3d2816d68a138ebc5bc8967aeb7fd86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 May 2015 20:00:02 -0400 Subject: memcg: implement mem_cgroup_css_from_page() Implement mem_cgroup_css_from_page() which returns the cgroup_subsys_state of the memcg associated with a given page on the default hierarchy. This will be used by cgroup writeback support. This function assumes that page->mem_cgroup association doesn't change until the page is released, which is true on the default hierarchy as long as replace_page_cache_page() is not used. As the only user of replace_page_cache_page() is FUSE which won't support cgroup writeback for the time being, this works for now, and replace_page_cache_page() will soon be updated so that the invariant actually holds. Note that the RCU protected page->mem_cgroup access is consistent with other usages across memcg but ultimately incorrect. These unlocked accesses are missing required barriers. page->mem_cgroup should be made an RCU pointer and updated and accessed using RCU operations. v4: Instead of triggering WARN, return the root css on the traditional hierarchies. This makes the function a lot easier to deal with especially as there's no light way to synchronize against hierarchy rebinding. v3: s/mem_cgroup_migrate()/mem_cgroup_css_from_page()/ v2: Trigger WARN if the function is used on the traditional hierarchies and add comment about the assumed invariant. Signed-off-by: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 294498f4f6fc..637ef626008e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -115,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, } extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); +extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, -- cgit v1.2.3 From 4452226ea276e74fc3e252c88d9bb7e8f8e44bf0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:26 -0400 Subject: writeback: move backing_dev_info->state into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->state into wb. * enum bdi_state is renamed to wb_state and the prefix of all enums is changed from BDI_ to WB_. * Explicit zeroing of bdi->state is removed without adding zeoring of wb->state as the whole data structure is zeroed on init anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: drbd-dev@lists.linbit.com Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index aff923ae8c4b..eb14f988a63e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,13 +25,13 @@ struct device; struct dentry; /* - * Bits in backing_dev_info.state + * Bits in bdi_writeback.state */ -enum bdi_state { - BDI_async_congested, /* The async (write) queue is getting full */ - BDI_sync_congested, /* The sync queue is getting full */ - BDI_registered, /* bdi_register() was done */ - BDI_writeback_running, /* Writeback is in progress */ +enum wb_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ }; typedef int (congested_fn)(void *, int); @@ -49,6 +49,7 @@ enum bdi_stat_item { struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ + unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct delayed_work dwork; /* work item used for writeback */ @@ -62,7 +63,6 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ @@ -250,23 +250,23 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->state & bdi_bits); + return (bdi->wb.state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_sync_congested); + return bdi_congested(bdi, 1 << WB_sync_congested); } static inline int bdi_write_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_async_congested); + return bdi_congested(bdi, 1 << WB_async_congested); } static inline int bdi_rw_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, (1 << BDI_sync_congested) | - (1 << BDI_async_congested)); + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); } enum { -- cgit v1.2.3 From 93f78d882865cb90020d0f80a9523c99cf46924c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:27 -0400 Subject: writeback: move backing_dev_info->bdi_stat[] into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->bdi_stat[] into wb. * enum bdi_stat_item is renamed to wb_stat_item and the prefix of all enums is changed from BDI_ to WB_. * BDI_STAT_BATCH() -> WB_STAT_BATCH() * [__]{add|inc|dec|sum}_wb_stat(bdi, ...) -> [__]{add|inc}_wb_stat(wb, ...) * bdi_stat[_error]() -> wb_stat[_error]() * bdi_writeout_inc() -> wb_writeout_inc() * stat init is moved to bdi_wb_init() and bdi_wb_exit() is added and frees stat. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 68 +++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index eb14f988a63e..fe7a907a4e16 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,15 +36,15 @@ enum wb_state { typedef int (congested_fn)(void *, int); -enum bdi_stat_item { - BDI_RECLAIMABLE, - BDI_WRITEBACK, - BDI_DIRTIED, - BDI_WRITTEN, - NR_BDI_STAT_ITEMS +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS }; -#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -58,6 +58,8 @@ struct bdi_writeback { struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; }; struct backing_dev_info { @@ -69,8 +71,6 @@ struct backing_dev_info { char *name; - struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; - unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ @@ -137,78 +137,74 @@ static inline int wb_has_dirty_io(struct bdi_writeback *wb) !list_empty(&wb->b_more_io); } -static inline void __add_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item, s64 amount) +static inline void __add_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item, s64 amount) { - __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); + __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void __inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __inc_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, 1); + __add_wb_stat(wb, item, 1); } -static inline void inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __inc_bdi_stat(bdi, item); + __inc_wb_stat(wb, item); local_irq_restore(flags); } -static inline void __dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __dec_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, -1); + __add_wb_stat(wb, item, -1); } -static inline void dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __dec_bdi_stat(bdi, item); + __dec_wb_stat(wb, item); local_irq_restore(flags); } -static inline s64 bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - return percpu_counter_read_positive(&bdi->bdi_stat[item]); + return percpu_counter_read_positive(&wb->stat[item]); } -static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 __wb_stat_sum(struct bdi_writeback *wb, + enum wb_stat_item item) { - return percpu_counter_sum_positive(&bdi->bdi_stat[item]); + return percpu_counter_sum_positive(&wb->stat[item]); } -static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { s64 sum; unsigned long flags; local_irq_save(flags); - sum = __bdi_stat_sum(bdi, item); + sum = __wb_stat_sum(wb, item); local_irq_restore(flags); return sum; } -extern void bdi_writeout_inc(struct backing_dev_info *bdi); +extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +static inline unsigned long wb_stat_error(struct bdi_writeback *wb) { #ifdef CONFIG_SMP - return nr_cpu_ids * BDI_STAT_BATCH; + return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif -- cgit v1.2.3 From a88a341a73be4ef035ca26170c849f002797da27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:28 -0400 Subject: writeback: move bandwidth related fields from backing_dev_info into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bandwidth related fields from backing_dev_info into bdi_writeback. * The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp, write_bandwidth, avg_write_bandwidth, dirty_ratelimit, balanced_dirty_ratelimit, completions and dirty_exceeded. * writeback_chunk_size() and over_bground_thresh() now take @wb instead of @bdi. * bdi_writeout_fraction(bdi, ...) -> wb_writeout_fraction(wb, ...) bdi_dirty_limit(bdi, ...) -> wb_dirty_limit(wb, ...) bdi_position_ration(bdi, ...) -> wb_position_ratio(wb, ...) bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...) [__]bdi_update_bandwidth(bdi, ...) -> [__]wb_update_bandwidth(wb, ...) bdi_{max|min}_pause(bdi, ...) -> wb_{max|min}_pause(wb, ...) bdi_dirty_limits(bdi, ...) -> wb_dirty_limits(wb, ...) * Init/exits of the relocated fields are moved to bdi_wb_init/exit() respectively. Note that explicit zeroing is dropped in the process as wb's are cleared in entirety anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. v2: Typo in description fixed as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Jaegeuk Kim Cc: Steven Whitehouse Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 20 ++++++++++---------- include/linux/writeback.h | 19 +++++++++---------- 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fe7a907a4e16..2ab06049d812 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -60,16 +60,6 @@ struct bdi_writeback { spinlock_t list_lock; /* protects the b_* lists */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; @@ -88,6 +78,16 @@ struct backing_dev_info { struct fprop_local_percpu completions; int dirty_exceeded; +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b2dd371ec0ca..a6b9db7fcee8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,16 +155,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty); - -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); +unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); + +void __wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); -- cgit v1.2.3 From f0054bb1e1f3be03cc33369df640db97f10f6172 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:30 -0400 Subject: writeback: move backing_dev_info->wb_lock and ->worklist into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->wb_lock and ->worklist into wb. * The lock protects bdi->worklist and bdi->wb.dwork scheduling. While moving, rename it to wb->work_lock as wb->wb_lock is confusing. Also, move wb->dwork downwards so that it's colocated with the new ->work_lock and ->work_list fields. * bdi_writeback_workfn() -> wb_workfn() bdi_wakeup_thread_delayed(bdi) -> wb_wakeup_delayed(wb) bdi_wakeup_thread(bdi) -> wb_wakeup(wb) bdi_queue_work(bdi, ...) -> wb_queue_work(wb, ...) __bdi_start_writeback(bdi, ...) -> __wb_start_writeback(wb, ...) get_next_work_item(bdi) -> get_next_work_item(wb) * bdi_wb_shutdown() is renamed to wb_shutdown() and now takes @wb. The function contained parts which belong to the containing bdi rather than the wb itself - testing cap_writeback_dirty and bdi_remove_from_list() invocation. Those are moved to bdi_unregister(). * bdi_wb_{init|exit}() are renamed to wb_{init|exit}(). Initializations of the moved bdi->wb_lock and ->work_list are relocated from bdi_init() to wb_init(). * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2ab06049d812..d796f49ce87a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -52,7 +52,6 @@ struct bdi_writeback { unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ - struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -78,6 +77,10 @@ struct bdi_writeback { struct fprop_local_percpu completions; int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ }; struct backing_dev_info { @@ -93,9 +96,6 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */ - - struct list_head work_list; struct device *dev; @@ -121,9 +121,9 @@ int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -void bdi_writeback_workfn(struct work_struct *work); +void wb_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -- cgit v1.2.3 From 66114cad64bf76a155fec1f0fff0de771cf909d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:32 -0400 Subject: writeback: separate out include/linux/backing-dev-defs.h With the planned cgroup writeback support, backing-dev related declarations will be more widely used across block and cgroup; unfortunately, including backing-dev.h from include/linux/blkdev.h makes cyclic include dependency quite likely. This patch separates out backing-dev-defs.h which only has the essential definitions and updates blkdev.h to include it. c files which need access to more backing-dev details now include backing-dev.h directly. This takes backing-dev.h off the common include dependency chain making it a lot easier to use it across block and cgroup. v2: fs/fat build failure fixed. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 106 +++++++++++++++++++++++++++++++++++++++ include/linux/backing-dev.h | 102 +------------------------------------ include/linux/blkdev.h | 2 +- 3 files changed, 108 insertions(+), 102 deletions(-) create mode 100644 include/linux/backing-dev-defs.h (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h new file mode 100644 index 000000000000..aa18c4bd43c1 --- /dev/null +++ b/include/linux/backing-dev-defs.h @@ -0,0 +1,106 @@ +#ifndef __LINUX_BACKING_DEV_DEFS_H +#define __LINUX_BACKING_DEV_DEFS_H + +#include +#include +#include +#include +#include +#include + +struct page; +struct device; +struct dentry; + +/* + * Bits in bdi_writeback.state + */ +enum wb_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ +}; + +typedef int (congested_fn)(void *, int); + +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS +}; + +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) + +struct bdi_writeback { + struct backing_dev_info *bdi; /* our parent bdi */ + + unsigned long state; /* Always use atomic bitops on this */ + unsigned long last_old_flush; /* last old data flush */ + + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_dirty_time; /* time stamps are dirty */ + spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; + + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long dirtied_stamp; + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw */ + + /* + * The base dirty throttle rate, re-calculated on every 200ms. + * All the bdi tasks' dirty rate will be curbed under it. + * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit + * in small steps and is much more smooth/stable than the latter. + */ + unsigned long dirty_ratelimit; + unsigned long balanced_dirty_ratelimit; + + struct fprop_local_percpu completions; + int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; + + unsigned int min_ratio; + unsigned int max_ratio, max_prop_frac; + + struct bdi_writeback wb; /* default writeback info for this bdi */ + + struct device *dev; + + struct timer_list laptop_mode_wb_timer; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debug_dir; + struct dentry *debug_stats; +#endif +}; + +enum { + BLK_RW_ASYNC = 0, + BLK_RW_SYNC = 1, +}; + +void clear_bdi_congested(struct backing_dev_info *bdi, int sync); +void set_bdi_congested(struct backing_dev_info *bdi, int sync); + +#endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d796f49ce87a..5e39f7a8efed 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -8,104 +8,11 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H -#include -#include -#include #include #include #include -#include #include -#include -#include -#include - -struct page; -struct device; -struct dentry; - -/* - * Bits in bdi_writeback.state - */ -enum wb_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ - WB_registered, /* bdi_register() was done */ - WB_writeback_running, /* Writeback is in progress */ -}; - -typedef int (congested_fn)(void *, int); - -enum wb_stat_item { - WB_RECLAIMABLE, - WB_WRITEBACK, - WB_DIRTIED, - WB_WRITTEN, - NR_WB_STAT_ITEMS -}; - -#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) - -struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ - - unsigned long state; /* Always use atomic bitops on this */ - unsigned long last_old_flush; /* last old data flush */ - - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ - struct list_head b_dirty_time; /* time stamps are dirty */ - spinlock_t list_lock; /* protects the b_* lists */ - - struct percpu_counter stat[NR_WB_STAT_ITEMS]; - - unsigned long bw_time_stamp; /* last time write bw is updated */ - unsigned long dirtied_stamp; - unsigned long written_stamp; /* pages written at bw_time_stamp */ - unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ - - /* - * The base dirty throttle rate, re-calculated on every 200ms. - * All the bdi tasks' dirty rate will be curbed under it. - * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit - * in small steps and is much more smooth/stable than the latter. - */ - unsigned long dirty_ratelimit; - unsigned long balanced_dirty_ratelimit; - - struct fprop_local_percpu completions; - int dirty_exceeded; - - spinlock_t work_lock; /* protects work_list & dwork scheduling */ - struct list_head work_list; - struct delayed_work dwork; /* work item used for writeback */ -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; - - unsigned int min_ratio; - unsigned int max_ratio, max_prop_frac; - - struct bdi_writeback wb; /* default writeback info for this bdi */ - - struct device *dev; - - struct timer_list laptop_mode_wb_timer; - -#ifdef CONFIG_DEBUG_FS - struct dentry *debug_dir; - struct dentry *debug_stats; -#endif -}; +#include struct backing_dev_info *inode_to_bdi(struct inode *inode); @@ -265,13 +172,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); } -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); int pdflush_proc_obsolete(struct ctl_table *table, int write, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ccaa9aecd593..60d2726a6b62 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From a212b105b07d75b48b1a166378282e8a77fbf53d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:33 -0400 Subject: bdi: make inode_to_bdi() inline Now that bdi definitions are moved to backing-dev-defs.h, backing-dev.h can include blkdev.h and inline inode_to_bdi() without worrying about introducing circular include dependency. The function gets called from hot paths and fairly trivial. This patch makes inode_to_bdi() and sb_is_blkdev_sb() that the function calls inline. blockdev_superblock and noop_backing_dev_info are EXPORT_GPL'd to allow the inline functions to be used from modules. While at it, make sb_is_blkdev_sb() return bool instead of int. v2: Fixed typo in description as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 18 ++++++++++++++++-- include/linux/fs.h | 8 +++++++- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5e39f7a8efed..785782034e86 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -11,11 +11,10 @@ #include #include #include +#include #include #include -struct backing_dev_info *inode_to_bdi(struct inode *inode); - int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -149,6 +148,21 @@ extern struct backing_dev_info noop_backing_dev_info; int writeback_in_progress(struct backing_dev_info *bdi); +static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif + return sb->s_bdi; +} + static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ef63900243c..ce100b87fba3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2240,7 +2240,13 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); -extern int sb_is_blkdev_sb(struct super_block *sb); + +extern struct super_block *blockdev_superblock; + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } -- cgit v1.2.3 From 4aa9c692e052cf6db99db62a8fe0543e5c455da7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:35 -0400 Subject: bdi: separate out congested state into a separate struct Currently, a wb's (bdi_writeback) congestion state is carried in its ->state field; however, cgroup writeback support will require multiple wb's sharing the same congestion state. This patch separates out congestion state into its own struct - struct bdi_writeback_congested. A new field wb field, wb_congested, points to its associated congested struct. The default wb, bdi->wb, always points to bdi->wb_congested. While this patch adds a layer of indirection, it doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 14 ++++++++++++-- include/linux/backing-dev.h | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index aa18c4bd43c1..9e9eafa5f5aa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -16,12 +16,15 @@ struct dentry; * Bits in bdi_writeback.state */ enum wb_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ }; +enum wb_congested_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ +}; + typedef int (congested_fn)(void *, int); enum wb_stat_item { @@ -34,6 +37,10 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +struct bdi_writeback_congested { + unsigned long state; /* WB_[a]sync_congested flags */ +}; + struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -48,6 +55,8 @@ struct bdi_writeback { struct percpu_counter stat[NR_WB_STAT_ITEMS]; + struct bdi_writeback_congested *congested; + unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ @@ -84,6 +93,7 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; struct device *dev; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 785782034e86..bfdaa18ba0a1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,7 +167,7 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.state & bdi_bits); + return (bdi->wb.congested->state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) -- cgit v1.2.3 From 89e9b9e07a390c50980d10aa37a04631db5a23ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:36 -0400 Subject: writeback: add {CONFIG|BDI_CAP|FS}_CGROUP_WRITEBACK cgroup writeback requires support from both bdi and filesystem sides. Add BDI_CAP_CGROUP_WRITEBACK and FS_CGROUP_WRITEBACK to indicate support and enable BDI_CAP_CGROUP_WRITEBACK on block based bdi's by default. Also, define CONFIG_CGROUP_WRITEBACK which is enabled if both MEMCG and BLK_CGROUP are enabled. inode_cgwb_enabled() which determines whether a given inode's both bdi and fs support cgroup writeback is added. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 32 +++++++++++++++++++++++++++++++- include/linux/fs.h | 1 + 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index bfdaa18ba0a1..6bb31234e6a9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -134,12 +134,15 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_WRITEBACK: Don't write pages back * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. + * + * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 #define BDI_CAP_NO_ACCT_WB 0x00000004 #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 +#define BDI_CAP_CGROUP_WRITEBACK 0x00000020 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -229,4 +232,31 @@ static inline int bdi_sched_wait(void *word) return 0; } -#endif /* _LINUX_BACKING_DEV_H */ +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode + * @inode: inode of interest + * + * cgroup writeback requires support from both the bdi and filesystem. + * Test whether @inode has both. + */ +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + return bdi_cap_account_dirty(bdi) && + (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && + (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + return false; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +#endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index ce100b87fba3..74e0ae0626a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1897,6 +1897,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ +#define FS_CGROUP_WRITEBACK 32 /* Supports cgroup-aware writeback */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- cgit v1.2.3 From 52ebea749aaed195245701a8f90a23d672c7a933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:37 -0400 Subject: writeback: make backing_dev_info host cgroup-specific bdi_writebacks For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo Cc: kbuild test robot Cc: Dan Carpenter Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 59 +++++++++++- include/linux/backing-dev.h | 195 +++++++++++++++++++++++++++++++++++++++ include/linux/blk-cgroup.h | 4 + include/linux/fs.h | 4 + include/linux/memcontrol.h | 4 + 5 files changed, 263 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 9e9eafa5f5aa..a1e9c407a59a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -2,8 +2,11 @@ #define __LINUX_BACKING_DEV_DEFS_H #include +#include +#include #include #include +#include #include #include #include @@ -37,10 +40,43 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +/* + * For cgroup writeback, multiple wb's may map to the same blkcg. Those + * wb's can operate mostly independently but should share the congested + * state. To facilitate such sharing, the congested state is tracked using + * the following struct which is created on demand, indexed by blkcg ID on + * its bdi, and refcounted. + */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct backing_dev_info *bdi; /* the associated bdi */ + atomic_t refcnt; /* nr of attached wb's and blkg */ + int blkcg_id; /* ID of the associated blkcg */ + struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ +#endif }; +/* + * Each wb (bdi_writeback) can perform writeback operations, is measured + * and throttled, independently. Without cgroup writeback, each bdi + * (bdi_writeback) is served by its embedded bdi->wb. + * + * On the default hierarchy, blkcg implicitly enables memcg. This allows + * using memcg's page ownership for attributing writeback IOs, and every + * memcg - blkcg combination can be served by its own wb by assigning a + * dedicated wb to each memcg, which enables isolation across different + * cgroups and propagation of IO back pressure down from the IO layer upto + * the tasks which are generating the dirty pages to be written back. + * + * A cgroup wb is indexed on its bdi by the ID of the associated memcg, + * refcounted with the number of inodes attached to it, and pins the memcg + * and the corresponding blkcg. As the corresponding blkcg for a memcg may + * change as blkcg is disabled and enabled higher up in the hierarchy, a wb + * is tested for blkcg after lookup and removed from index on mismatch so + * that a new wb for the combination can be created. + */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -78,6 +114,19 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct percpu_ref refcnt; /* used only for !root wb's */ + struct cgroup_subsys_state *memcg_css; /* the associated memcg */ + struct cgroup_subsys_state *blkcg_css; /* and blkcg */ + struct list_head memcg_node; /* anchored at memcg->cgwb_list */ + struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + + union { + struct work_struct release_work; + struct rcu_head rcu; + }; +#endif }; struct backing_dev_info { @@ -92,9 +141,13 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - struct bdi_writeback wb; /* default writeback info for this bdi */ - struct bdi_writeback_congested wb_congested; - + struct bdi_writeback wb; /* the root writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; /* its congested state */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ + struct rb_root cgwb_congested_tree; /* their congested states */ + atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ +#endif struct device *dev; struct timer_list laptop_mode_wb_timer; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6bb31234e6a9..8ae59df2e3d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include int __must_check bdi_init(struct backing_dev_info *bdi); @@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word) #ifdef CONFIG_CGROUP_WRITEBACK +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); +void wb_congested_put(struct bdi_writeback_congested *congested); +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp); +void __inode_attach_wb(struct inode *inode, struct page *page); +void wb_memcg_offline(struct mem_cgroup *memcg); +void wb_blkcg_offline(struct blkcg *blkcg); + /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest @@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +/** + * wb_find_current - find wb for %current on a bdi + * @bdi: bdi of interest + * + * Find the wb of @bdi which matches both the memcg and blkcg of %current. + * Must be called under rcu_read_lock() which protects the returend wb. + * NULL if not found. + */ +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + + memcg_css = task_css(current, memory_cgrp_id); + if (!memcg_css->parent) + return &bdi->wb; + + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + + /* + * %current's blkcg equals the effective blkcg of its memcg. No + * need to use the relatively expensive cgroup_get_e_css(). + */ + if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + return wb; + return NULL; +} + +/** + * wb_get_create_current - get or create wb for %current on a bdi + * @bdi: bdi of interest + * @gfp: allocation mask + * + * Equivalent to wb_get_create() on %current's memcg. This function is + * called from a relatively hot path and optimizes the common cases using + * wb_find_current(). + */ +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + wb = wb_find_current(bdi); + if (wb && unlikely(!wb_tryget(wb))) + wb = NULL; + rcu_read_unlock(); + + if (unlikely(!wb)) { + struct cgroup_subsys_state *memcg_css; + + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, gfp); + css_put(memcg_css); + } + return wb; +} + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +/** + * inode_to_wb - determine the wb of an inode + * @inode: inode of interest + * + * Returns the wb @inode is currently associated with. + */ +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return inode->i_wb; +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return false; } +static inline struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + return bdi->wb.congested; +} + +static inline void wb_congested_put(struct bdi_writeback_congested *congested) +{ +} + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + return &bdi->wb; +} + +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + return &bdi->wb; +} + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + +static inline void wb_memcg_offline(struct mem_cgroup *memcg) +{ +} + +static inline void wb_blkcg_offline(struct blkcg *blkcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4dc643f2046e..3033eb173eb4 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -53,6 +53,10 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ unsigned int cfq_leaf_weight; + +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif }; struct blkg_stat { diff --git a/include/linux/fs.h b/include/linux/fs.h index 74e0ae0626a8..67a42ec95065 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -35,6 +35,7 @@ #include struct backing_dev_info; +struct bdi_writeback; struct export_operations; struct hd_geometry; struct iovec; @@ -635,6 +636,9 @@ struct inode { struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *i_wb; /* the associated cgroup wb */ +#endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; union { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 637ef626008e..662a953ea8ad 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -388,6 +388,10 @@ enum { OVER_LIMIT, }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); +#endif + struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) void sock_update_memcg(struct sock *sk); -- cgit v1.2.3 From ce7acfeaf0363c8b75810908448f61af04d38f91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:38 -0400 Subject: writeback, blkcg: associate each blkcg_gq with the corresponding bdi_writeback_congested A blkg (blkcg_gq) can be congested and decongested independently from other blkgs on the same request_queue. Accordingly, for cgroup writeback support, the congestion status at bdi (backing_dev_info) should be split and updated separately from matching blkg's. This patch prepares by adding blkg->wb_congested and associating a blkg with its matching per-blkcg bdi_writeback_congested on creation. v2: Updated to associate bdi_writeback_congested instead of bdi_writeback. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3033eb173eb4..07a32b813ed8 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -99,6 +99,12 @@ struct blkcg_gq { struct hlist_node blkcg_node; struct blkcg *blkcg; + /* + * Each blkg gets congested separately and the congestion state is + * propagated to the matching bdi_writeback_congested. + */ + struct bdi_writeback_congested *wb_congested; + /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; -- cgit v1.2.3 From ec8a6f2643923ee5b74d24fa8d134240379f436b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:41 -0400 Subject: writeback: make congestion functions per bdi_writeback Currently, all congestion functions take bdi (backing_dev_info) and always operate on the root wb (bdi->wb) and the congestion state from the block layer is propagated only for the root blkcg. This patch introduces {set|clear}_wb_congested() and wb_congested() which take a bdi_writeback_congested and bdi_writeback respectively. The bdi counteparts are now wrappers invoking the wb based functions on @bdi->wb. While converting clear_bdi_congested() to clear_wb_congested(), the local variable declaration order between @wqh and @bit is swapped for cosmetic reason. This patch just adds the new wb based functions. The following patches will apply them. v2: Updated for bdi_writeback_congested. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 14 +++++++++++-- include/linux/backing-dev.h | 45 +++++++++++++++++++++++----------------- 2 files changed, 38 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a1e9c407a59a..eb386766b5f3 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -163,7 +163,17 @@ enum { BLK_RW_SYNC = 1, }; -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync); +void set_wb_congested(struct bdi_writeback_congested *congested, int sync); + +static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + clear_wb_congested(bdi->wb.congested, sync); +} + +static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + set_wb_congested(bdi->wb.congested, sync); +} #endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8ae59df2e3d1..2c498a2a8268 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,27 +167,13 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } -static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) +static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) { - if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.congested->state & bdi_bits); -} - -static inline int bdi_read_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_sync_congested); -} - -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_async_congested); -} + struct backing_dev_info *bdi = wb->bdi; -static inline int bdi_rw_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, (1 << WB_sync_congested) | - (1 << WB_async_congested)); + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, cong_bits); + return wb->congested->state & cong_bits; } long congestion_wait(int sync, long timeout); @@ -454,4 +440,25 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) #endif /* CONFIG_CGROUP_WRITEBACK */ +static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) +{ + return wb_congested(&bdi->wb, cong_bits); +} + +static inline int bdi_read_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_sync_congested); +} + +static inline int bdi_write_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_async_congested); +} + +static inline int bdi_rw_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + #endif /* _LINUX_BACKING_DEV_H */ -- cgit v1.2.3 From d40f75a06dd675808eed385d490ba9468200b23f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:42 -0400 Subject: writeback, blkcg: restructure blk_{set|clear}_queue_congested() blk_{set|clear}_queue_congested() take @q and set or clear, respectively, the congestion state of its bdi's root wb. Because bdi used to be able to handle congestion state only on the root wb, the callers of those functions tested whether the congestion is on the root blkcg and skipped if not. This is cumbersome and makes implementation of per cgroup bdi_writeback congestion state propagation difficult. This patch renames blk_{set|clear}_queue_congested() to blk_{set|clear}_congested(), and makes them take request_list instead of request_queue and test whether the specified request_list is the root one before updating bdi_writeback congestion state. This makes the tests in the callers unnecessary and simplifies them. As there are no external users of these functions, the definitions are moved from include/linux/blkdev.h to block/blk-core.c. This patch doesn't introduce any noticeable behavior difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60d2726a6b62..ab4a27852f1b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -790,25 +790,6 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern void blk_queue_bio(struct request_queue *q, struct bio *bio); -/* - * A queue has just exitted congestion. Note this in the global counter of - * congested queues, and wake up anyone who was waiting for requests to be - * put back. - */ -static inline void blk_clear_queue_congested(struct request_queue *q, int sync) -{ - clear_bdi_congested(&q->backing_dev_info, sync); -} - -/* - * A queue has just entered congestion. Flag that in the queue's VM-visible - * state flags and increment the global gounter of congested queues. - */ -static inline void blk_set_queue_congested(struct request_queue *q, int sync) -{ - set_bdi_congested(&q->backing_dev_info, sync); -} - extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -- cgit v1.2.3 From 703c270887bb5106c4c46a00cc7477d30d5e04f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:44 -0400 Subject: writeback: implement and use inode_congested() In several places, bdi_congested() and its wrappers are used to determine whether more IOs should be issued. With cgroup writeback support, this question can't be answered solely based on the bdi (backing_dev_info). It's dependent on whether the filesystem and bdi support cgroup writeback and the blkcg the inode is associated with. This patch implements inode_congested() and its wrappers which take @inode and determines the congestion state considering cgroup writeback. The new functions replace bdi_*congested() calls in places where the query is about specific inode and task. There are several filesystem users which also fit this criteria but they should be updated when each filesystem implements cgroup writeback support. v2: Now that a given inode is associated with only one wb, congestion state can be determined independent from the asking task. Drop @task. Spotted by Vivek. Also, converted to take @inode instead of @mapping and renamed to inode_congested(). Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2c498a2a8268..6f0882105f95 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -230,6 +230,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); +int inode_congested(struct inode *inode, int cong_bits); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -438,8 +439,29 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +static inline int inode_congested(struct inode *inode, int cong_bits) +{ + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ +static inline int inode_read_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_sync_congested); +} + +static inline int inode_write_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_async_congested); +} + +static inline int inode_rw_congested(struct inode *inode) +{ + return inode_congested(inode, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) { return wb_congested(&bdi->wb, cong_bits); -- cgit v1.2.3 From d6c10f1fc8626dc55946f4768ae322b4c57b07dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:45 -0400 Subject: writeback: implement WB_has_dirty_io wb_state flag Currently, wb_has_dirty_io() determines whether a wb (bdi_writeback) has any dirty inode by testing all three IO lists on each invocation without actively keeping track. For cgroup writeback support, a single bdi will host multiple wb's each of which will host dirty inodes separately and we'll need to make bdi_has_dirty_io(), which currently only represents the root wb, aggregate has_dirty_io from all member wb's, which requires tracking transitions in has_dirty_io state on each wb. This patch introduces inode_wb_list_{move|del}_locked() to consolidate IO list operations leaving queue_io() the only other function which directly manipulates IO lists (via move_expired_inodes()). All three functions are updated to call wb_io_lists_[de]populated() which keep track of whether the wb has dirty inodes or not and record it using the new WB_has_dirty_io flag. inode_wb_list_moved_locked()'s return value indicates whether the wb had no dirty inodes before. mark_inode_dirty() is restructured so that the return value of inode_wb_list_move_locked() can be used for deciding whether to wake up the wb. While at it, change {bdi|wb}_has_dirty_io()'s return values to bool. These functions were returning 0 and 1 before. Also, add a comment explaining the synchronization of wb_state flags. v2: Updated to accommodate b_dirty_time. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + include/linux/backing-dev.h | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index eb386766b5f3..7a94b7850b7c 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; enum wb_state { WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ + WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; enum wb_congested_state { diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6f0882105f95..3c8403c012ce 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -int bdi_has_dirty_io(struct backing_dev_info *bdi); +bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -37,11 +37,9 @@ extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -static inline int wb_has_dirty_io(struct bdi_writeback *wb) +static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); + return test_bit(WB_has_dirty_io, &wb->state); } static inline void __add_wb_stat(struct bdi_writeback *wb, -- cgit v1.2.3 From 766a9d6e60578f1ef6de71f89f022084f8bffc82 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:46 -0400 Subject: writeback: implement backing_dev_info->tot_write_bandwidth cgroup writeback support needs to keep track of the sum of avg_write_bandwidth of all wb's (bdi_writeback's) with dirty inodes to distribute write workload. This patch adds bdi->tot_write_bandwidth and updates inode_wb_list_move_locked(), inode_wb_list_del_locked() and wb_update_write_bandwidth() to adjust it as wb's gain and lose dirty inodes and its avg_write_bandwidth gets updated. As the update events are not synchronized with each other, bdi->tot_write_bandwidth is an atomic_long_t. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 7a94b7850b7c..d631a61f4023 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -142,6 +142,8 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; + atomic_long_t tot_write_bandwidth; /* sum of active avg_write_bw */ + struct bdi_writeback wb; /* the root writeback info for this bdi */ struct bdi_writeback_congested wb_congested; /* its congested state */ #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From 95a46c65e3c09edb9f17dabf2dc16670cd328739 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:47 -0400 Subject: writeback: make bdi_has_dirty_io() take multiple bdi_writeback's into account bdi_has_dirty_io() used to only reflect whether the root wb (bdi_writeback) has dirty inodes. For cgroup writeback support, it needs to take all active wb's into account. If any wb on the bdi has dirty inodes, bdi_has_dirty_io() should return true. To achieve that, as inode_wb_list_{move|del}_locked() now keep track of the dirty state transition of each wb, the number of dirty wbs can be counted in the bdi; however, bdi is already aggregating wb->avg_write_bandwidth which can easily be guaranteed to be > 0 when there are any dirty inodes by ensuring wb->avg_write_bandwidth can't dip below 1. bdi_has_dirty_io() can simply test whether bdi->tot_write_bandwidth is zero or not. While this bumps the value of wb->avg_write_bandwidth to one when it used to be zero, this shouldn't cause any meaningful behavior difference. bdi_has_dirty_io() is made an inline function which tests whether ->tot_write_bandwidth is non-zero. Also, WARN_ON_ONCE()'s on its value are added to inode_wb_list_{move|del}_locked(). Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 8 ++++++-- include/linux/backing-dev.h | 10 +++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index d631a61f4023..8c857d723023 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -98,7 +98,7 @@ struct bdi_writeback { unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ + unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. @@ -142,7 +142,11 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - atomic_long_t tot_write_bandwidth; /* sum of active avg_write_bw */ + /* + * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are + * any dirty wbs, which is depended upon by bdi_has_dirty(). + */ + atomic_long_t tot_write_bandwidth; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct bdi_writeback_congested wb_congested; /* its congested state */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3c8403c012ce..0839e44105bd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,6 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -42,6 +41,15 @@ static inline bool wb_has_dirty_io(struct bdi_writeback *wb) return test_bit(WB_has_dirty_io, &wb->state); } +static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + /* + * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are + * any dirty wbs. See wb_update_write_bandwidth(). + */ + return atomic_long_read(&bdi->tot_write_bandwidth); +} + static inline void __add_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { -- cgit v1.2.3 From ebe41ab0c79d5633123f6faa3265a1a63c5f22d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:50 -0400 Subject: writeback: implement bdi_for_each_wb() This will be used to implement bdi-wide operations which should be distributed across all its cgroup bdi_writebacks. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0839e44105bd..c7979806baee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -383,6 +383,61 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +struct wb_iter { + int start_blkcg_id; + struct radix_tree_iter tree_iter; + void **slot; +}; + +static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, + struct backing_dev_info *bdi) +{ + struct radix_tree_iter *titer = &iter->tree_iter; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (iter->start_blkcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); + iter->start_blkcg_id = -1; + } else { + iter->slot = radix_tree_next_slot(iter->slot, titer, 0); + } + + if (!iter->slot) + iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0); + if (iter->slot) + return *iter->slot; + return NULL; +} + +static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, + struct backing_dev_info *bdi, + int start_blkcg_id) +{ + iter->start_blkcg_id = start_blkcg_id; + + if (start_blkcg_id) + return __wb_iter_next(iter, bdi); + else + return &bdi->wb; +} + +/** + * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * @wb_cur: cursor struct bdi_writeback pointer + * @bdi: bdi to walk wb's of + * @iter: pointer to struct wb_iter to be used as iteration buffer + * @start_blkcg_id: blkcg ID to start iteration from + * + * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending + * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * to be used as temp storage during iteration. rcu_read_lock() must be + * held throughout iteration. + */ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ + (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -445,6 +500,14 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +struct wb_iter { + int next_id; +}; + +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((iter)->next_id = (start_blkcg_id); \ + ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); ) + static inline int inode_congested(struct inode *inode, int cong_bits) { return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -- cgit v1.2.3 From c00ddad39f512b1a81e25b7892217ce10efab0f1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:51 -0400 Subject: writeback: remove bdi_start_writeback() bdi_start_writeback() is a thin wrapper on top of __wb_start_writeback() which is used only by laptop_mode_timer_fn(). This patches removes bdi_start_writeback(), renames __wb_start_writeback() to wb_start_writeback() and makes laptop_mode_timer_fn() use it instead. This doesn't cause any functional difference and will ease making laptop_mode_timer_fn() cgroup writeback aware. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c7979806baee..0ff40c228bee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,8 +25,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason); +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From bc05873dccd27d75d6acdf812c3edfb181f1ba17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:53 -0400 Subject: writeback: make writeback_in_progress() take bdi_writeback instead of backing_dev_info writeback_in_progress() currently takes @bdi and returns whether writeback is in progress on its root wb (bdi_writeback). In preparation for cgroup writeback support, make it take wb instead. While at it, make it an inline function. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0ff40c228bee..f04956c900ec 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -156,7 +156,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); extern struct backing_dev_info noop_backing_dev_info; -int writeback_in_progress(struct backing_dev_info *bdi); +/** + * writeback_in_progress - determine whether there is writeback in progress + * @wb: bdi_writeback of interest + * + * Determine whether there is writeback waiting to be handled against a + * bdi_writeback. + */ +static inline bool writeback_in_progress(struct bdi_writeback *wb) +{ + return test_bit(WB_writeback_running, &wb->state); +} static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { -- cgit v1.2.3 From 9ecf4866c018aeb304a7b49216c4d183665becb7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:54 -0400 Subject: writeback: make bdi_start_background_writeback() take bdi_writeback instead of backing_dev_info bdi_start_background_writeback() currently takes @bdi and kicks the root wb (bdi_writeback). In preparation for cgroup writeback support, make it take wb instead. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f04956c900ec..9cc11e5b97ca 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -27,7 +27,7 @@ void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); -void bdi_start_background_writeback(struct backing_dev_info *bdi); +void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From cc395d7f1f7b9c740ab6d367ef1f6eb248595dff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:58 -0400 Subject: writeback: implement bdi_wait_for_completion() If the completion of a wb_writeback_work can be waited upon by setting its ->done to a struct completion and waiting on it; however, for cgroup writeback support, it's necessary to issue multiple work items to multiple bdi_writebacks and wait for the completion of all. This patch implements wb_completion which can wait for multiple work items and replaces the struct completion with it. It can be defined using DEFINE_WB_COMPLETION_ONSTACK(), used for multiple work items and waited for by wb_wait_for_completion(). Nobody currently issues multiple work items and this patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8c857d723023..97a92fa0cdb5 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -155,6 +155,8 @@ struct backing_dev_info { struct rb_root cgwb_congested_tree; /* their congested states */ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #endif + wait_queue_head_t wb_waitq; + struct device *dev; struct timer_list laptop_mode_wb_timer; -- cgit v1.2.3 From f30a7d0cc8d9096d6728fadd0ab024e648010ec0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:14:00 -0400 Subject: writeback: restructure try_writeback_inodes_sb[_nr]() try_writeback_inodes_sb_nr() wraps writeback_inodes_sb_nr() so that it handles s_umount locking and skips if writeback is already in progress. The in progress test is performed on the root wb (bdi_writeback) which isn't sufficient for cgroup writeback support. The test must be done per-wb. To prepare for the change, this patch factors out __writeback_inodes_sb_nr() from writeback_inodes_sb_nr() and adds @skip_if_busy and moves the in progress test right before queueing the wb_writeback_work. try_writeback_inodes_sb_nr() now just grabs s_umount and invokes __writeback_inodes_sb_nr() with asserted @skip_if_busy. This way, later addition of multiple wb handling can skip only the wb's which already have writeback in progress. This swaps the order between in progress test and s_umount test which can flip the return value when writeback is in progress and s_umount is being held by someone else but this shouldn't cause any meaningful difference. It's a fringe condition and the return value is an unsynchronized hint anyway. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/writeback.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a6b9db7fcee8..23af355d5471 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -93,9 +93,9 @@ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); -int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, - enum wb_reason reason); +bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); +bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, + enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); -- cgit v1.2.3 From bafc0dba1e20d84578d7098d32caf63441e5743d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:37:23 -0600 Subject: buffer, writeback: make __block_write_full_page() honor cgroup writeback [__]block_write_full_page() is used to implement ->writepage in various filesystems. All writeback logic is now updated to handle cgroup writeback and the block cgroup to issue IOs for is encoded in writeback_control and can be retrieved from the inode; however, [__]block_write_full_page() currently ignores the blkcg indicated by inode and issues all bio's without explicit blkcg association. This patch adds submit_bh_blkcg() which associates the bio with the specified blkio cgroup before issuing and uses it in __block_write_full_page() so that the issued bio's are associated with inode_to_wb_blkcg_css(inode). v2: Updated for per-inode wb association. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9cc11e5b97ca..e9d7373f5f93 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -393,6 +393,12 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +static inline struct cgroup_subsys_state * +inode_to_wb_blkcg_css(struct inode *inode) +{ + return inode_to_wb(inode)->blkcg_css; +} + struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -510,6 +516,12 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +static inline struct cgroup_subsys_state * +inode_to_wb_blkcg_css(struct inode *inode) +{ + return blkcg_root_css; +} + struct wb_iter { int next_id; }; -- cgit v1.2.3 From 0d960a383ae7aa791b2833e122ba7519d264cf92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:19 -0400 Subject: writeback: clean up wb_dirty_limit() The function name wb_dirty_limit(), its argument @dirty and the local variable @wb_dirty are mortally confusing given that the function calculates per-wb threshold value not dirty pages, especially given that @dirty and @wb_dirty are used elsewhere for dirty pages. Let's rename the function to wb_calc_thresh() and wb_dirty to wb_thresh. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 23af355d5471..0435c85d4cfa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,7 +155,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void __wb_update_bandwidth(struct bdi_writeback *wb, unsigned long thresh, -- cgit v1.2.3 From 8a73179956e649df0d4b3250db17734f272d8266 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:20 -0400 Subject: writeback: reorganize [__]wb_update_bandwidth() __wb_update_bandwidth() is called from two places - fs/fs-writeback.c::balance_dirty_pages() and mm/page-writeback.c::wb_writeback(). The latter updates only the write bandwidth while the former also deals with the dirty ratelimit. The two callsites are distinguished by whether @thresh parameter is zero or not, which is cryptic. In addition, the two files define their own different versions of wb_update_bandwidth() on top of __wb_update_bandwidth(), which is confusing to say the least. This patch cleans up [__]wb_update_bandwidth() in the following ways. * __wb_update_bandwidth() now takes explicit @update_ratelimit parameter to gate dirty ratelimit handling. * mm/page-writeback.c::wb_update_bandwidth() is flattened into its caller - balance_dirty_pages(). * fs/fs-writeback.c::wb_update_bandwidth() is moved to mm/page-writeback.c and __wb_update_bandwidth() is made static. * While at it, add a lockdep assertion to __wb_update_bandwidth(). Except for the lockdep addition, this is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0435c85d4cfa..80adf3d88d9d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -157,14 +157,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void __wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); - +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); -- cgit v1.2.3 From 380c27ca33ebecc9da35aa90c8b3a9154f90aac2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:21 -0400 Subject: writeback: implement wb_domain Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. Currently, what constitutes the global writeback domain are scattered across a number of global states. This patch starts collecting them into struct wb_domain. * fprop_global which serves as the basis for proportional bandwidth measurement and its period timer are moved into struct wb_domain. * global_wb_domain hosts the states for the global domain. * While at it, flatten wb_writeout_fraction() into its callers. This thin wrapper doesn't provide any actual benefits while getting in the way. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 80adf3d88d9d..3148db1296a2 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -86,6 +87,36 @@ struct writeback_control { unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ }; +/* + * A wb_domain represents a domain that wb's (bdi_writeback's) belong to + * and are measured against each other in. There always is one global + * domain, global_wb_domain, that every wb in the system is a member of. + * This allows measuring the relative bandwidth of each wb to distribute + * dirtyable memory accordingly. + */ +struct wb_domain { + /* + * Scale the writeback cache size proportional to the relative + * writeout speed. + * + * We do this by keeping a floating proportion between BDIs, based + * on page writeback completions [end_page_writeback()]. Those + * devices that write out pages fastest will get the larger share, + * while the slower will get a smaller share. + * + * We use page writeout completions because we are interested in + * getting rid of dirty pages. Having them written out is the + * primary goal. + * + * We introduce a concept of time, a period over which we measure + * these events, because demand can/will vary over time. The length + * of this period itself is measured in page writeback completions. + */ + struct fprop_global completions; + struct timer_list period_timer; /* timer for aging of completions */ + unsigned long period_time; +}; + /* * fs/fs-writeback.c */ @@ -120,6 +151,7 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); +int wb_domain_init(struct wb_domain *dom, gfp_t gfp); extern unsigned long global_dirty_limit; -- cgit v1.2.3 From dcc25ae76eb7b8ff883eaaab57e30e8f2f085be3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:22 -0400 Subject: writeback: move global_dirty_limit into wb_domain This patch is a part of the series to define wb_domain which represents a domain that wb's (bdi_writeback's) belong to and are measured against each other in. This will enable IO backpressure propagation for cgroup writeback. global_dirty_limit exists to regulate the global dirty threshold which is a property of the wb_domain. This patch moves hard_dirty_limit, dirty_lock, and update_time into wb_domain. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3148db1296a2..5fdd4e1805e6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -95,6 +95,8 @@ struct writeback_control { * dirtyable memory accordingly. */ struct wb_domain { + spinlock_t lock; + /* * Scale the writeback cache size proportional to the relative * writeout speed. @@ -115,6 +117,19 @@ struct wb_domain { struct fprop_global completions; struct timer_list period_timer; /* timer for aging of completions */ unsigned long period_time; + + /* + * The dirtyable memory and dirty threshold could be suddenly + * knocked down by a large amount (eg. on the startup of KVM in a + * swapless system). This may throw the system into deep dirty + * exceeded state and throttle heavy/light dirtiers alike. To + * retain good responsiveness, maintain global_dirty_limit for + * tracking slowly down to the knocked down dirty threshold. + * + * Both fields are protected by ->lock. + */ + unsigned long dirty_limit_tstamp; + unsigned long dirty_limit; }; /* @@ -153,7 +168,7 @@ void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); -extern unsigned long global_dirty_limit; +extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern int dirty_background_ratio; -- cgit v1.2.3 From aa661bbe1e61ce80ca4ae98804f673ede94b0827 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:31 -0400 Subject: writeback: move over_bground_thresh() to mm/page-writeback.c and rename it to wb_over_bg_thresh(). The function is closely tied to the dirty throttling mechanism implemented in page-writeback.c. This relocation will allow future updates necessary for cgroup writeback support. While at it, add function comment. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5fdd4e1805e6..b57c2786b5aa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -207,6 +207,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); +bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); -- cgit v1.2.3 From 841710aa6e4acd066ab9fe8c8cb6f4e4e6709d83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:33 -0400 Subject: writeback: implement memcg wb_domain Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. The previous patches laid the groundwork to support the two wb_domains and this patch implements memcg wb_domain. memcg->cgwb_domain is initialized on css online and destroyed on css release, wb->memcg_completions is added, and __wb_writeout_inc() is updated to increment completions against both global and memcg wb_domains. The following patches will update balance_dirty_pages() and its subroutines to actually consider memcg wb_domain for throttling. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + include/linux/memcontrol.h | 12 +++++++++++- include/linux/writeback.h | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 97a92fa0cdb5..8d470b73824f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -118,6 +118,7 @@ struct bdi_writeback { #ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ + struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 662a953ea8ad..e3177bed23ea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -389,8 +389,18 @@ enum { }; #ifdef CONFIG_CGROUP_WRITEBACK + struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); -#endif +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b57c2786b5aa..04a3786c456f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -167,6 +167,9 @@ static inline void laptop_sync_completion(void) { } void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom); +#endif extern struct wb_domain global_wb_domain; -- cgit v1.2.3 From 2529bb3aadc40a93e642f5f3650f63379a964467 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:34 -0400 Subject: writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes The amount of available memory to a memcg wb_domain can change as memcg configuration changes. A domain's ->dirty_limit exists to smooth out sudden drops in dirty threshold; however, when a domain's size actually drops significantly, it hinders the dirty throttling from adjusting to the new configuration leading to unexpected behaviors including unnecessary OOM kills. This patch resolves the issue by adding wb_domain_size_changed() which resets ->dirty_limit[_tstmp] and making memcg call it on configuration changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 04a3786c456f..3b73e97ecfc7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -132,6 +132,26 @@ struct wb_domain { unsigned long dirty_limit; }; +/** + * wb_domain_size_changed - memory available to a wb_domain has changed + * @dom: wb_domain of interest + * + * This function should be called when the amount of memory available to + * @dom has changed. It resets @dom's dirty limit parameters to prevent + * the past values which don't match the current configuration from skewing + * dirty throttling. Without this, when memory size of a wb_domain is + * greatly reduced, the dirty throttling logic may allow too many pages to + * be dirtied leading to consecutive unnecessary OOMs and may get stuck in + * that situation. + */ +static inline void wb_domain_size_changed(struct wb_domain *dom) +{ + spin_lock(&dom->lock); + dom->dirty_limit_tstamp = jiffies; + dom->dirty_limit = 0; + spin_unlock(&dom->lock); +} + /* * fs/fs-writeback.c */ -- cgit v1.2.3 From c2aa723a6093633ae4ec15b08a4db276643cab3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:35 -0400 Subject: writeback: implement memcg writeback domain based throttling While cgroup writeback support now connects memcg and blkcg so that writeback IOs are properly attributed and controlled, the IO back pressure propagation mechanism implemented in balance_dirty_pages() and its subroutines wasn't aware of cgroup writeback. Processes belonging to a memcg may have access to only subset of total memory available in the system and not factoring this into dirty throttling rendered it completely ineffective for processes under memcg limits and memcg ended up building a separate ad-hoc degenerate mechanism directly into vmscan code to limit page dirtying. The previous patches updated balance_dirty_pages() and its subroutines so that they can deal with multiple wb_domain's (writeback domains) and defined per-memcg wb_domain. Processes belonging to a non-root memcg are bound to two wb_domains, global wb_domain and memcg wb_domain, and should be throttled according to IO pressures from both domains. This patch updates dirty throttling code so that it repeats similar calculations for the two domains - the differences between the two are few and minor - and applies the lower of the two sets of resulting constraints. wb_over_bg_thresh(), which controls when background writeback terminates, is also updated to consider both global and memcg wb_domains. It returns true if dirty is over bg_thresh for either domain. This makes the dirty throttling mechanism operational for memcg domains including writeback-bandwidth-proportional dirty page distribution inside them but the ad-hoc memcg throttling mechanism in vmscan is still in place. The next patch will rip it out. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e3177bed23ea..c3eb19e2bc1c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -392,6 +392,8 @@ enum { struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, + unsigned long *pdirty, unsigned long *pwriteback); #else /* CONFIG_CGROUP_WRITEBACK */ @@ -400,6 +402,13 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return NULL; } +static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, + unsigned long *pavail, + unsigned long *pdirty, + unsigned long *pwriteback) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -- cgit v1.2.3 From 21c6321fbb3a3787af07f1bc031d713a707fb69c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:49 -0400 Subject: writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Currently, majority of cgroup writeback support including all the above functions are implemented in include/linux/backing-dev.h and mm/backing-dev.c; however, the portion closely related to writeback logic implemented in include/linux/writeback.h and mm/page-writeback.c will expand to support foreign writeback detection and correction. This patch moves wb[_try]_get() and wb_put() to include/linux/backing-dev-defs.h so that they can be used from writeback.h and inode_{attach|detach}_wb() to writeback.h and page-writeback.c. This is pure reorganization and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 50 ++++++++++++++++++++++++ include/linux/backing-dev.h | 82 ---------------------------------------- include/linux/writeback.h | 46 ++++++++++++++++++++++ 3 files changed, 96 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8d470b73824f..e047b496a0b9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -186,4 +186,54 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + #endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e9d7373f5f93..5c978a924157 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); -void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); int inode_congested(struct inode *inode, int cong_bits); @@ -264,37 +263,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } -/** - * wb_tryget - try to increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - return percpu_ref_tryget(&wb->refcnt); - return true; -} - -/** - * wb_get - increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline void wb_get(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_get(&wb->refcnt); -} - -/** - * wb_put - decrement a wb's refcount - * @wb: bdi_writeback to put - */ -static inline void wb_put(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_put(&wb->refcnt); -} - /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest @@ -353,35 +321,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_attach_wb - associate an inode with its wb - * @inode: inode of interest - * @page: page being dirtied (may be NULL) - * - * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o - * @inode->i_lock. - */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ - if (!inode->i_wb) - __inode_attach_wb(inode, page); -} - -/** - * inode_detach_wb - disassociate an inode from its wb - * @inode: inode of interest - * - * @inode is being freed. Detach from its wb. - */ -static inline void inode_detach_wb(struct inode *inode) -{ - if (inode->i_wb) { - wb_put(inode->i_wb); - inode->i_wb = NULL; - } -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested) { } -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - return true; -} - -static inline void wb_get(struct bdi_writeback *wb) -{ -} - -static inline void wb_put(struct bdi_writeback *wb) -{ -} - static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; @@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ -} - -static inline void inode_detach_wb(struct inode *inode) -{ -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3b73e97ecfc7..6726b7e56beb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -8,6 +8,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -173,6 +174,51 @@ static inline void wait_on_inode(struct inode *inode) wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } +#ifdef CONFIG_CGROUP_WRITEBACK + +void __inode_attach_wb(struct inode *inode, struct page *page); + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * mm/page-writeback.c */ -- cgit v1.2.3 From b16b1deb553adcd7b3b7ce3e6d6fd1b923f314da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:39:48 -0600 Subject: writeback: make writeback_control track the inode being written back Currently, for cgroup writeback, the IO submission paths directly associate the bio's with the blkcg from inode_to_wb_blkcg_css(); however, it'd be necessary to keep more writeback context to implement foreign inode writeback detection. wbc (writeback_control) is the natural fit for the extra context - it persists throughout the writeback of each inode and is passed all the way down to IO submission paths. This patch adds wbc_attach_and_unlock_inode(), wbc_detach_inode(), and wbc_attach_fdatawrite_inode() which are used to associate wbc with the inode being written back. IO submission paths now use wbc_init_bio() instead of directly associating bio's with blkcg themselves. This leaves inode_to_wb_blkcg_css() w/o any user. The function is removed. wbc currently only tracks the associated wb (bdi_writeback). Future patches will add more for foreign inode detection. The association is established under i_lock which will be depended upon when migrating foreign inodes to other wb's. As currently, once established, inode to wb association never changes, going through wbc when initializing bio's doesn't cause any behavior changes. v2: submit_blk_blkcg() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 -------- include/linux/writeback.h | 68 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5c978a924157..b1d2489a6536 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,12 +332,6 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return inode_to_wb(inode)->blkcg_css; -} - struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -434,12 +428,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return blkcg_root_css; -} - struct wb_iter { int next_id; }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6726b7e56beb..8f964e558af5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -86,6 +86,9 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *wb; /* wb this writeback is issued under */ +#endif }; /* @@ -176,7 +179,14 @@ static inline void wait_on_inode(struct inode *inode) #ifdef CONFIG_CGROUP_WRITEBACK +#include +#include + void __inode_attach_wb(struct inode *inode, struct page *page); +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock); +void wbc_detach_inode(struct writeback_control *wbc); /** * inode_attach_wb - associate an inode with its wb @@ -207,6 +217,44 @@ static inline void inode_detach_wb(struct inode *inode) } } +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} + +/** + * wbc_init_bio - writeback specific initializtion of bio + * @wbc: writeback_control for the writeback in progress + * @bio: bio to be initialized + * + * @bio is a part of the writeback in progress controlled by @wbc. Perform + * writeback specific initialization. This is used to apply the cgroup + * writeback context. + */ +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (wbc->wb) + bio_associate_blkcg(bio, wbc->wb->blkcg_css); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct page *page) @@ -217,6 +265,26 @@ static inline void inode_detach_wb(struct inode *inode) { } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ +} + +static inline void wbc_detach_inode(struct writeback_control *wbc) +{ +} + +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* -- cgit v1.2.3 From 2a81490811d0296d390c571bb64eaa93e5ed7def Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:51 -0400 Subject: writeback: implement foreign cgroup inode detection As concurrent write sharing of an inode is expected to be very rare and memcg only tracks page ownership on first-use basis severely confining the usefulness of such sharing, cgroup writeback tracks ownership per-inode. While the support for concurrent write sharing of an inode is deemed unnecessary, an inode being written to by different cgroups at different points in time is a lot more common, and, more importantly, charging only by first-use can too readily lead to grossly incorrect behaviors (single foreign page can lead to gigabytes of writeback to be incorrectly attributed). To resolve this issue, cgroup writeback detects the majority dirtier of an inode and will transfer the ownership to it. To avoid unnnecessary oscillation, the detection mechanism keeps track of history and gives out the switch verdict only if the foreign usage pattern is stable over a certain amount of time and/or writeback attempts. The detection mechanism has fairly low space and computation overhead. It adds 8 bytes to struct inode (one int and two u16's) and minimal amount of calculation per IO. The detection mechanism converges to the correct answer usually in several seconds of IO time when there's a clear majority dirtier. Even when there isn't, it can reach an acceptable answer fairly quickly under most circumstances. Please see wb_detach_inode() for more details. This patch only implements detection. Following patches will implement actual switching. v2: wbc_account_io() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/fs.h | 5 +++++ include/linux/writeback.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 67a42ec95065..740126d7c44e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -638,6 +638,11 @@ struct inode { struct list_head i_wb_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ + + /* foreign inode detection, see wbc_detach_inode() */ + int i_wb_frn_winner; + u16 i_wb_frn_avg_time; + u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8f964e558af5..b333c945e571 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -88,6 +88,15 @@ struct writeback_control { unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ + struct inode *inode; /* inode being written out */ + + /* foreign inode detection, see wbc_detach_inode() */ + int wb_id; /* current wb id */ + int wb_lcand_id; /* last foreign candidate wb id */ + int wb_tcand_id; /* this foreign candidate wb id */ + size_t wb_bytes; /* bytes written by current wb */ + size_t wb_lcand_bytes; /* bytes written by last candidate */ + size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; @@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes); /** * inode_attach_wb - associate an inode with its wb @@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } +static inline void wbc_account_io(struct writeback_control *wbc, + struct page *page, size_t bytes) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* -- cgit v1.2.3 From 682aa8e1a6a1504a4caaa62e6c2c9daae3757210 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:53 -0400 Subject: writeback: implement unlocked_inode_to_wb transaction and use it for stat updates The mechanism for detecting whether an inode should switch its wb (bdi_writeback) association is now in place. This patch build the framework for the actual switching. This patch adds a new inode flag I_WB_SWITCHING, which has two functions. First, the easy one, it ensures that there's only one switching in progress for a give inode. Second, it's used as a mechanism to synchronize wb stat updates. The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters but track the current number of dirty pages and pages under writeback respectively. As such, when an inode is moved from one wb to another, the inode's portion of those stats have to be transferred together; unfortunately, this is a bit tricky as those stat updates are percpu operations which are performed without holding any lock in some places. This patch solves the problem in a similar way as memcg. Each such lockless stat updates are wrapped in transaction surrounded by unlocked_inode_to_wb_begin/end(). During normal operation, they map to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted, mapping->tree_lock is grabbed across the transaction. In turn, the switching path sets I_WB_SWITCHING and waits for a RCU grace period to pass before actually starting to switch, which guarantees that all stat update paths are synchronizing against mapping->tree_lock. This patch still doesn't implement the actual switching. v3: Updated on top of the recent cancel_dirty_page() updates. unlocked_inode_to_wb_begin() now nests inside mem_cgroup_begin_page_stat() to match the locking order. v2: The i_wb access transaction will be used for !stat accesses too. Function names and comments updated accordingly. s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/ s/switch_wb/switch_wbs/ Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 54 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 6 +++++ include/linux/mm.h | 3 ++- 3 files changed, 62 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b1d2489a6536..73ffa32e58ee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,6 +332,50 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +/** + * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction + * @inode: target inode + * @lockedp: temp bool output param, to be passed to the end function + * + * The caller wants to access the wb associated with @inode but isn't + * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This + * function determines the wb associated with @inode and ensures that the + * association doesn't change until the transaction is finished with + * unlocked_inode_to_wb_end(). + * + * The caller must call unlocked_inode_to_wb_end() with *@lockdep + * afterwards and can't sleep during transaction. IRQ may or may not be + * disabled on return. + */ +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + rcu_read_lock(); + + /* + * Paired with store_release in inode_switch_wb_work_fn() and + * ensures that we see the new wb if we see cleared I_WB_SWITCH. + */ + *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + + if (unlikely(*lockedp)) + spin_lock_irq(&inode->i_mapping->tree_lock); + return inode_to_wb(inode); +} + +/** + * unlocked_inode_to_wb_end - end inode wb access transaction + * @inode: target inode + * @locked: *@lockedp from unlocked_inode_to_wb_begin() + */ +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ + if (unlikely(locked)) + spin_unlock_irq(&inode->i_mapping->tree_lock); + + rcu_read_unlock(); +} + struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -420,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + return inode_to_wb(inode); +} + +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ +} + static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } diff --git a/include/linux/fs.h b/include/linux/fs.h index 740126d7c44e..b5e1dcfbc5e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1815,6 +1815,11 @@ struct super_operations { * * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab mapping->tree_lock. See + * inode_switch_wb_work_fn() for details. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1834,6 +1839,7 @@ struct super_operations { #define I_DIRTY_TIME (1 << 11) #define __I_DIRTY_TIME_EXPIRED 12 #define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) +#define I_WB_SWITCH (1 << 13) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) diff --git a/include/linux/mm.h b/include/linux/mm.h index f48d979ced4b..4024543b4203 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,6 +27,7 @@ struct anon_vma_chain; struct file_ra_state; struct user_struct; struct writeback_control; +struct bdi_writeback; #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1214,7 +1215,7 @@ int redirty_page_for_writepage(struct writeback_control *wbc, void account_page_dirtied(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg); void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg); + struct mem_cgroup *memcg, struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); void cancel_dirty_page(struct page *page); -- cgit v1.2.3 From aaa2cacf8184e2a92accb8e443b1608d65f9a13f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:55 -0400 Subject: writeback: add lockdep annotation to inode_to_wb() With the previous three patches, all operations which acquire wb from inode are either under one of inode->i_lock, mapping->tree_lock or wb->list_lock or protected by unlocked_inode_to_wb transaction. This will be depended upon by foreign inode wb switching. This patch adds lockdep assertion to inode_to_wb() so that usages outside the above list locks can be caught easily. There are three exceptions. * locked_inode_to_wb_and_lock_list() is holding wb->list_lock but the wb may not be the inode's. Ensuring that is the function's role after all. Updated to deref inode->i_wb directly. * inode_wb_stat_unlocked_begin() is usually protected by combination of !I_WB_SWITCH and rcu_read_lock(). Updated to deref inode->i_wb directly. * inode_congested() wants to test whether inode->i_wb is set before starting the transaction. Added inode_to_wb_is_valid() which tests inode->i_wb directly. v5: might_lock() removed. It annotates that the lock is grabbed w/ irq enabled which isn't the case and triggering lockdep warning spuriously. v4: might_lock() added to unlocked_inode_to_wb_begin(). v3: inode_congested() conversion added. v2: locked_inode_to_wb_and_lock_list() was missing in the first version. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 73ffa32e58ee..dfce80869145 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -321,14 +321,34 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } +/** + * inode_to_wb_is_valid - test whether an inode has a wb associated + * @inode: inode of interest + * + * Returns %true if @inode has a wb associated. May be called without any + * locking. + */ +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return inode->i_wb; +} + /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * - * Returns the wb @inode is currently associated with. + * Returns the wb @inode is currently associated with. The caller must be + * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the + * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && + (!lockdep_is_held(&inode->i_lock) && + !lockdep_is_held(&inode->i_mapping->tree_lock) && + !lockdep_is_held(&inode->i_wb->list_lock))); +#endif return inode->i_wb; } @@ -360,7 +380,12 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) if (unlikely(*lockedp)) spin_lock_irq(&inode->i_mapping->tree_lock); - return inode_to_wb(inode); + + /* + * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. + * inode_to_wb() will bark. Deref directly. + */ + return inode->i_wb; } /** @@ -459,6 +484,11 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return true; +} + static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; -- cgit v1.2.3 From e8a7abf5a5bd302a1e06a3c21a629eaa4cba57d6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:57 -0400 Subject: writeback: disassociate inodes from dying bdi_writebacks For the purpose of foreign inode detection, wb's (bdi_writeback's) are identified by the associated memcg ID. As we create a separate wb for each memcg, this is enough to identify the active wb's; however, when blkcg is enabled or disabled higher up in the hierarchy, the mapping between memcg and blkcg changes which in turn creates a new wb to service the new mapping. The old wb is unlinked from index and released after all references are drained. The foreign inode detection logic can't detect this condition because both the old and new wb's point to the same memcg and thus never decides to move inodes attached to the old wb to the new one. This patch adds logic to initiate switching immediately in wbc_attach_and_unlock_inode() if the associated wb is dying. We can make the usual foreign detection logic to distinguish the different wb's mapped to the memcg but the dying wb is never gonna be in active service again and there's no point in tracking the usage history and reaching the switch verdict after enough data points are collected. It's already known that the wb has to be switched. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e047b496a0b9..a48d90e3bcbb 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -219,6 +219,17 @@ static inline void wb_put(struct bdi_writeback *wb) percpu_ref_put(&wb->refcnt); } +/** + * wb_dying - is a wb dying? + * @wb: bdi_writeback of interest + * + * Returns whether @wb is unlinked and being drained. + */ +static inline bool wb_dying(struct bdi_writeback *wb) +{ + return percpu_ref_is_dying(&wb->refcnt); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool wb_tryget(struct bdi_writeback *wb) @@ -234,6 +245,11 @@ static inline void wb_put(struct bdi_writeback *wb) { } +static inline bool wb_dying(struct bdi_writeback *wb) +{ + return false; +} + #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* __LINUX_BACKING_DEV_DEFS_H */ -- cgit v1.2.3 From 632dda833e28fe43049a01b4bc53e409176e7843 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 May 2015 14:04:50 -0400 Subject: SUNRPC: Clean up bc_send() Clean up: Merge bc_send() into bc_svc_process(). Note: even thought this touches svc.c, it is a client-side change. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/bc_xprt.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index 2ca67b55e0fe..8df43c9f11dc 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -37,7 +37,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied); void xprt_free_bc_request(struct rpc_rqst *req); int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); -int bc_send(struct rpc_rqst *req); /* * Determine if a shared backchannel is in use -- cgit v1.2.3 From da7049f834c3582c1ed1a04889bda5b4121973c0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 26 May 2015 13:49:07 -0400 Subject: svcrdma: Remove svc_rdma_xdr_decode_deferred_req() svc_rdma_xdr_decode_deferred_req() indexes an array with an un-byte-swapped value off the wire. Fortunately this function isn't used anywhere, so simply remove it. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index df8edf8ec914..8ad9b6d9d4e0 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -182,7 +182,6 @@ struct svcxprt_rdma { /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); -extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode, u32 *); -- cgit v1.2.3 From e842f2903908934187af7232fb5b21da527d1757 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: dax: don't abuse get_block mapping for endio callbacks dax_fault() currently relies on the get_block callback to attach an io completion callback to the mapping buffer head so that it can run unwritten extent conversion after zeroing allocated blocks. Instead of this hack, pass the conversion callback directly into dax_fault() similar to the get_block callback. When the filesystem allocates unwritten extents, it will set the buffer_unwritten() flag, and hence the dax_fault code can call the completion function in the contexts where it is necessary without overloading the mapping buffer head. Note: The changes to ext4 to use this interface are suspect at best. In fact, the way ext4 did this end_io assignment in the first place looks suspect because it only set a completion callback when there wasn't already some other write() call taking place on the same inode. The ext4 end_io code looks rather intricate and fragile with all it's reference counting and passing to different contexts for modification via inode private pointers that aren't protected by locks... Signed-off-by: Dave Chinner Acked-by: Jan Kara Signed-off-by: Dave Chinner --- include/linux/fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..c9b4cca9e08d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); +typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 @@ -2627,9 +2628,10 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, int dax_clear_blocks(struct inode *, sector_t block, long size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, -- cgit v1.2.3 From ce5c5d554dc47a4fb4360c84b72231fea081e7a0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: dax: expose __dax_fault for filesystems with locking constraints Some filesystems cannot call dax_fault() directly because they have different locking and/or allocation constraints in the page fault IO path. To handle this, we need to follow the same model as the generic block_page_mkwrite code, where the internals are exposed via __block_page_mkwrite() so that filesystems can wrap the correct locking and operations around the outside. This is loosely based on a patch originally from Matthew Willcox. Unlike the original patch, it does not change ext4 code, error returns or unwritten extent conversion handling. It also adds a __dax_mkwrite() wrapper for .page_mkwrite implementations to do the right thing, too. Signed-off-by: Dave Chinner Reviewed-by: Jan Kara Signed-off-by: Dave Chinner --- include/linux/fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c9b4cca9e08d..5784377e7c56 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2630,8 +2630,11 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, -- cgit v1.2.3 From c8fff7bc5bba6bd59cad40441c189c4efe7190f6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 8 Apr 2015 19:59:20 +0300 Subject: of: return NUMA_NO_NODE from fallback of_node_to_nid() Node 0 might be offline as well as any other numa node, in this case kernel cannot handle memory allocation and crashes. Signed-off-by: Konstantin Khlebnikov Fixes: 0c3f061c195c ("of: implement of_node_to_nid as a weak function") Signed-off-by: Grant Likely --- include/linux/of.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index ddeaae6d2083..ab071742c0c4 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -667,7 +667,10 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag #if defined(CONFIG_OF) && defined(CONFIG_NUMA) extern int of_node_to_nid(struct device_node *np); #else -static inline int of_node_to_nid(struct device_node *device) { return 0; } +static inline int of_node_to_nid(struct device_node *device) +{ + return NUMA_NO_NODE; +} #endif static inline struct device_node *of_find_matching_node( -- cgit v1.2.3 From 9e7c8f8c62c1e1cda203b5bfaba4575b141e42e7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Jun 2015 16:22:16 -0400 Subject: signals: don't abuse __flush_signals() in selinux_bprm_committed_creds() selinux_bprm_committed_creds()->__flush_signals() is not right, we shouldn't clear TIF_SIGPENDING unconditionally. There can be other reasons for signal_pending(): freezing(), JOBCTL_PENDING_MASK, and potentially more. Also change this code to check fatal_signal_pending() rather than SIGNAL_GROUP_EXIT, it looks a bit better. Now we can kill __flush_signals() before it finds another buggy user. Note: this code looks racy, we can flush a signal which was sent after the task SID has been updated. Signed-off-by: Oleg Nesterov Signed-off-by: Paul Moore --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8222ae40ecb0..4f84aade8b4d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2373,7 +2373,6 @@ extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); -extern void __flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); -- cgit v1.2.3 From 30b7e246a6222f1fbad39b1451273375306fe1e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Jun 2015 11:21:10 -0400 Subject: svcrdma: Keep rpcrdma_msg fields in network byte-order Fields in struct rpcrdma_msg are __be32. Don't byte-swap these fields when decoding RPC calls and then swap them back for the reply. For the most part, they can be left alone. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 8ad9b6d9d4e0..c03ca0a1b743 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -184,7 +184,7 @@ struct svcxprt_rdma { extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, struct rpcrdma_msg *, - enum rpcrdma_errcode, u32 *); + enum rpcrdma_errcode, __be32 *); extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, -- cgit v1.2.3 From b7e0b9a965a116341b4ef86ab98ea2843b218271 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Jun 2015 11:21:20 -0400 Subject: svcrdma: Replace GFP_KERNEL in a loop with GFP_NOFAIL At the 2015 LSF/MM, it was requested that memory allocation call sites that request GFP_KERNEL allocations in a loop should be annotated with __GFP_NOFAIL. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c03ca0a1b743..d26384b22126 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -211,7 +211,6 @@ extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode); -struct page *svc_rdma_get_page(void); extern int svc_rdma_post_recv(struct svcxprt_rdma *); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); -- cgit v1.2.3 From 0380a3f37540ad0582b3c749a74fc127af914689 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Jun 2015 11:21:32 -0400 Subject: svcrdma: Add a separate "max data segs macro for svcrdma The server and client maximum are architecturally independent. Allow changing one without affecting the other. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d26384b22126..cb94ee4181d4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -172,6 +172,13 @@ struct svcxprt_rdma { #define RDMAXPRT_SQ_PENDING 2 #define RDMAXPRT_CONN_PENDING 3 +#define RPCRDMA_MAX_SVC_SEGS (64) /* server max scatter/gather */ +#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT) +#define RPCRDMA_MAXPAYLOAD RPCSVC_MAXPAYLOAD +#else +#define RPCRDMA_MAXPAYLOAD (RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT) +#endif + #define RPCRDMA_LISTEN_BACKLOG 10 /* The default ORD value is based on two outstanding full-size writes with a * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ -- cgit v1.2.3 From 0f41979164a52a1ca0f0a601f90000fc18e3a396 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Jun 2015 22:59:08 -0400 Subject: SUNRPC: Remove unused argument 'tk_ops' in rpc_run_bc_task Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 5f1e6bd4c316..2aa68976a482 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -205,8 +205,7 @@ struct rpc_wait_queue { */ struct rpc_task *rpc_new_task(const struct rpc_task_setup *); struct rpc_task *rpc_run_task(const struct rpc_task_setup *); -struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, - const struct rpc_call_ops *ops); +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); void rpc_exit_task(struct rpc_task *); -- cgit v1.2.3 From 0d2a970d0ae55086520e1e58e572a7acd519429c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Jun 2015 15:37:10 -0400 Subject: SUNRPC: Fix a backchannel race We need to allow the server to send a new request immediately after we've replied to the previous one. Right now, there is a window between the send and the release of the old request in rpc_put_task(), where the server could send us a new backchannel RPC call, and we have no request to service it. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 8b93ef53df3c..bc9c39d6d30d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -212,7 +212,8 @@ struct rpc_xprt { #if defined(CONFIG_SUNRPC_BACKCHANNEL) struct svc_serv *bc_serv; /* The RPC service which will */ /* process the callback */ - unsigned int bc_alloc_count; /* Total number of preallocs */ + int bc_alloc_count; /* Total number of preallocs */ + atomic_t bc_free_slots; spinlock_t bc_pa_lock; /* Protects the preallocated * items */ struct list_head bc_pa_list; /* List of preallocated -- cgit v1.2.3 From 3f21c265cd5f7ae867cc0e86a1f6d5093f1963cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Jun 2015 10:57:37 -0600 Subject: block: add blk_set_queue_dying() to blkdev.h We export this function and NVMe wants to use it, but for some reason it was never added to the block header. Do that. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ccaa9aecd593..a31380c35918 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1024,6 +1024,7 @@ bool __must_check blk_get_queue(struct request_queue *); struct request_queue *blk_alloc_queue(gfp_t); struct request_queue *blk_alloc_queue_node(gfp_t, int); extern void blk_put_queue(struct request_queue *); +extern void blk_set_queue_dying(struct request_queue *); /* * block layer runtime pm functions -- cgit v1.2.3 From a5768aa887fb636f0cc4c83a2f1242506aaf50f6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 1 Jun 2015 14:28:14 -0600 Subject: NVMe: Automatic namespace rescan Namespaces may be dynamically allocated and deleted or attached and detached. This has the driver rescan the device for namespace changes after each device reset or namespace change asynchronous event. There could potentially be many detached namespaces that we don't want polluting /dev/ with unusable block handles, so this will delete disks if the namespace is not active as indicated by the response from identify namespace. This also skips adding the disk if no capacity is provisioned to the namespace in the first place. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 986bf8ad8e93..c0d94ed8ce9a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -92,6 +92,7 @@ struct nvme_dev { work_func_t reset_workfn; struct work_struct reset_work; struct work_struct probe_work; + struct work_struct scan_work; char name[12]; char serial[20]; char model[40]; -- cgit v1.2.3 From 2e61dfb3602b904966491f260f62c01b9895936a Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Fri, 5 Jun 2015 11:26:13 -0500 Subject: clk: of: helper for filling parent clock array and return num of parents Sprinkled all through the platform clock drivers are code like this to fill the clock parent array: for (i = 0; i < num_parents; ++i) parent_names[i] = of_clk_get_parent_name(np, i); The of_clk_parent_fill() will do the same as the code above, and while at it, return the number of parents as well since the logic of the function is to the walk the clock node to look for the parent. Signed-off-by: Dinh Nguyen [sboyd@codeaurora.org: Fixed kernel-doc] Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 5378c2aba4d2..2e5df069ca34 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -626,6 +626,8 @@ struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec, void *data); struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data); int of_clk_get_parent_count(struct device_node *np); +int of_clk_parent_fill(struct device_node *np, const char **parents, + unsigned int size); const char *of_clk_get_parent_name(struct device_node *np, int index); void of_clk_init(const struct of_device_id *matches); -- cgit v1.2.3 From cb4a316752709be4a644f070440a8be470d92b7d Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 6 Jun 2015 10:02:14 +1000 Subject: cgroup: use bitmask to filter for_each_subsys Add a new macro for_each_subsys_which that allows all enabled cgroup subsystems to be filtered by a bitmask, such that mask & (1 << ssid) determines if the subsystem is to be processed in the loop body (where ssid is the unique id of the subsystem). Also replace the need_forkexit_callback with two separate bitmasks for each callback to make (ss->{fork,exit}) checks unnecessary. tj: add a short comment for "if (!CGROUP_SUBSYS_COUNT)". Signed-off-by: Aleksa Sarai --- include/linux/cgroup-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 26d1cea7929f..c5588c438448 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -490,6 +490,8 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) #else /* CONFIG_CGROUPS */ +#define CGROUP_SUBSYS_COUNT 0 + static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} -- cgit v1.2.3 From 5179f0ce2f96f155e3bda93b3b82f912dbaddad2 Mon Sep 17 00:00:00 2001 From: Steve Twiss Date: Mon, 8 Jun 2015 16:26:20 -0700 Subject: Input: add OnKey driver for DA9063 MFD part This adds OnKey driver support for DA9063. Signed-off-by: Steve Twiss Signed-off-by: Dmitry Torokhov --- include/linux/mfd/da9063/pdata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h index 95c8742215a7..612383bd80ae 100644 --- a/include/linux/mfd/da9063/pdata.h +++ b/include/linux/mfd/da9063/pdata.h @@ -103,6 +103,7 @@ struct da9063; struct da9063_pdata { int (*init)(struct da9063 *da9063); int irq_base; + bool key_power; unsigned flags; struct da9063_regulators_pdata *regulators_pdata; struct led_platform_data *leds_pdata; -- cgit v1.2.3 From b1999477ed91c3c33891acfe0e18a4457e5c4915 Mon Sep 17 00:00:00 2001 From: Guo Zeng Date: Tue, 14 Apr 2015 11:55:55 +0000 Subject: ARM: prima2: move to use REGMAP APIs for rtciobrg all devices behind rtciobrg needs a special way to access. currently they are using a platform-specific API. this patch moves to REGMAP, then clients can use regmap APIs to read/write. for the moment, old APIs are still kept, once all clients move to regmap, old APIs will be dropped. this patch also does minor clean for comments, authors statement. Signed-off-by: Guo Zeng Signed-off-by: Barry Song --- include/linux/rtc/sirfsoc_rtciobrg.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtc/sirfsoc_rtciobrg.h b/include/linux/rtc/sirfsoc_rtciobrg.h index 2c92e1c8e055..aefd997262e4 100644 --- a/include/linux/rtc/sirfsoc_rtciobrg.h +++ b/include/linux/rtc/sirfsoc_rtciobrg.h @@ -9,10 +9,14 @@ #ifndef _SIRFSOC_RTC_IOBRG_H_ #define _SIRFSOC_RTC_IOBRG_H_ +struct regmap_config; + extern void sirfsoc_rtc_iobrg_besyncing(void); extern u32 sirfsoc_rtc_iobrg_readl(u32 addr); extern void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr); +struct regmap *devm_regmap_init_iobg(struct device *dev, + const struct regmap_config *config); #endif -- cgit v1.2.3 From 0bb979472a7401022109e81dd89d777adea58710 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jun 2015 08:01:20 -0600 Subject: cfq-iosched: fix the setting of IOPS mode on SSDs A previous commit wanted to make CFQ default to IOPS mode on non-rotational storage, however it did so when the queue was initialized and the non-rotational flag is only set later on in the probe. Add an elevator hook that gets called off the add_disk() path, at that point we know that feature probing has finished, and we can reliably check for the various flags that drivers can set. Fixes: 41c0126b ("block: Make CFQ default to IOPS mode on SSDs") Tested-by: Romain Francoise Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 45a91474487d..638b324f0291 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -39,6 +39,7 @@ typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct reques typedef int (elevator_init_fn) (struct request_queue *, struct elevator_type *e); typedef void (elevator_exit_fn) (struct elevator_queue *); +typedef void (elevator_registered_fn) (struct request_queue *); struct elevator_ops { @@ -68,6 +69,7 @@ struct elevator_ops elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; + elevator_registered_fn *elevator_registered_fn; }; #define ELV_NAME_MAX (16) -- cgit v1.2.3 From 3037e9ea780027d41baaaabb68a749e49e7c8260 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 10 Jun 2015 21:04:54 +0100 Subject: clk: fixed: Add comment to clk_fixed_set_rate Currently it is not made explicit why clk_fixed_set_rate() can ignore its arguments and unconditionally return success. Add a comment to explain this. We also mark the clk_ops table const since it should never be modified at runtime. Suggested-by: Stephen Boyd Signed-off-by: Daniel Thompson Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 2e5df069ca34..4a943d13625b 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -459,7 +459,7 @@ struct clk_fixed_factor { unsigned int div; }; -extern struct clk_ops clk_fixed_factor_ops; +extern const struct clk_ops clk_fixed_factor_ops; struct clk *clk_register_fixed_factor(struct device *dev, const char *name, const char *parent_name, unsigned long flags, unsigned int mult, unsigned int div); -- cgit v1.2.3 From 3c87ef6efb40f0e339d705c194b2224f854ec38e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 3 Jun 2015 16:14:25 -0400 Subject: sunrpc: keep a count of swapfiles associated with the rpc_clnt Jerome reported seeing a warning pop when working with a swapfile on NFS. The nfs_swap_activate can end up calling sk_set_memalloc while holding the rcu_read_lock and that function can sleep. To fix that, we need to take a reference to the xprt while holding the rcu_read_lock, set the socket up for swapping and then drop that reference. But, xprt_put is not exported and having NFS deal with the underlying xprt is a bit of layering violation anyway. Fix this by adding a set of activate/deactivate functions that take a rpc_clnt pointer instead of an rpc_xprt, and have nfs_swap_activate and nfs_swap_deactivate call those. Also, add a per-rpc_clnt atomic counter to keep track of the number of active swapfiles associated with it. When the counter does a 0->1 transition, we enable swapping on the xprt, when we do a 1->0 transition we disable swapping on it. This also allows us to be a bit more selective with the RPC_TASK_SWAPPER flag. If non-swapper and swapper clnts are sharing a xprt, then we only need to flag the tasks from the swapper clnt with that flag. Acked-by: Mel Gorman Reported-by: Jerome Marchand Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + include/linux/sunrpc/sched.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 598ba80ec30c..131032f15cc1 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -56,6 +56,7 @@ struct rpc_clnt { struct rpc_rtt * cl_rtt; /* RTO estimator data */ const struct rpc_timeout *cl_timeout; /* Timeout strategy */ + atomic_t cl_swapper; /* swapfile count */ int cl_nodelen; /* nodename length */ char cl_nodename[UNX_MAXNODENAME+1]; struct rpc_pipe_dir_head cl_pipedir_objects; diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 2aa68976a482..d703f0ef37d8 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -268,4 +268,20 @@ static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q, } #endif +#if IS_ENABLED(CONFIG_SUNRPC_SWAP) +int rpc_clnt_swap_activate(struct rpc_clnt *clnt); +void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt); +#else +static inline int +rpc_clnt_swap_activate(struct rpc_clnt *clnt) +{ + return -EINVAL; +} + +static inline void +rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) +{ +} +#endif /* CONFIG_SUNRPC_SWAP */ + #endif /* _LINUX_SUNRPC_SCHED_H_ */ -- cgit v1.2.3 From 8e2281330f9930bccf77cf04027ec60b6cc0fb34 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 3 Jun 2015 16:14:26 -0400 Subject: sunrpc: make xprt->swapper an atomic_t Split xs_swapper into enable/disable functions and eliminate the "enable" flag. Currently, it's racy if you have multiple swapon/swapoff operations running in parallel over the same xprt. Also fix it so that we only set it to a memalloc socket on a 0->1 transition and only clear it on a 1->0 transition. Cc: Mel Gorman Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index bc9c39d6d30d..9a3308703c15 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -180,7 +180,7 @@ struct rpc_xprt { atomic_t num_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char resvport : 1; /* use a reserved port */ - unsigned int swapper; /* we're swapping over this + atomic_t swapper; /* we're swapping over this transport */ unsigned int bind_index; /* bind function index */ @@ -346,7 +346,8 @@ void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect_done(struct rpc_xprt *xprt); void xprt_force_disconnect(struct rpc_xprt *xprt); void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); -int xs_swapper(struct rpc_xprt *xprt, int enable); +int xs_swapper_enable(struct rpc_xprt *xprt); +void xs_swapper_disable(struct rpc_xprt *xprt); bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *); void xprt_unlock_connect(struct rpc_xprt *, void *); -- cgit v1.2.3 From d67fa4d85a2143b46052b2e9ccc6749a4c97b2de Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 3 Jun 2015 16:14:29 -0400 Subject: sunrpc: turn swapper_enable/disable functions into rpc_xprt_ops RDMA xprts don't have a sock_xprt, but an rdma_xprt, so the xs_swapper_enable/disable functions will likely oops when fed an RDMA xprt. Turn these functions into rpc_xprt_ops so that that doesn't occur. For now the RDMA versions are no-ops that just return -EINVAL on an attempt to swapon. Cc: Chuck Lever Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 9a3308703c15..b6096592b1d4 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -133,6 +133,8 @@ struct rpc_xprt_ops { void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq); + int (*enable_swap)(struct rpc_xprt *xprt); + void (*disable_swap)(struct rpc_xprt *xprt); }; /* @@ -328,6 +330,18 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 * return p + xprt->tsh_size; } +static inline int +xprt_enable_swap(struct rpc_xprt *xprt) +{ + return xprt->ops->enable_swap(xprt); +} + +static inline void +xprt_disable_swap(struct rpc_xprt *xprt) +{ + xprt->ops->disable_swap(xprt); +} + /* * Transport switch helper functions */ @@ -346,8 +360,6 @@ void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect_done(struct rpc_xprt *xprt); void xprt_force_disconnect(struct rpc_xprt *xprt); void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); -int xs_swapper_enable(struct rpc_xprt *xprt); -void xs_swapper_disable(struct rpc_xprt *xprt); bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *); void xprt_unlock_connect(struct rpc_xprt *, void *); -- cgit v1.2.3 From 11598b8ff2b97cf034d0c025cf125c92b574bafc Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 10 Jun 2015 16:54:28 -0400 Subject: NFS: Remove unused nfs_rw_ops->rw_release() function This was only ever set to nfs_writeback_release_common(), a function which is completely empty. Let's just drop this function pointer and simplify the code a bit. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_page.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 3eb072dbce83..f2f650f136ee 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -67,7 +67,6 @@ struct nfs_rw_ops { const fmode_t rw_mode; struct nfs_pgio_header *(*rw_alloc_header)(void); void (*rw_free_header)(struct nfs_pgio_header *); - void (*rw_release)(struct nfs_pgio_header *); int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *, struct inode *); void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *); -- cgit v1.2.3 From 4a06825839889cc1756d0dd8a52d6b1071ee0263 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 May 2015 14:02:25 -0400 Subject: SUNRPC: Transport fault injection It has been exceptionally useful to exercise the logic that handles local immediate errors and RDMA connection loss. To enable developers to test this regularly and repeatably, add logic to simulate connection loss every so often. Fault injection is disabled by default. It is enabled with $ sudo echo xxx > /sys/kernel/debug/sunrpc/inject_fault/disconnect where "xxx" is a large positive number of transport method calls before a disconnect. A value of several thousand is usually a good number that allows reasonable forward progress while still causing a lot of connection drops. These hooks are disabled when SUNRPC_DEBUG is turned off. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index b6096592b1d4..0fb9acbb4780 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -135,6 +135,7 @@ struct rpc_xprt_ops { void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq); int (*enable_swap)(struct rpc_xprt *xprt); void (*disable_swap)(struct rpc_xprt *xprt); + void (*inject_disconnect)(struct rpc_xprt *xprt); }; /* @@ -244,6 +245,7 @@ struct rpc_xprt { const char *address_strings[RPC_DISPLAY_MAX]; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *debugfs; /* debugfs directory */ + atomic_t inject_disconnect; #endif }; @@ -445,6 +447,23 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) return test_and_set_bit(XPRT_BINDING, &xprt->state); } +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +extern unsigned int rpc_inject_disconnect; +static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ + if (!rpc_inject_disconnect) + return; + if (atomic_dec_return(&xprt->inject_disconnect)) + return; + atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); + xprt->ops->inject_disconnect(xprt); +} +#else +static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ +} +#endif + #endif /* __KERNEL__*/ #endif /* _LINUX_SUNRPC_XPRT_H */ -- cgit v1.2.3 From dfabde206aa10ae71a89ba75e68b1f58a6336a05 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 11 May 2015 17:08:50 +0100 Subject: mailbox: Add ability for clients to request channels by name This patch supplies a new framework API; mbox_request_channel_byname(). It works by supplying the usual client pointer as the first argument and a string as the second. The API will search the client's node for a 'mbox-names' property then request a channel in the normal way using the requested string's index as the expected second 'index' argument. Signed-off-by: Lee Jones Signed-off-by: Jassi Brar --- include/linux/mailbox_client.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h index 1726ccbd8009..44348710953f 100644 --- a/include/linux/mailbox_client.h +++ b/include/linux/mailbox_client.h @@ -40,6 +40,8 @@ struct mbox_client { void (*tx_done)(struct mbox_client *cl, void *mssg, int r); }; +struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl, + const char *name); struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index); int mbox_send_message(struct mbox_chan *chan, void *mssg); void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ -- cgit v1.2.3 From 87d001ef5366c4a24f7a1340246c4ce68190581c Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 27 May 2015 16:01:52 +0200 Subject: dmaengine: Move icg helpers to global header Now that we can have ICGs set for both the source and destination (using the icg field of struct data_chunk) or for only the source or the destination (using the dst_icg or src_icg respectively), and that these fields can be ignored depending on other parameters (src_inc, src_sgl, etc.), the logic to get the actual ICG value can be quite tricky. The XDMAC driver was already implementing it, but since we will need it in other drivers, we can move it to the main header file. Signed-off-by: Maxime Ripard Acked-by: Ludovic Desroches Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..499c530bcbaa 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -874,6 +874,33 @@ static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags) BUG(); } +static inline size_t dmaengine_get_icg(bool inc, bool sgl, size_t icg, + size_t dir_icg) +{ + if (inc) { + if (dir_icg) + return dir_icg; + else if (sgl) + return icg; + } + + return 0; +} + +static inline size_t dmaengine_get_dst_icg(struct dma_interleaved_template *xt, + struct data_chunk *chunk) +{ + return dmaengine_get_icg(xt->dst_inc, xt->dst_sgl, + chunk->icg, chunk->dst_icg); +} + +static inline size_t dmaengine_get_src_icg(struct dma_interleaved_template *xt, + struct data_chunk *chunk) +{ + return dmaengine_get_icg(xt->src_inc, xt->src_sgl, + chunk->icg, chunk->src_icg); +} + /* --- public DMA engine API --- */ #ifdef CONFIG_DMA_ENGINE -- cgit v1.2.3 From 4983a501afede12f95d26e1e213f8f2e9eda1871 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 18 May 2015 13:46:15 +0200 Subject: dmaengine: Revert "drivers/dma: remove unused support for MEMSET operations" This reverts commit 48a9db462d99494583dad829969616ac90a8df4e. Some platforms actually need support for the memset operations. Bring it back. Signed-off-by: Maxime Ripard Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..19face3168b4 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -65,6 +65,7 @@ enum dma_transaction_type { DMA_PQ, DMA_XOR_VAL, DMA_PQ_VAL, + DMA_MEMSET, DMA_INTERRUPT, DMA_SG, DMA_PRIVATE, @@ -570,6 +571,7 @@ struct dma_tx_state { * @copy_align: alignment shift for memcpy operations * @xor_align: alignment shift for xor operations * @pq_align: alignment shift for pq operations + * @fill_align: alignment shift for memset operations * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @src_addr_widths: bit mask of src addr widths the device supports @@ -588,6 +590,7 @@ struct dma_tx_state { * @device_prep_dma_xor_val: prepares a xor validation operation * @device_prep_dma_pq: prepares a pq operation * @device_prep_dma_pq_val: prepares a pqzero_sum operation + * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. @@ -620,6 +623,7 @@ struct dma_device { u8 copy_align; u8 xor_align; u8 pq_align; + u8 fill_align; #define DMA_HAS_PQ_CONTINUE (1 << 15) int dev_id; @@ -650,6 +654,9 @@ struct dma_device { struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, unsigned int src_cnt, const unsigned char *scf, size_t len, enum sum_check_flags *pqres, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_memset)( + struct dma_chan *chan, dma_addr_t dest, int value, size_t len, + unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)( struct dma_chan *chan, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_sg)( @@ -745,6 +752,17 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma( return chan->device->device_prep_interleaved_dma(chan, xt, flags); } +static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset( + struct dma_chan *chan, dma_addr_t dest, int value, size_t len, + unsigned long flags) +{ + if (!chan || !chan->device) + return NULL; + + return chan->device->device_prep_dma_memset(chan, dest, value, + len, flags); +} + static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg( struct dma_chan *chan, struct scatterlist *dst_sg, unsigned int dst_nents, @@ -820,6 +838,12 @@ static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1, return dmaengine_check_align(dev->pq_align, off1, off2, len); } +static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1, + size_t off2, size_t len) +{ + return dmaengine_check_align(dev->fill_align, off1, off2, len); +} + static inline void dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue) { -- cgit v1.2.3 From 7e53df111beea8db2543424d07bdee2a630698c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 26 May 2015 11:53:04 -0400 Subject: xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy Clean up: This field is no longer used. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Reviewed-by: Sagi Grimberg Reviewed-by: Devesh Sharma Tested-By: Devesh Sharma Reviewed-by: Doug Ledford Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtrdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index c984c85981ea..b17613052cc3 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -56,7 +56,8 @@ #define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ -/* memory registration strategies */ +/* Memory registration strategies, by number. + * This is part of a kernel / user space API. Do not remove. */ enum rpcrdma_memreg { RPCRDMA_BOUNCEBUFFERS = 0, RPCRDMA_REGISTER, -- cgit v1.2.3 From 73b6ecdb93e8e77752cae9077c424fcdc6f23c39 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 12 Jun 2015 11:10:06 +0900 Subject: extcon: Redefine the unique id of supported external connectors without 'enum extcon' type This patch just redefine the unique id of supported external connectors without 'enum extcon' type. Because unique id would be used on devictree file(*.dts) to indicate the specific external connectors like key number of input framework. So, I have the plan to move this definitions to following header file which includes the unique id of supported external connectors. - include/dt-bindings/extcon/extcon.h Fixes: 2a9de9c0f08d ("extcon: Use the unique id for external connector instead of string") Signed-off-by: Chanwoo Choi Signed-off-by: Greg Kroah-Hartman --- include/linux/extcon.h | 90 +++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index a7b224b20ecc..b16d929fa75f 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -30,41 +30,35 @@ #include #include -enum extcon { - EXTCON_NONE = 0x0, - - /* USB external connector */ - EXTCON_USB = 0x1, - EXTCON_USB_HOST = 0x2, - - /* Charger external connector */ - EXTCON_TA = 0x10, - EXTCON_FAST_CHARGER = 0x11, - EXTCON_SLOW_CHARGER = 0x12, - EXTCON_CHARGE_DOWNSTREAM = 0x13, - - /* Audio/Video external connector */ - EXTCON_LINE_IN = 0x20, - EXTCON_LINE_OUT = 0x21, - EXTCON_MICROPHONE = 0x22, - EXTCON_HEADPHONE = 0x23, - - EXTCON_HDMI = 0x30, - EXTCON_MHL = 0x31, - EXTCON_DVI = 0x32, - EXTCON_VGA = 0x33, - EXTCON_SPDIF_IN = 0x34, - EXTCON_SPDIF_OUT = 0x35, - EXTCON_VIDEO_IN = 0x36, - EXTCON_VIDEO_OUT = 0x37, - - /* Etc external connector */ - EXTCON_DOCK = 0x50, - EXTCON_JIG = 0x51, - EXTCON_MECHANICAL = 0x52, - - EXTCON_END, -}; +/* + * Define the unique id of supported external connectors + */ +#define EXTCON_NONE 0 + +#define EXTCON_USB 1 /* USB connector */ +#define EXTCON_USB_HOST 2 + +#define EXTCON_TA 3 /* Charger connector */ +#define EXTCON_FAST_CHARGER 4 +#define EXTCON_SLOW_CHARGER 5 +#define EXTCON_CHARGE_DOWNSTREAM 6 + +#define EXTCON_LINE_IN 7 /* Audio/Video connector */ +#define EXTCON_LINE_OUT 8 +#define EXTCON_MICROPHONE 9 +#define EXTCON_HEADPHONE 10 +#define EXTCON_HDMI 11 +#define EXTCON_MHL 12 +#define EXTCON_DVI 13 +#define EXTCON_VGA 14 +#define EXTCON_SPDIF_IN 15 +#define EXTCON_SPDIF_OUT 16 +#define EXTCON_VIDEO_IN 17 +#define EXTCON_VIDEO_OUT 18 + +#define EXTCON_DOCK 19 /* Misc connector */ +#define EXTCON_JIG 20 +#define EXTCON_MECHANICAL 21 struct extcon_cable; @@ -105,7 +99,7 @@ struct extcon_cable; struct extcon_dev { /* Optional user initializing data */ const char *name; - const enum extcon *supported_cable; + const unsigned int *supported_cable; const u32 *mutually_exclusive; /* Optional callbacks to override class functions */ @@ -182,10 +176,10 @@ extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name); /* * Following APIs control the memory of extcon device. */ -extern struct extcon_dev *extcon_dev_allocate(const enum extcon *cable); +extern struct extcon_dev *extcon_dev_allocate(const unsigned int *cable); extern void extcon_dev_free(struct extcon_dev *edev); extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const enum extcon *cable); + const unsigned int *cable); extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev); /* @@ -206,8 +200,8 @@ extern int extcon_update_state(struct extcon_dev *edev, u32 mask, u32 state); * get/set_cable_state access each bit of the 32b encoded state value. * They are used to access the status of each cable based on the cable_name. */ -extern int extcon_get_cable_state_(struct extcon_dev *edev, enum extcon id); -extern int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id, +extern int extcon_get_cable_state_(struct extcon_dev *edev, unsigned int id); +extern int extcon_set_cable_state_(struct extcon_dev *edev, unsigned int id, bool cable_state); extern int extcon_get_cable_state(struct extcon_dev *edev, @@ -234,9 +228,9 @@ extern int extcon_unregister_interest(struct extcon_specific_cable_nb *nb); * we do not recommend to use this for normal 'notifiee' device drivers who * want to be notified by a specific external port of the notifier. */ -extern int extcon_register_notifier(struct extcon_dev *edev, enum extcon id, +extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id, struct notifier_block *nb); -extern int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id, +extern int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id, struct notifier_block *nb); /* @@ -266,7 +260,7 @@ static inline int devm_extcon_dev_register(struct device *dev, static inline void devm_extcon_dev_unregister(struct device *dev, struct extcon_dev *edev) { } -static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable) +static inline struct extcon_dev *extcon_dev_allocate(const unsigned int *cable) { return ERR_PTR(-ENOSYS); } @@ -274,7 +268,7 @@ static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable) static inline void extcon_dev_free(struct extcon_dev *edev) { } static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const enum extcon *cable) + const unsigned int *cable) { return ERR_PTR(-ENOSYS); } @@ -298,13 +292,13 @@ static inline int extcon_update_state(struct extcon_dev *edev, u32 mask, } static inline int extcon_get_cable_state_(struct extcon_dev *edev, - enum extcon id) + unsigned int id) { return 0; } static inline int extcon_set_cable_state_(struct extcon_dev *edev, - enum extcon id, bool cable_state) + unsigned int id, bool cable_state) { return 0; } @@ -327,14 +321,14 @@ static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name) } static inline int extcon_register_notifier(struct extcon_dev *edev, - enum extcon id, + unsigned int id, struct notifier_block *nb) { return 0; } static inline int extcon_unregister_notifier(struct extcon_dev *edev, - enum extcon id, + unsigned int id, struct notifier_block *nb) { return 0; -- cgit v1.2.3 From ef73f886b53548d83d71a439f51a0c13ea6c1dae Mon Sep 17 00:00:00 2001 From: Dmitry Kalinkin Date: Thu, 28 May 2015 15:07:04 +0300 Subject: vme: export vme_check_window() Signed-off-by: Dmitry Kalinkin Cc: Igor Alekseev Signed-off-by: Greg Kroah-Hartman --- include/linux/vme.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vme.h b/include/linux/vme.h index 79242e9c06b8..c0131358f351 100644 --- a/include/linux/vme.h +++ b/include/linux/vme.h @@ -120,6 +120,8 @@ void vme_free_consistent(struct vme_resource *, size_t, void *, dma_addr_t); size_t vme_get_size(struct vme_resource *); +int vme_check_window(u32 aspace, unsigned long long vme_base, + unsigned long long size); struct vme_resource *vme_slave_request(struct vme_dev *, u32, u32); int vme_slave_set(struct vme_resource *, int, unsigned long long, -- cgit v1.2.3 From 6f6a6fda294506dfe0e3e0a253bb2d2923f28f0a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 15 Jun 2015 14:36:01 -0400 Subject: jbd2: fix ocfs2 corrupt when updating journal superblock fails If updating journal superblock fails after journal data has been flushed, the error is omitted and this will mislead the caller as a normal case. In ocfs2, the checkpoint will be treated successfully and the other node can get the lock to update. Since the sb_start is still pointing to the old log block, it will rewrite the journal data during journal recovery by the other node. Thus the new updates will be overwritten and ocfs2 corrupts. So in above case we have to return the error, and ocfs2_commit_cache will take care of the error and prevent the other node to do update first. And only after recovering journal it can do the new updates. The issue discussion mail can be found at: https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html http://comments.gmane.org/gmane.comp.file-systems.ext4/48841 [ Fixed bug in patch which allowed a non-negative error return from jbd2_cleanup_journal_tail() to leak out of jbd2_fjournal_flush(); this was causing xfstests ext4/306 to fail. -- Ted ] Reported-by: Yiwen Jiang Signed-off-by: Joseph Qi Signed-off-by: Theodore Ts'o Tested-by: Yiwen Jiang Cc: Junxiao Bi Cc: stable@vger.kernel.org --- include/linux/jbd2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 20e7f78041c8..edb640ae9a94 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ @@ -1157,7 +1157,7 @@ extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); -extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, +extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); extern void __jbd2_journal_abort_hard (journal_t *); extern void jbd2_journal_abort (journal_t *, int); -- cgit v1.2.3 From 764ad8ba8cd4c6f836fca9378f8c5121aece0842 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Jun 2015 19:43:56 -0400 Subject: nfs: increase size of EXCHANGE_ID name string buffer The current buffer is much too small if you have a relatively long hostname. Bring it up to the size of the one that SETCLIENTID has. Cc: Reported-by: Michael Skralivetsky Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 93ab6071bbe9..e9e9a8dcfb47 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1142,7 +1142,7 @@ struct nfs41_state_protection { struct nfs4_op_map allow; }; -#define NFS4_EXCHANGE_ID_LEN (48) +#define NFS4_EXCHANGE_ID_LEN (127) struct nfs41_exchange_id_args { struct nfs_client *client; nfs4_verifier *verifier; -- cgit v1.2.3 From 3a6bb738792500e8b4534c0350c13a132bac0492 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Jun 2015 19:43:57 -0400 Subject: nfs: convert setclientid and exchange_id encoders to use clp->cl_owner_id ...instead of buffers that are part of their arg structs. We already hold a reference to the client, so we might as well use the allocated buffer. In the event that we can't allocate the clp->cl_owner_id, then just return -ENOMEM. Note too that we switch from a GFP_KERNEL allocation here to GFP_NOFS. It's possible we could end up trying to do a SETCLIENTID or EXCHANGE_ID in order to reclaim some memory, and the GFP_KERNEL allocations in the existing code could cause recursion back into NFS reclaim. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index e9e9a8dcfb47..6c0b423d781b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -994,7 +994,7 @@ struct nfs4_setclientid { char sc_netid[RPCBIND_MAXNETIDLEN + 1]; unsigned int sc_uaddr_len; char sc_uaddr[RPCBIND_MAXUADDRLEN + 1]; - u32 sc_cb_ident; + struct nfs_client *sc_clnt; struct rpc_cred *sc_cred; }; -- cgit v1.2.3 From 873e385116b2cc5c7daca8f51881371fadb90970 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Jun 2015 19:44:00 -0400 Subject: nfs: make nfs4_init_uniform_client_string use a dynamically allocated buffer Change the uniform client string generator to dynamically allocate the NFSv4 client name string buffer. With this patch, we can eliminate the buffers that are embedded within the "args" structs and simply use the name string that is hanging off the client. This uniform string case is a little simpler than the nonuniform since we don't need to deal with RCU, but we do have two different cases, depending on whether there is a uniquifier or not. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6c0b423d781b..c959e7eb5bd4 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -984,11 +984,8 @@ struct nfs4_readlink_res { struct nfs4_sequence_res seq_res; }; -#define NFS4_SETCLIENTID_NAMELEN (127) struct nfs4_setclientid { const nfs4_verifier * sc_verifier; - unsigned int sc_name_len; - char sc_name[NFS4_SETCLIENTID_NAMELEN + 1]; u32 sc_prog; unsigned int sc_netid_len; char sc_netid[RPCBIND_MAXNETIDLEN + 1]; @@ -1142,12 +1139,9 @@ struct nfs41_state_protection { struct nfs4_op_map allow; }; -#define NFS4_EXCHANGE_ID_LEN (127) struct nfs41_exchange_id_args { struct nfs_client *client; nfs4_verifier *verifier; - unsigned int id_len; - char id[NFS4_EXCHANGE_ID_LEN]; u32 flags; struct nfs41_state_protection state_protect; }; -- cgit v1.2.3 From 22c1587adfed1977d26a57ac2831d03e37f8f805 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 27 Apr 2015 17:37:53 -0400 Subject: init: delete the __cpuinit related stubs The __cpuinit support was removed several releases ago in 3.11-rc1 with commit 22f0a27367742f65130c0fb25ef00f7297e032c1 ("init.h: remove __cpuinit sections from the kernel") People have had a chance to update their out of tree code, so now we remove the no-op stubs to ensure no more new use cases can creep back in. Also delete the mention of __cpuinitdata from the tag script. Signed-off-by: Paul Gortmaker --- include/linux/init.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 21b6d768edd7..7c68c36d3fd8 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -91,14 +91,6 @@ #define __exit __section(.exit.text) __exitused __cold notrace -/* temporary, until all users are removed */ -#define __cpuinit -#define __cpuinitdata -#define __cpuinitconst -#define __cpuexit -#define __cpuexitdata -#define __cpuexitconst - /* Used for MEMORY_HOTPLUG */ #define __meminit __section(.meminit.text) __cold notrace #define __meminitdata __section(.meminit.data) @@ -116,9 +108,6 @@ #define __INITRODATA .section ".init.rodata","a",%progbits #define __FINITDATA .previous -/* temporary, until all users are removed */ -#define __CPUINIT - #define __MEMINIT .section ".meminit.text", "ax" #define __MEMINITDATA .section ".meminit.data", "aw" #define __MEMINITRODATA .section ".meminit.rodata", "a" -- cgit v1.2.3 From f309d4443130bf814e991f836e919dca22df37ae Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 May 2015 20:10:57 -0400 Subject: platform_device: better support builtin boilerplate avoidance We have macros that help reduce the boilerplate for modules that register with no extra init/exit complexity other than the most standard use case. However we see an increasing number of non-modular drivers using these modular_driver() type register functions. There are several downsides to this: 1) The code can appear modular to a reader of the code, and they won't know if the code really is modular without checking the Makefile and Kconfig to see if compilation is governed by a bool or tristate. 2) Coders of drivers may be tempted to code up an __exit function that is never used, just in order to satisfy the required three args of the modular registration function. 3) Non-modular code ends up including the which increases CPP overhead that they don't need. 4) It hinders us from performing better separation of the module init code and the generic init code. Here we introduce similar macros, with the mapping from module_driver to builtin_driver and similar, so that simple changes of: module_platform_driver() ---> builtin_platform_driver() module_platform_driver_probe() ---> builtin_platform_driver_probe(). can help us avoid #3 above, without having to code up the same __init functions and device_initcall() boilerplate. For non modular code, module_init becomes __initcall. But direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of device_initcall directly in this change means that the runtime impact is zero -- drivers will remain at level 6 in the initcall ordering. Cc: Greg Kroah-Hartman Signed-off-by: Paul Gortmaker --- include/linux/device.h | 22 ++++++++++++++++++++++ include/linux/platform_device.h | 23 +++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 6558af90c8fe..c2d6167cb4a3 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1269,4 +1269,26 @@ static void __exit __driver##_exit(void) \ } \ module_exit(__driver##_exit); +/** + * builtin_driver() - Helper macro for drivers that don't do anything + * special in init and have no exit. This eliminates some boilerplate. + * Each driver may only use this macro once, and calling it replaces + * device_initcall (or in some cases, the legacy __initcall). This is + * meant to be a direct parallel of module_driver() above but without + * the __exit stuff that is not used for builtin cases. + * + * @__driver: driver name + * @__register: register function for this driver type + * @...: Additional arguments to be passed to __register + * + * Use this macro to construct bus specific macros for registering + * drivers, and do not use it on its own. + */ +#define builtin_driver(__driver, __register, ...) \ +static int __init __driver##_init(void) \ +{ \ + return __register(&(__driver) , ##__VA_ARGS__); \ +} \ +device_initcall(__driver##_init); + #endif /* _DEVICE_H_ */ diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 58f1e75ba105..bba08f44cc97 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -222,6 +222,15 @@ static inline void platform_set_drvdata(struct platform_device *pdev, module_driver(__platform_driver, platform_driver_register, \ platform_driver_unregister) +/* builtin_platform_driver() - Helper macro for builtin drivers that + * don't do anything special in driver init. This eliminates some + * boilerplate. Each driver may only use this macro once, and + * calling it replaces device_initcall(). Note this is meant to be + * a parallel of module_platform_driver() above, but w/o _exit stuff. + */ +#define builtin_platform_driver(__platform_driver) \ + builtin_driver(__platform_driver, platform_driver_register) + /* module_platform_driver_probe() - Helper macro for drivers that don't do * anything special in module init/exit. This eliminates a lot of * boilerplate. Each module may only use this macro once, and @@ -240,6 +249,20 @@ static void __exit __platform_driver##_exit(void) \ } \ module_exit(__platform_driver##_exit); +/* builtin_platform_driver_probe() - Helper macro for drivers that don't do + * anything special in device init. This eliminates some boilerplate. Each + * driver may only use this macro once, and using it replaces device_initcall. + * This is meant to be a parallel of module_platform_driver_probe above, but + * without the __exit parts. + */ +#define builtin_platform_driver_probe(__platform_driver, __platform_probe) \ +static int __init __platform_driver##_init(void) \ +{ \ + return platform_driver_probe(&(__platform_driver), \ + __platform_probe); \ +} \ +device_initcall(__platform_driver##_init); \ + #define platform_create_bundle(driver, probe, res, n_res, data, size) \ __platform_create_bundle(driver, probe, res, n_res, data, size, THIS_MODULE) extern struct platform_device *__platform_create_bundle( -- cgit v1.2.3 From fec47d863587c272d6fbf4e50066209c953d5e60 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Fri, 22 May 2015 15:45:27 -0500 Subject: remoteproc: introduce rproc_get_by_phandle API Allow users of remoteproc the ability to get a handle to an rproc by passing a phandle supplied in the user's device tree node. This is useful in situations that require manual booting of the rproc. This patch uses the code removed by commit 40e575b1d0b3 ("remoteproc: remove the get_by_name/put API") for the ref counting but is modified to use a simple list and locking mechanism and has rproc_get_by_name replaced with an rproc_get_by_phandle API. Signed-off-by: Dave Gerlach Signed-off-by: Suman Anna [fix order of Signed-off-by tags] Signed-off-by: Ohad Ben-Cohen --- include/linux/remoteproc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 78b8a9b9d40a..56739e5df1e6 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -36,11 +36,11 @@ #define REMOTEPROC_H #include -#include #include #include #include #include +#include /** * struct resource_table - firmware resource table header @@ -375,7 +375,7 @@ enum rproc_crash_type { /** * struct rproc - represents a physical remote processor device - * @node: klist node of this rproc object + * @node: list node of this rproc object * @domain: iommu domain * @name: human readable name of the rproc * @firmware: name of firmware file to be loaded @@ -407,7 +407,7 @@ enum rproc_crash_type { * @has_iommu: flag to indicate if remote processor is behind an MMU */ struct rproc { - struct klist_node node; + struct list_head node; struct iommu_domain *domain; const char *name; const char *firmware; @@ -481,6 +481,7 @@ struct rproc_vdev { u32 rsc_offset; }; +struct rproc *rproc_get_by_phandle(phandle phandle); struct rproc *rproc_alloc(struct device *dev, const char *name, const struct rproc_ops *ops, const char *firmware, int len); -- cgit v1.2.3 From 9f3520c3115b451ac1301779fc3c769d94907a70 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 8 May 2015 18:19:05 +1000 Subject: wait: introduce wait_event_exclusive_cmd It's just a variant of wait_event_cmd(), with exclusive flag being set. For cases like RAID5, which puts many processes to sleep until 1/4 resources are free, a wake_up wakes up all processes to run, but there is one process being able to get the resource as it's protected by a spin lock. That ends up introducing heavy lock contentions, and hurts performance badly. Here introduce wait_event_exclusive_cmd to relieve the lock contention naturally by letting wake_up just wake up one process. Cc: Ingo Molnar Cc: Peter Zijlstra v2: its assumed that wait*() and __wait*() have the same arguments - peterz Acked-by: Peter Zijlstra (Intel) Signed-off-by: Yuanhan Liu Signed-off-by: NeilBrown --- include/linux/wait.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db83349865b..db78c7204947 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,6 +358,19 @@ do { \ __ret; \ }) +#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ + cmd1; schedule(); cmd2) +/* + * Just like wait_event_cmd(), except it sets exclusive flag + */ +#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \ +} while (0) + #define __wait_event_cmd(wq, condition, cmd1, cmd2) \ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) -- cgit v1.2.3 From a01f7cd657c95941079d548ef7fbf0cc370c1259 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Fri, 22 May 2015 15:45:28 -0500 Subject: remoteproc: add a rproc ops for performing address translation The rproc_da_to_va API is currently used to perform any device to kernel address translations to meet the different needs of the remoteproc core/drivers (eg: loading). The functionality is achieved within the remoteproc core, and is limited only for carveouts allocated within the core. A new rproc ops, da_to_va, is added to provide flexibility to platform implementations to perform the address translation themselves when the above conditions cannot be met by the implementations. The rproc_da_to_va() API is extended to invoke this ops if present, and fallback to regular processing if the platform implementation cannot provide the translation. This will allow any remoteproc implementations to translate addresses for dedicated memories like internal memories. While at this, also update the rproc_da_to_va() documentation since it is an exported function. Signed-off-by: Suman Anna Signed-off-by: Dave Gerlach Signed-off-by: Ohad Ben-Cohen --- include/linux/remoteproc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 56739e5df1e6..9c4e1384f636 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -330,11 +330,13 @@ struct rproc; * @start: power on the device and boot it * @stop: power off the device * @kick: kick a virtqueue (virtqueue id given as a parameter) + * @da_to_va: optional platform hook to perform address translations */ struct rproc_ops { int (*start)(struct rproc *rproc); int (*stop)(struct rproc *rproc); void (*kick)(struct rproc *rproc, int vqid); + void * (*da_to_va)(struct rproc *rproc, u64 da, int len); }; /** -- cgit v1.2.3 From a01bc0d5f557bd8becd0ba75d09c39192004697e Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Fri, 22 May 2015 15:45:30 -0500 Subject: remoteproc/wkup_m3: add a remoteproc driver for TI Wakeup M3 Add a remoteproc driver to load the firmware and boot a small Wakeup M3 processor present on TI AM33xx and AM43xx SoCs. This Wakeup M3 remote processor is an integrated Cortex M3 that allows the SoC to enter the lowest possible power state by taking control from the MPU after it has gone into its own low power state and shutting off any additional peripherals. The Wakeup M3 processor has two internal memory regions - 16 kB of unified instruction memory called UMEM used to store executable code, and 8 kB of data memory called DMEM used for all data sections. The Wakeup M3 processor executes its code entirely from within the UMEM and uses the DMEM for any data. It does not use any external memory or any other external resources. The device address view has the UMEM at address 0x0 and DMEM at address 0x80000, and these are computed automatically within the driver based on relative address calculation from the corresponding device tree IOMEM resources. These device addresses are used to aid the core remoteproc ELF loader code to properly translate and load the firmware segments through the .rproc_da_to_va ops. Signed-off-by: Dave Gerlach Signed-off-by: Suman Anna Signed-off-by: Ohad Ben-Cohen --- include/linux/platform_data/wkup_m3.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 include/linux/platform_data/wkup_m3.h (limited to 'include/linux') diff --git a/include/linux/platform_data/wkup_m3.h b/include/linux/platform_data/wkup_m3.h new file mode 100644 index 000000000000..3f1d77effd71 --- /dev/null +++ b/include/linux/platform_data/wkup_m3.h @@ -0,0 +1,30 @@ +/* + * TI Wakeup M3 remote processor platform data + * + * Copyright (C) 2014-2015 Texas Instruments, Inc. + * + * Dave Gerlach + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_PLATFORM_DATA_WKUP_M3_H +#define _LINUX_PLATFORM_DATA_WKUP_M3_H + +struct platform_device; + +struct wkup_m3_platform_data { + const char *reset_name; + + int (*assert_reset)(struct platform_device *pdev, const char *name); + int (*deassert_reset)(struct platform_device *pdev, const char *name); +}; + +#endif /* _LINUX_PLATFORM_DATA_WKUP_M3_H */ -- cgit v1.2.3 From 46b15caa7cb19b0f6e3bc8ebaee5bc1bb2e35110 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Jun 2015 18:48:31 -0400 Subject: vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB FS_CGROUP_WRITEBACK indicates whether a file_system_type supports cgroup writeback; however, different super_blocks of the same file_system_type may or may not support cgroup writeback depending on filesystem options. This patch replaces FS_CGROUP_WRITEBACK with a per-super_block flag. super_block->s_flags carries some internal flags in the high bits but it's exposd to userland through uapi header and running out of space anyway. This patch adds a new field super_block->s_iflags to carry kernel-internal flags. It is currently only used by the new SB_I_CGROUPWB flag whose concatenated and abbreviated name is for consistency with other super_block flags. ext2_fill_super() is updated to set SB_I_CGROUPWB. v2: Added super_block->s_iflags instead of stealing another high bit from sb->s_flags as suggested by Christoph and Jan. Signed-off-by: Tejun Heo Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: Christoph Hellwig Cc: Jan Kara Cc: linux-ext4@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- include/linux/fs.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index dfce80869145..a13181a42b9a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -260,7 +260,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return bdi_cap_account_dirty(bdi) && (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && - (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); + (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index b5e1dcfbc5e3..2c5e33a5b2af 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1241,6 +1241,8 @@ struct mm_struct; #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ +/* sb->s_iflags */ +#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ /* Possible states of 'frozen' field */ enum { @@ -1279,6 +1281,7 @@ struct super_block { const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; + unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; @@ -1912,7 +1915,6 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ -#define FS_CGROUP_WRITEBACK 32 /* Supports cgroup-aware writeback */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- cgit v1.2.3 From fb02915f47181e824339d91f8e385fd4bd746d6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Jun 2015 16:54:28 -0400 Subject: kernfs: make kernfs_get_inode() public Move kernfs_get_inode() prototype from fs/kernfs/kernfs-internal.h to include/linux/kernfs.h. It obtains the matching inode for a kernfs_node. It will be used by cgroup for inode based permission checks for now but is generally useful. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 71ecdab1671b..e6b2f7db9c0c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); @@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } +static inline struct inode * +kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ return NULL; } + static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) -- cgit v1.2.3 From 187fe84067bd377047cfcb7f2bbc7c9dc12d290c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Jun 2015 16:54:28 -0400 Subject: cgroup: require write perm on common ancestor when moving processes on the default hierarchy On traditional hierarchies, if a task has write access to "tasks" or "cgroup.procs" file of a cgroup and its euid agrees with the target, it can move the target to the cgroup; however, consider the following scenario. The owner of each cgroup is in the parentheses. R (root) - 0 (root) - 00 (user1) - 000 (user1) | \ 001 (user1) \ 1 (root) - 10 (user1) The subtrees of 00 and 10 are delegated to user1; however, while both subtrees may belong to the same user, it is clear that the two subtrees are to be isolated - they're under completely separate resource limits imposed by 0 and 1, respectively. Note that 0 and 1 aren't strictly necessary but added to ease illustrating the issue. If user1 is allowed to move processes between the two subtrees, the intention of the hierarchy - keeping a given group of processes under a subtree with certain resource restrictions while delegating management of the subtree - can be circumvented by user1. This happens because migration permission check doesn't consider the hierarchical nature of cgroups. To fix the issue, this patch adds an extra permission requirement when userland tries to migrate a process in the default hierarchy - the issuing task must have write access to the common ancestor of "cgroup.procs" file of the ancestor in addition to the destination's. Conceptually, the issuer must be able to move the target process from the source cgroup to the common ancestor of source and destination cgroups and then to the destination. As long as delegation is done in a proper top-down way, this guarantees that a delegatee can't smuggle processes across disjoint delegation domains. The next patch will add documentation on the delegation model on the default hierarchy. v2: Fixed missing !ret test. Spotted by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner Cc: Li Zefan --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c5588c438448..93755a629299 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -220,6 +220,7 @@ struct cgroup { int populated_cnt; struct kernfs_node *kn; /* cgroup kernfs entry */ + struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ /* -- cgit v1.2.3 From 4bacc9c9234c7c8eec44f5ed4e960d9f96fa0f01 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 18 Jun 2015 14:32:31 +0100 Subject: overlayfs: Make f_path always point to the overlay and f_inode to the underlay Make file->f_path always point to the overlay dentry so that the path in /proc/pid/fd is correct and to ensure that label-based LSMs have access to the overlay as well as the underlay (path-based LSMs probably don't need it). Using my union testsuite to set things up, before the patch I see: [root@andromeda union-testsuite]# bash 5 /a/foo107 [root@andromeda union-testsuite]# stat /mnt/a/foo107 ... Device: 23h/35d Inode: 13381 Links: 1 ... [root@andromeda union-testsuite]# stat -L /proc/$$/fd/5 ... Device: 23h/35d Inode: 13381 Links: 1 ... After the patch: [root@andromeda union-testsuite]# bash 5 /mnt/a/foo107 [root@andromeda union-testsuite]# stat /mnt/a/foo107 ... Device: 23h/35d Inode: 40346 Links: 1 ... [root@andromeda union-testsuite]# stat -L /proc/$$/fd/5 ... Device: 23h/35d Inode: 40346 Links: 1 ... Note the change in where /proc/$$/fd/5 points to in the ls command. It was pointing to /a/foo107 (which doesn't exist) and now points to /mnt/a/foo107 (which is correct). The inode accessed, however, is the lower layer. The union layer is on device 25h/37d and the upper layer on 24h/36d. Signed-off-by: David Howells Signed-off-by: Al Viro --- include/linux/dcache.h | 2 ++ include/linux/fs.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index df334cbacc6d..167ec0934049 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -160,6 +160,7 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); + struct inode *(*d_select_inode)(struct dentry *, unsigned); } ____cacheline_aligned; /* @@ -225,6 +226,7 @@ struct dentry_operations { #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ +#define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ extern seqlock_t rename_lock; diff --git a/include/linux/fs.h b/include/linux/fs.h index b577e801b4af..2bd77e10e8e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1641,7 +1641,6 @@ struct inode_operations { int (*set_acl)(struct inode *, struct posix_acl *, int); /* WARNING: probably going away soon, do not use! */ - int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } ____cacheline_aligned; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, @@ -2194,7 +2193,6 @@ extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); -extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); -- cgit v1.2.3 From 1796dcce2daacc125f2d60afc3f631ca29e36684 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 3 May 2015 18:57:10 +0900 Subject: rtc: interface: Fix coding style violations Fix issues reported by checkpatch: ERROR: open brace '{' following struct go on the same line ERROR: "foo* bar" should be "foo *bar" Additionally adjust alignment of wrapped function arguments. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 8dcf6825fa88..b0709f80dfb0 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -101,8 +101,7 @@ struct rtc_timer { /* flags */ #define RTC_DEV_BUSY 0 -struct rtc_device -{ +struct rtc_device { struct device dev; struct module *owner; @@ -198,10 +197,10 @@ int rtc_register(rtc_task_t *task); int rtc_unregister(rtc_task_t *task); int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg); -void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data); -int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, - ktime_t expires, ktime_t period); -int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer); +void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data); +int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, + ktime_t expires, ktime_t period); +int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer); void rtc_timer_do_work(struct work_struct *work); static inline bool is_leap_year(unsigned int year) -- cgit v1.2.3 From 73744a64aab872703b851b9678a7f488b507eb81 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 3 May 2015 18:57:11 +0900 Subject: rtc: interface: Remove unused return value from rtc_timer_cancel() The rtc_timer_cancel() always returns 0 and cannot fail (calls only other void-returning functions). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index b0709f80dfb0..587017e7939c 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -200,7 +200,7 @@ int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg); void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data); int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, ktime_t expires, ktime_t period); -int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer); +void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer); void rtc_timer_do_work(struct work_struct *work); static inline bool is_leap_year(unsigned int year) -- cgit v1.2.3 From d8d919879e9a645061a560a0a26abb9f3bca97df Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 3 Apr 2015 18:43:44 +0200 Subject: clk: add CLK_RECALC_NEW_RATES clock flag for Exynos cpu clock support This flag is needed to fix the issue with wrong dividers being setup by Common Clock Framework when using the new Exynos cpu clock support. The issue happens because clk_core_set_rate_nolock() calls clk_calc_new_rates(clk, rate) before both pre/post clock notifiers have a chance to run. In case of Exynos cpu clock support pre/post clock notifiers are registered for mout_apll clock which is a parent of armclk cpu clock and dividers are modified in both pre and post clock notifier. This results in wrong dividers values being later programmed by clk_change_rate(top). To workaround the problem CLK_RECALC_NEW_RATES flag is added and it is set for mout_apll clock later so the correct divider values are re-calculated after both pre and post clock notifiers had run. For example when using "performance" governor on Exynos4210 Origen board the cpufreq-dt driver requests to change the frequency from 1000MHz to 1200MHz and after the change state of the relevant clocks is following: Without use of CLK_GET_RATE_NOCACHE flag: fout_apll rate: 1200000000 fout_apll_div_2 rate: 600000000 mout_clkout_cpu rate: 600000000 div_clkout_cpu rate: 600000000 clkout_cpu rate: 600000000 mout_apll rate: 1200000000 armclk rate: 1200000000 mout_hpm rate: 1200000000 div_copy rate: 300000000 div_hpm rate: 300000000 mout_core rate: 1200000000 div_core rate: 1200000000 div_core2 rate: 1200000000 arm_clk_div_2 rate: 600000000 div_corem0 rate: 300000000 div_corem1 rate: 150000000 div_periph rate: 300000000 div_atb rate: 300000000 div_pclk_dbg rate: 150000000 sclk_apll rate: 1200000000 sclk_apll_div_2 rate: 600000000 With use of CLK_GET_RATE_NOCACHE flag: fout_apll rate: 1200000000 fout_apll_div_2 rate: 600000000 mout_clkout_cpu rate: 600000000 div_clkout_cpu rate: 600000000 clkout_cpu rate: 600000000 mout_apll rate: 1200000000 armclk rate: 1200000000 mout_hpm rate: 1200000000 div_copy rate: 200000000 div_hpm rate: 200000000 mout_core rate: 1200000000 div_core rate: 1200000000 div_core2 rate: 1200000000 arm_clk_div_2 rate: 600000000 div_corem0 rate: 300000000 div_corem1 rate: 150000000 div_periph rate: 300000000 div_atb rate: 240000000 div_pclk_dbg rate: 120000000 sclk_apll rate: 150000000 sclk_apll_div_2 rate: 75000000 Without this change cpufreq-dt driver showed ~10 mA larger energy consumption when compared to cpufreq-exynos one when "performance" cpufreq governor was used on Exynos4210 SoC based Origen board. This issue was probably meant to be workarounded by use of CLK_GET_RATE_NOCACHE and CLK_DIVIDER_READ_ONLY clock flags in the original Exynos cpu clock patchset (in "[PATCH v12 6/6] clk: samsung: remove unused clock aliases and update clock flags" patch) but usage of these flags is not sufficient to fix the issue observed. Cc: Thomas Abraham Cc: Tomasz Figa Cc: Mike Turquette Cc: Javier Martinez Canillas Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Michael Turquette --- include/linux/clk-provider.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index df695313f975..82f59ca8188a 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -31,6 +31,7 @@ #define CLK_GET_RATE_NOCACHE BIT(6) /* do not use the cached clk rate */ #define CLK_SET_RATE_NO_REPARENT BIT(7) /* don't re-parent on rate change */ #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */ +#define CLK_RECALC_NEW_RATES BIT(9) /* recalc rates after notifications */ struct clk_hw; struct clk_core; -- cgit v1.2.3 From 1387fe7d292b66677dae31d25a8e3c953bf21748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 12 May 2015 11:31:02 +0200 Subject: MIPS: BCM47xx: Extract all boardflags to new u32 fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For years we planned to get rid of old u16 fields, let's start doing it with MIPS code. This process will take some time, it requires doing the same in ssb/bcma and then switching all drivers to new fields. This will be handled in separated patches submitted to appropriate trees. Signed-off-by: Rafał Miłecki Cc: linux-mips@linux-mips.org Cc: Hauke Mehrtens Patchwork: https://patchwork.linux-mips.org/patch/10026/ Signed-off-by: Ralf Baechle --- include/linux/ssb/ssb.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index 4568a5cc9ab8..ee90e32a0607 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -88,11 +88,14 @@ struct ssb_sprom { u32 ofdm5glpo; /* 5.2GHz OFDM power offset */ u32 ofdm5gpo; /* 5.3GHz OFDM power offset */ u32 ofdm5ghpo; /* 5.8GHz OFDM power offset */ + u32 boardflags; + u32 boardflags2; + u32 boardflags3; + /* TODO: Switch all drivers to new u32 fields and drop below ones */ u16 boardflags_lo; /* Board flags (bits 0-15) */ u16 boardflags_hi; /* Board flags (bits 16-31) */ u16 boardflags2_lo; /* Board flags (bits 32-47) */ u16 boardflags2_hi; /* Board flags (bits 48-63) */ - /* TODO store board flags in a single u64 */ struct ssb_sprom_core_pwr_info core_pwr_info[4]; -- cgit v1.2.3 From 6e122ac0053d071976686dd04cdd60ea8039bb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 12 May 2015 11:54:48 +0200 Subject: MIPS: BCM47xx: Extract info about et2 interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New devices may have more than 1 Ethernet core (device). We should extract info about them to make it available to Ethernet drivers. Signed-off-by: Rafał Miłecki Cc: linux-mips@linux-mips.org Cc: Hauke Mehrtens Cc: Hante Meuleman Cc: Ian Kent Patchwork: https://patchwork.linux-mips.org/patch/10027/ Signed-off-by: Ralf Baechle --- include/linux/ssb/ssb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index ee90e32a0607..c3d1a525bacc 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -29,10 +29,13 @@ struct ssb_sprom { u8 il0mac[6] __aligned(sizeof(u16)); /* MAC address for 802.11b/g */ u8 et0mac[6] __aligned(sizeof(u16)); /* MAC address for Ethernet */ u8 et1mac[6] __aligned(sizeof(u16)); /* MAC address for 802.11a */ + u8 et2mac[6] __aligned(sizeof(u16)); /* MAC address for extra Ethernet */ u8 et0phyaddr; /* MII address for enet0 */ u8 et1phyaddr; /* MII address for enet1 */ + u8 et2phyaddr; /* MII address for enet2 */ u8 et0mdcport; /* MDIO for enet0 */ u8 et1mdcport; /* MDIO for enet1 */ + u8 et2mdcport; /* MDIO for enet2 */ u16 dev_id; /* Device ID overriding e.g. PCI ID */ u16 board_rev; /* Board revision number from SPROM. */ u16 board_num; /* Board number from SPROM. */ -- cgit v1.2.3 From 44e08e7099c8de226606cfc989b45d6fa27f507f Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Sun, 24 May 2015 16:11:31 +0100 Subject: MIPS/IRQCHIP: Move Ingenic SoC intc driver to drivers/irqchip Move the driver for Ingenic SoC interrupt controllers into drivers/irqchip where it belongs. Signed-off-by: Paul Burton Cc: Lars-Peter Clausen Cc: Thomas Gleixner Cc: Jason Cooper Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: Brian Norris Patchwork: https://patchwork.linux-mips.org/patch/10147/ Signed-off-by: Ralf Baechle --- include/linux/irqchip/ingenic.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/linux/irqchip/ingenic.h (limited to 'include/linux') diff --git a/include/linux/irqchip/ingenic.h b/include/linux/irqchip/ingenic.h new file mode 100644 index 000000000000..0ee319a4029d --- /dev/null +++ b/include/linux/irqchip/ingenic.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2010, Lars-Peter Clausen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LINUX_IRQCHIP_INGENIC_H__ +#define __LINUX_IRQCHIP_INGENIC_H__ + +#include + +extern void ingenic_intc_irq_suspend(struct irq_data *data); +extern void ingenic_intc_irq_resume(struct irq_data *data); + +#endif -- cgit v1.2.3 From 55cab93bcf1422ab4298edc65c349c4304b4884e Mon Sep 17 00:00:00 2001 From: Hante Meuleman Date: Thu, 21 May 2015 15:27:23 +0200 Subject: mips: bcm47xx: allow retrieval of complete nvram contents Host platforms such as routers supported by OpenWrt can support NVRAM reading directly from internal NVRAM store. The brcmfmac for one requires the complete nvram contents to select what needs to be sent to wireless device. Signed-off-by: Arend van Spriel Signed-off-by: Hante Meuleman Reviewed-by: Arend Van Spriel Reviewed-by: Franky (Zhenhui) Lin Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Daniel (Deognyoun) Kim Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/10093/ Signed-off-by: Ralf Baechle --- include/linux/bcm47xx_nvram.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h index b12b07e75929..c73927c66c2c 100644 --- a/include/linux/bcm47xx_nvram.h +++ b/include/linux/bcm47xx_nvram.h @@ -10,11 +10,17 @@ #include #include +#include #ifdef CONFIG_BCM47XX int bcm47xx_nvram_init_from_mem(u32 base, u32 lim); int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len); int bcm47xx_nvram_gpio_pin(const char *name); +char *bcm47xx_nvram_get_contents(size_t *val_len); +static inline void bcm47xx_nvram_release_contents(char *nvram) +{ + vfree(nvram); +}; #else static inline int bcm47xx_nvram_init_from_mem(u32 base, u32 lim) { @@ -29,6 +35,15 @@ static inline int bcm47xx_nvram_gpio_pin(const char *name) { return -ENOTSUPP; }; + +static inline char *bcm47xx_nvram_get_contents(size_t *val_len) +{ + return NULL; +}; + +static inline void bcm47xx_nvram_release_contents(char *nvram) +{ +}; #endif #endif /* __BCM47XX_NVRAM_H */ -- cgit v1.2.3 From 2ddf3a792218cddd30140b1f8b32cb6e2d67921f Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Sun, 31 May 2015 02:18:24 +0200 Subject: MIPS: ath79: Add OF support to the GPIO driver Replace the simple GPIO chip registration by a platform driver and make ath79_gpio_init() just register the device. Signed-off-by: Alban Bedel Cc: linux-mips@linux-mips.org Signed-off-by: Ralf Baechle --- include/linux/platform_data/gpio-ath79.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/platform_data/gpio-ath79.h (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-ath79.h b/include/linux/platform_data/gpio-ath79.h new file mode 100644 index 000000000000..88b0db7bee74 --- /dev/null +++ b/include/linux/platform_data/gpio-ath79.h @@ -0,0 +1,19 @@ +/* + * Atheros AR7XXX/AR9XXX GPIO controller platform data + * + * Copyright (C) 2015 Alban Bedel + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_PLATFORM_DATA_GPIO_ATH79_H +#define __LINUX_PLATFORM_DATA_GPIO_ATH79_H + +struct ath79_gpio_platform_data { + unsigned ngpios; + bool oe_inverted; +}; + +#endif -- cgit v1.2.3 From f6e734a8c162297953d7bfc0f3f6bf4f8c33d72f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 10 Jun 2015 23:05:08 +0200 Subject: MIPS: BCM47xx: Move NVRAM driver to the drivers/firmware/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After Broadcom switched from MIPS to ARM for their home routers we need to have NVRAM driver in some common place (not arch/mips/). As explained in Kconfig, this driver is responsible for parsing SoC configuration data that is passed to the kernel in flash from the bootloader firmware called "CFE". We were thinking about putting it in bus directory, however there are two possible buses for MIPS: drivers/ssb/ and drivers/bcma/. So this won't fit there and this is why I would like to move this driver to the drivers/firmware/. Signed-off-by: Rafał Miłecki Reviewed-by: Paul Walmsley Cc: linux-mips@linux-mips.org Cc: Hauke Mehrtens Cc: Seiji Aguchi Cc: Greg Kroah-Hartman Cc: Ard Biesheuvel Cc: Mike Waychison Cc: Roy Franz Cc: Matt Fleming Cc: Linus Torvalds Patchwork: https://patchwork.linux-mips.org/patch/10544/ Signed-off-by: Ralf Baechle --- include/linux/bcm47xx_nvram.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h index c73927c66c2c..2793652fbf66 100644 --- a/include/linux/bcm47xx_nvram.h +++ b/include/linux/bcm47xx_nvram.h @@ -12,7 +12,7 @@ #include #include -#ifdef CONFIG_BCM47XX +#ifdef CONFIG_BCM47XX_NVRAM int bcm47xx_nvram_init_from_mem(u32 base, u32 lim); int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len); int bcm47xx_nvram_gpio_pin(const char *name); -- cgit v1.2.3 From 8ccd0d0ca04147e91890c373677f1e741dda2631 Mon Sep 17 00:00:00 2001 From: Hyungwon Hwang Date: Fri, 12 Jun 2015 21:59:01 +0900 Subject: of: add helper for getting endpoint node of specific identifiers When there are multiple ports or multiple endpoints in a port, they have to be distinguished by the value of reg property. It is common. The drivers can get the specific endpoint in the specific port via this function. Now the drivers have to implement this code in themselves or have to force the order of dt nodes to get the right node. Signed-off-by: Hyungwon Hwang Acked-by: Rob Herring Signed-off-by: Inki Dae --- include/linux/of_graph.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h index 7bc92e050608..1c1d5b901a72 100644 --- a/include/linux/of_graph.h +++ b/include/linux/of_graph.h @@ -45,6 +45,8 @@ int of_graph_parse_endpoint(const struct device_node *node, struct device_node *of_graph_get_port_by_id(struct device_node *node, u32 id); struct device_node *of_graph_get_next_endpoint(const struct device_node *parent, struct device_node *previous); +struct device_node *of_graph_get_endpoint_by_regs( + const struct device_node *parent, int port_reg, int reg); struct device_node *of_graph_get_remote_port_parent( const struct device_node *node); struct device_node *of_graph_get_remote_port(const struct device_node *node); @@ -69,6 +71,12 @@ static inline struct device_node *of_graph_get_next_endpoint( return NULL; } +struct device_node *of_graph_get_endpoint_by_regs( + const struct device_node *parent, int port_reg, int reg) +{ + return NULL; +} + static inline struct device_node *of_graph_get_remote_port_parent( const struct device_node *node) { -- cgit v1.2.3 From 5104b7d7678b0029417f6ac08243773a77259ac6 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 17 Jun 2015 06:17:52 +0930 Subject: module: make perm const Change the struct kernel_param.perm field to a const, as it should never be changed. Signed-off-by: Dan Streetman Signed-off-by: Rusty Russell (cut from larger patch) --- include/linux/moduleparam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 7e0079936396..ab5031453807 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -68,7 +68,7 @@ enum { struct kernel_param { const char *name; const struct kernel_param_ops *ops; - u16 perm; + const u16 perm; s8 level; u8 flags; union { -- cgit v1.2.3 From b51d23e4e9fea6f264d39535c2a62d1f51e7ccc3 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 17 Jun 2015 06:18:52 +0930 Subject: module: add per-module param_lock Add a "param_lock" mutex to each module, and update params.c to use the correct built-in or module mutex while locking kernel params. Remove the kparam_block_sysfs_r/w() macros, replace them with direct calls to kernel_param_[un]lock(module). The kernel param code currently uses a single mutex to protect modification of any and all kernel params. While this generally works, there is one specific problem with it; a module callback function cannot safely load another module, i.e. with request_module() or even with indirect calls such as crypto_has_alg(). If the module to be loaded has any of its params configured (e.g. with a /etc/modprobe.d/* config file), then the attempt will result in a deadlock between the first module param callback waiting for modprobe, and modprobe trying to lock the single kernel param mutex to set the new module's param. This fixes that by using per-module mutexes, so that each individual module is protected against concurrent changes in its own kernel params, but is not blocked by changes to other module params. All built-in modules continue to use the built-in mutex, since they will always be loaded at runtime and references (e.g. request_module(), crypto_has_alg()) to them will never cause load-time param changing. This also simplifies the interface used by modules to block sysfs access to their params; while there are currently functions to block and unblock sysfs param access which are split up by read and write and expect a single kernel param to be passed, their actual operation is identical and applies to all params, not just the one passed to them; they simply lock and unlock the global param mutex. They are replaced with direct calls to kernel_param_[un]lock(THIS_MODULE), which locks THIS_MODULE's param_lock, or if the module is built-in, it locks the built-in mutex. Suggested-by: Rusty Russell Signed-off-by: Dan Streetman Signed-off-by: Rusty Russell --- include/linux/module.h | 1 + include/linux/moduleparam.h | 61 +++++++-------------------------------------- 2 files changed, 10 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 4c1b02e1361d..6ba0e87fa804 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -240,6 +240,7 @@ struct module { unsigned int num_syms; /* Kernel parameters. */ + struct mutex param_lock; struct kernel_param *kp; unsigned int num_kp; diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index ab5031453807..f1fdc50520d8 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -67,6 +67,7 @@ enum { struct kernel_param { const char *name; + struct module *mod; const struct kernel_param_ops *ops; const u16 perm; s8 level; @@ -108,7 +109,7 @@ struct kparam_array * * @perm is 0 if the the variable is not to appear in sysfs, or 0444 * for world-readable, 0644 for root-writable, etc. Note that if it - * is writable, you may need to use kparam_block_sysfs_write() around + * is writable, you may need to use kernel_param_lock() around * accesses (esp. charp, which can be kfreed when it changes). * * The @type is simply pasted to refer to a param_ops_##type and a @@ -216,12 +217,12 @@ struct kparam_array parameters. */ #define __module_param_call(prefix, name, ops, arg, perm, level, flags) \ /* Default value instead of permissions? */ \ - static const char __param_str_##name[] = prefix #name; \ + static const char __param_str_##name[] = prefix #name; \ static struct kernel_param __moduleparam_const __param_##name \ __used \ __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \ - = { __param_str_##name, ops, VERIFY_OCTAL_PERMISSIONS(perm), \ - level, flags, { arg } } + = { __param_str_##name, THIS_MODULE, ops, \ + VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } } /* Obsolete - use module_param_cb() */ #define module_param_call(name, set, get, arg, perm) \ @@ -238,58 +239,14 @@ __check_old_set_param(int (*oldset)(const char *, struct kernel_param *)) return 0; } -/** - * kparam_block_sysfs_write - make sure a parameter isn't written via sysfs. - * @name: the name of the parameter - * - * There's no point blocking write on a paramter that isn't writable via sysfs! - */ -#define kparam_block_sysfs_write(name) \ - do { \ - BUG_ON(!(__param_##name.perm & 0222)); \ - __kernel_param_lock(); \ - } while (0) - -/** - * kparam_unblock_sysfs_write - allows sysfs to write to a parameter again. - * @name: the name of the parameter - */ -#define kparam_unblock_sysfs_write(name) \ - do { \ - BUG_ON(!(__param_##name.perm & 0222)); \ - __kernel_param_unlock(); \ - } while (0) - -/** - * kparam_block_sysfs_read - make sure a parameter isn't read via sysfs. - * @name: the name of the parameter - * - * This also blocks sysfs writes. - */ -#define kparam_block_sysfs_read(name) \ - do { \ - BUG_ON(!(__param_##name.perm & 0444)); \ - __kernel_param_lock(); \ - } while (0) - -/** - * kparam_unblock_sysfs_read - allows sysfs to read a parameter again. - * @name: the name of the parameter - */ -#define kparam_unblock_sysfs_read(name) \ - do { \ - BUG_ON(!(__param_##name.perm & 0444)); \ - __kernel_param_unlock(); \ - } while (0) - #ifdef CONFIG_SYSFS -extern void __kernel_param_lock(void); -extern void __kernel_param_unlock(void); +extern void kernel_param_lock(struct module *mod); +extern void kernel_param_unlock(struct module *mod); #else -static inline void __kernel_param_lock(void) +static inline void kernel_param_lock(struct module *mod) { } -static inline void __kernel_param_unlock(void) +static inline void kernel_param_unlock(struct module *mod) { } #endif -- cgit v1.2.3 From cca0ba2df3d4000bacd9b7807d46ffafac62d53a Mon Sep 17 00:00:00 2001 From: Hyungwon Hwang Date: Thu, 28 May 2015 16:25:15 +0900 Subject: backlight: Change the return type of backlight_update_status() to int Backlight device returns the result of update_status(), but backlight_update_status() ignores it. So the consumers cannot confirm the result of their function call. This patch makes the result to be returned back for consumers. Signed-off-by: Hyungwon Hwang Acked-by: Jingoo Han Signed-off-by: Lee Jones --- include/linux/backlight.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index adb14a8616df..1e7a69adbe6f 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -117,12 +117,16 @@ struct backlight_device { int use_count; }; -static inline void backlight_update_status(struct backlight_device *bd) +static inline int backlight_update_status(struct backlight_device *bd) { + int ret = -ENOENT; + mutex_lock(&bd->update_lock); if (bd->ops && bd->ops->update_status) - bd->ops->update_status(bd); + ret = bd->ops->update_status(bd); mutex_unlock(&bd->update_lock); + + return ret; } extern struct backlight_device *backlight_device_register(const char *name, -- cgit v1.2.3 From ce16b9d2356125eb791bd920c710b8512eecce54 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 17 Jun 2015 11:53:53 -0500 Subject: of: define of_find_node_by_phandle for !CONFIG_OF Define stub implementation for of_find_node_by_phandle() API so that users of this API can build properly even when CONFIG_OF is not defined. Fixes x86 randconfig build failure of remoteproc. Signed-off-by: Suman Anna [robh: add details on fixing remoteproc compile] Signed-off-by: Rob Herring --- include/linux/of.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index a3eb9d1e5800..54f858798e8c 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -422,6 +422,11 @@ static inline struct device_node *of_find_node_opts_by_path(const char *path, return NULL; } +static inline struct device_node *of_find_node_by_phandle(phandle handle) +{ + return NULL; +} + static inline struct device_node *of_get_parent(const struct device_node *node) { return NULL; -- cgit v1.2.3 From 9bf39ab2adafd7cf8740859cb49e7b7952813a5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 19 Jun 2015 10:29:13 +0200 Subject: vfs: add file_path() helper Turn d_path(&file->f_path, ...); into file_path(file, ...); Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2bd77e10e8e5..2c135ad741a9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2500,6 +2500,8 @@ extern struct file * open_exec(const char *); extern int is_subdir(struct dentry *, struct dentry *); extern int path_is_under(struct path *, struct path *); +extern char *file_path(struct file *, char *, int); + #include /* needed for stackable file system support */ -- cgit v1.2.3 From 2726d56620ce71f40dd583d51391b86e1ab8cc57 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 19 Jun 2015 10:30:28 +0200 Subject: vfs: add seq_file_path() helper Turn seq_path(..., &file->f_path, ...); into seq_file_path(..., file, ...); Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- include/linux/seq_file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index afbb1fd77c77..912a7c482649 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -123,6 +123,7 @@ __printf(2, 3) int seq_printf(struct seq_file *, const char *, ...); __printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args); int seq_path(struct seq_file *, const struct path *, const char *); +int seq_file_path(struct seq_file *, struct file *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); -- cgit v1.2.3 From 5fa8e0a1c6a762857ae67d1628c58b9a02362003 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:53 +0200 Subject: fs: Rename file_remove_suid() to file_remove_privs() file_remove_suid() is a misnomer since it removes also file capabilities stored in xattrs and sets S_NOSEC flag. Also should_remove_suid() tells something else than whether file_remove_suid() call is necessary which leads to bugs. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c135ad741a9..641e68d850cf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2553,7 +2553,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); -extern int file_remove_suid(struct file *); +extern int file_remove_privs(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) -- cgit v1.2.3 From dbfae0cdcd87602737101d4417811f4323156b54 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:54 +0200 Subject: fs: Provide function telling whether file_remove_privs() will do anything Provide function telling whether file_remove_privs() will do anything. Currently we only have should_remove_suid() and that does something slightly different. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 641e68d850cf..ee60e8ab210f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2554,6 +2554,7 @@ extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_privs(struct file *); +extern int file_needs_remove_privs(struct file *file); extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) -- cgit v1.2.3 From 45f147a1bc97c743c6101a8d2741c69a51f583e4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:55 +0200 Subject: fs: Call security_ops->inode_killpriv on truncate Comment in include/linux/security.h says that ->inode_killpriv() should be called when setuid bit is being removed and that similar security labels (in fact this applies only to file capabilities) should be removed at this time as well. However we don't call ->inode_killpriv() when we remove suid bit on truncate. We fix the problem by calling ->inode_need_killpriv() and subsequently ->inode_killpriv() on truncate the same way as we do it on file write. After this patch there's only one user of should_remove_suid() - ocfs2 - and indeed it's buggy because it doesn't call ->inode_killpriv() on write. However fixing it is difficult because of special locking constraints. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- include/linux/fs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ee60e8ab210f..1e658b11c265 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2554,7 +2554,11 @@ extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_privs(struct file *); -extern int file_needs_remove_privs(struct file *file); +extern int dentry_needs_remove_privs(struct dentry *dentry); +static inline int file_needs_remove_privs(struct file *file) +{ + return dentry_needs_remove_privs(file->f_path.dentry); +} extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) -- cgit v1.2.3 From b57c2cb9ea1a02c2ae08e16de8c20cc13ffbf85a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 24 May 2015 17:19:41 +0200 Subject: pagemap.h: move dir_pages() over there That function was declared in a lot of filesystems to calculate directory pages. Signed-off-by: Fabian Frederick Signed-off-by: Al Viro --- include/linux/pagemap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f7065c..808942d31062 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -670,4 +670,10 @@ static inline int add_to_page_cache(struct page *page, return error; } +static inline unsigned long dir_pages(struct inode *inode) +{ + return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; +} + #endif /* _LINUX_PAGEMAP_H */ -- cgit v1.2.3 From dc3f4198eac14e52a98dfc79cd84b45e280f59cd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 18 May 2015 10:10:34 -0400 Subject: make simple_positive() public Signed-off-by: Al Viro --- include/linux/dcache.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 167ec0934049..d2d50249b7b2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -507,6 +507,11 @@ static inline bool d_really_is_positive(const struct dentry *dentry) return dentry->d_inode != NULL; } +static inline int simple_positive(struct dentry *dentry) +{ + return d_really_is_positive(dentry) && !d_unhashed(dentry); +} + extern void d_set_fallthru(struct dentry *dentry); static inline bool d_is_fallthru(const struct dentry *dentry) -- cgit v1.2.3 From 38183b9c31cf21d8996d6eee2e3a14508b20c418 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 28 May 2015 17:20:58 +1000 Subject: rcu: merge fix for Convert ACCESS_ONCE() to READ_ONCE() and WRITE_ONCE() This mirrors the change introduced by 7d0ae8086b8 of same title in Linus' tree; it's not obvious as a merge resolution since we moved the function. Signed-off-by: Stephen Rothwell Signed-off-by: Rusty Russell --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index eae42c21d5fd..52bdec710ed7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -467,7 +467,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s */ #define lockless_dereference(p) \ ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ + typeof(p) _________p1 = READ_ONCE(p); \ smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ (_________p1); \ }) -- cgit v1.2.3 From ce0bdb849ad46e4b4e4cae6913b447ae9938bdcf Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Tue, 23 Jun 2015 16:06:44 +0900 Subject: of: fix a build error to of_graph_get_endpoint_by_regs function This patch fixes the below build error reported by Stephen, Stephen reported: After merging the drm-exynos tree, today's linux-next build (x86_64 allmodconfig) failed like this: drivers/media/i2c/adv7604.o: In function `of_graph_get_endpoint_by_regs': adv7604.c:(.text+0x586c): multiple definition of `of_graph_get_endpoint_by_regs' drivers/media/i2c/adv7343.o:adv7343.c:(.text+0xa13): first defined here drivers/media/platform/soc_camera/atmel-isi.o: In function `of_graph_get_endpoint_by_regs': atmel-isi.c:(.text+0x1ec9): multiple definition of `of_graph_get_endpoint_by_regs' drivers/media/platform/soc_camera/soc_camera.o:soc_camera.c:(.text+0x2ce3): first defined here drivers/media/platform/soc_camera/rcar_vin.o: In function `of_graph_get_endpoint_by_regs': rcar_vin.c:(.text+0x307c): multiple definition of `of_graph_get_endpoint_by_regs' drivers/media/platform/soc_camera/soc_camera.o:soc_camera.c:(.text+0x2ce3): first defined here Caused by commit: a0f7001c18ca ("of: add helper for getting endpoint node of specific identifiers") To fix the error, this patch declares of_graph_get_endpoint_by_regs function with "static inline". Signed-off-by: Inki Dae Signed-off-by: Dave Airlie --- include/linux/of_graph.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h index 1c1d5b901a72..f8bcd0e21a26 100644 --- a/include/linux/of_graph.h +++ b/include/linux/of_graph.h @@ -71,7 +71,7 @@ static inline struct device_node *of_graph_get_next_endpoint( return NULL; } -struct device_node *of_graph_get_endpoint_by_regs( +static inline struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg) { return NULL; -- cgit v1.2.3 From be3a5d233922d73f27002ce2767f6ec03c3f473d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Jun 2015 19:51:55 +0800 Subject: NFSv.2/pnfs Add a LAYOUTSTATS rpc function Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 1 + include/linux/nfs_xdr.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 32201c269890..b8e72aad919c 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -500,6 +500,7 @@ enum { NFSPROC4_CLNT_SEEK, NFSPROC4_CLNT_ALLOCATE, NFSPROC4_CLNT_DEALLOCATE, + NFSPROC4_CLNT_LAYOUTSTATS, }; /* nfs41 types */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c959e7eb5bd4..7bbe50504211 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -316,6 +316,49 @@ struct nfs4_layoutreturn { int rpc_status; }; +#define PNFS_LAYOUTSTATS_MAXSIZE 256 + +struct nfs42_layoutstat_args; +struct nfs42_layoutstat_devinfo; +typedef void (*layoutstats_encode_t)(struct xdr_stream *, + struct nfs42_layoutstat_args *, + struct nfs42_layoutstat_devinfo *); + +/* Per file per deviceid layoutstats */ +struct nfs42_layoutstat_devinfo { + struct nfs4_deviceid dev_id; + __u64 offset; + __u64 length; + __u64 read_count; + __u64 read_bytes; + __u64 write_count; + __u64 write_bytes; + __u32 layout_type; + layoutstats_encode_t layoutstats_encode; + void *layout_private; +}; + +struct nfs42_layoutstat_args { + struct nfs4_sequence_args seq_args; + struct nfs_fh *fh; + struct inode *inode; + nfs4_stateid stateid; + int num_dev; + struct nfs42_layoutstat_devinfo *devinfo; +}; + +struct nfs42_layoutstat_res { + struct nfs4_sequence_res seq_res; + int num_dev; + int rpc_status; +}; + +struct nfs42_layoutstat_data { + struct inode *inode; + struct nfs42_layoutstat_args args; + struct nfs42_layoutstat_res res; +}; + struct stateowner_id { __u64 create_time; __u32 uniquifier; -- cgit v1.2.3 From 1bfe3b259ff2e579b2acb190f874397d53274497 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 23 Jun 2015 19:52:03 +0800 Subject: nfs42: serialize LAYOUTSTATS calls of the same file There is no need to report concurrently. Reviewed-by: Jeff Layton Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b95f914ce083..f91b5ade30c9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -219,6 +219,7 @@ struct nfs_inode { #define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ +#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { -- cgit v1.2.3 From c181fb3e723351e2f7a1f76b6c0627a4b8ad1723 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Mon, 22 Jun 2015 22:38:53 +0200 Subject: ACPI / OF: Rename of_node() and acpi_node() to to_of_node() and to_acpi_node() Commit 8a0662d9 introduced of_node and acpi_node symbols in global namespace but there were already ~63 of_node local variables or function parameters (no single acpi_node though, but anyway). After debugging undefined but used of_node local varible (which turned out to reference static function of_node() instead) it became clear that the names for the functions are too short and too generic for global scope. Signed-off-by: Alexander Sverdlin Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 4 ++-- include/linux/of.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..ec3c98ed9dca 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -53,7 +53,7 @@ static inline acpi_handle acpi_device_handle(struct acpi_device *adev) return adev ? adev->handle : NULL; } -#define ACPI_COMPANION(dev) acpi_node((dev)->fwnode) +#define ACPI_COMPANION(dev) to_acpi_node((dev)->fwnode) #define ACPI_COMPANION_SET(dev, adev) set_primary_fwnode(dev, (adev) ? \ acpi_fwnode_handle(adev) : NULL) #define ACPI_HANDLE(dev) acpi_device_handle(ACPI_COMPANION(dev)) @@ -473,7 +473,7 @@ static inline bool is_acpi_node(struct fwnode_handle *fwnode) return false; } -static inline struct acpi_device *acpi_node(struct fwnode_handle *fwnode) +static inline struct acpi_device *to_acpi_node(struct fwnode_handle *fwnode) { return NULL; } diff --git a/include/linux/of.h b/include/linux/of.h index b871ff9d81d7..f05fdcea4e66 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -128,7 +128,7 @@ static inline bool is_of_node(struct fwnode_handle *fwnode) return fwnode && fwnode->type == FWNODE_OF; } -static inline struct device_node *of_node(struct fwnode_handle *fwnode) +static inline struct device_node *to_of_node(struct fwnode_handle *fwnode) { return fwnode ? container_of(fwnode, struct device_node, fwnode) : NULL; } @@ -387,7 +387,7 @@ static inline bool is_of_node(struct fwnode_handle *fwnode) return false; } -static inline struct device_node *of_node(struct fwnode_handle *fwnode) +static inline struct device_node *to_of_node(struct fwnode_handle *fwnode) { return NULL; } -- cgit v1.2.3 From 9200025724619d83f9fc366281f0bde36afe6e5a Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Fri, 12 Jun 2015 10:04:10 +0800 Subject: rtc: Introduce rtc_tm_sub() helper function There're many sites need comparing the two rtc_time variants for many rtc drivers, especially in the instances of rtc_class_ops::set_alarm(). So add this common helper function to make things easy. Suggested-by: Arnd Bergmann Signed-off-by: Xunlei Pang Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 587017e7939c..b36160321458 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -24,6 +24,14 @@ extern void rtc_time64_to_tm(time64_t time, struct rtc_time *tm); ktime_t rtc_tm_to_ktime(struct rtc_time tm); struct rtc_time rtc_ktime_to_tm(ktime_t kt); +/* + * rtc_tm_sub - Return the difference in seconds. + */ +static inline time64_t rtc_tm_sub(struct rtc_time *lhs, struct rtc_time *rhs) +{ + return rtc_tm_to_time64(lhs) - rtc_tm_to_time64(rhs); +} + /** * Deprecated. Use rtc_time64_to_tm(). */ -- cgit v1.2.3 From c86a6c28957a9e8e9a71582a32e96971ad411ffe Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Fri, 12 Jun 2015 11:10:18 +0800 Subject: rtc: interface: Remove rtc_set_mmss() Now rtc_set_mmss() has no users, just remove it. We still have rtc_set_time() doing similar things. Signed-off-by: Xunlei Pang Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index b36160321458..3359f0422c6b 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -168,7 +168,6 @@ extern void devm_rtc_device_unregister(struct device *dev, extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); -extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs); extern int rtc_set_ntp_time(struct timespec64 now); int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, -- cgit v1.2.3 From c3cddc4c296c3a1498df7181015cbcea178a0546 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 24 Jun 2015 16:54:45 -0700 Subject: fsnotify: remove obsolete documentation should_send_event is no longer part of struct fsnotify_ops, so remove it. Signed-off-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsnotify_backend.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0f313f93c586..65a517dd32f7 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -84,8 +84,6 @@ struct fsnotify_fname; * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * - * should_send_event - given a group, inode, and mask this function determines - * if the group is interested in this event. * handle_event - main call for a group to handle an fs event * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group -- cgit v1.2.3 From 5286d20c4eb7b0c29217f8756652609df74f5489 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 24 Jun 2015 16:54:51 -0700 Subject: configfs: unexport/make static config_item_init() config_item_init() is only used in item.c Signed-off-by: Fabian Frederick Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/configfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 34025df61829..c9e5c57e4edf 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item) return item->ci_name; } -extern void config_item_init(struct config_item *); extern void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type); -- cgit v1.2.3 From b5242e98c1cb834feb1e84026f09a4796b49eb4d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 24 Jun 2015 16:55:42 -0700 Subject: smpboot: allow excluding cpus from the smpboot threads This patch series allows the watchdog to run by default only on the housekeeping cores when nohz_full is in effect; this seems to be a good compromise short of turning it off completely (since the nohz_full cores can't tolerate a watchdog). To provide customizability, we add /proc/sys/kernel/watchdog_cpumask so that the set of cores running the watchdog can be tuned to different values after bootup. To implement this customizability, we add a new smpboot_update_cpumask_percpu_thread() API to the smpboot_thread subsystem that lets us park or unpark "unwanted" threads. And now that threads can be parked for long periods of time, we tweak the /proc//stat and /proc//status code so parked threads aren't reported as running, which is otherwise confusing. This patch (of 3): This change allows some cores to be excluded from running the smp_hotplug_thread tasks. The following commit to update kernel/watchdog.c to use this functionality is the motivating example, and more information on the motivation is provided there. A new smp_hotplug_thread field is introduced, "cpumask", which is cpumask field managed by the smpboot subsystem that indicates whether or not the given smp_hotplug_thread should run on that core; the cpumask is checked when deciding whether to unpark the thread. To limit the cpumask to less than cpu_possible, you must call smpboot_update_cpumask_percpu_thread() after registering. Signed-off-by: Chris Metcalf Cc: Don Zickus Cc: Ingo Molnar Cc: Ulrich Obergfell Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smpboot.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index d600afb21926..da3c593f9845 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -27,6 +27,8 @@ struct smpboot_thread_data; * @pre_unpark: Optional unpark function, called before the thread is * unparked (cpu online). This is not guaranteed to be * called on the target cpu of the thread. Careful! + * @cpumask: Internal state. To update which threads are unparked, + * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -41,11 +43,14 @@ struct smp_hotplug_thread { void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); void (*pre_unpark)(unsigned int cpu); + cpumask_var_t cpumask; bool selfparking; const char *thread_comm; }; int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif -- cgit v1.2.3 From fe4ba3c34352b7e8068b7f18eb233444aed17011 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 24 Jun 2015 16:55:45 -0700 Subject: watchdog: add watchdog_cpumask sysctl to assist nohz Change the default behavior of watchdog so it only runs on the housekeeping cores when nohz_full is enabled at build and boot time. Allow modifying the set of cores the watchdog is currently running on with a new kernel.watchdog_cpumask sysctl. In the current system, the watchdog subsystem runs a periodic timer that schedules the watchdog kthread to run. However, nohz_full cores are designed to allow userspace application code running on those cores to have 100% access to the CPU. So the watchdog system prevents the nohz_full application code from being able to run the way it wants to, thus the motivation to suppress the watchdog on nohz_full cores, which this patchset provides by default. However, if we disable the watchdog globally, then the housekeeping cores can't benefit from the watchdog functionality. So we allow disabling it only on some cores. See Documentation/lockup-watchdogs.txt for more information. [jhubbard@nvidia.com: fix a watchdog crash in some configurations] Signed-off-by: Chris Metcalf Acked-by: Don Zickus Cc: Ingo Molnar Cc: Ulrich Obergfell Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3d46fb4708e0..f94da0e65dea 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long *watchdog_cpumask_bits; extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , @@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); +extern int proc_watchdog_cpumask(struct ctl_table *, int, + void __user *, size_t *, loff_t *); #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI -- cgit v1.2.3 From 4066c33d0308f87e9a3b0c7fafb9141c0bfbfa77 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Wed, 24 Jun 2015 16:55:54 -0700 Subject: mm/slab_common: support the slub_debug boot option on specific object size The slub_debug=PU,kmalloc-xx cannot work because in the create_kmalloc_caches() the s->name is created after the create_kmalloc_cache() is called. The name is NULL in the create_kmalloc_cache() so the kmem_cache_flags() would not set the slub_debug flags to the s->flags. The fix here set up a kmalloc_names string array for the initialization purpose and delete the dynamic name creation of kmalloc_caches. [akpm@linux-foundation.org: s/kmalloc_names/kmalloc_info/, tweak comment text] Signed-off-by: Gavin Guo Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index ffd24c830151..96f0ea506b5c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,8 +153,30 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) +/* + * The KMALLOC_LOOP_LOW is the definition for the for loop index start number + * to create the kmalloc_caches object in create_kmalloc_caches(). The first + * and the second are 96 and 192. You can see that in the kmalloc_index(), if + * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, + * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't + * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. + */ +#if KMALLOC_MIN_SIZE <= 32 +#define KMALLOC_LOOP_LOW 1 +#elif KMALLOC_MIN_SIZE <= 64 +#define KMALLOC_LOOP_LOW 2 +#else +#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW +#endif + #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +/* + * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. + * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be + * initialized. + */ +#define KMALLOC_LOOP_LOW 1 #endif /* -- cgit v1.2.3 From 1ed58b6051b67e5cfe9e465fb60bf7d5f55e0a64 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 24 Jun 2015 16:55:59 -0700 Subject: linux/slab.h: fix three off-by-one typos in comment The first is a keyboard-off-by-one, the other two the ordinary mathy kind. Signed-off-by: Rasmus Villemoes Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 96f0ea506b5c..9de2fdc8b5e4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; * belongs to. * 0 = zero alloc * 1 = 65 .. 96 bytes - * 2 = 120 .. 192 bytes - * n = 2^(n-1) .. 2^n -1 + * 2 = 129 .. 192 bytes + * n = 2^(n-1)+1 .. 2^n */ static __always_inline int kmalloc_index(size_t size) { -- cgit v1.2.3 From 2ae416b142b625c58c9ccb039aa3ef48ad0e9bae Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:16 -0700 Subject: mm: new mm hook framework CRIU is recreating the process memory layout by remapping the checkpointee memory area on top of the current process (criu). This includes remapping the vDSO to the place it has at checkpoint time. However some architectures like powerpc are keeping a reference to the vDSO base address to build the signal return stack frame by calling the vDSO sigreturn service. So once the vDSO has been moved, this reference is no more valid and the signal frame built later are not usable. This patch serie is introducing a new mm hook framework, and a new arch_remap hook which is called when mremap is done and the mm lock still hold. The next patch is adding the vDSO remap and unmap tracking to the powerpc architecture. This patch (of 3): This patch introduces a new set of header file to manage mm hooks: - per architecture empty header file (arch/x/include/asm/mm-arch-hooks.h) - a generic header (include/linux/mm-arch-hooks.h) The architecture which need to overwrite a hook as to redefine it in its header file, while architecture which doesn't need have nothing to do. The default hooks are defined in the generic header and are used in the case the architecture is not defining it. In a next step, mm hooks defined in include/asm-generic/mm_hooks.h should be moved here. Signed-off-by: Laurent Dufour Suggested-by: Andrew Morton Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm-arch-hooks.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/mm-arch-hooks.h (limited to 'include/linux') diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h new file mode 100644 index 000000000000..63005e367abd --- /dev/null +++ b/include/linux/mm-arch-hooks.h @@ -0,0 +1,16 @@ +/* + * Generic mm no-op hooks. + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_MM_ARCH_HOOKS_H +#define _LINUX_MM_ARCH_HOOKS_H + +#include + +#endif /* _LINUX_MM_ARCH_HOOKS_H */ -- cgit v1.2.3 From 4abad2ca4a4dbdd4a218c12451231ab628f2e60c Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:19 -0700 Subject: mm: new arch_remap() hook Some architectures would like to be triggered when a memory area is moved through the mremap system call. This patch introduces a new arch_remap() mm hook which is placed in the path of mremap, and is called before the old area is unmapped (and the arch_unmap() hook is called). Signed-off-by: Laurent Dufour Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm-arch-hooks.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h index 63005e367abd..4efc3f56e6df 100644 --- a/include/linux/mm-arch-hooks.h +++ b/include/linux/mm-arch-hooks.h @@ -13,4 +13,13 @@ #include +#ifndef arch_remap +static inline void arch_remap(struct mm_struct *mm, + unsigned long old_start, unsigned long old_end, + unsigned long new_start, unsigned long new_end) +{ +} +#define arch_remap arch_remap +#endif + #endif /* _LINUX_MM_ARCH_HOOKS_H */ -- cgit v1.2.3 From a9919c79359df9a706ff9e14fc0a93cd15791c9b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 24 Jun 2015 16:56:28 -0700 Subject: mm: only define hashdist variable when needed For !CONFIG_NUMA, hashdist will always be 0, since it's setter is otherwise compiled out. So we can save 4 bytes of data and some .text (although mostly in __init functions) by only defining it for CONFIG_NUMA. Signed-off-by: Rasmus Villemoes Acked-by: David Rientjes Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0995c2de8162..f589222bfa87 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename, /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. */ -#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) -#define HASHDIST_DEFAULT 1 +#ifdef CONFIG_NUMA +#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) +extern int hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define HASHDIST_DEFAULT 0 +#define hashdist (0) #endif -extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif /* _LINUX_BOOTMEM_H */ -- cgit v1.2.3 From c761471b58e6138938ebc6eafec20b2f60cb3397 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Jun 2015 16:56:33 -0700 Subject: mm: avoid tail page refcounting on non-THP compound pages Reintroduce 8d63d99a5dfb ("mm: avoid tail page refcounting on non-THP compound pages") after removing bogus VM_BUG_ON_PAGE() in put_unrefcounted_compound_page(). THP uses tail page refcounting to be able to split huge pages at any time. Tail page refcounting is not needed for other users of compound pages and it's harmful because of overhead. We try to exclude non-THP pages from tail page refcounting using __compound_tail_refcounted() check. It excludes most common non-THP compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP users -- drivers. And it's not only about overhead. Drivers might want to use compound pages to get refcounting semantics suitable for mapping high-order pages to userspace. But tail page refcounting breaks it. Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on them. It means GUP pins would affect page_mapcount() for tail pages. It's not a problem for THP, because it never maps tail pages. But unlike THP, drivers map parts of compound pages with PTEs and it makes page_mapcount() be called for tail pages. In particular, GUP pins would shift PSS up and affect /proc/kpagecount for such pages. But, I'm not aware about anything which can lead to crash or other serious misbehaviour. Since currently all THP pages are anonymous and all drivers pages are not, we can fix the __compound_tail_refcounted() check by requiring PageAnon() to enable tail page refcounting. Signed-off-by: Kirill A. Shutemov Acked-by: Hugh Dickins Reviewed-by: Andrea Arcangeli Reported-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..8b086070c3a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -499,7 +499,7 @@ static inline int page_count(struct page *page) static inline bool __compound_tail_refcounted(struct page *page) { - return !PageSlab(page) && !PageHeadHuge(page); + return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); } /* -- cgit v1.2.3 From ead07f6a867b5b1b41cf703735e8b39094987a7d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:48 -0700 Subject: mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling memory_failure() can run in 2 different mode (specified by MF_COUNT_INCREASED) in page refcount perspective. When MF_COUNT_INCREASED is set, memory_failure() assumes that the caller takes a refcount of the target page. And if cleared, memory_failure() takes it in it's own. In current code, however, refcounting is done differently in each caller. For example, madvise_hwpoison() uses get_user_pages_fast() and hwpoison_inject() uses get_page_unless_zero(). So this inconsistent refcounting causes refcount failure especially for thp tail pages. Typical user visible effects are like memory leak or VM_BUG_ON_PAGE(!page_count(page)) in isolate_lru_page(). To fix this refcounting issue, this patch introduces get_hwpoison_page() to handle thp tail pages in the same manner for each caller of hwpoison code. memory_failure() might fail to split thp and in such case it returns without completing page isolation. This is not good because PageHWPoison on the thp is still set and there's no easy way to unpoison such thps. So this patch try to roll back any action to the thp in "non anonymous thp" case and "thp split failed" case, expecting an MCE(SRAR) generated by later access afterward will properly free such thps. [akpm@linux-foundation.org: fix CONFIG_HWPOISON_INJECT=m] Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b086070c3a5..cd9df66138a1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2146,6 +2146,7 @@ enum mf_flags { extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); +extern int get_hwpoison_page(struct page *page); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From 16e951966f05da5ccd650104176f6ba289f7fa20 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:07 -0700 Subject: mm: oom_kill: clean up victim marking and exiting interfaces Rename unmark_oom_victim() to exit_oom_victim(). Marking and unmarking are related in functionality, but the interface is not symmetrical at all: one is an internal OOM killer function used during the killing, the other is for an OOM victim to signal its own death on exit later on. This has locking implications, see follow-up changes. While at it, rename mark_tsk_oom_victim() to mark_oom_victim(), which is easier on the eye. Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 44b2f6f7bbd8..a8e6a498cbcb 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -47,9 +47,7 @@ static inline bool oom_task_origin(const struct task_struct *p) return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); } -extern void mark_tsk_oom_victim(struct task_struct *tsk); - -extern void unmark_oom_victim(void); +extern void mark_oom_victim(struct task_struct *tsk); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, @@ -75,6 +73,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); + +extern void exit_oom_victim(void); + extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); -- cgit v1.2.3 From dc56401fc9f25e8f93899991ec858c98a331d88c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:19 -0700 Subject: mm: oom_kill: simplify OOM killer locking The zonelist locking and the oom_sem are two overlapping locks that are used to serialize global OOM killing against different things. The historical zonelist locking serializes OOM kills from allocations with overlapping zonelists against each other to prevent killing more tasks than necessary in the same memory domain. Only when neither tasklists nor zonelists from two concurrent OOM kills overlap (tasks in separate memcgs bound to separate nodes) are OOM kills allowed to execute in parallel. The younger oom_sem is a read-write lock to serialize OOM killing against the PM code trying to disable the OOM killer altogether. However, the OOM killer is a fairly cold error path, there is really no reason to optimize for highly performant and concurrent OOM kills. And the oom_sem is just flat-out redundant. Replace both locking schemes with a single global mutex serializing OOM kills regardless of context. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index a8e6a498cbcb..7deecb7bca5e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -32,6 +32,8 @@ enum oom_scan_t { /* Thread is the potential origin of an oom condition; kill first on oom */ #define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) +extern struct mutex oom_lock; + static inline void set_current_oom_origin(void) { current->signal->oom_flags |= OOM_FLAG_ORIGIN; @@ -60,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message); -extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); -extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); - extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, int order, const nodemask_t *nodemask, struct mem_cgroup *memcg); -- cgit v1.2.3 From cc637b1704d78b068c2eb700eec384c69ea56cdf Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:30 -0700 Subject: memory-failure: export page_type and action result Export 'outcome' and 'action_page_type' to mm.h, so we could use this emnus outside. This patch is preparation for adding trace events for memory-failure recovery action. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cd9df66138a1..986a9a221ee0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2153,6 +2153,40 @@ extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages; extern int soft_offline_page(struct page *page, int flags); + +/* + * Error handlers for various types of pages. + */ +enum mf_outcome { + MF_IGNORED, /* Error: cannot be handled */ + MF_FAILED, /* Error: handling failed */ + MF_DELAYED, /* Will be handled later */ + MF_RECOVERED, /* Successfully recovered */ +}; + +enum mf_action_page_type { + MF_MSG_KERNEL, + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_SLAB, + MF_MSG_DIFFERENT_COMPOUND, + MF_MSG_POISONED_HUGE, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, + MF_MSG_DIRTY_MLOCKED_LRU, + MF_MSG_CLEAN_MLOCKED_LRU, + MF_MSG_DIRTY_UNEVICTABLE_LRU, + MF_MSG_CLEAN_UNEVICTABLE_LRU, + MF_MSG_DIRTY_LRU, + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, + MF_MSG_BUDDY_2ND, + MF_MSG_UNKNOWN, +}; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr, -- cgit v1.2.3 From cc3e2af42e7b7e0457b93bf17c19b44c635cd40c Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:33 -0700 Subject: memory-failure: change type of action_result's param 3 to enum Change type of action_result's param 3 to enum for type consistency, and rename mf_outcome to mf_result for clearly. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 986a9a221ee0..24ad583596d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2157,7 +2157,7 @@ extern int soft_offline_page(struct page *page, int flags); /* * Error handlers for various types of pages. */ -enum mf_outcome { +enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ -- cgit v1.2.3 From 8809aa2d28d74111ff2f1928edaa4e9845c97a7d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Jun 2015 16:57:44 -0700 Subject: mm: clarify that the function operates on hugepage pte We have confusing functions to clear pmd, pmd_clear_* and pmd_clear. Add _huge_ to pmdp_clear functions so that we are clear that they operate on hugepage pte. We don't bother about other functions like pmdp_set_wrprotect, pmdp_clear_flush_young, because they operate on PTE bits and hence indicate they are operating on hugepage ptes Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Andrea Arcangeli Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 95243d28a0ee..61cd67f4d788 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pte; \ }) -#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \ +#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \ + ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ ___pmd; \ }) -#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \ +#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \ + ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ mmu_notifier_invalidate_range(__mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ @@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush -#define pmdp_clear_flush_notify pmdp_clear_flush -#define pmdp_get_and_clear_notify pmdp_get_and_clear +#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush +#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ -- cgit v1.2.3 From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:09 -0700 Subject: mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute Some high end Intel Xeon systems report uncorrectable memory errors as a recoverable machine check. Linux has included code for some time to process these and just signal the affected processes (or even recover completely if the error was in a read only page that can be replaced by reading from disk). But we have no recovery path for errors encountered during kernel code execution. Except for some very specific cases were are unlikely to ever be able to recover. Enter memory mirroring. Actually 3rd generation of memory mirroing. Gen1: All memory is mirrored Pro: No s/w enabling - h/w just gets good data from other side of the mirror Con: Halves effective memory capacity available to OS/applications Gen2: Partial memory mirror - just mirror memory begind some memory controllers Pro: Keep more of the capacity Con: Nightmare to enable. Have to choose between allocating from mirrored memory for safety vs. NUMA local memory for performance Gen3: Address range partial memory mirror - some mirror on each memory controller Pro: Can tune the amount of mirror and keep NUMA performance Con: I have to write memory management code to implement The current plan is just to use mirrored memory for kernel allocations. This has been broken into two phases: 1) This patch series - find the mirrored memory, use it for boot time allocations 2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the unused mirrored memory from mm/memblock.c and only give it out to select kernel allocations (this is still being scoped because page_alloc.c is scary). This patch (of 3): Add extra "flags" to memblock to allow selection of memory based on attribute. No functional changes Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9497ec7c77ea..7aeec0cb4c27 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -21,7 +21,10 @@ #define INIT_PHYSMEM_REGIONS 4 /* Definition of memblock flags. */ -#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ +enum { + MEMBLOCK_NONE = 0x0, /* No special request */ + MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ +}; struct memblock_region { phys_addr_t base; @@ -61,7 +64,7 @@ extern bool movable_node_enabled; phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, - int nid); + int nid, ulong flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); @@ -85,11 +88,13 @@ int memblock_remove_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size); -void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a, +void __next_mem_range(u64 *idx, int nid, ulong flags, + struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, +void __next_mem_range_rev(u64 *idx, int nid, ulong flags, + struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); @@ -100,16 +105,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range(i, type_a, type_b, nid, \ +#define for_each_mem_range(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ - for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \ + for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ - __next_mem_range(&i, nid, type_a, type_b, \ + __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) /** @@ -119,17 +125,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range_rev(i, type_a, type_b, nid, \ +#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ for (i = (u64)ULLONG_MAX, \ - __next_mem_range_rev(&i, nid, type_a, type_b, \ + __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ - __next_mem_range_rev(&i, nid, type_a, type_b, \ + __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) #ifdef CONFIG_MOVABLE_NODE @@ -181,13 +188,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL + * @flags: pick from blocks based on memory attributes * * Walks over free (memory && !reserved) areas of memblock. Available as * soon as memblock is initialized. */ -#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ +#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \ for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ - nid, p_start, p_end, p_nid) + nid, flags, p_start, p_end, p_nid) /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas @@ -196,13 +204,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL + * @flags: pick from blocks based on memory attributes * * Walks over free (memory && !reserved) areas of memblock in reverse * order. Available as soon as memblock is initialized. */ -#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ +#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \ + p_nid) \ for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ - nid, p_start, p_end, p_nid) + nid, flags, p_start, p_end, p_nid) static inline void memblock_set_region_flags(struct memblock_region *r, unsigned long flags) @@ -273,7 +283,8 @@ static inline bool memblock_bottom_up(void) { return false; } #define MEMBLOCK_ALLOC_ACCESSIBLE 0 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end); + phys_addr_t start, phys_addr_t end, + ulong flags); phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr); phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, -- cgit v1.2.3 From a3f5bafcc04aaf62990e0cf3ced1cc6d8dc6fe95 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:12 -0700 Subject: mm/memblock: allocate boot time data structures from mirrored memory Try to allocate all boot time kernel data structures from mirrored memory. If we run out of mirrored memory print warnings, but fall back to using non-mirrored memory to make sure that we still boot. By number of bytes, most of what we allocate at boot time is the page structures. 64 bytes per 4K page on x86_64 ... or about 1.5% of total system memory. For workloads where the bulk of memory is allocated to applications this may represent a useful improvement to system availability since 1.5% of total memory might be a third of the memory allocated to the kernel. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 7aeec0cb4c27..0215ffd63069 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -24,6 +24,7 @@ enum { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ + MEMBLOCK_MIRROR = 0x2, /* mirrored region */ }; struct memblock_region { @@ -78,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); +int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); +ulong choose_memblock_flags(void); /* Low level functions */ int memblock_add_range(struct memblock_type *type, @@ -160,6 +163,11 @@ static inline bool movable_node_is_enabled(void) } #endif +static inline bool memblock_is_mirror(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_MIRROR; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); -- cgit v1.2.3 From b05b9f5f9dcf593a0e9327676b78e6c17b4218e8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:15 -0700 Subject: x86, mirror: x86 enabling - find mirrored memory ranges UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory address ranges. See UEFI 2.5 spec pages 157-158: http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf On EFI enabled systems scan the memory map and tell memblock about any mirrored ranges. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/efi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 2092965afca3..5f19efe4eb3f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -96,6 +96,8 @@ typedef struct { #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ +#define EFI_MEMORY_MORE_RELIABLE \ + ((u64)0x0000000000010000ULL) /* higher reliability */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 @@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos extern void efi_late_init(void); extern void efi_free_boot_services(void); extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size); +extern void efi_find_mirror(void); #else static inline void efi_late_init(void) {} static inline void efi_free_boot_services(void) {} -- cgit v1.2.3 From d1dc6f1bcf1e998e7ce65fc120da371ab047a999 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 24 Jun 2015 16:58:18 -0700 Subject: frontswap: allow multiple backends Change frontswap single pointer to a singly linked list of frontswap implementations. Update Xen tmem implementation as register no longer returns anything. Frontswap only keeps track of a single implementation; any implementation that registers second (or later) will replace the previously registered implementation, and gets a pointer to the previous implementation that the new implementation is expected to pass all frontswap functions to if it can't handle the function itself. However that method doesn't really make much sense, as passing that work on to every implementation adds unnecessary work to implementations; instead, frontswap should simply keep a list of all registered implementations and try each implementation for any function. Most importantly, neither of the two currently existing frontswap implementations in the kernel actually do anything with any previous frontswap implementation that they replace when registering. This allows frontswap to successfully manage multiple implementations by keeping a list of them all. Signed-off-by: Dan Streetman Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 8293262401de..e65ef959546c 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -6,16 +6,16 @@ #include struct frontswap_ops { - void (*init)(unsigned); - int (*store)(unsigned, pgoff_t, struct page *); - int (*load)(unsigned, pgoff_t, struct page *); - void (*invalidate_page)(unsigned, pgoff_t); - void (*invalidate_area)(unsigned); + void (*init)(unsigned); /* this swap type was just swapon'ed */ + int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ + int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ + void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ + void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ + struct frontswap_ops *next; /* private pointer to next ops */ }; extern bool frontswap_enabled; -extern struct frontswap_ops * - frontswap_register_ops(struct frontswap_ops *ops); +extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); extern void frontswap_writethrough(bool); -- cgit v1.2.3 From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 24 Jun 2015 16:58:51 -0700 Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc() Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the following INFO splat is logged: =============================== [ INFO: suspicious RCU usage. ] 4.1.0-rc7-next-20150612 #1 Not tainted ------------------------------- kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by systemd/1: #0: (rtnl_mutex){+.+.+.}, at: [] rtnetlink_rcv+0x1f/0x40 #1: (rcu_read_lock_bh){......}, at: [] ipv6_add_addr+0x62/0x540 #2: (addrconf_hash_lock){+...+.}, at: [] ipv6_add_addr+0x184/0x540 stack backtrace: CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1 Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20 04/17/2014 Call Trace: dump_stack+0x4c/0x6e lockdep_rcu_suspicious+0xe7/0x120 ___might_sleep+0x1d5/0x1f0 __might_sleep+0x4d/0x90 kmem_cache_alloc+0x47/0x250 create_object+0x39/0x2e0 kmemleak_alloc_percpu+0x61/0xe0 pcpu_alloc+0x370/0x630 Additional backtrace lines are truncated. In addition, the above splat is followed by several "BUG: sleeping function called from invalid context at mm/slub.c:1268" outputs. As suggested by Martin KaFai Lau, these are the clue to the fix. Routine kmemleak_alloc_percpu() always uses GFP_KERNEL for its allocations, whereas it should follow the gfp from its callers. Reviewed-by: Catalin Marinas Reviewed-by: Kamalesh Babulal Acked-by: Martin KaFai Lau Signed-off-by: Larry Finger Cc: Martin KaFai Lau Cc: Catalin Marinas Cc: Tejun Heo Cc: Christoph Lameter Cc: [3.18+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index e705467ddb47..d0a1f99e24e3 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -28,7 +28,8 @@ extern void kmemleak_init(void) __ref; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; -extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; +extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; @@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, gfp_t gfp) { } -static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) +static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) { } static inline void kmemleak_free(const void *ptr) -- cgit v1.2.3 From b94d5230d06eb930be82e67fb1a9a58271e78297 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 19 May 2015 22:54:31 -0400 Subject: libnvdimm, nfit: initial libnvdimm infrastructure and NFIT support A struct nvdimm_bus is the anchor device for registering nvdimm resources and interfaces, for example, a character control device, nvdimm devices, and I/O region devices. The ACPI NFIT (NVDIMM Firmware Interface Table) is one possible platform description for such non-volatile memory resources in a system. The nfit.ko driver attaches to the "ACPI0012" device that indicates the presence of the NFIT and parses the table to register a struct nvdimm_bus instance. Cc: Cc: Lv Zheng Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Jeff Moyer Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 include/linux/libnvdimm.h (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h new file mode 100644 index 000000000000..2b3c63950c91 --- /dev/null +++ b/include/linux/libnvdimm.h @@ -0,0 +1,34 @@ +/* + * libnvdimm - Non-volatile-memory Devices Subsystem + * + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LIBNVDIMM_H__ +#define __LIBNVDIMM_H__ +struct nvdimm; +struct nvdimm_bus_descriptor; +typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len); + +struct nvdimm_bus_descriptor { + unsigned long dsm_mask; + char *provider_name; + ndctl_fn ndctl; +}; + +struct device; +struct nvdimm_bus; +struct nvdimm_bus *nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc); +void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +#endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 45def22c1fab85764646746ce38d45b2f3281fa5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 26 Apr 2015 19:26:48 -0400 Subject: libnvdimm: control character device and nvdimm_bus sysfs attributes The control device for a nvdimm_bus is registered as an "nd" class device. The expectation is that there will usually only be one "nd" bus registered under /sys/class/nd. However, we allow for the possibility of multiple buses and they will listed in discovery order as ndctl0...ndctlN. This character device hosts the ioctl for passing control messages. The initial command set has a 1:1 correlation with the commands listed in the by the "NFIT DSM Example" document [1], but this scheme is extensible to future command sets. Note, nd_ioctl() and the backing ->ndctl() implementation are defined in a subsequent patch. This is simply the initial registrations and sysfs attributes. [1]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Neil Brown Cc: Greg KH Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 2b3c63950c91..d375cdc4abd5 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,8 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +extern struct attribute_group nvdimm_bus_attribute_group; + struct nvdimm; struct nvdimm_bus_descriptor; typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, @@ -21,14 +23,16 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, unsigned int buf_len); struct nvdimm_bus_descriptor { + const struct attribute_group **attr_groups; unsigned long dsm_mask; char *provider_name; ndctl_fn ndctl; }; struct device; -struct nvdimm_bus; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From e6dfb2de47768efe8cc37c9a1863d2aff81440fb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 25 Apr 2015 03:56:17 -0400 Subject: libnvdimm, nfit: dimm/memory-devices Enable nvdimm devices to be registered on a nvdimm_bus. The kernel assigned device id for nvdimm devicesis dynamic. If userspace needs a more static identifier it should consult a provider-specific attribute. In the case where NFIT is the provider, the 'nmemX/nfit/handle' or 'nmemX/nfit/serial' attributes may be used for this purpose. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d375cdc4abd5..07787f0dd7de 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,12 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ + +enum { + /* when a dimm supports both PMEM and BLK access a label is required */ + NDD_ALIASING = 1 << 0, +}; + extern struct attribute_group nvdimm_bus_attribute_group; struct nvdimm; @@ -34,5 +40,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm *to_nvdimm(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); +const char *nvdimm_name(struct nvdimm *nvdimm); +void *nvdimm_provider_data(struct nvdimm *nvdimm); +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 62232e45f4a265abb43f0acf16e58f5d0b6e1ec9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 8 Jun 2015 14:27:06 -0400 Subject: libnvdimm: control (ioctl) messages for nvdimm_bus and nvdimm devices Most discovery/configuration of the nvdimm-subsystem is done via sysfs attributes. However, some nvdimm_bus instances, particularly the ACPI.NFIT bus, define a small set of messages that can be passed to the platform. For convenience we derive the initial libnvdimm-ioctl command formats directly from the NFIT DSM Interface Example formats. ND_CMD_SMART: media health and diagnostics ND_CMD_GET_CONFIG_SIZE: size of the label space ND_CMD_GET_CONFIG_DATA: read label space ND_CMD_SET_CONFIG_DATA: write label space ND_CMD_VENDOR: vendor-specific command passthrough ND_CMD_ARS_CAP: report address-range-scrubbing capabilities ND_CMD_ARS_START: initiate scrubbing ND_CMD_ARS_STATUS: report on scrubbing state ND_CMD_SMART_THRESHOLD: configure alarm thresholds for smart events If a platform later defines different commands than this set it is straightforward to extend support to those formats. Most of the commands target a specific dimm. However, the address-range-scrubbing commands target the bus. The 'commands' attribute in sysfs of an nvdimm_bus, or nvdimm, enumerate the supported commands for that object. Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Reported-by: Nicholas Moulin Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 07787f0dd7de..a39235819af3 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,13 +14,22 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include +#include enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + + /* need to set a limit somewhere, but yes, this is likely overkill */ + ND_IOCTL_MAX_BUFLEN = SZ_4M, + ND_CMD_MAX_ELEM = 4, + ND_CMD_MAX_ENVELOPE = 16, + ND_CMD_ARS_STATUS_MAX = SZ_4K, }; extern struct attribute_group nvdimm_bus_attribute_group; +extern struct attribute_group nvdimm_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -35,6 +44,14 @@ struct nvdimm_bus_descriptor { ndctl_fn ndctl; }; +struct nd_cmd_desc { + int in_num; + int out_num; + u32 in_sizes[ND_CMD_MAX_ELEM]; + int out_sizes[ND_CMD_MAX_ELEM]; +}; + +struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); @@ -45,5 +62,13 @@ struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, - const struct attribute_group **groups, unsigned long flags); + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask); +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd); +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd); +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf); +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 4d88a97aa9e8cfa6460aab119c5da60ad2267423 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 14:41:48 -0400 Subject: libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver infrastructure * Implement the device-model infrastructure for loading modules and attaching drivers to nvdimm devices. This is a simple association of a nd-device-type number with a driver that has a bitmask of supported device types. To facilitate userspace bind/unbind operations 'modalias' and 'devtype', that also appear in the uevent, are added as generic sysfs attributes for all nvdimm devices. The reason for the device-type number is to support sub-types within a given parent devtype, be it a vendor-specific sub-type or otherwise. * The first consumer of this infrastructure is the driver for dimm devices. It simply uses control messages to retrieve and store the configuration-data image (label set) from each dimm. Note: nd_device_register() arranges for asynchronous registration of nvdimm bus devices by default. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ include/linux/nd.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 include/linux/nd.h (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a39235819af3..d3ebccf4ea8b 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -30,6 +30,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; +extern struct attribute_group nd_device_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -71,4 +72,5 @@ u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); #endif /* __LIBNVDIMM_H__ */ diff --git a/include/linux/nd.h b/include/linux/nd.h new file mode 100644 index 000000000000..e074f67e53a3 --- /dev/null +++ b/include/linux/nd.h @@ -0,0 +1,39 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LINUX_ND_H__ +#define __LINUX_ND_H__ +#include +#include + +struct nd_device_driver { + struct device_driver drv; + unsigned long type; + int (*probe)(struct device *dev); + int (*remove)(struct device *dev); +}; + +static inline struct nd_device_driver *to_nd_device_driver( + struct device_driver *drv) +{ + return container_of(drv, struct nd_device_driver, drv); +} + +#define MODULE_ALIAS_ND_DEVICE(type) \ + MODULE_ALIAS("nd:t" __stringify(type) "*") +#define ND_DEVICE_MODALIAS_FMT "nd:t%d" + +int __must_check __nd_driver_register(struct nd_device_driver *nd_drv, + struct module *module, const char *mod_name); +#define nd_driver_register(driver) \ + __nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) +#endif /* __LINUX_ND_H__ */ -- cgit v1.2.3 From 1f7df6f88b9245a7f2d0f8ecbc97dc88c8d0d8e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 9 Jun 2015 20:13:14 -0400 Subject: libnvdimm, nfit: regions (block-data-window, persistent memory, volatile memory) A "region" device represents the maximum capacity of a BLK range (mmio block-data-window(s)), or a PMEM range (DAX-capable persistent memory or volatile memory), without regard for aliasing. Aliasing, in the dimm-local address space (DPA), is resolved by metadata on a dimm to designate which exclusive interface will access the aliased DPA ranges. Support for the per-dimm metadata/label arrvies is in a subsequent patch. The name format of "region" devices is "regionN" where, like dimms, N is a global ida index assigned at discovery time. This id is not reliable across reboots nor in the presence of hotplug. Look to attributes of the region or static id-data of the sub-namespace to generate a persistent name. However, if the platform configuration does not change it is reasonable to expect the same region id to be assigned at the next boot. "region"s have 2 generic attributes "size", and "mapping"s where: - size: the BLK accessible capacity or the span of the system physical address range in the case of PMEM. - mappingN: a tuple describing a dimm's contribution to the region's capacity in the format (,,). For a PMEM-region there will be at least one mapping per dimm in the interleave set. For a BLK-region there is only "mapping0" listing the starting DPA of the BLK-region and the available DPA capacity of that space (matches "size" above). The max number of mappings per "region" is hard coded per the constraints of sysfs attribute groups. That said the number of mappings per region should never exceed the maximum number of possible dimms in the system. If the current number turns out to not be enough then the "mappings" attribute clarifies how many there are supposed to be. "32 should be enough for anybody...". Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d3ebccf4ea8b..39e7e606092a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -26,11 +26,14 @@ enum { ND_CMD_MAX_ELEM = 4, ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, + ND_MAX_MAPPINGS = 32, }; extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_region_attribute_group; +extern struct attribute_group nd_mapping_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -38,6 +41,12 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_mapping { + struct nvdimm *nvdimm; + u64 start; + u64 size; +}; + struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long dsm_mask; @@ -52,6 +61,14 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_region_desc { + struct resource *res; + struct nd_mapping *nd_mapping; + u16 num_mappings; + const struct attribute_group **attr_groups; + void *provider_data; +}; + struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, @@ -59,9 +76,11 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); +struct nd_region *to_nd_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); +void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -73,4 +92,10 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 3d88002e4a7bd40f355550284c6cd140e6fe29dc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 15:02:11 -0400 Subject: libnvdimm: support for legacy (non-aliasing) nvdimms The libnvdimm region driver is an intermediary driver that translates non-volatile "region"s into "namespace" sub-devices that are surfaced by persistent memory block-device drivers (PMEM and BLK). ACPI 6 introduces the concept that a given nvdimm may simultaneously offer multiple access modes to its media through direct PMEM load/store access, or windowed BLK mode. Existing nvdimms mostly implement a PMEM interface, some offer a BLK-like mode, but never both as ACPI 6 defines. If an nvdimm is single interfaced, then there is no need for dimm metadata labels. For these devices we can take the region boundaries directly to create a child namespace device (nd_namespace_io). Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 7 +++++-- include/linux/nd.h | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 39e7e606092a..37f966aff386 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -71,8 +71,11 @@ struct nd_region_desc { struct nvdimm_bus; struct device; -struct nvdimm_bus *nvdimm_bus_register(struct device *parent, - struct nvdimm_bus_descriptor *nfit_desc); +struct module; +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc, struct module *module); +#define nvdimm_bus_register(parent, desc) \ + __nvdimm_bus_register(parent, desc, THIS_MODULE) void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); diff --git a/include/linux/nd.h b/include/linux/nd.h index e074f67e53a3..da70e9962197 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -26,6 +26,16 @@ static inline struct nd_device_driver *to_nd_device_driver( struct device_driver *drv) { return container_of(drv, struct nd_device_driver, drv); +}; + +struct nd_namespace_io { + struct device dev; + struct resource res; +}; + +static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) +{ + return container_of(dev, struct nd_namespace_io, dev); } #define MODULE_ALIAS_ND_DEVICE(type) \ -- cgit v1.2.3 From eaf961536e1622ad21247ac8d44acd48ba65566e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:11:27 -0400 Subject: libnvdimm, nfit: add interleave-set state-tracking infrastructure On platforms that have firmware support for reading/writing per-dimm label space, a portion of the dimm may be accessible via an interleave set PMEM mapping in addition to the dimm's BLK (block-data-window aperture(s)) interface. A label, stored in a "configuration data region" on the dimm, disambiguates which dimm addresses are accessed through which exclusive interface. Add infrastructure that allows the kernel to block modifications to a label in the set while any member dimm is active. Note that this is meant only for enforcing "no modifications of active labels" via the coarse ioctl command. Adding/deleting namespaces from an active interleave set is always possible via sysfs. Another aspect of tracking interleave sets is tracking their integrity when DIMMs in a set are physically re-ordered. For this purpose we generate an "interleave-set cookie" that can be recorded in a label and validated against the current configuration. It is the bus provider implementation's responsibility to calculate the interleave set cookie and attach it to a given region. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 37f966aff386..1b627b109360 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -61,11 +61,16 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_interleave_set { + u64 cookie; +}; + struct nd_region_desc { struct resource *res; struct nd_mapping *nd_mapping; u16 num_mappings; const struct attribute_group **attr_groups; + struct nd_interleave_set *nd_set; void *provider_data; }; @@ -101,4 +106,5 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From bf9bccc14c05dae8caba29df6187c731710f5380 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 Jun 2015 17:14:46 -0400 Subject: libnvdimm: pmem label sets and namespace instantiation. A complete label set is a PMEM-label per-dimm per-interleave-set where all the UUIDs match and the interleave set cookie matches the hosting interleave set. Present sysfs attributes for manipulation of a PMEM-namespace's 'alt_name', 'uuid', and 'size' attributes. A later patch will make these settings persistent by writing back the label. Note that PMEM allocations grow forwards from the start of an interleave set (lowest dimm-physical-address (DPA)). BLK-namespaces that alias with a PMEM interleave set will grow allocations backward from the highest DPA. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 10 ++++++++++ include/linux/nd.h | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 1b627b109360..c130972e08c4 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -41,10 +41,20 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_namespace_label; +struct nvdimm_drvdata; struct nd_mapping { struct nvdimm *nvdimm; + struct nd_namespace_label **labels; u64 start; u64 size; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; }; struct nvdimm_bus_descriptor { diff --git a/include/linux/nd.h b/include/linux/nd.h index da70e9962197..255c38a83083 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -28,16 +28,40 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_io - infrastructure for loading an nd_pmem instance + * @dev: namespace device created by the nd region driver + * @res: struct resource conversion of a NFIT SPA table + */ struct nd_namespace_io { struct device dev; struct resource res; }; +/** + * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory + * @nsio: device and system physical address range to drive + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + */ +struct nd_namespace_pmem { + struct nd_namespace_io nsio; + char *alt_name; + u8 *uuid; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); } +static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return container_of(nsio, struct nd_namespace_pmem, nsio); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" -- cgit v1.2.3 From 1b40e09a1232de537b193fa1b6b3ef16d3a1e397 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:34:01 -0400 Subject: libnvdimm: blk labels and namespace instantiation A blk label set describes a namespace comprised of one or more discontiguous dpa ranges on a single dimm. They may alias with one or more pmem interleave sets that include the given dimm. This is the runtime/volatile configuration infrastructure for sysfs manipulation of 'alt_name', 'uuid', 'size', and 'sector_size'. A later patch will make these settings persistent by writing back the label(s). Unlike pmem namespaces, multiple blk namespaces can be created per region. Once a blk namespace has been created a new seed device (unconfigured child of a parent blk region) is instantiated. As long as a region has 'available_size' != 0 new child namespaces may be created. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 3 +++ include/linux/nd.h | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index c130972e08c4..a59dca17b3aa 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -27,6 +27,9 @@ enum { ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, ND_MAX_MAPPINGS = 32, + + /* mark newly adjusted resources as requiring a label update */ + DPA_RESOURCE_ADJUSTED = 1 << 0, }; extern struct attribute_group nvdimm_bus_attribute_group; diff --git a/include/linux/nd.h b/include/linux/nd.h index 255c38a83083..23276ea91690 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -50,6 +50,26 @@ struct nd_namespace_pmem { u8 *uuid; }; +/** + * struct nd_namespace_blk - namespace for dimm-bounded persistent memory + * @dev: namespace device creation by the nd region driver + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + * @id: ida allocated id + * @lbasize: blk namespaces have a native sector size when btt not present + * @num_resources: number of dpa extents to claim + * @res: discontiguous dpa extents for given dimm + */ +struct nd_namespace_blk { + struct device dev; + char *alt_name; + u8 *uuid; + int id; + unsigned long lbasize; + int num_resources; + struct resource **res; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); @@ -62,6 +82,11 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) return container_of(nsio, struct nd_namespace_pmem, nsio); } +static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) +{ + return container_of(dev, struct nd_namespace_blk, dev); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" -- cgit v1.2.3 From d7f96f97c4031fa4ffdb7801f9aae23e96170a6f Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 25 Jun 2015 09:06:56 +0200 Subject: firmware: dmi_scan: add SBMIOS entry and DMI tables Some utils, like dmidecode and smbios, need to access SMBIOS entry table area in order to get information like SMBIOS version, size, etc. Currently it's done via /dev/mem. But for situation when /dev/mem usage is disabled, the utils have to use dmi sysfs instead, which doesn't represent SMBIOS entry and adds code/delay redundancy when direct access for table is needed. So this patch creates dmi/tables and adds SMBIOS entry point to allow utils in question to work correctly without /dev/mem. Also patch adds raw dmi table to simplify dmi table processing in user space, as proposed by Jean Delvare. Tested-by: Roy Franz Signed-off-by: Ivan Khoronzhuk Signed-off-by: Jean Delvare --- include/linux/dmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index f820f0a336c9..2f9f98827c0a 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -2,6 +2,7 @@ #define __DMI_H__ #include +#include #include /* enum dmi_field is in mod_devicetable.h */ @@ -93,6 +94,7 @@ struct dmi_dev_onboard { int devfn; }; +extern struct kobject *dmi_kobj; extern int dmi_check_system(const struct dmi_system_id *list); const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list); extern const char * dmi_get_system_info(int field); -- cgit v1.2.3 From 9ea650c804404e55dbf17c928203141282e759ab Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 25 Jun 2015 09:06:57 +0200 Subject: firmware: dmi: struct dmi_header should be packed Apparently the compiler does fine without it, but it feels safer and clearer to add the missing attribute. Signed-off-by: Jean Delvare --- include/linux/dmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 2f9f98827c0a..5055ac34142d 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -75,7 +75,7 @@ struct dmi_header { u8 type; u8 length; u16 handle; -}; +} __packed; struct dmi_device { struct list_head list; -- cgit v1.2.3 From 8c2f7e8658df1d3b7cbfa62706941d14c715823a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Jun 2015 04:20:04 -0400 Subject: libnvdimm: infrastructure for btt devices NVDIMM namespaces, in addition to accepting "struct bio" based requests, also have the capability to perform byte-aligned accesses. By default only the bio/block interface is used. However, if another driver can make effective use of the byte-aligned capability it can claim namespace interface and use the byte-aligned ->rw_bytes() interface. The BTT driver is the initial first consumer of this mechanism to allow adding atomic sector update semantics to a pmem or blk namespace. This patch is the sysfs infrastructure to allow configuring a BTT instance for a namespace. Enabling that BTT and performing i/o is in a subsequent patch. Cc: Greg KH Cc: Neil Brown Signed-off-by: Dan Williams --- include/linux/nd.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nd.h b/include/linux/nd.h index 23276ea91690..507e47c86737 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -12,6 +12,7 @@ */ #ifndef __LINUX_ND_H__ #define __LINUX_ND_H__ +#include #include #include @@ -28,13 +29,33 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_common - core infrastructure of a namespace + * @force_raw: ignore other personalities for the namespace (e.g. btt) + * @dev: device model node + * @claim: when set a another personality has taken ownership of the namespace + * @rw_bytes: access the raw namespace capacity with byte-aligned transfers + */ +struct nd_namespace_common { + int force_raw; + struct device dev; + struct device *claim; + int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, + void *buf, size_t size, int rw); +}; + +static inline struct nd_namespace_common *to_ndns(struct device *dev) +{ + return container_of(dev, struct nd_namespace_common, dev); +} + /** * struct nd_namespace_io - infrastructure for loading an nd_pmem instance * @dev: namespace device created by the nd region driver * @res: struct resource conversion of a NFIT SPA table */ struct nd_namespace_io { - struct device dev; + struct nd_namespace_common common; struct resource res; }; @@ -52,7 +73,6 @@ struct nd_namespace_pmem { /** * struct nd_namespace_blk - namespace for dimm-bounded persistent memory - * @dev: namespace device creation by the nd region driver * @alt_name: namespace name supplied in the dimm label * @uuid: namespace name supplied in the dimm label * @id: ida allocated id @@ -61,7 +81,7 @@ struct nd_namespace_pmem { * @res: discontiguous dpa extents for given dimm */ struct nd_namespace_blk { - struct device dev; + struct nd_namespace_common common; char *alt_name; u8 *uuid; int id; @@ -72,7 +92,7 @@ struct nd_namespace_blk { static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { - return container_of(dev, struct nd_namespace_io, dev); + return container_of(dev, struct nd_namespace_io, common.dev); } static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) @@ -84,7 +104,40 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) { - return container_of(dev, struct nd_namespace_blk, dev); + return container_of(dev, struct nd_namespace_blk, common.dev); +} + +/** + * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to fill + * @size: transfer length + * + * @buf is up-to-date upon return from this routine. + */ +static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, READ); +} + +/** + * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to drain + * @size: transfer length + * + * NVDIMM Namepaces disks do not implement sectors internally. Depending on + * the @ndns, the contents of @buf may be in cpu cache, platform buffers, + * or on backing memory media upon return from this routine. Flushing + * to media is handled internal to the @ndns driver, if at all. + */ +static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, WRITE); } #define MODULE_ALIAS_ND_DEVICE(type) \ -- cgit v1.2.3 From 144cba1493fdd6e3e1980e439a31df877831ebcd Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 27 Apr 2015 11:09:54 +0800 Subject: libceph: allow setting osd_req_op's flags Signed-off-by: Yan, Zheng Reviewed-by: Alex Elder --- include/linux/ceph/osd_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 61b19c46bdb3..7506b485bb6d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); extern void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode); + unsigned int which, u16 opcode, u32 flags); extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, unsigned int which, -- cgit v1.2.3 From d50c97b566c5bbf990eff472e9feaa58fdebdd33 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 15 May 2015 11:52:20 +0300 Subject: libceph: nuke time_sub() Unused since ceph got merged into mainline I guess. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/libceph.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 30f92cefaa72..85ae9a889a3f 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -93,15 +93,6 @@ enum { CEPH_MOUNT_SHUTDOWN, }; -/* - * subtract jiffies - */ -static inline unsigned long time_sub(unsigned long a, unsigned long b) -{ - BUG_ON(time_after(b, a)); - return (long)a - (long)b; -} - struct ceph_mds_client; /* -- cgit v1.2.3 From a319bf56a617354e62cf5f774d2ca4e1a8a3bff3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 15 May 2015 12:02:17 +0300 Subject: libceph: store timeouts in jiffies, verify user input There are currently three libceph-level timeouts that the user can specify on mount: mount_timeout, osd_idle_ttl and osdkeepalive. All of these are in seconds and no checking is done on user input: negative values are accepted, we multiply them all by HZ which may or may not overflow, arbitrarily large jiffies then get added together, etc. There is also a bug in the way mount_timeout=0 is handled. It's supposed to mean "infinite timeout", but that's not how wait.h APIs treat it and so __ceph_open_session() for example will busy loop without much chance of being interrupted if none of ceph-mons are there. Fix all this by verifying user input, storing timeouts capped by msecs_to_jiffies() in jiffies and using the new ceph_timeout_jiffies() helper for all user-specified waits to handle infinite timeouts correctly. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/libceph.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 85ae9a889a3f..d73a569f9bf5 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -43,9 +43,9 @@ struct ceph_options { int flags; struct ceph_fsid fsid; struct ceph_entity_addr my_addr; - int mount_timeout; - int osd_idle_ttl; - int osd_keepalive_timeout; + unsigned long mount_timeout; /* jiffies */ + unsigned long osd_idle_ttl; /* jiffies */ + unsigned long osd_keepalive_timeout; /* jiffies */ /* * any type that can't be simply compared or doesn't need need @@ -63,9 +63,9 @@ struct ceph_options { /* * defaults */ -#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_KEEPALIVE_DEFAULT 5 -#define CEPH_OSD_IDLE_TTL_DEFAULT 60 +#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) +#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) +#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) @@ -93,6 +93,11 @@ enum { CEPH_MOUNT_SHUTDOWN, }; +static inline unsigned long ceph_timeout_jiffies(unsigned long timeout) +{ + return timeout ?: MAX_SCHEDULE_TIMEOUT; +} + struct ceph_mds_client; /* -- cgit v1.2.3 From f66fd9f0952187d274c13c136b74548f792c1925 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 10 Jun 2015 17:26:13 +0800 Subject: ceph: pre-allocate data structure that tracks caps flushing Signed-off-by: Yan, Zheng --- include/linux/ceph/libceph.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index d73a569f9bf5..9ebee53d3bf5 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -174,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len) extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_cap_flush_cachep; extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; -- cgit v1.2.3 From b459be739f97e2062b2ba77cfe8ea198dbd58904 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 12 Jun 2015 13:21:07 +0300 Subject: crush: sync up with userspace .. up to ceph.git commit 1db1abc8328d ("crush: eliminate ad hoc diff between kernel and userspace"). This fixes a bunch of recently pulled coding style issues and makes includes a bit cleaner. A patch "crush:Make the function crush_ln static" from Nicholas Krause is folded in as crush_ln() has been made static in userspace as well. Signed-off-by: Ilya Dryomov --- include/linux/crush/crush.h | 40 ++++++++++++++++++++++++++++++++++++++-- include/linux/crush/hash.h | 6 ++++++ include/linux/crush/mapper.h | 2 +- 3 files changed, 45 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 48a1a7d100f1..48b49305716b 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -1,7 +1,11 @@ #ifndef CEPH_CRUSH_CRUSH_H #define CEPH_CRUSH_CRUSH_H -#include +#ifdef __KERNEL__ +# include +#else +# include "crush_compat.h" +#endif /* * CRUSH is a pseudo-random data distribution algorithm that @@ -20,7 +24,11 @@ #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ +#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */ +#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */ +#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u) +#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u) #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ #define CRUSH_ITEM_NONE 0x7fffffff /* no result */ @@ -108,6 +116,15 @@ enum { }; extern const char *crush_bucket_alg_name(int alg); +/* + * although tree was a legacy algorithm, it has been buggy, so + * exclude it. + */ +#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \ + (1 << CRUSH_BUCKET_UNIFORM) | \ + (1 << CRUSH_BUCKET_LIST) | \ + (1 << CRUSH_BUCKET_STRAW)) + struct crush_bucket { __s32 id; /* this'll be negative */ __u16 type; /* non-zero; type=0 is reserved for devices */ @@ -174,7 +191,7 @@ struct crush_map { /* choose local attempts using a fallback permutation before * re-descent */ __u32 choose_local_fallback_tries; - /* choose attempts before giving up */ + /* choose attempts before giving up */ __u32 choose_total_tries; /* attempt chooseleaf inner descent once for firstn mode; on * reject retry outer descent. Note that this does *not* @@ -187,6 +204,25 @@ struct crush_map { * that want to limit reshuffling, a value of 3 or 4 will make the * mappings line up a bit better with previous mappings. */ __u8 chooseleaf_vary_r; + +#ifndef __KERNEL__ + /* + * version 0 (original) of straw_calc has various flaws. version 1 + * fixes a few of them. + */ + __u8 straw_calc_version; + + /* + * allowed bucket algs is a bitmask, here the bit positions + * are CRUSH_BUCKET_*. note that these are *bits* and + * CRUSH_BUCKET_* values are not, so we need to or together (1 + * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to + * minimize confusion (bucket type values start at 1). + */ + __u32 allowed_bucket_algs; + + __u32 *choose_tries; +#endif }; diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h index 91e884230d5d..d1d90258242e 100644 --- a/include/linux/crush/hash.h +++ b/include/linux/crush/hash.h @@ -1,6 +1,12 @@ #ifndef CEPH_CRUSH_HASH_H #define CEPH_CRUSH_HASH_H +#ifdef __KERNEL__ +# include +#else +# include "crush_compat.h" +#endif + #define CRUSH_HASH_RJENKINS1 0 #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index eab367446eea..5dfd5b1125d2 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -8,7 +8,7 @@ * LGPL2 */ -#include +#include "crush.h" extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); extern int crush_do_rule(const struct crush_map *map, -- cgit v1.2.3 From daf7ab7c58ac5f6304b63cca47a06cdc213361d7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 4 Jun 2015 12:13:11 +0800 Subject: genirq: Clean up outdated comments related to include/linux/irqdesc.h Seems we have little chance to move irqdesc.h from include/linux/ into kernel/irq/, so remove the outdated comments. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433391238-19471-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 - include/linux/irqdesc.h | 3 --- 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 812149160d3b..92188b0225bb 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -407,7 +407,6 @@ enum { IRQCHIP_EOI_THREADED = (1 << 6), }; -/* This include will go away once we isolated irq_desc usage to core code */ #include /* diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index c52d1480f272..66c41b40ca50 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -3,9 +3,6 @@ /* * Core internal functions to deal with irq descriptors - * - * This include will move to kernel/irq once we cleaned up the tree. - * For now it's included from */ struct irq_affinity_notify; -- cgit v1.2.3 From 08dad486d98e55d49b760cbaa20b71f35566d4e6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 4 Jun 2015 12:13:12 +0800 Subject: genirq: Remove irq_node() Macro irq_node() has no user. Remove it. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1433391238-19471-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irqnr.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index fdd5cc16c9c4..9669bf9d4f48 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -23,12 +23,6 @@ unsigned int irq_get_next_irq(unsigned int offset); ; \ else -#ifdef CONFIG_SMP -#define irq_node(irq) (irq_get_irq_data(irq)->node) -#else -#define irq_node(irq) 0 -#endif - # define for_each_active_irq(irq) \ for (irq = irq_get_next_irq(0); irq < nr_irqs; \ irq = irq_get_next_irq(irq + 1)) -- cgit v1.2.3 From 7c494375b773497228502133879072a9e959c063 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 1 Jun 2015 10:35:16 -0700 Subject: Input: improve parsing OF parameters for touchscreens When applying touchscreen parameters specified in device tree let's make sure we keep whatever setup was done by the driver and not reset the missing values to zero. Reported-by: Pavel Machek Tested-by: Pavel Machek Signed-off-by: Dmitry Torokhov --- include/linux/input/touchscreen.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/input/touchscreen.h b/include/linux/input/touchscreen.h index 08a5ef6e8f25..eecc9ea6cd58 100644 --- a/include/linux/input/touchscreen.h +++ b/include/linux/input/touchscreen.h @@ -12,9 +12,10 @@ #include #ifdef CONFIG_OF -void touchscreen_parse_of_params(struct input_dev *dev); +void touchscreen_parse_of_params(struct input_dev *dev, bool multitouch); #else -static inline void touchscreen_parse_of_params(struct input_dev *dev) +static inline void touchscreen_parse_of_params(struct input_dev *dev, + bool multitouch) { } #endif -- cgit v1.2.3 From 479305fd7172503772575997eb6f1b0a2bb4a107 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 25 Jun 2015 15:00:40 -0700 Subject: zpool: remove zpool_evict() Remove zpool_evict() helper function. As zbud is currently the only zpool implementation that supports eviction, add zpool and zpool_ops references to struct zbud_pool and directly call zpool_ops->evict(zpool, handle) on eviction. Currently zpool provides the zpool_evict helper which locks the zpool list lock and searches through all pools to find the specific one matching the caller, and call the corresponding zpool_ops->evict function. However, this is unnecessary, as the zbud pool can simply keep a reference to the zpool that created it, as well as the zpool_ops, and directly call the zpool_ops->evict function, when it needs to evict a page. This avoids a spinlock and list search in zpool for each eviction. Signed-off-by: Dan Streetman Cc: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 56529b34dc63..d30eff3d84d5 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -81,7 +81,8 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops); + void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops, + struct zpool *zpool); void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, @@ -102,6 +103,4 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); -int zpool_evict(void *pool, unsigned long handle); - #endif -- cgit v1.2.3 From f6d133f877c8bb0a0934dc8c521c758ee771e901 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Jun 2015 15:01:00 -0700 Subject: compiler-gcc.h: neatening - Move the inline and noinline blocks together - Comment neatening - Alignment of __attribute__ uses - Consistent naming of __must_be_array macro argument - Multiline macro neatening Signed-off-by: Joe Perches Cc: Andi Kleen Cc: Michal Marek Cc: Segher Boessenkool Cc: Sasha Levin Cc: Anton Blanchard Cc: Alan Modra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 85 +++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 371e560d13cf..5c2c14e3c647 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -5,9 +5,9 @@ /* * Common definitions for all gcc versions go here. */ -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) /* Optimization barrier */ @@ -46,55 +46,63 @@ * the inline assembly constraint from =g to =r, in this particular * case either is valid. */ -#define RELOC_HIDE(ptr, off) \ - ({ unsigned long __ptr; \ - __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ - (typeof(ptr)) (__ptr + (off)); }) +#define RELOC_HIDE(ptr, off) \ +({ \ + unsigned long __ptr; \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ + (typeof(ptr)) (__ptr + (off)); \ +}) /* Make the optimizer believe the variable can be manipulated arbitrarily. */ -#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var)) +#define OPTIMIZER_HIDE_VAR(var) \ + __asm__ ("" : "=r" (var) : "0" (var)) #ifdef __CHECKER__ -#define __must_be_array(arr) 0 +#define __must_be_array(a) 0 #else /* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #endif /* * Force always-inline if the user requests it so via the .config, * or if gcc is too old: */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ +#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -# define inline inline __attribute__((always_inline)) notrace -# define __inline__ __inline__ __attribute__((always_inline)) notrace -# define __inline __inline __attribute__((always_inline)) notrace +#define inline inline __attribute__((always_inline)) notrace +#define __inline__ __inline__ __attribute__((always_inline)) notrace +#define __inline __inline __attribute__((always_inline)) notrace #else /* A lot of inline functions can cause havoc with function tracing */ -# define inline inline notrace -# define __inline__ __inline__ notrace -# define __inline __inline notrace +#define inline inline notrace +#define __inline__ __inline__ notrace +#define __inline __inline notrace #endif -#define __deprecated __attribute__((deprecated)) -#define __packed __attribute__((packed)) -#define __weak __attribute__((weak)) -#define __alias(symbol) __attribute__((alias(#symbol))) +#define __always_inline inline __attribute__((always_inline)) +#define noinline __attribute__((noinline)) + +#define __deprecated __attribute__((deprecated)) +#define __packed __attribute__((packed)) +#define __weak __attribute__((weak)) +#define __alias(symbol) __attribute__((alias(#symbol))) /* - * it doesn't make sense on ARM (currently the only user of __naked) to trace - * naked functions because then mcount is called without stack and frame pointer - * being set up and there is no chance to restore the lr register to the value - * before mcount was called. + * it doesn't make sense on ARM (currently the only user of __naked) + * to trace naked functions because then mcount is called without + * stack and frame pointer being set up and there is no chance to + * restore the lr register to the value before mcount was called. + * + * The asm() bodies of naked functions often depend on standard calling + * conventions, therefore they must be noinline and noclone. * - * The asm() bodies of naked functions often depend on standard calling conventions, - * therefore they must be noinline and noclone. GCC 4.[56] currently fail to enforce - * this, so we must do so ourselves. See GCC PR44290. + * GCC 4.[56] currently fail to enforce this, so we must do so ourselves. + * See GCC PR44290. */ -#define __naked __attribute__((naked)) noinline __noclone notrace +#define __naked __attribute__((naked)) noinline __noclone notrace -#define __noreturn __attribute__((noreturn)) +#define __noreturn __attribute__((noreturn)) /* * From the GCC manual: @@ -106,14 +114,13 @@ * would be. * [...] */ -#define __pure __attribute__((pure)) -#define __aligned(x) __attribute__((aligned(x))) -#define __printf(a, b) __attribute__((format(printf, a, b))) -#define __scanf(a, b) __attribute__((format(scanf, a, b))) -#define noinline __attribute__((noinline)) -#define __attribute_const__ __attribute__((__const__)) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) +#define __pure __attribute__((pure)) +#define __aligned(x) __attribute__((aligned(x))) +#define __printf(a, b) __attribute__((format(printf, a, b))) +#define __scanf(a, b) __attribute__((format(scanf, a, b))) +#define __attribute_const__ __attribute__((__const__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) #define __gcc_header(x) #x #define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h) @@ -129,5 +136,3 @@ * code */ #define uninitialized_var(x) x = x - -#define __always_inline inline __attribute__((always_inline)) -- cgit v1.2.3 From cb984d101b30eb7478d32df56a0023e4603cba7f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Jun 2015 15:01:02 -0700 Subject: compiler-gcc: integrate the various compiler-gcc[345].h files As gcc major version numbers are going to advance rather rapidly in the future, there's no real value in separate files for each compiler version. Deduplicate some of the macros #defined in each file too. Neaten comments using normal kernel commenting style. Signed-off-by: Joe Perches Cc: Andi Kleen Cc: Michal Marek Cc: Segher Boessenkool Cc: Sasha Levin Cc: Anton Blanchard Cc: Alan Modra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 120 ++++++++++++++++++++++++++++++++++++++++-- include/linux/compiler-gcc3.h | 23 -------- include/linux/compiler-gcc4.h | 91 -------------------------------- include/linux/compiler-gcc5.h | 67 ----------------------- 4 files changed, 116 insertions(+), 185 deletions(-) delete mode 100644 include/linux/compiler-gcc3.h delete mode 100644 include/linux/compiler-gcc4.h delete mode 100644 include/linux/compiler-gcc5.h (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5c2c14e3c647..dfaa7b3e9ae9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -122,10 +122,122 @@ #define __maybe_unused __attribute__((unused)) #define __always_unused __attribute__((unused)) -#define __gcc_header(x) #x -#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h) -#define gcc_header(x) _gcc_header(x) -#include gcc_header(__GNUC__) +/* gcc version specific checks */ + +#if GCC_VERSION < 30200 +# error Sorry, your compiler is too old - please upgrade it. +#endif + +#if GCC_VERSION < 30300 +# define __used __attribute__((__unused__)) +#else +# define __used __attribute__((__used__)) +#endif + +#ifdef CONFIG_GCOV_KERNEL +# if GCC_VERSION < 30400 +# error "GCOV profiling support for gcc versions below 3.4 not included" +# endif /* __GNUC_MINOR__ */ +#endif /* CONFIG_GCOV_KERNEL */ + +#if GCC_VERSION >= 30400 +#define __must_check __attribute__((warn_unused_result)) +#endif + +#if GCC_VERSION >= 40000 + +/* GCC 4.1.[01] miscompiles __weak */ +#ifdef __KERNEL__ +# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 +# error Your version of gcc miscompiles the __weak directive +# endif +#endif + +#define __used __attribute__((__used__)) +#define __compiler_offsetof(a, b) \ + __builtin_offsetof(a, b) + +#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 +# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) +#endif + +#if GCC_VERSION >= 40300 +/* Mark functions as cold. gcc will assume any path leading to a call + * to them will be unlikely. This means a lot of manual unlikely()s + * are unnecessary now for any paths leading to the usual suspects + * like BUG(), printk(), panic() etc. [but let's keep them for now for + * older compilers] + * + * Early snapshots of gcc 4.3 don't support this and we can't detect this + * in the preprocessor, but we can live with this because they're unreleased. + * Maketime probing would be overkill here. + * + * gcc also has a __attribute__((__hot__)) to move hot functions into + * a special section, but I don't see any sense in this right now in + * the kernel context + */ +#define __cold __attribute__((__cold__)) + +#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + +#ifndef __CHECKER__ +# define __compiletime_warning(message) __attribute__((warning(message))) +# define __compiletime_error(message) __attribute__((error(message))) +#endif /* __CHECKER__ */ +#endif /* GCC_VERSION >= 40300 */ + +#if GCC_VERSION >= 40500 +/* + * Mark a position in code as unreachable. This can be used to + * suppress control flow warnings after asm blocks that transfer + * control elsewhere. + * + * Early snapshots of gcc 4.5 don't support this and we can't detect + * this in the preprocessor, but we can live with this because they're + * unreleased. Really, we need to have autoconf for the kernel. + */ +#define unreachable() __builtin_unreachable() + +/* Mark a function definition as prohibited from being cloned. */ +#define __noclone __attribute__((__noclone__)) + +#endif /* GCC_VERSION >= 40500 */ + +#if GCC_VERSION >= 40600 +/* + * Tell the optimizer that something else uses this function or variable. + */ +#define __visible __attribute__((externally_visible)) +#endif + +/* + * GCC 'asm goto' miscompiles certain code sequences: + * + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 + * + * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. + * + * (asm goto is automatically volatile - the naming reflects this.) + */ +#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) + +#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP +#if GCC_VERSION >= 40400 +#define __HAVE_BUILTIN_BSWAP32__ +#define __HAVE_BUILTIN_BSWAP64__ +#endif +#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) +#define __HAVE_BUILTIN_BSWAP16__ +#endif +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ + +#if GCC_VERSION >= 50000 +#define KASAN_ABI_VERSION 4 +#elif GCC_VERSION >= 40902 +#define KASAN_ABI_VERSION 3 +#endif + +#endif /* gcc version >= 40000 specific checks */ #if !defined(__noclone) #define __noclone /* not needed */ diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h deleted file mode 100644 index 7d89febe4d79..000000000000 --- a/include/linux/compiler-gcc3.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include directly, include instead." -#endif - -#if GCC_VERSION < 30200 -# error Sorry, your compiler is too old - please upgrade it. -#endif - -#if GCC_VERSION >= 30300 -# define __used __attribute__((__used__)) -#else -# define __used __attribute__((__unused__)) -#endif - -#if GCC_VERSION >= 30400 -#define __must_check __attribute__((warn_unused_result)) -#endif - -#ifdef CONFIG_GCOV_KERNEL -# if GCC_VERSION < 30400 -# error "GCOV profiling support for gcc versions below 3.4 not included" -# endif /* __GNUC_MINOR__ */ -#endif /* CONFIG_GCOV_KERNEL */ diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h deleted file mode 100644 index 769e19864632..000000000000 --- a/include/linux/compiler-gcc4.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include directly, include instead." -#endif - -/* GCC 4.1.[01] miscompiles __weak */ -#ifdef __KERNEL__ -# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 -# error Your version of gcc miscompiles the __weak directive -# endif -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a,b) __builtin_offsetof(a,b) - -#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 -# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#endif - -#if GCC_VERSION >= 40300 -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ -#endif /* GCC_VERSION >= 40300 */ - -#if GCC_VERSION >= 40500 -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -#endif /* GCC_VERSION >= 40500 */ - -#if GCC_VERSION >= 40600 -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) -#endif - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#if GCC_VERSION >= 40400 -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#endif -#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) -#define __HAVE_BUILTIN_BSWAP16__ -#endif -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#if GCC_VERSION >= 40902 -#define KASAN_ABI_VERSION 3 -#endif diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h deleted file mode 100644 index efee493714eb..000000000000 --- a/include/linux/compiler-gcc5.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include directly, include instead." -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) - -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ - -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#define __HAVE_BUILTIN_BSWAP16__ -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#define KASAN_ABI_VERSION 4 -- cgit v1.2.3 From b86a50c3b5414eafdbee7f34af4a201a4a7817c2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 Jun 2015 15:01:05 -0700 Subject: compiler-intel: fix wrong compiler barrier() macro Cleanup commit 73679e508201 ("compiler-intel.h: Remove duplicate definition") removed the double definition of __memory_barrier() intrinsics. However, in doing so, it also removed the preceding #undef barrier by accident, meaning, the actual barrier() macro from compiler-gcc.h with inline asm is still in place as __GNUC__ is provided. Subsequently, barrier() can never be defined as __memory_barrier() from compiler.h since it already has a definition in place and if we trust the comment in compiler-intel.h, ecc doesn't support gcc specific asm statements. I don't have an ecc at hand (unsure if that's still used in the field?) and only found this by accident during code review, a revert of that cleanup would be simplest option. Fixes: 73679e508201 ("compiler-intel.h: Remove duplicate definition") Signed-off-by: Daniel Borkmann Reviewed-by: Pranith Kumar Cc: Pranith Kumar Cc: H. Peter Anvin Cc: mancha security Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-intel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index 0c9a2f2c2802..d4c71132d07f 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -13,10 +13,12 @@ /* Intel ECC compiler doesn't support gcc specific asm stmts. * It uses intrinsics to do the equivalent things. */ +#undef barrier #undef barrier_data #undef RELOC_HIDE #undef OPTIMIZER_HIDE_VAR +#define barrier() __memory_barrier() #define barrier_data(ptr) barrier() #define RELOC_HIDE(ptr, off) \ -- cgit v1.2.3 From 8c7fbe5795a016259445a61e072eb0118aaf6a61 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Jun 2015 15:01:16 -0700 Subject: stddef.h: move offsetofend inside #ifndef/#endif guard, neaten Commit 3876488444e7 ("include/stddef.h: Move offsetofend() from vfio.h to a generic kernel header") added offsetofend outside the normal include #ifndef/#endif guard. Move it inside. Miscellanea: o remove unnecessary blank line o standardize offsetof macros whitespace style Signed-off-by: Joe Perches Cc: Denys Vlasenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stddef.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 076af437284d..9c61c7cda936 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -3,7 +3,6 @@ #include - #undef NULL #define NULL ((void *)0) @@ -14,10 +13,9 @@ enum { #undef offsetof #ifdef __compiler_offsetof -#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) +#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER) #else -#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) -#endif +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) #endif /** @@ -28,3 +26,5 @@ enum { */ #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) + +#endif -- cgit v1.2.3 From 3033f14ab78c326871a4902591c2518410add24a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 25 Jun 2015 15:01:19 -0700 Subject: clone: support passing tls argument via C rather than pt_regs magic clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter "tls" that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. This patch (of 2): clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett Acked-by: Andy Lutomirski Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thiago Macieira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 15 +++++++++++++++ include/linux/syscalls.h | 6 +++--- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6633e83e608a..93ed0b682adb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2556,8 +2556,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); @@ -2576,6 +2590,7 @@ extern int do_execveat(int, struct filename *, const char __user * const __user *, const char __user * const __user *, int); +extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38aabe1..bb51becf23f8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd); asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long, int __user *); #else #ifdef CONFIG_CLONE_BACKWARDS3 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, - int __user *, int); + int __user *, unsigned long); #else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, - int __user *, int); + int __user *, unsigned long); #endif #endif -- cgit v1.2.3 From d43ff430f434d862db59582c0f1f02382a678036 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jun 2015 15:01:24 -0700 Subject: printk: guard the amount written per line by devkmsg_read() This patchset updates netconsole so that it can emit messages with the same header as used in /dev/kmsg which gives neconsole receiver full log information which enables things like structured logging and detection of lost messages. This patch (of 7): devkmsg_read() uses 8k buffer and assumes that the formatted output message won't overrun which seems safe given LOG_LINE_MAX, the current use of dict and the escaping method being used; however, we're planning to use devkmsg formatting wider and accounting for the buffer size properly isn't that complicated. This patch defines CONSOLE_EXT_LOG_MAX as 8192 and updates devkmsg_read() so that it limits output accordingly. Signed-off-by: Tejun Heo Cc: David Miller Cc: Kay Sievers Reviewed-by: Petr Mladek Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 9b30871c9149..58b1fec40d37 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -30,6 +30,8 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +#define CONSOLE_EXT_LOG_MAX 8192 + /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT -- cgit v1.2.3 From 6fe29354befe4c46eb308b662155d4d8017358e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jun 2015 15:01:30 -0700 Subject: printk: implement support for extended console drivers printk log_buf keeps various metadata for each message including its sequence number and timestamp. The metadata is currently available only through /dev/kmsg and stripped out before passed onto console drivers. We want this metadata to be available to console drivers too so that console consumers can get full information including the metadata and dictionary, which among other things can be used to detect whether messages got lost in transit. This patch implements support for extended console drivers. Consoles can indicate that they want extended messages by setting the new CON_EXTENDED flag and they'll be fed messages formatted the same way as /dev/kmsg. ",,,;\n" If extended consoles exist, in-kernel fragment assembly is disabled. This ensures that all messages emitted to consoles have full metadata including sequence number. The contflag carries enough information to reassemble the fragments from the reader side trivially. Note that this only affects /dev/kmsg. Regular console and /proc/kmsg outputs are not affected by this change. * Extended message formatting for console drivers is enabled iff there are registered extended consoles. * Comment describing /dev/kmsg message format updated to add missing contflag field and help distinguishing variable from verbatim terms. Signed-off-by: Tejun Heo Cc: David Miller Cc: Kay Sievers Reviewed-by: Petr Mladek Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 9f50fb413c11..bd194343c346 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -115,6 +115,7 @@ static inline int con_debug_leave(void) #define CON_BOOT (8) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ +#define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ struct console { char name[16]; -- cgit v1.2.3 From 3ea4331c60be3eee4c97e5ddabad95399f879b76 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 25 Jun 2015 15:01:47 -0700 Subject: check_syslog_permissions() cleanup Patch fixes drawbacks in heck_syslog_permissions() noticed by AKPM: "from_file handling makes me cry. That's not a boolean - it's an enumerated value with two values currently defined. But the code in check_syslog_permissions() treats it as a boolean and also hardwires the knowledge that SYSLOG_FROM_PROC == 1 (or == `true`). And the name is wrong: it should be called from_proc to match SYSLOG_FROM_PROC." Signed-off-by: Vasily Averin Cc: Kees Cook Cc: Josh Boyer Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syslog.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 4b7b875a7ce1..c3a7f0cc3a27 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -47,12 +47,12 @@ #define SYSLOG_FROM_READER 0 #define SYSLOG_FROM_PROC 1 -int do_syslog(int type, char __user *buf, int count, bool from_file); +int do_syslog(int type, char __user *buf, int count, int source); #ifdef CONFIG_PRINTK -int check_syslog_permissions(int type, bool from_file); +int check_syslog_permissions(int type, int source); #else -static inline int check_syslog_permissions(int type, bool from_file) +static inline int check_syslog_permissions(int type, int source) { return 0; } -- cgit v1.2.3 From 94df290404cd0da8016698bf3f398410f29d9a64 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 25 Jun 2015 15:02:22 -0700 Subject: lib/string.c: introduce strreplace() Strings are sometimes sanitized by replacing a certain character (often '/') by another (often '!'). In a few places, this is done the same way Schlemiel the Painter would do it. Others are slightly smarter but still do multiple strchr() calls. Introduce strreplace() to do this using a single function call and a single pass over the string. One would expect the return value to be one of three things: void, s, or the number of replacements made. I chose the fourth, returning a pointer to the end of the string. This is more likely to be useful (for example allowing the caller to avoid a strlen call). Signed-off-by: Rasmus Villemoes Cc: "Theodore Ts'o" Cc: Greg Kroah-Hartman Cc: Neil Brown Cc: Steven Rostedt Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index e40099e585c9..a8d90db9c4b0 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -111,6 +111,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); #endif void *memchr_inv(const void *s, int c, size_t n); +char *strreplace(char *s, char old, char new); extern void kfree_const(const void *x); -- cgit v1.2.3 From 78d8e58a086b214dddf1fd463e20a7e1d82d7866 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 26 Jun 2015 10:01:13 -0400 Subject: Revert "block, dm: don't copy bios for request clones" This reverts commit 5f1b670d0bef508a5554d92525f5f6d00d640b38. Justification for revert as reported in this dm-devel post: https://www.redhat.com/archives/dm-devel/2015-June/msg00160.html this change should not be pushed to mainline yet. Firstly, Christoph has a newer version of the patch that fixes silent data corruption problem: https://www.redhat.com/archives/dm-devel/2015-May/msg00229.html And the new version still depends on LLDDs to always complete requests to the end when error happens, while block API doesn't enforce such a requirement. If the assumption is ever broken, the inconsistency between request and bio (e.g. rq->__sector and rq->bio) will cause silent data corruption: https://www.redhat.com/archives/dm-devel/2015-June/msg00022.html Reported-by: Junichi Nomura Signed-off-by: Mike Snitzer --- include/linux/blk_types.h | 2 -- include/linux/blkdev.h | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6ab9d12d1f17..7303b3405520 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -192,7 +192,6 @@ enum rq_flag_bits { __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NO_TIMEOUT, /* requests may never expire */ - __REQ_CLONE, /* cloned bios */ __REQ_NR_BITS, /* stops here */ }; @@ -247,6 +246,5 @@ enum rq_flag_bits { #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) -#define REQ_CLONE (1ULL << __REQ_CLONE) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 776d2ee43ba6..41c0fb573dff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -775,7 +775,11 @@ extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); -extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src); +extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), + void *data); +extern void blk_rq_unprep_clone(struct request *rq); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern void blk_delay_queue(struct request_queue *, unsigned long); -- cgit v1.2.3 From 5212e11fde4d40fa627668b4f2222d20db488f71 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Jun 2015 04:20:32 -0400 Subject: nd_btt: atomic sector updates BTT stands for Block Translation Table, and is a way to provide power fail sector atomicity semantics for block devices that have the ability to perform byte granularity IO. It relies on the capability of libnvdimm namespace devices to do byte aligned IO. The BTT works as a stacked blocked device, and reserves a chunk of space from the backing device for its accounting metadata. It is a bio-based driver because all IO is done synchronously, and there is no queuing or asynchronous completions at either the device or the driver level. The BTT uses 'lanes' to index into various 'on-disk' data structures, and lanes also act as a synchronization mechanism in case there are more CPUs than available lanes. We did a comparison between two lane lock strategies - first where we kept an atomic counter around that tracked which was the last lane that was used, and 'our' lane was determined by atomically incrementing that. That way, for the nr_cpus > nr_lanes case, theoretically, no CPU would be blocked waiting for a lane. The other strategy was to use the cpu number we're scheduled on to and hash it to a lane number. Theoretically, this could block an IO that could've otherwise run using a different, free lane. But some fio workloads showed that the direct cpu -> lane hash performed faster than tracking 'last lane' - my reasoning is the cache thrash caused by moving the atomic variable made that approach slower than simply waiting out the in-progress IO. This supports the conclusion that the driver can be a very simple bio-based one that does synchronous IOs instead of queuing. Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Neil Brown Cc: Jeff Moyer Cc: Dave Chinner Cc: Greg KH [jmoyer: fix nmi watchdog timeout in btt_map_init] [jmoyer: move btt initialization to module load path] [jmoyer: fix memory leak in the btt initialization path] [jmoyer: Don't overwrite corrupted arenas] Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a59dca17b3aa..531d99dfac68 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -85,6 +85,7 @@ struct nd_region_desc { const struct attribute_group **attr_groups; struct nd_interleave_set *nd_set; void *provider_data; + int num_lanes; }; struct nvdimm_bus; -- cgit v1.2.3 From 047fc8a1f9a6330eacc80374dff087e20dc2304b Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 04:21:02 -0400 Subject: libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory The libnvdimm implementation handles allocating dimm address space (DPA) between PMEM and BLK mode interfaces. After DPA has been allocated from a BLK-region to a BLK-namespace the nd_blk driver attaches to handle I/O as a struct bio based block device. Unlike PMEM, BLK is required to handle platform specific details like mmio register formats and memory controller interleave. For this reason the libnvdimm generic nd_blk driver calls back into the bus provider to carry out the I/O. This initial implementation handles the BLK interface defined by the ACPI 6 NFIT [1] and the NVDIMM DSM Interface Example [2] composed from DCR (dimm control region), BDW (block data window), IDT (interleave descriptor) NFIT structures and the hardware register format. [1]: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf [2]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Signed-off-by: Ross Zwisler Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 531d99dfac68..7fc1b25bdb5d 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,7 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include #include #include @@ -89,8 +90,24 @@ struct nd_region_desc { }; struct nvdimm_bus; -struct device; struct module; +struct device; +struct nd_blk_region; +struct nd_blk_region_desc { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + struct nd_region_desc ndr_desc; +}; + +static inline struct nd_blk_region_desc *to_blk_region_desc( + struct nd_region_desc *ndr_desc) +{ + return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc); + +} + struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ @@ -99,10 +116,10 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); struct nd_region *to_nd_region(struct device *dev); +struct nd_blk_region *to_nd_blk_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); -void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -120,5 +137,11 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +void *nd_region_provider_data(struct nd_region *nd_region); +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr); +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 581388209405902b56d055f644b4dd124a206112 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 23 Jun 2015 20:08:34 -0400 Subject: libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only Upon detection of an unarmed dimm in a region, arrange for descendant BTT, PMEM, or BLK instances to be read-only. A dimm is primarily marked "unarmed" via flags passed by platform firmware (NFIT). The flags in the NFIT memory device sub-structure indicate the state of the data on the nvdimm relative to its energy source or last "flush to persistence". For the most part there is nothing the driver can do but advertise the state of these flags in sysfs and emit a message if firmware indicates that the contents of the device may be corrupted. However, for the case of ACPI_NFIT_MEM_ARMED, the driver can arrange for the block devices incorporating that nvdimm to be marked read-only. This is a safe default as the data is still available and new writes are held off until the administrator either forces read-write mode, or the energy source becomes armed. A 'read_only' attribute is added to REGION devices to allow for overriding the default read-only policy of all descendant block devices. Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 7fc1b25bdb5d..dc799a29ed1a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -21,6 +21,8 @@ enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + /* unarmed memory devices may not persist writes */ + NDD_UNARMED = 1 << 1, /* need to set a limit somewhere, but yes, this is likely overkill */ ND_IOCTL_MAX_BUFLEN = SZ_4M, -- cgit v1.2.3 From 99759869faf15471cfce251bc138848d8af7d162 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 17:14:15 -0600 Subject: acpi: Add acpi_map_pxm_to_online_node() The kernel initializes CPU & memory's NUMA topology from ACPI SRAT table. Some other ACPI tables, such as NFIT and DMAR, also contain proximity IDs for their device's NUMA topology. This information can be used to improve performance of these devices. This patch introduces acpi_map_pxm_to_online_node(), which is similar to acpi_map_pxm_to_node(), but always returns an online node. When the mapped node from a given proximity ID is offline, it looks up the node distance table and returns the nearest online node. ACPI device drivers, which are called after the NUMA initialization has completed in the kernel, can call this interface to obtain their device NUMA topology from ACPI tables. Such drivers do not have to deal with offline nodes. A node may be offline when a device proximity ID is unique, SRAT memory entry does not exist, or NUMA is disabled, ex. "numa=off" on x86. This patch also moves the pxm range check from acpi_get_node() to acpi_map_pxm_to_node(). Signed-off-by: Toshi Kani Acked-by: Rafael J. Wysocki > Signed-off-by: Dan Williams --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..1b3bbb11d11c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -289,8 +289,13 @@ extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d); extern void acpi_osi_setup(char *str); #ifdef CONFIG_ACPI_NUMA +int acpi_map_pxm_to_online_node(int pxm); int acpi_get_node(acpi_handle handle); #else +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + return 0; +} static inline int acpi_get_node(acpi_handle handle) { return 0; -- cgit v1.2.3 From 41d7a6d637e1440f5410cb43c25a3c41255540c5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:33 -0600 Subject: libnvdimm: Set numa_node to NVDIMM devices ACPI NFIT table has System Physical Address Range Structure entries that describe a proximity ID of each range when ACPI_NFIT_PROXIMITY_VALID is set in the flags. Change acpi_nfit_register_region() to map a proximity ID to its node ID, and set it to a new numa_node field of nd_region_desc, which is then conveyed to the nd_region device. The device core arranges for btt and namespace devices to inherit their node from their parent region. Signed-off-by: Toshi Kani [djbw: move set_dev_node() from region.c to bus.c] Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index dc799a29ed1a..30b3deaafd51 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -89,6 +89,7 @@ struct nd_region_desc { struct nd_interleave_set *nd_set; void *provider_data; int num_lanes; + int numa_node; }; struct nvdimm_bus; -- cgit v1.2.3 From 74ae66c3b14ffa94c8d2dea201cdf8e6203d13d5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:34 -0600 Subject: libnvdimm: Add sysfs numa_node to NVDIMM devices Add support of sysfs 'numa_node' to I/O-related NVDIMM devices under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.x. An example of numa_node values on a 2-socket system with a single NVDIMM range on each socket is shown below. /sys/bus/nd/devices |-- btt0.0/numa_node:0 |-- btt1.0/numa_node:1 |-- btt1.1/numa_node:1 |-- namespace0.0/numa_node:0 |-- namespace1.0/numa_node:1 |-- region0/numa_node:0 |-- region1/numa_node:1 These numa_node files are then linked under the block class of their device names. /sys/class/block/pmem0/device/numa_node:0 /sys/class/block/pmem1s/device/numa_node:1 This enables numactl(8) to accept 'block:' and 'file:' paths of pmem and btt devices as shown in the examples below. numactl --preferred block:pmem0 --show numactl --preferred file:/dev/pmem1s --show Signed-off-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 30b3deaafd51..75e3af01ee32 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -38,6 +38,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_numa_attribute_group; extern struct attribute_group nd_region_attribute_group; extern struct attribute_group nd_mapping_attribute_group; -- cgit v1.2.3 From 61031952f4c89dba1065f7a5b9419badb112554c Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 03:08:39 -0400 Subject: arch, x86: pmem api for ensuring durability of persistent memory updates Based on an original patch by Ross Zwisler [1]. Writes to persistent memory have the potential to be posted to cpu cache, cpu write buffers, and platform write buffers (memory controller) before being committed to persistent media. Provide apis, memcpy_to_pmem(), wmb_pmem(), and memremap_pmem(), to write data to pmem and assert that it is durable in PMEM (a persistent linear address range). A '__pmem' attribute is added so sparse can track proper usage of pointers to pmem. This continues the status quo of pmem being x86 only for 4.2, but reworks to ioremap, and wider implementation of memremap() will enable other archs in 4.3. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Ross Zwisler [djbw: various reworks] Signed-off-by: Dan Williams --- include/linux/compiler.h | 2 + include/linux/pmem.h | 153 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 include/linux/pmem.h (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..9a528d945498 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -21,6 +21,7 @@ # define __rcu __attribute__((noderef, address_space(4))) #else # define __rcu +# define __pmem __attribute__((noderef, address_space(5))) #endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); @@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __cond_lock(x,c) (c) # define __percpu # define __rcu +# define __pmem #endif /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ diff --git a/include/linux/pmem.h b/include/linux/pmem.h new file mode 100644 index 000000000000..f6481a0b1d4f --- /dev/null +++ b/include/linux/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __PMEM_H__ +#define __PMEM_H__ + +#include + +#ifdef CONFIG_ARCH_HAS_PMEM_API +#include +#else +static inline void arch_wmb_pmem(void) +{ + BUG(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} + +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return NULL; +} + +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + BUG(); +} +#endif + +/* + * Architectures that define ARCH_HAS_PMEM_API must provide + * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(), + * arch_wmb_pmem(), and __arch_has_wmb_pmem(). + */ + +static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) +{ + memcpy(dst, (void __force const *) src, size); +} + +static inline void memunmap_pmem(void __pmem *addr) +{ + iounmap((void __force __iomem *) addr); +} + +/** + * arch_has_wmb_pmem - true if wmb_pmem() ensures durability + * + * For a given cpu implementation within an architecture it is possible + * that wmb_pmem() resolves to a nop. In the case this returns + * false, pmem api users are unable to ensure durability and may want to + * fall back to a different data consistency model, or otherwise notify + * the user. + */ +static inline bool arch_has_wmb_pmem(void) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) + return __arch_has_wmb_pmem(); + return false; +} + +static inline bool arch_has_pmem_api(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem(); +} + +/* + * These defaults seek to offer decent performance and minimize the + * window between i/o completion and writes being durable on media. + * However, it is undefined / architecture specific whether + * default_memremap_pmem + default_memcpy_to_pmem is sufficient for + * making data durable relative to i/o completion. + */ +static void default_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t size) +{ + memcpy((void __force *) dst, src, size); +} + +static void __pmem *default_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + /* TODO: convert to ioremap_wt() */ + return (void __pmem __force *)ioremap_nocache(offset, size); +} + +/** + * memremap_pmem - map physical persistent memory for pmem api + * @offset: physical address of persistent memory + * @size: size of the mapping + * + * Establish a mapping of the architecture specific memory type expected + * by memcpy_to_pmem() and wmb_pmem(). For example, it may be + * the case that an uncacheable or writethrough mapping is sufficient, + * or a writeback mapping provided memcpy_to_pmem() and + * wmb_pmem() arrange for the data to be written through the + * cache to persistent media. + */ +static inline void __pmem *memremap_pmem(resource_size_t offset, + unsigned long size) +{ + if (arch_has_pmem_api()) + return arch_memremap_pmem(offset, size); + return default_memremap_pmem(offset, size); +} + +/** + * memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Perform a memory copy that results in the destination of the copy + * being effectively evicted from, or never written to, the processor + * cache hierarchy after the copy completes. After memcpy_to_pmem() + * data may still reside in cpu or platform buffers, so this operation + * must be followed by a wmb_pmem(). + */ +static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n) +{ + if (arch_has_pmem_api()) + arch_memcpy_to_pmem(dst, src, n); + else + default_memcpy_to_pmem(dst, src, n); +} + +/** + * wmb_pmem - synchronize writes to persistent memory + * + * After a series of memcpy_to_pmem() operations this drains data from + * cpu write buffers and any platform (memory controller) buffers to + * ensure that written data is durable on persistent memory media. + */ +static inline void wmb_pmem(void) +{ + if (arch_has_pmem_api()) + arch_wmb_pmem(); +} +#endif /* __PMEM_H__ */ -- cgit v1.2.3 From 304adf8a8fff972f633bf52b3d160682d3f3d5d2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 4 Jun 2015 12:13:26 +0800 Subject: genirq: Introduce helper irq_desc_get_irq() Introduce helper irq_desc_get_irq() to retrieve the irq number from the irq descriptor. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433391238-19471-17-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 66c41b40ca50..a72bfd94ea10 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -100,6 +100,11 @@ static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) #endif } +static inline unsigned int irq_desc_get_irq(struct irq_desc *desc) +{ + return desc->irq_data.irq; +} + static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) { return &desc->irq_data; -- cgit v1.2.3 From bbc9d21fc0071c245c19077155ea371092ff0db8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Jun 2015 15:01:30 +0200 Subject: genirq: Implement irq_set_handler_locked()/irq_set_chip_handler_name_locked() The main use case for the exisiting __irq_set_*_locked() inlines is to replace the handler [,chip and name] of an interrupt from a region which has the irq descriptor lock held, e.g. from the irq_set_type() callback. The first argument is the irq number, so the functions need so perform a pointless lookup of the interrupt descriptor for those cases which have the irq_data pointer handy. Provide new functions which take an irq_data pointer instead of the interrupt number, so the lookup of the interrupt descriptor can be avoided. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Conflicts: include/linux/irqdesc.h --- include/linux/irqdesc.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index a72bfd94ea10..624a668e61f1 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -190,6 +190,47 @@ __irq_set_chip_handler_name_locked(unsigned int irq, struct irq_chip *chip, desc->name = name; } +/** + * irq_set_handler_locked - Set irq handler from a locked region + * @data: Pointer to the irq_data structure which identifies the irq + * @handler: Flow control handler function for this interrupt + * + * Sets the handler in the irq descriptor associated to @data. + * + * Must be called with irq_desc locked and valid parameters. Typical + * call site is the irq_set_type() callback. + */ +static inline void irq_set_handler_locked(struct irq_data *data, + irq_flow_handler_t handler) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + desc->handle_irq = handler; +} + +/** + * irq_set_chip_handler_name_locked - Set chip, handler and name from a locked region + * @data: Pointer to the irq_data structure for which the chip is set + * @chip: Pointer to the new irq chip + * @handler: Flow control handler function for this interrupt + * @name: Name of the interrupt + * + * Replace the irq chip at the proper hierarchy level in @data and + * sets the handler and name in the associated irq descriptor. + * + * Must be called with irq_desc locked and valid parameters. + */ +static inline void +irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip, + irq_flow_handler_t handler, const char *name) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + desc->handle_irq = handler; + desc->name = name; + data->chip = chip; +} + static inline int irq_balancing_disabled(unsigned int irq) { struct irq_desc *desc; -- cgit v1.2.3 From 6c5a0d891543873aefc3aaf846c1e7afe0982ff9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 27 Jun 2015 11:45:46 -0400 Subject: NFSv4.2: LAYOUTSTATS is optional to implement Make it so, by checking the return value for NFS4ERR_MOTSUPP and caching the information as a server capability. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 5e1273d4de14..a2ea1491d3df 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -237,5 +237,6 @@ struct nfs_server { #define NFS_CAP_SEEK (1U << 19) #define NFS_CAP_ALLOCATE (1U << 20) #define NFS_CAP_DEALLOCATE (1U << 21) +#define NFS_CAP_LAYOUTSTATS (1U << 22) #endif -- cgit v1.2.3 From cf2fde7b39e9446e2af015215d7fb695781af0c1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 26 Jun 2015 06:44:38 +0930 Subject: param: fix module param locks when !CONFIG_SYSFS. As Dan Streetman points out, the entire point of locking for is to stop sysfs accesses, so they're elided entirely in the !SYSFS case. Reported-by: Stephen Rothwell Signed-off-by: Rusty Russell --- include/linux/module.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 6ba0e87fa804..46efa1c9de60 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -240,7 +240,9 @@ struct module { unsigned int num_syms; /* Kernel parameters. */ +#ifdef CONFIG_SYSFS struct mutex param_lock; +#endif struct kernel_param *kp; unsigned int num_kp; -- cgit v1.2.3 From ef90174f821041313d42d99c1c8b35a3af64a910 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Theou Date: Tue, 9 Jun 2015 09:55:02 -0700 Subject: watchdog: watchdog_core: Add watchdog registration deferral mechanism Currently, watchdog subsystem require the misc subsystem to register a watchdog. This may not be the case in case of an early registration of a watchdog, which can be required when the watchdog cannot be disabled. This patch introduces a deferral mechanism to remove this requirement. Signed-off-by: Jean-Baptiste Theou Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index a746bf5216f8..f47feada5b42 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -65,6 +65,8 @@ struct watchdog_ops { * @driver-data:Pointer to the drivers private data. * @lock: Lock for watchdog core internal use only. * @status: Field that contains the devices internal status bits. + * @deferred: entry in wtd_deferred_reg_list which is used to + * register early initialized watchdogs. * * The watchdog_device structure contains all information about a * watchdog timer device. @@ -95,6 +97,7 @@ struct watchdog_device { #define WDOG_ALLOW_RELEASE 2 /* Did we receive the magic char ? */ #define WDOG_NO_WAY_OUT 3 /* Is 'nowayout' feature set ? */ #define WDOG_UNREGISTERED 4 /* Has the device been unregistered */ + struct list_head deferred; }; #define WATCHDOG_NOWAYOUT IS_BUILTIN(CONFIG_WATCHDOG_NOWAYOUT) -- cgit v1.2.3 From a9730fca9946f3697410479e0ef1bd759ba00a77 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 29 Jun 2015 09:28:08 -0500 Subject: Fix kmalloc slab creation sequence This patch restores the slab creation sequence that was broken by commit 4066c33d0308f8 and also reverts the portions that introduced the KMALLOC_LOOP_XXX macros. Those can never really work since the slab creation is much more complex than just going from a minimum to a maximum number. The latest upstream kernel boots cleanly on my machine with a 64 bit x86 configuration under KVM using either SLAB or SLUB. Fixes: 4066c33d0308f8 ("support the slub_debug boot option") Reported-by: Theodore Ts'o Signed-off-by: Christoph Lameter Signed-off-by: Linus Torvalds --- include/linux/slab.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9de2fdc8b5e4..a99f0e5243e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,30 +153,8 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) -/* - * The KMALLOC_LOOP_LOW is the definition for the for loop index start number - * to create the kmalloc_caches object in create_kmalloc_caches(). The first - * and the second are 96 and 192. You can see that in the kmalloc_index(), if - * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, - * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't - * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. - */ -#if KMALLOC_MIN_SIZE <= 32 -#define KMALLOC_LOOP_LOW 1 -#elif KMALLOC_MIN_SIZE <= 64 -#define KMALLOC_LOOP_LOW 2 -#else -#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW -#endif - #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -/* - * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. - * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be - * initialized. - */ -#define KMALLOC_LOOP_LOW 1 #endif /* -- cgit v1.2.3 From 31f02455455d405320e2f749696bef4e02903b35 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 30 Jun 2015 12:07:17 -0400 Subject: sparse: fix misplaced __pmem definition Move the definition of __pmem outside of CONFIG_SPARSE_RCU_POINTER to fix: drivers/nvdimm/pmem.c:198:17: sparse: too many arguments for function __builtin_expect drivers/nvdimm/pmem.c:36:33: sparse: expected ; at end of declaration drivers/nvdimm/pmem.c:48:21: sparse: void declaration ...due to __pmem failing to be defined in some configurations when CONFIG_SPARSE_RCU_POINTER=y. Reported-by: kbuild test robot Reported-by: Dan Carpenter Signed-off-by: Dan Williams --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 26fc8bc77f85..d8fbd500330e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -17,11 +17,11 @@ # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) # define __percpu __attribute__((noderef, address_space(3))) +# define __pmem __attribute__((noderef, address_space(5))) #ifdef CONFIG_SPARSE_RCU_POINTER # define __rcu __attribute__((noderef, address_space(4))) #else # define __rcu -# define __pmem __attribute__((noderef, address_space(5))) #endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); -- cgit v1.2.3 From 8e7a7f8619f1f93736d9bb7e31caf4721bdc739d Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 30 Jun 2015 14:56:41 -0700 Subject: memblock: introduce a for_each_reserved_mem_region iterator Struct page initialisation had been identified as one of the reasons why large machines take a long time to boot. Patches were posted a long time ago to defer initialisation until they were first used. This was rejected on the grounds it should not be necessary to hurt the fast paths. This series reuses much of the work from that time but defers the initialisation of memory to kswapd so that one thread per node initialises memory local to that node. After applying the series and setting the appropriate Kconfig variable I see this in the boot log on a 64G machine [ 7.383764] kswapd 0 initialised deferred memory in 188ms [ 7.404253] kswapd 1 initialised deferred memory in 208ms [ 7.411044] kswapd 3 initialised deferred memory in 216ms [ 7.411551] kswapd 2 initialised deferred memory in 216ms On a 1TB machine, I see [ 8.406511] kswapd 3 initialised deferred memory in 1116ms [ 8.428518] kswapd 1 initialised deferred memory in 1140ms [ 8.435977] kswapd 0 initialised deferred memory in 1148ms [ 8.437416] kswapd 2 initialised deferred memory in 1148ms Once booted the machine appears to work as normal. Boot times were measured from the time shutdown was called until ssh was available again. In the 64G case, the boot time savings are negligible. On the 1TB machine, the savings were 16 seconds. Nate Zimmer said: : On an older 8 TB box with lots and lots of cpus the boot time, as : measure from grub to login prompt, the boot time improved from 1484 : seconds to exactly 1000 seconds. Waiman Long said: : I ran a bootup timing test on a 12-TB 16-socket IvyBridge-EX system. From : grub menu to ssh login, the bootup time was 453s before the patch and 265s : after the patch - a saving of 188s (42%). Daniel Blueman said: : On a 7TB, 1728-core NumaConnect system with 108 NUMA nodes, we're seeing : stock 4.0 boot in 7136s. This drops to 2159s, or a 70% reduction with : this patchset. Non-temporal PMD init (https://lkml.org/lkml/2015/4/23/350) : drops this to 1045s. This patch (of 13): As part of initializing struct page's in 2MiB chunks, we noticed that at the end of free_all_bootmem(), there was nothing which had forced the reserved/allocated 4KiB pages to be initialized. This helper function will be used for that expansion. Signed-off-by: Robin Holt Signed-off-by: Nate Zimmer Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 0215ffd63069..cc4b01972060 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -101,6 +101,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); +void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, + phys_addr_t *out_end); + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. @@ -142,6 +145,21 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) +/** + * for_each_reserved_mem_region - iterate over all reserved memblock areas + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over reserved areas of memblock. Available as soon as memblock + * is initialized. + */ +#define for_each_reserved_mem_region(i, p_start, p_end) \ + for (i = 0UL, \ + __next_reserved_mem_region(&i, p_start, p_end); \ + i != (u64)ULLONG_MAX; \ + __next_reserved_mem_region(&i, p_start, p_end)) + #ifdef CONFIG_MOVABLE_NODE static inline bool memblock_is_hotpluggable(struct memblock_region *m) { -- cgit v1.2.3 From 92923ca3aacef63c92dc297a75ad0c6dfe4eab37 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Tue, 30 Jun 2015 14:56:48 -0700 Subject: mm: meminit: only set page reserved in the memblock region Currently each page struct is set as reserved upon initialization. This patch leaves the reserved bit clear and only sets the reserved bit when it is known the memory was allocated by the bootmem allocator. This makes it easier to distinguish between uninitialised struct pages and reserved struct pages in later patches. Signed-off-by: Robin Holt Signed-off-by: Nathan Zimmer Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 99959a34f4f1..d662af2d0d01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1635,6 +1635,8 @@ extern void free_highmem_page(struct page *page); extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); +extern void reserve_bootmem_region(unsigned long start, unsigned long end); + /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) { -- cgit v1.2.3 From 8a942fdea560d4ac0e9d9fabcd5201ad20e0c382 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:56:55 -0700 Subject: mm: meminit: make __early_pfn_to_nid SMP-safe and introduce meminit_pfn_in_nid __early_pfn_to_nid() use static variables to cache recent lookups as memblock lookups are very expensive but it assumes that memory initialisation is single-threaded. Parallel initialisation of struct pages will break that assumption so this patch makes __early_pfn_to_nid() SMP-safe by requiring the caller to cache recent search information. early_pfn_to_nid() keeps the same interface but is only safe to use early in boot due to the use of a global static variable. meminit_pfn_in_nid() is an SMP-safe version that callers must maintain their own state for. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++-- include/linux/mmzone.h | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d662af2d0d01..2e872f92dbac 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1726,7 +1726,8 @@ extern void sparse_memory_present_with_active_regions(int nid); #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn) +static inline int __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { return 0; } @@ -1734,7 +1735,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn) /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); /* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn); +extern int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 54d74f6eb233..b2473d822549 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1216,10 +1216,24 @@ void sparse_init(void); #define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * may treat start/end as pfns or sections. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; + #ifdef CONFIG_NODES_SPAN_OTHER_NODES bool early_pfn_in_nid(unsigned long pfn, int nid); +bool meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state); #else -#define early_pfn_in_nid(pfn, nid) (1) +#define early_pfn_in_nid(pfn, nid) (1) +#define meminit_pfn_in_nid(pfn, nid, state) (1) #endif #ifndef early_pfn_valid -- cgit v1.2.3 From 75a592a47129dcfc1aec40e7d3cdf239a767d441 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:56:59 -0700 Subject: mm: meminit: inline some helper functions early_pfn_in_nid() and meminit_pfn_in_nid() are small functions that are unnecessarily visible outside memory initialisation. As well as unnecessary visibility, it's unnecessary function call overhead when initialising pages. This patch moves the helpers inline. [akpm@linux-foundation.org: fix build] [mhocko@suse.cz: fix build] Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2473d822549..1e05dc7449cd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1227,15 +1227,6 @@ struct mminit_pfnnid_cache { int last_nid; }; -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool early_pfn_in_nid(unsigned long pfn, int nid); -bool meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state); -#else -#define early_pfn_in_nid(pfn, nid) (1) -#define meminit_pfn_in_nid(pfn, nid, state) (1) -#endif - #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) #endif -- cgit v1.2.3 From 3a80a7fa7989fbb6aa56bb6ad31811b62cf99e60 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:02 -0700 Subject: mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set This patch initalises all low memory struct pages and 2G of the highest zone on each node during memory initialisation if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set. That config option cannot be set but will be available in a later patch. Parallel initialisation of struct page depends on some features from memory hotplug and it is necessary to alter alter section annotations. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1e05dc7449cd..754c25966a0a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -762,6 +762,14 @@ typedef struct pglist_data { /* Number of pages migrated during the rate limiting time interval */ unsigned long numabalancing_migrate_nr_pages; #endif + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * If memory initialisation on large machines is deferred then this + * is the first PFN that needs to be initialised. + */ + unsigned long first_deferred_pfn; +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) -- cgit v1.2.3 From 0e1cc95b4cc7293bb7b39175035e7f7e45c90977 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:27 -0700 Subject: mm: meminit: finish initialisation of struct pages before basic setup Waiman Long reported that 24TB machines hit OOM during basic setup when struct page initialisation was deferred. One approach is to initialise memory on demand but it interferes with page allocator paths. This patch creates dedicated threads to initialise memory before basic setup. It then blocks on a rw_semaphore until completion as a wait_queue and counter is overkill. This may be slower to boot but it's simplier overall and also gets rid of a section mangling which existed so kswapd could do the initialisation. [akpm@linux-foundation.org: include rwsem.h, use DECLARE_RWSEM, fix comment, remove unneeded cast] Signed-off-by: Mel Gorman Cc: Waiman Long Cc: Dave Hansen Cc: Scott Norton Tested-by: Daniel J Blueman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6ba7cf23748f..ad35f300b9a4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -384,6 +384,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +void page_alloc_init_late(void); +#else +static inline void page_alloc_init_late(void) +{ +} +#endif + /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are -- cgit v1.2.3 From 5375b708f2547f70cd2bee2fd8663ab7035f9551 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Tue, 30 Jun 2015 14:57:46 -0700 Subject: kernel/panic/kexec: fix "crash_kexec_post_notifiers" option issue in oops path Commit f06e5153f4ae2e ("kernel/panic.c: add "crash_kexec_post_notifiers" option for kdump after panic_notifers") introduced "crash_kexec_post_notifiers" kernel boot option, which toggles wheather panic() calls crash_kexec() before panic_notifiers and dump kmsg or after. The problem is that the commit overlooks panic_on_oops kernel boot option. If it is enabled, crash_kexec() is called directly without going through panic() in oops path. To fix this issue, this patch adds a check to "crash_kexec_post_notifiers" in the condition of kexec_should_crash(). Also, put a comment in kexec_should_crash() to explain not obvious things on this patch. Signed-off-by: HATAYAMA Daisuke Acked-by: Baoquan He Tested-by: Hidehiro Kawai Reviewed-by: Masami Hiramatsu Cc: Vivek Goyal Cc: Ingo Molnar Cc: Hidehiro Kawai Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5acf5b70866d..0dfa4e31563d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,6 +439,9 @@ extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; extern int sysctl_panic_on_stackoverflow; + +extern bool crash_kexec_post_notifiers; + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. -- cgit v1.2.3 From 2a1bf8f93b33992bb0457512b28d046e279bbd7e Mon Sep 17 00:00:00 2001 From: Dave Gordon Date: Tue, 30 Jun 2015 14:58:54 -0700 Subject: lib/scatterlist: mark input buffer parameters as 'const' The 'buf' parameter of sg(p)copy_from_buffer() can and should be const-qualified, although because of the shared implementation of _to_buffer() and _from_buffer(), we have to cast this away internally. This means that callers who have a 'const' buffer containing the data to be copied to the sg-list no longer have to cast away the const-ness themselves. It also enables improved coverage by code analysis tools. Signed-off-by: Dave Gordon Cc: Akinobu Mita Cc: "Martin K. Petersen" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/scatterlist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 50a8486c524b..505d0481df1e 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -266,12 +266,12 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, gfp_t gfp_mask); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen); + const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip); + const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); -- cgit v1.2.3 From 386ecb1216f9e38947ce6a2af22e5e1e47256a97 Mon Sep 17 00:00:00 2001 From: Dave Gordon Date: Tue, 30 Jun 2015 14:58:57 -0700 Subject: drivers/scsi/scsi_debug.c: resolve sg buffer const-ness issue do_device_access() takes a separate parameter to indicate the direction of data transfer, which it used to use to select the appropriate function out of sg_pcopy_{to,from}_buffer(). However these two functions now have So this patch makes it bypass these wrappers and call the underlying function sg_copy_buffer() directly; this has the same calling style as do_device_access() i.e. a separate direction-of-transfer parameter and no pointers-to-const, so skipping the wrappers not only eliminates the warning, it also make the code simpler :) [akpm@linux-foundation.org: fix very broken build] Signed-off-by: Dave Gordon Acked-by: Arnd Bergmann Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/scatterlist.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 505d0481df1e..9b1ef0c820a7 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -265,6 +265,9 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, unsigned long offset, unsigned long size, gfp_t gfp_mask); +size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, + size_t buflen, off_t skip, bool to_buffer); + size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, -- cgit v1.2.3 From 0030edf296db8a7afb13573eb12977b7d399cd40 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Tue, 30 Jun 2015 15:00:03 -0700 Subject: genalloc: rename dev_get_gen_pool() to gen_pool_get() To be consistent with other genalloc interface namings, rename dev_get_gen_pool() to gen_pool_get(). The original omitted "dev_" prefix is removed, since it points to argument type of the function, and so it does not bring any useful information. [akpm@linux-foundation.org: update arch/arm/mach-socfpga/pm.c] Signed-off-by: Vladimir Zapolskiy Acked-by: Nicolas Ferre Cc: Philipp Zabel Cc: Shawn Guo Cc: Sascha Hauer Cc: Alexandre Belloni Cc: Russell King Cc: Mauro Carvalho Chehab Cc: Vinod Koul Cc: Takashi Iwai Cc: Jaroslav Kysela Cc: Mark Brown Cc: Nicolas Ferre Cc: Alan Tull Cc: Dinh Nguyen Cc: Kevin Hilman Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 1ccaab44abcc..015d17068615 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -119,7 +119,7 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, extern struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, int nid); -extern struct gen_pool *dev_get_gen_pool(struct device *dev); +extern struct gen_pool *gen_pool_get(struct device *dev); bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, size_t size); -- cgit v1.2.3 From abdd4a7025282fbe3737e1bcb5f51afc8d8ea1b8 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Tue, 30 Jun 2015 15:00:07 -0700 Subject: genalloc: rename of_get_named_gen_pool() to of_gen_pool_get() To be consistent with other kernel interface namings, rename of_get_named_gen_pool() to of_gen_pool_get(). In the original function name "_named" suffix references to a device tree property, which contains a phandle to a device and the corresponding device driver is assumed to register a gen_pool object. Due to a weak relation and to avoid any confusion (e.g. in future possible scenario if gen_pool objects are named) the suffix is removed. [sfr@canb.auug.org.au: crypto/marvell/cesa - fix up for of_get_named_gen_pool() rename] Signed-off-by: Vladimir Zapolskiy Cc: Nicolas Ferre Cc: Philipp Zabel Cc: Shawn Guo Cc: Sascha Hauer Cc: Alexandre Belloni Cc: Russell King Cc: Mauro Carvalho Chehab Cc: Vinod Koul Cc: Takashi Iwai Cc: Jaroslav Kysela Signed-off-by: Stephen Rothwell Cc: Herbert Xu Cc: Boris BREZILLON Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 015d17068615..5383bb1394a1 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -125,10 +125,10 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, size_t size); #ifdef CONFIG_OF -extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, +extern struct gen_pool *of_gen_pool_get(struct device_node *np, const char *propname, int index); #else -static inline struct gen_pool *of_get_named_gen_pool(struct device_node *np, +static inline struct gen_pool *of_gen_pool_get(struct device_node *np, const char *propname, int index) { return NULL; -- cgit v1.2.3 From 8a81252b774b53e628a8a0fe18e2b8fc236d92cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jun 2015 15:54:08 +0200 Subject: fs/file.c: don't acquire files->file_lock in fd_install() Mateusz Guzik reported : Currently obtaining a new file descriptor results in locking fdtable twice - once in order to reserve a slot and second time to fill it. Holding the spinlock in __fd_install() is needed in case a resize is done, or to prevent a resize. Mateusz provided an RFC patch and a micro benchmark : http://people.redhat.com/~mguzik/pipebench.c A resize is an unlikely operation in a process lifetime, as table size is at least doubled at every resize. We can use RCU instead of the spinlock. __fd_install() must wait if a resize is in progress. The resize must block new __fd_install() callers from starting, and wait that ongoing install are finished (synchronize_sched()) resize should be attempted by a single thread to not waste resources. rcu_sched variant is used, as __fd_install() and expand_fdtable() run from process context. It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R) CPU E5-2696 v2 @ 2.50GHz Signed-off-by: Eric Dumazet Reported-by: Mateusz Guzik Acked-by: Mateusz Guzik Tested-by: Mateusz Guzik Signed-off-by: Al Viro --- include/linux/fdtable.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 230f87bdf5ad..fbb88740634a 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -47,6 +47,9 @@ struct files_struct { * read mostly part */ atomic_t count; + bool resize_in_progress; + wait_queue_head_t resize_wait; + struct fdtable __rcu *fdt; struct fdtable fdtab; /* -- cgit v1.2.3 From fbabfd0f4ee2e8847bf56edf481249ad1bb8c44d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 9 May 2015 15:54:49 -0500 Subject: fs: Add helper functions for permanently empty directories. To ensure it is safe to mount proc and sysfs I need to check if filesystems that are mounted on top of them are mounted on truly empty directories. Given that some directories can gain entries over time, knowing that a directory is empty right now is insufficient. Therefore add supporting infrastructure for permantently empty directories that proc and sysfs can use when they create mount points for filesystems and fs_fully_visible can use to test for permanently empty directories to ensure that nothing will be gained by mounting a fresh copy of proc or sysfs. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2d24eeb8e59c..571aab91bfc0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2780,6 +2780,8 @@ extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned in extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; +extern void make_empty_dir_inode(struct inode *inode); +extern bool is_empty_dir_inode(struct inode *inode); struct tree_descr { char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *); -- cgit v1.2.3 From f9bd6733d3f11e24f3949becf277507d422ee1eb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 9 May 2015 22:09:14 -0500 Subject: sysctl: Allow creating permanently empty directories that serve as mountpoints. Add a magic sysctl table sysctl_mount_point that when used to create a directory forces that directory to be permanently empty. Update the code to use make_empty_dir_inode when accessing permanently empty directories. Update the code to not allow adding to permanently empty directories. Update /proc/sys/fs/binfmt_misc to be a permanently empty directory. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- include/linux/sysctl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 795d5fea5697..fa7bc29925c9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -188,6 +188,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); + +extern struct ctl_table sysctl_mount_point[]; + #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) { -- cgit v1.2.3 From ea015218f2f7ace2dad9cedd21ed95bdba2886d7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 May 2015 16:09:29 -0500 Subject: kernfs: Add support for always empty directories. Add a new function kernfs_create_empty_dir that can be used to create directory that can not be modified. Update the code to use make_empty_dir_inode when reporting a permanently empty directory to the vfs. Update the code to not allow adding to permanently empty directories. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- include/linux/kernfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 71ecdab1671b..29d1896c3ba5 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -45,6 +45,7 @@ enum kernfs_node_flag { KERNFS_LOCKDEP = 0x0100, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, + KERNFS_EMPTY_DIR = 0x1000, }; /* @flags for kernfs_create_root() */ @@ -285,6 +286,8 @@ void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, void *priv, const void *ns); +struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, + const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, -- cgit v1.2.3 From 87d2846fcf88113fae2341da1ca9a71f0d916f2c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 May 2015 16:31:40 -0500 Subject: sysfs: Add support for permanently empty directories to serve as mount points. Add two functions sysfs_create_mount_point and sysfs_remove_mount_point that hang a permanently empty directory off of a kobject or remove a permanently emptpy directory hanging from a kobject. Export these new functions so modular filesystems can use them. Cc: stable@vger.kernel.org Acked-by: Greg Kroah-Hartman Signed-off-by: "Eric W. Biederman" --- include/linux/sysfs.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 99382c0df17e..9f65758311a4 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -210,6 +210,10 @@ int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); +int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, + const char *name); +void sysfs_remove_mount_point(struct kobject *parent_kobj, + const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, @@ -298,6 +302,17 @@ static inline int sysfs_move_dir_ns(struct kobject *kobj, return 0; } +static inline int sysfs_create_mount_point(struct kobject *parent_kobj, + const char *name) +{ + return 0; +} + +static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, + const char *name) +{ +} + static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) -- cgit v1.2.3 From bd7ade3cd9b0850264306f5c2b79024a417b6396 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2015 01:32:44 -0400 Subject: bufferhead: Add _gfp version for sb_getblk() sb_getblk() is used during ext4 (and possibly other FSes) writeback paths. Sometimes such path require allocating memory and guaranteeing that such allocation won't block. Currently, however, there is no way to provide user flags for sb_getblk which could lead to deadlocks. This patch implements a sb_getblk_gfp with the only difference it can accept user-provided GFP flags. Signed-off-by: Nikolay Borisov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- include/linux/buffer_head.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 73b45225a7ca..e6797ded700e 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -317,6 +317,13 @@ sb_getblk(struct super_block *sb, sector_t block) return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } + +static inline struct buffer_head * +sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) +{ + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp); +} + static inline struct buffer_head * sb_find_get_block(struct super_block *sb, sector_t block) { -- cgit v1.2.3 From ec110bc7cc48d7806c9b65094e6afb19452d458f Mon Sep 17 00:00:00 2001 From: Allen Hubbe Date: Thu, 7 May 2015 06:45:21 -0400 Subject: NTB: Move files in preparation for NTB abstraction This patch only moves files to their new locations, before applying the next two patches adding the NTB Abstraction layer. Splitting this patch from the next is intended make distinct which code is changed only due to moving the files, versus which are substantial code changes in adding the NTB Abstraction layer. Signed-off-by: Allen Hubbe Signed-off-by: Jon Mason --- include/linux/ntb.h | 88 ------------------------------------------- include/linux/ntb_transport.h | 88 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 88 deletions(-) delete mode 100644 include/linux/ntb.h create mode 100644 include/linux/ntb_transport.h (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h deleted file mode 100644 index 9ac1a62fc6f5..000000000000 --- a/include/linux/ntb.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2012 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * BSD LICENSE - * - * Copyright(c) 2012 Intel Corporation. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copy - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Intel PCIe NTB Linux driver - * - * Contact Information: - * Jon Mason - */ - -struct ntb_transport_qp; - -struct ntb_client { - struct device_driver driver; - int (*probe)(struct pci_dev *pdev); - void (*remove)(struct pci_dev *pdev); -}; - -enum { - NTB_LINK_DOWN = 0, - NTB_LINK_UP, -}; - -int ntb_register_client(struct ntb_client *drvr); -void ntb_unregister_client(struct ntb_client *drvr); -int ntb_register_client_dev(char *device_name); -void ntb_unregister_client_dev(char *device_name); - -struct ntb_queue_handlers { - void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data, - void *data, int len); - void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data, - void *data, int len); - void (*event_handler)(void *data, int status); -}; - -unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp); -unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp); -struct ntb_transport_qp * -ntb_transport_create_queue(void *data, struct pci_dev *pdev, - const struct ntb_queue_handlers *handlers); -void ntb_transport_free_queue(struct ntb_transport_qp *qp); -int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data, - unsigned int len); -int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data, - unsigned int len); -void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len); -void ntb_transport_link_up(struct ntb_transport_qp *qp); -void ntb_transport_link_down(struct ntb_transport_qp *qp); -bool ntb_transport_link_query(struct ntb_transport_qp *qp); diff --git a/include/linux/ntb_transport.h b/include/linux/ntb_transport.h new file mode 100644 index 000000000000..f78b64cc09d5 --- /dev/null +++ b/include/linux/ntb_transport.h @@ -0,0 +1,88 @@ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2012 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * BSD LICENSE + * + * Copyright(c) 2012 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copy + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel PCIe NTB Linux driver + * + * Contact Information: + * Jon Mason + */ + +struct ntb_transport_qp; + +struct ntb_client { + struct device_driver driver; + int (*probe)(struct pci_dev *pdev); + void (*remove)(struct pci_dev *pdev); +}; + +enum { + NTB_LINK_DOWN = 0, + NTB_LINK_UP, +}; + +int ntb_transport_register_client(struct ntb_client *drvr); +void ntb_transport_unregister_client(struct ntb_client *drvr); +int ntb_register_client_dev(char *device_name); +void ntb_unregister_client_dev(char *device_name); + +struct ntb_queue_handlers { + void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data, + void *data, int len); + void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data, + void *data, int len); + void (*event_handler)(void *data, int status); +}; + +unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp); +unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp); +struct ntb_transport_qp * +ntb_transport_create_queue(void *data, struct pci_dev *pdev, + const struct ntb_queue_handlers *handlers); +void ntb_transport_free_queue(struct ntb_transport_qp *qp); +int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data, + unsigned int len); +int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data, + unsigned int len); +void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len); +void ntb_transport_link_up(struct ntb_transport_qp *qp); +void ntb_transport_link_down(struct ntb_transport_qp *qp); +bool ntb_transport_link_query(struct ntb_transport_qp *qp); -- cgit v1.2.3 From a13f35e8714009145e32ebe2bf25b84e1376e314 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jul 2015 08:44:34 -0600 Subject: writeback: don't embed root bdi_writeback_congested in bdi_writeback 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") made bdi (backing_dev_info) host per-cgroup wb's (bdi_writeback's). As the congested state needs to be per-wb and referenced from blkcg side and multiple wbs, the patch made all non-root cong's (bdi_writeback_congested's) reference counted and indexed on bdi. When a bdi is destroyed, cgwb_bdi_destroy() tries to drain all non-root cong's; however, this can hang indefinitely because wb's can also be referenced from blkcg_gq's which are destroyed after bdi destruction is complete. To fix the bug, bdi destruction will be updated to not wait for cong's to drain, which naturally means that cong's may outlive the associated bdi. This is fine for non-root cong's but is problematic for the root cong's which are embedded in their bdi's as they may end up getting dereferenced after the containing bdi's are freed. This patch makes root cong's behave the same as non-root cong's. They are no longer embedded in their bdi's but allocated separately during bdi initialization, indexed and reference counted the same way. * As cong handling is the same for all wb's, wb->congested initialization is moved into wb_init(). * When !CONFIG_CGROUP_WRITEBACK, there was no indexing or refcnting. bdi->wb_congested is now a pointer pointing to the root cong allocated during bdi init and minimal refcnting operations are implemented. * The above makes root wb init paths diverge depending on CONFIG_CGROUP_WRITEBACK. root wb init is moved to cgwb_bdi_init(). This patch in itself shouldn't cause any consequential behavior differences but prepares for the actual fix. Signed-off-by: Tejun Heo Reported-by: Jon Christopherson Link: https://bugzilla.kernel.org/show_bug.cgi?id=100681 Tested-by: Jon Christopherson Added include to backing-dev.h for kfree() definition. Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 5 +++-- include/linux/backing-dev.h | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a48d90e3bcbb..a23209b43842 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -50,10 +50,10 @@ enum wb_stat_item { */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ + atomic_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK struct backing_dev_info *bdi; /* the associated bdi */ - atomic_t refcnt; /* nr of attached wb's and blkg */ int blkcg_id; /* ID of the associated blkcg */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ #endif @@ -150,11 +150,12 @@ struct backing_dev_info { atomic_long_t tot_write_bandwidth; struct bdi_writeback wb; /* the root writeback info for this bdi */ - struct bdi_writeback_congested wb_congested; /* its congested state */ #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ +#else + struct bdi_writeback_congested *wb_congested; #endif wait_queue_head_t wb_waitq; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0e6d4828a77a..0fe9df983ab7 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -15,6 +15,7 @@ #include #include #include +#include int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -465,11 +466,14 @@ static inline bool inode_cgwb_enabled(struct inode *inode) static inline struct bdi_writeback_congested * wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { - return bdi->wb.congested; + atomic_inc(&bdi->wb_congested->refcnt); + return bdi->wb_congested; } static inline void wb_congested_put(struct bdi_writeback_congested *congested) { + if (atomic_dec_and_test(&congested->refcnt)) + kfree(congested); } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) -- cgit v1.2.3 From 91e20b5040c67c51aad88cf87db4305c5bd7f79d Mon Sep 17 00:00:00 2001 From: Joel Porquet Date: Thu, 2 Jul 2015 15:32:00 -0400 Subject: irqchip: Move IRQCHIP_DECLARE macro to include/linux/irqchip.h At the moment the IRQCHIP_DECLARE macro is only declared locally in drivers/irqchip/irqchip.h. It prevents from using it directly in arch/* directories whenever irqchip drivers only exist there, which happens in a few cases (e.g. arc, arm, microblaze and mips). This patch makes the macro to be globally defined, i.e. in include/linux/irqchip.h, and thus usable for arch-specific declarations of irqchip drivers. In this way, it is very similar to what clocksource does (ie CLOCKSOURCE_OF_DECLARE is defined in include/linux/clocksource.h). For now, this patch only moves the declaration of the macro IRQCHIP_DECLARE to the global header 'include/linux/irqchip.h' and make 'drivers/irqchip/irqchip.h' include 'include/linux/irqchip.h'. Later, other patches will get rid of 'drivers/irqchip/irqchip.h' and modify all the impacted irqchip drivers. Signed-off-by: Joel Porquet Cc: Jason Cooper Link: http://lkml.kernel.org/r/1435865565-14114-1-git-send-email-joel@porquet.org Signed-off-by: Thomas Gleixner --- include/linux/irqchip.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 14d79131f53d..638887376e58 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -11,6 +11,20 @@ #ifndef _LINUX_IRQCHIP_H #define _LINUX_IRQCHIP_H +#include + +/* + * This macro must be used by the different irqchip drivers to declare + * the association between their DT compatible string and their + * initialization function. + * + * @name: name that must be unique accross all IRQCHIP_DECLARE of the + * same file. + * @compstr: compatible string of the irqchip driver + * @fn: initialization function + */ +#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn) + #ifdef CONFIG_IRQCHIP void irqchip_init(void); #else -- cgit v1.2.3 From 2ecd9d29abb171d6e97a4f3eb29d7456a11401b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jul 2015 18:53:58 +0200 Subject: sched, preempt_notifier: separate notifier registration from static_key inc/dec Commit 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers") had two problems. First, the preempt-notifier API needs to sleep with the addition of the static_key, we do however need to hold off preemption while modifying the preempt notifier list, otherwise a preemption could observe an inconsistent list state. KVM correctly registers and unregisters preempt notifiers with preemption disabled, so the sleep caused dmesg splats. Second, KVM registers and unregisters preemption notifiers very often (in vcpu_load/vcpu_put). With a single uniprocessor guest the static key would move between 0 and 1 continuously, hitting the slow path on every userspace exit. To fix this, wrap the static_key inc/dec in a new API, and call it from KVM. Fixes: 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers") Reported-by: Pontus Fuchs Reported-by: Takashi Iwai Tested-by: Takashi Iwai Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- include/linux/preempt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 0f1534acaf60..84991f185173 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -293,6 +293,8 @@ struct preempt_notifier { struct preempt_ops *ops; }; +void preempt_notifier_inc(void); +void preempt_notifier_dec(void); void preempt_notifier_register(struct preempt_notifier *notifier); void preempt_notifier_unregister(struct preempt_notifier *notifier); -- cgit v1.2.3 From f6db8347993256b58bd4746b0c4c5b935c32210d Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 25 Jun 2015 23:53:37 +0530 Subject: sched/stat: Simplify the sched_info accounting dependency Both CONFIG_SCHEDSTATS=y and CONFIG_TASK_DELAY_ACCT=y track task sched_info, which results in ugly #if clauses. Simplify the code by introducing a synthethic CONFIG_SCHED_INFO switch, selected by both. Signed-off-by: Naveen N. Rao Cc: Balbir Singh Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: a.p.zijlstra@chello.nl Cc: ricklind@us.ibm.com Link: http://lkml.kernel.org/r/8d19eef800811a94b0f91bcbeb27430a884d7433.1435255405.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6633e83e608a..9bf4bc0e3b8b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -849,7 +849,7 @@ extern struct user_struct root_user; struct backing_dev_info; struct reclaim_state; -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO struct sched_info { /* cumulative counters */ unsigned long pcount; /* # of times run on this cpu */ @@ -859,7 +859,7 @@ struct sched_info { unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ }; -#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ +#endif /* CONFIG_SCHED_INFO */ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { @@ -1408,7 +1408,7 @@ struct task_struct { int rcu_tasks_idle_cpu; #endif /* #ifdef CONFIG_TASKS_RCU */ -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO struct sched_info sched_info; #endif -- cgit v1.2.3 From 6b55c9654fccf69ae7ace23ca101dc37b903181b Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 25 Jun 2015 22:51:41 +0530 Subject: sched/debug: Move print_cfs_rq() declaration to kernel/sched/sched.h Currently print_cfs_rq() is declared in include/linux/sched.h. However it's not used outside kernel/sched. Hence move the declaration to kernel/sched/sched.h Also some functions are only available for CONFIG_SCHED_DEBUG=y. Hence move the declarations to within the #ifdef. Signed-off-by: Srikar Dronamraju Acked-by: Rik van Riel Cc: Iulia Manda Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435252903-1081-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9bf4bc0e3b8b..3505352dd296 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -191,8 +191,6 @@ struct task_group; #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -extern void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); #endif /* -- cgit v1.2.3 From a1bd3baeb2f18b2b3d0f98ce5fdaa725149b950b Mon Sep 17 00:00:00 2001 From: Allen Hubbe Date: Thu, 9 Apr 2015 10:33:20 -0400 Subject: NTB: Add NTB hardware abstraction layer Abstract the NTB device behind a programming interface, so that it can support different hardware and client drivers. Signed-off-by: Allen Hubbe Signed-off-by: Jon Mason --- include/linux/ntb.h | 984 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 984 insertions(+) create mode 100644 include/linux/ntb.h (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h new file mode 100644 index 000000000000..b02f72bb8e32 --- /dev/null +++ b/include/linux/ntb.h @@ -0,0 +1,984 @@ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright (C) 2015 EMC Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright (C) 2015 EMC Corporation. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copy + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * PCIe NTB Linux driver + * + * Contact Information: + * Allen Hubbe + */ + +#ifndef _NTB_H_ +#define _NTB_H_ + +#include +#include + +struct ntb_client; +struct ntb_dev; +struct pci_dev; + +/** + * enum ntb_topo - NTB connection topology + * @NTB_TOPO_NONE: Topology is unknown or invalid. + * @NTB_TOPO_PRI: On primary side of local ntb. + * @NTB_TOPO_SEC: On secondary side of remote ntb. + * @NTB_TOPO_B2B_USD: On primary side of local ntb upstream of remote ntb. + * @NTB_TOPO_B2B_DSD: On primary side of local ntb downstream of remote ntb. + */ +enum ntb_topo { + NTB_TOPO_NONE = -1, + NTB_TOPO_PRI, + NTB_TOPO_SEC, + NTB_TOPO_B2B_USD, + NTB_TOPO_B2B_DSD, +}; + +static inline int ntb_topo_is_b2b(enum ntb_topo topo) +{ + switch ((int)topo) { + case NTB_TOPO_B2B_USD: + case NTB_TOPO_B2B_DSD: + return 1; + } + return 0; +} + +static inline char *ntb_topo_string(enum ntb_topo topo) +{ + switch (topo) { + case NTB_TOPO_NONE: return "NTB_TOPO_NONE"; + case NTB_TOPO_PRI: return "NTB_TOPO_PRI"; + case NTB_TOPO_SEC: return "NTB_TOPO_SEC"; + case NTB_TOPO_B2B_USD: return "NTB_TOPO_B2B_USD"; + case NTB_TOPO_B2B_DSD: return "NTB_TOPO_B2B_DSD"; + } + return "NTB_TOPO_INVALID"; +} + +/** + * enum ntb_speed - NTB link training speed + * @NTB_SPEED_AUTO: Request the max supported speed. + * @NTB_SPEED_NONE: Link is not trained to any speed. + * @NTB_SPEED_GEN1: Link is trained to gen1 speed. + * @NTB_SPEED_GEN2: Link is trained to gen2 speed. + * @NTB_SPEED_GEN3: Link is trained to gen3 speed. + */ +enum ntb_speed { + NTB_SPEED_AUTO = -1, + NTB_SPEED_NONE = 0, + NTB_SPEED_GEN1 = 1, + NTB_SPEED_GEN2 = 2, + NTB_SPEED_GEN3 = 3, +}; + +/** + * enum ntb_width - NTB link training width + * @NTB_WIDTH_AUTO: Request the max supported width. + * @NTB_WIDTH_NONE: Link is not trained to any width. + * @NTB_WIDTH_1: Link is trained to 1 lane width. + * @NTB_WIDTH_2: Link is trained to 2 lane width. + * @NTB_WIDTH_4: Link is trained to 4 lane width. + * @NTB_WIDTH_8: Link is trained to 8 lane width. + * @NTB_WIDTH_12: Link is trained to 12 lane width. + * @NTB_WIDTH_16: Link is trained to 16 lane width. + * @NTB_WIDTH_32: Link is trained to 32 lane width. + */ +enum ntb_width { + NTB_WIDTH_AUTO = -1, + NTB_WIDTH_NONE = 0, + NTB_WIDTH_1 = 1, + NTB_WIDTH_2 = 2, + NTB_WIDTH_4 = 4, + NTB_WIDTH_8 = 8, + NTB_WIDTH_12 = 12, + NTB_WIDTH_16 = 16, + NTB_WIDTH_32 = 32, +}; + +/** + * struct ntb_client_ops - ntb client operations + * @probe: Notify client of a new device. + * @remove: Notify client to remove a device. + */ +struct ntb_client_ops { + int (*probe)(struct ntb_client *client, struct ntb_dev *ntb); + void (*remove)(struct ntb_client *client, struct ntb_dev *ntb); +}; + +static inline int ntb_client_ops_is_valid(const struct ntb_client_ops *ops) +{ + /* commented callbacks are not required: */ + return + ops->probe && + ops->remove && + 1; +} + +/** + * struct ntb_ctx_ops - ntb driver context operations + * @link_event: See ntb_link_event(). + * @db_event: See ntb_db_event(). + */ +struct ntb_ctx_ops { + void (*link_event)(void *ctx); + void (*db_event)(void *ctx, int db_vector); +}; + +static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops) +{ + /* commented callbacks are not required: */ + return + /* ops->link_event && */ + /* ops->db_event && */ + 1; +} + +/** + * struct ntb_ctx_ops - ntb device operations + * @mw_count: See ntb_mw_count(). + * @mw_get_range: See ntb_mw_get_range(). + * @mw_set_trans: See ntb_mw_set_trans(). + * @mw_clear_trans: See ntb_mw_clear_trans(). + * @link_is_up: See ntb_link_is_up(). + * @link_enable: See ntb_link_enable(). + * @link_disable: See ntb_link_disable(). + * @db_is_unsafe: See ntb_db_is_unsafe(). + * @db_valid_mask: See ntb_db_valid_mask(). + * @db_vector_count: See ntb_db_vector_count(). + * @db_vector_mask: See ntb_db_vector_mask(). + * @db_read: See ntb_db_read(). + * @db_set: See ntb_db_set(). + * @db_clear: See ntb_db_clear(). + * @db_read_mask: See ntb_db_read_mask(). + * @db_set_mask: See ntb_db_set_mask(). + * @db_clear_mask: See ntb_db_clear_mask(). + * @peer_db_addr: See ntb_peer_db_addr(). + * @peer_db_read: See ntb_peer_db_read(). + * @peer_db_set: See ntb_peer_db_set(). + * @peer_db_clear: See ntb_peer_db_clear(). + * @peer_db_read_mask: See ntb_peer_db_read_mask(). + * @peer_db_set_mask: See ntb_peer_db_set_mask(). + * @peer_db_clear_mask: See ntb_peer_db_clear_mask(). + * @spad_is_unsafe: See ntb_spad_is_unsafe(). + * @spad_count: See ntb_spad_count(). + * @spad_read: See ntb_spad_read(). + * @spad_write: See ntb_spad_write(). + * @peer_spad_addr: See ntb_peer_spad_addr(). + * @peer_spad_read: See ntb_peer_spad_read(). + * @peer_spad_write: See ntb_peer_spad_write(). + */ +struct ntb_dev_ops { + int (*mw_count)(struct ntb_dev *ntb); + int (*mw_get_range)(struct ntb_dev *ntb, int idx, + phys_addr_t *base, resource_size_t *size, + resource_size_t *align, resource_size_t *align_size); + int (*mw_set_trans)(struct ntb_dev *ntb, int idx, + dma_addr_t addr, resource_size_t size); + int (*mw_clear_trans)(struct ntb_dev *ntb, int idx); + + int (*link_is_up)(struct ntb_dev *ntb, + enum ntb_speed *speed, enum ntb_width *width); + int (*link_enable)(struct ntb_dev *ntb, + enum ntb_speed max_speed, enum ntb_width max_width); + int (*link_disable)(struct ntb_dev *ntb); + + int (*db_is_unsafe)(struct ntb_dev *ntb); + u64 (*db_valid_mask)(struct ntb_dev *ntb); + int (*db_vector_count)(struct ntb_dev *ntb); + u64 (*db_vector_mask)(struct ntb_dev *ntb, int db_vector); + + u64 (*db_read)(struct ntb_dev *ntb); + int (*db_set)(struct ntb_dev *ntb, u64 db_bits); + int (*db_clear)(struct ntb_dev *ntb, u64 db_bits); + + u64 (*db_read_mask)(struct ntb_dev *ntb); + int (*db_set_mask)(struct ntb_dev *ntb, u64 db_bits); + int (*db_clear_mask)(struct ntb_dev *ntb, u64 db_bits); + + int (*peer_db_addr)(struct ntb_dev *ntb, + phys_addr_t *db_addr, resource_size_t *db_size); + u64 (*peer_db_read)(struct ntb_dev *ntb); + int (*peer_db_set)(struct ntb_dev *ntb, u64 db_bits); + int (*peer_db_clear)(struct ntb_dev *ntb, u64 db_bits); + + u64 (*peer_db_read_mask)(struct ntb_dev *ntb); + int (*peer_db_set_mask)(struct ntb_dev *ntb, u64 db_bits); + int (*peer_db_clear_mask)(struct ntb_dev *ntb, u64 db_bits); + + int (*spad_is_unsafe)(struct ntb_dev *ntb); + int (*spad_count)(struct ntb_dev *ntb); + + u32 (*spad_read)(struct ntb_dev *ntb, int idx); + int (*spad_write)(struct ntb_dev *ntb, int idx, u32 val); + + int (*peer_spad_addr)(struct ntb_dev *ntb, int idx, + phys_addr_t *spad_addr); + u32 (*peer_spad_read)(struct ntb_dev *ntb, int idx); + int (*peer_spad_write)(struct ntb_dev *ntb, int idx, u32 val); +}; + +static inline int ntb_dev_ops_is_valid(const struct ntb_dev_ops *ops) +{ + /* commented callbacks are not required: */ + return + ops->mw_count && + ops->mw_get_range && + ops->mw_set_trans && + /* ops->mw_clear_trans && */ + ops->link_is_up && + ops->link_enable && + ops->link_disable && + /* ops->db_is_unsafe && */ + ops->db_valid_mask && + + /* both set, or both unset */ + (!ops->db_vector_count == !ops->db_vector_mask) && + + ops->db_read && + /* ops->db_set && */ + ops->db_clear && + /* ops->db_read_mask && */ + ops->db_set_mask && + ops->db_clear_mask && + ops->peer_db_addr && + /* ops->peer_db_read && */ + ops->peer_db_set && + /* ops->peer_db_clear && */ + /* ops->peer_db_read_mask && */ + /* ops->peer_db_set_mask && */ + /* ops->peer_db_clear_mask && */ + /* ops->spad_is_unsafe && */ + ops->spad_count && + ops->spad_read && + ops->spad_write && + ops->peer_spad_addr && + /* ops->peer_spad_read && */ + ops->peer_spad_write && + 1; +} + +/** + * struct ntb_client - client interested in ntb devices + * @drv: Linux driver object. + * @ops: See &ntb_client_ops. + */ +struct ntb_client { + struct device_driver drv; + const struct ntb_client_ops ops; +}; + +#define drv_ntb_client(__drv) container_of((__drv), struct ntb_client, drv) + +/** + * struct ntb_device - ntb device + * @dev: Linux device object. + * @pdev: Pci device entry of the ntb. + * @topo: Detected topology of the ntb. + * @ops: See &ntb_dev_ops. + * @ctx: See &ntb_ctx_ops. + * @ctx_ops: See &ntb_ctx_ops. + */ +struct ntb_dev { + struct device dev; + struct pci_dev *pdev; + enum ntb_topo topo; + const struct ntb_dev_ops *ops; + void *ctx; + const struct ntb_ctx_ops *ctx_ops; + + /* private: */ + + /* synchronize setting, clearing, and calling ctx_ops */ + spinlock_t ctx_lock; + /* block unregister until device is fully released */ + struct completion released; +}; + +#define dev_ntb(__dev) container_of((__dev), struct ntb_dev, dev) + +/** + * ntb_register_client() - register a client for interest in ntb devices + * @client: Client context. + * + * The client will be added to the list of clients interested in ntb devices. + * The client will be notified of any ntb devices that are not already + * associated with a client, or if ntb devices are registered later. + * + * Return: Zero if the client is registered, otherwise an error number. + */ +#define ntb_register_client(client) \ + __ntb_register_client((client), THIS_MODULE, KBUILD_MODNAME) + +int __ntb_register_client(struct ntb_client *client, struct module *mod, + const char *mod_name); + +/** + * ntb_unregister_client() - unregister a client for interest in ntb devices + * @client: Client context. + * + * The client will be removed from the list of clients interested in ntb + * devices. If any ntb devices are associated with the client, the client will + * be notified to remove those devices. + */ +void ntb_unregister_client(struct ntb_client *client); + +#define module_ntb_client(__ntb_client) \ + module_driver(__ntb_client, ntb_register_client, \ + ntb_unregister_client) + +/** + * ntb_register_device() - register a ntb device + * @ntb: NTB device context. + * + * The device will be added to the list of ntb devices. If any clients are + * interested in ntb devices, each client will be notified of the ntb device, + * until at most one client accepts the device. + * + * Return: Zero if the device is registered, otherwise an error number. + */ +int ntb_register_device(struct ntb_dev *ntb); + +/** + * ntb_register_device() - unregister a ntb device + * @ntb: NTB device context. + * + * The device will be removed from the list of ntb devices. If the ntb device + * is associated with a client, the client will be notified to remove the + * device. + */ +void ntb_unregister_device(struct ntb_dev *ntb); + +/** + * ntb_set_ctx() - associate a driver context with an ntb device + * @ntb: NTB device context. + * @ctx: Driver context. + * @ctx_ops: Driver context operations. + * + * Associate a driver context and operations with a ntb device. The context is + * provided by the client driver, and the driver may associate a different + * context with each ntb device. + * + * Return: Zero if the context is associated, otherwise an error number. + */ +int ntb_set_ctx(struct ntb_dev *ntb, void *ctx, + const struct ntb_ctx_ops *ctx_ops); + +/** + * ntb_clear_ctx() - disassociate any driver context from an ntb device + * @ntb: NTB device context. + * + * Clear any association that may exist between a driver context and the ntb + * device. + */ +void ntb_clear_ctx(struct ntb_dev *ntb); + +/** + * ntb_link_event() - notify driver context of a change in link status + * @ntb: NTB device context. + * + * Notify the driver context that the link status may have changed. The driver + * should call ntb_link_is_up() to get the current status. + */ +void ntb_link_event(struct ntb_dev *ntb); + +/** + * ntb_db_event() - notify driver context of a doorbell event + * @ntb: NTB device context. + * @vector: Interrupt vector number. + * + * Notify the driver context of a doorbell event. If hardware supports + * multiple interrupt vectors for doorbells, the vector number indicates which + * vector received the interrupt. The vector number is relative to the first + * vector used for doorbells, starting at zero, and must be less than + ** ntb_db_vector_count(). The driver may call ntb_db_read() to check which + * doorbell bits need service, and ntb_db_vector_mask() to determine which of + * those bits are associated with the vector number. + */ +void ntb_db_event(struct ntb_dev *ntb, int vector); + +/** + * ntb_mw_count() - get the number of memory windows + * @ntb: NTB device context. + * + * Hardware and topology may support a different number of memory windows. + * + * Return: the number of memory windows. + */ +static inline int ntb_mw_count(struct ntb_dev *ntb) +{ + return ntb->ops->mw_count(ntb); +} + +/** + * ntb_mw_get_range() - get the range of a memory window + * @ntb: NTB device context. + * @idx: Memory window number. + * @base: OUT - the base address for mapping the memory window + * @size: OUT - the size for mapping the memory window + * @align: OUT - the base alignment for translating the memory window + * @align_size: OUT - the size alignment for translating the memory window + * + * Get the range of a memory window. NULL may be given for any output + * parameter if the value is not needed. The base and size may be used for + * mapping the memory window, to access the peer memory. The alignment and + * size may be used for translating the memory window, for the peer to access + * memory on the local system. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_mw_get_range(struct ntb_dev *ntb, int idx, + phys_addr_t *base, resource_size_t *size, + resource_size_t *align, resource_size_t *align_size) +{ + return ntb->ops->mw_get_range(ntb, idx, base, size, + align, align_size); +} + +/** + * ntb_mw_set_trans() - set the translation of a memory window + * @ntb: NTB device context. + * @idx: Memory window number. + * @addr: The dma address local memory to expose to the peer. + * @size: The size of the local memory to expose to the peer. + * + * Set the translation of a memory window. The peer may access local memory + * through the window starting at the address, up to the size. The address + * must be aligned to the alignment specified by ntb_mw_get_range(). The size + * must be aligned to the size alignment specified by ntb_mw_get_range(). + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_mw_set_trans(struct ntb_dev *ntb, int idx, + dma_addr_t addr, resource_size_t size) +{ + return ntb->ops->mw_set_trans(ntb, idx, addr, size); +} + +/** + * ntb_mw_clear_trans() - clear the translation of a memory window + * @ntb: NTB device context. + * @idx: Memory window number. + * + * Clear the translation of a memory window. The peer may no longer access + * local memory through the window. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_mw_clear_trans(struct ntb_dev *ntb, int idx) +{ + if (!ntb->ops->mw_clear_trans) + return ntb->ops->mw_set_trans(ntb, idx, 0, 0); + + return ntb->ops->mw_clear_trans(ntb, idx); +} + +/** + * ntb_link_is_up() - get the current ntb link state + * @ntb: NTB device context. + * @speed: OUT - The link speed expressed as PCIe generation number. + * @width: OUT - The link width expressed as the number of PCIe lanes. + * + * Set the translation of a memory window. The peer may access local memory + * through the window starting at the address, up to the size. The address + * must be aligned to the alignment specified by ntb_mw_get_range(). The size + * must be aligned to the size alignment specified by ntb_mw_get_range(). + * + * Return: One if the link is up, zero if the link is down, otherwise a + * negative value indicating the error number. + */ +static inline int ntb_link_is_up(struct ntb_dev *ntb, + enum ntb_speed *speed, enum ntb_width *width) +{ + return ntb->ops->link_is_up(ntb, speed, width); +} + +/** + * ntb_link_enable() - enable the link on the secondary side of the ntb + * @ntb: NTB device context. + * @max_speed: The maximum link speed expressed as PCIe generation number. + * @max_width: The maximum link width expressed as the number of PCIe lanes. + * + * Enable the link on the secondary side of the ntb. This can only be done + * from the primary side of the ntb in primary or b2b topology. The ntb device + * should train the link to its maximum speed and width, or the requested speed + * and width, whichever is smaller, if supported. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_link_enable(struct ntb_dev *ntb, + enum ntb_speed max_speed, + enum ntb_width max_width) +{ + return ntb->ops->link_enable(ntb, max_speed, max_width); +} + +/** + * ntb_link_disable() - disable the link on the secondary side of the ntb + * @ntb: NTB device context. + * + * Disable the link on the secondary side of the ntb. This can only be + * done from the primary side of the ntb in primary or b2b topology. The ntb + * device should disable the link. Returning from this call must indicate that + * a barrier has passed, though with no more writes may pass in either + * direction across the link, except if this call returns an error number. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_link_disable(struct ntb_dev *ntb) +{ + return ntb->ops->link_disable(ntb); +} + +/** + * ntb_db_is_unsafe() - check if it is safe to use hardware doorbell + * @ntb: NTB device context. + * + * It is possible for some ntb hardware to be affected by errata. Hardware + * drivers can advise clients to avoid using doorbells. Clients may ignore + * this advice, though caution is recommended. + * + * Return: Zero if it is safe to use doorbells, or One if it is not safe. + */ +static inline int ntb_db_is_unsafe(struct ntb_dev *ntb) +{ + if (!ntb->ops->db_is_unsafe) + return 0; + + return ntb->ops->db_is_unsafe(ntb); +} + +/** + * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb + * @ntb: NTB device context. + * + * Hardware may support different number or arrangement of doorbell bits. + * + * Return: A mask of doorbell bits supported by the ntb. + */ +static inline u64 ntb_db_valid_mask(struct ntb_dev *ntb) +{ + return ntb->ops->db_valid_mask(ntb); +} + +/** + * ntb_db_vector_count() - get the number of doorbell interrupt vectors + * @ntb: NTB device context. + * + * Hardware may support different number of interrupt vectors. + * + * Return: The number of doorbell interrupt vectors. + */ +static inline int ntb_db_vector_count(struct ntb_dev *ntb) +{ + if (!ntb->ops->db_vector_count) + return 1; + + return ntb->ops->db_vector_count(ntb); +} + +/** + * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector + * @ntb: NTB device context. + * @vector: Doorbell vector number. + * + * Each interrupt vector may have a different number or arrangement of bits. + * + * Return: A mask of doorbell bits serviced by a vector. + */ +static inline u64 ntb_db_vector_mask(struct ntb_dev *ntb, int vector) +{ + if (!ntb->ops->db_vector_mask) + return ntb_db_valid_mask(ntb); + + return ntb->ops->db_vector_mask(ntb, vector); +} + +/** + * ntb_db_read() - read the local doorbell register + * @ntb: NTB device context. + * + * Read the local doorbell register, and return the bits that are set. + * + * Return: The bits currently set in the local doorbell register. + */ +static inline u64 ntb_db_read(struct ntb_dev *ntb) +{ + return ntb->ops->db_read(ntb); +} + +/** + * ntb_db_set() - set bits in the local doorbell register + * @ntb: NTB device context. + * @db_bits: Doorbell bits to set. + * + * Set bits in the local doorbell register, which may generate a local doorbell + * interrupt. Bits that were already set must remain set. + * + * This is unusual, and hardware may not support it. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_db_set(struct ntb_dev *ntb, u64 db_bits) +{ + if (!ntb->ops->db_set) + return -EINVAL; + + return ntb->ops->db_set(ntb, db_bits); +} + +/** + * ntb_db_clear() - clear bits in the local doorbell register + * @ntb: NTB device context. + * @db_bits: Doorbell bits to clear. + * + * Clear bits in the local doorbell register, arming the bits for the next + * doorbell. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_db_clear(struct ntb_dev *ntb, u64 db_bits) +{ + return ntb->ops->db_clear(ntb, db_bits); +} + +/** + * ntb_db_read_mask() - read the local doorbell mask + * @ntb: NTB device context. + * + * Read the local doorbell mask register, and return the bits that are set. + * + * This is unusual, though hardware is likely to support it. + * + * Return: The bits currently set in the local doorbell mask register. + */ +static inline u64 ntb_db_read_mask(struct ntb_dev *ntb) +{ + if (!ntb->ops->db_read_mask) + return 0; + + return ntb->ops->db_read_mask(ntb); +} + +/** + * ntb_db_set_mask() - set bits in the local doorbell mask + * @ntb: NTB device context. + * @db_bits: Doorbell mask bits to set. + * + * Set bits in the local doorbell mask register, preventing doorbell interrupts + * from being generated for those doorbell bits. Bits that were already set + * must remain set. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits) +{ + return ntb->ops->db_set_mask(ntb, db_bits); +} + +/** + * ntb_db_clear_mask() - clear bits in the local doorbell mask + * @ntb: NTB device context. + * @db_bits: Doorbell bits to clear. + * + * Clear bits in the local doorbell mask register, allowing doorbell interrupts + * from being generated for those doorbell bits. If a doorbell bit is already + * set at the time the mask is cleared, and the corresponding mask bit is + * changed from set to clear, then the ntb driver must ensure that + * ntb_db_event() is called. If the hardware does not generate the interrupt + * on clearing the mask bit, then the driver must call ntb_db_event() anyway. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits) +{ + return ntb->ops->db_clear_mask(ntb, db_bits); +} + +/** + * ntb_peer_db_addr() - address and size of the peer doorbell register + * @ntb: NTB device context. + * @db_addr: OUT - The address of the peer doorbell register. + * @db_size: OUT - The number of bytes to write the peer doorbell register. + * + * Return the address of the peer doorbell register. This may be used, for + * example, by drivers that offload memory copy operations to a dma engine. + * The drivers may wish to ring the peer doorbell at the completion of memory + * copy operations. For efficiency, and to simplify ordering of operations + * between the dma memory copies and the ringing doorbell, the driver may + * append one additional dma memory copy with the doorbell register as the + * destination, after the memory copy operations. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_peer_db_addr(struct ntb_dev *ntb, + phys_addr_t *db_addr, + resource_size_t *db_size) +{ + return ntb->ops->peer_db_addr(ntb, db_addr, db_size); +} + +/** + * ntb_peer_db_read() - read the peer doorbell register + * @ntb: NTB device context. + * + * Read the peer doorbell register, and return the bits that are set. + * + * This is unusual, and hardware may not support it. + * + * Return: The bits currently set in the peer doorbell register. + */ +static inline u64 ntb_peer_db_read(struct ntb_dev *ntb) +{ + if (!ntb->ops->peer_db_read) + return 0; + + return ntb->ops->peer_db_read(ntb); +} + +/** + * ntb_peer_db_set() - set bits in the peer doorbell register + * @ntb: NTB device context. + * @db_bits: Doorbell bits to set. + * + * Set bits in the peer doorbell register, which may generate a peer doorbell + * interrupt. Bits that were already set must remain set. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits) +{ + return ntb->ops->peer_db_set(ntb, db_bits); +} + +/** + * ntb_peer_db_clear() - clear bits in the local doorbell register + * @ntb: NTB device context. + * @db_bits: Doorbell bits to clear. + * + * Clear bits in the peer doorbell register, arming the bits for the next + * doorbell. + * + * This is unusual, and hardware may not support it. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_peer_db_clear(struct ntb_dev *ntb, u64 db_bits) +{ + if (!ntb->ops->db_clear) + return -EINVAL; + + return ntb->ops->peer_db_clear(ntb, db_bits); +} + +/** + * ntb_peer_db_read_mask() - read the peer doorbell mask + * @ntb: NTB device context. + * + * Read the peer doorbell mask register, and return the bits that are set. + * + * This is unusual, and hardware may not support it. + * + * Return: The bits currently set in the peer doorbell mask register. + */ +static inline u64 ntb_peer_db_read_mask(struct ntb_dev *ntb) +{ + if (!ntb->ops->db_read_mask) + return 0; + + return ntb->ops->peer_db_read_mask(ntb); +} + +/** + * ntb_peer_db_set_mask() - set bits in the peer doorbell mask + * @ntb: NTB device context. + * @db_bits: Doorbell mask bits to set. + * + * Set bits in the peer doorbell mask register, preventing doorbell interrupts + * from being generated for those doorbell bits. Bits that were already set + * must remain set. + * + * This is unusual, and hardware may not support it. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_peer_db_set_mask(struct ntb_dev *ntb, u64 db_bits) +{ + if (!ntb->ops->db_set_mask) + return -EINVAL; + + return ntb->ops->peer_db_set_mask(ntb, db_bits); +} + +/** + * ntb_peer_db_clear_mask() - clear bits in the peer doorbell mask + * @ntb: NTB device context. + * @db_bits: Doorbell bits to clear. + * + * Clear bits in the peer doorbell mask register, allowing doorbell interrupts + * from being generated for those doorbell bits. If the hardware does not + * generate the interrupt on clearing the mask bit, then the driver should not + * implement this function! + * + * This is unusual, and hardware may not support it. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_peer_db_clear_mask(struct ntb_dev *ntb, u64 db_bits) +{ + if (!ntb->ops->db_clear_mask) + return -EINVAL; + + return ntb->ops->peer_db_clear_mask(ntb, db_bits); +} + +/** + * ntb_spad_is_unsafe() - check if it is safe to use the hardware scratchpads + * @ntb: NTB device context. + * + * It is possible for some ntb hardware to be affected by errata. Hardware + * drivers can advise clients to avoid using scratchpads. Clients may ignore + * this advice, though caution is recommended. + * + * Return: Zero if it is safe to use scratchpads, or One if it is not safe. + */ +static inline int ntb_spad_is_unsafe(struct ntb_dev *ntb) +{ + if (!ntb->ops->spad_is_unsafe) + return 0; + + return ntb->ops->spad_is_unsafe(ntb); +} + +/** + * ntb_mw_count() - get the number of scratchpads + * @ntb: NTB device context. + * + * Hardware and topology may support a different number of scratchpads. + * + * Return: the number of scratchpads. + */ +static inline int ntb_spad_count(struct ntb_dev *ntb) +{ + return ntb->ops->spad_count(ntb); +} + +/** + * ntb_spad_read() - read the local scratchpad register + * @ntb: NTB device context. + * @idx: Scratchpad index. + * + * Read the local scratchpad register, and return the value. + * + * Return: The value of the local scratchpad register. + */ +static inline u32 ntb_spad_read(struct ntb_dev *ntb, int idx) +{ + return ntb->ops->spad_read(ntb, idx); +} + +/** + * ntb_spad_write() - write the local scratchpad register + * @ntb: NTB device context. + * @idx: Scratchpad index. + * @val: Scratchpad value. + * + * Write the value to the local scratchpad register. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val) +{ + return ntb->ops->spad_write(ntb, idx, val); +} + +/** + * ntb_peer_spad_addr() - address of the peer scratchpad register + * @ntb: NTB device context. + * @idx: Scratchpad index. + * @spad_addr: OUT - The address of the peer scratchpad register. + * + * Return the address of the peer doorbell register. This may be used, for + * example, by drivers that offload memory copy operations to a dma engine. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_peer_spad_addr(struct ntb_dev *ntb, int idx, + phys_addr_t *spad_addr) +{ + return ntb->ops->peer_spad_addr(ntb, idx, spad_addr); +} + +/** + * ntb_peer_spad_read() - read the peer scratchpad register + * @ntb: NTB device context. + * @idx: Scratchpad index. + * + * Read the peer scratchpad register, and return the value. + * + * Return: The value of the local scratchpad register. + */ +static inline u32 ntb_peer_spad_read(struct ntb_dev *ntb, int idx) +{ + return ntb->ops->peer_spad_read(ntb, idx); +} + +/** + * ntb_peer_spad_write() - write the peer scratchpad register + * @ntb: NTB device context. + * @idx: Scratchpad index. + * @val: Scratchpad value. + * + * Write the value to the peer scratchpad register. + * + * Return: Zero on success, otherwise an error number. + */ +static inline int ntb_peer_spad_write(struct ntb_dev *ntb, int idx, u32 val) +{ + return ntb->ops->peer_spad_write(ntb, idx, val); +} + +#endif -- cgit v1.2.3 From e26a5843f7f5014ae4460030ca4de029a3ac35d3 Mon Sep 17 00:00:00 2001 From: Allen Hubbe Date: Thu, 9 Apr 2015 10:33:20 -0400 Subject: NTB: Split ntb_hw_intel and ntb_transport drivers Change ntb_hw_intel to use the new NTB hardware abstraction layer. Split ntb_transport into its own driver. Change it to use the new NTB hardware abstraction layer. Signed-off-by: Allen Hubbe Signed-off-by: Jon Mason --- include/linux/ntb_transport.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ntb_transport.h b/include/linux/ntb_transport.h index f78b64cc09d5..2862861366a5 100644 --- a/include/linux/ntb_transport.h +++ b/include/linux/ntb_transport.h @@ -5,6 +5,7 @@ * GPL LICENSE SUMMARY * * Copyright(c) 2012 Intel Corporation. All rights reserved. + * Copyright (C) 2015 EMC Corporation. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -13,6 +14,7 @@ * BSD LICENSE * * Copyright(c) 2012 Intel Corporation. All rights reserved. + * Copyright (C) 2015 EMC Corporation. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -40,7 +42,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * Intel PCIe NTB Linux driver + * PCIe NTB Transport Linux driver * * Contact Information: * Jon Mason @@ -48,21 +50,16 @@ struct ntb_transport_qp; -struct ntb_client { +struct ntb_transport_client { struct device_driver driver; - int (*probe)(struct pci_dev *pdev); - void (*remove)(struct pci_dev *pdev); + int (*probe)(struct device *client_dev); + void (*remove)(struct device *client_dev); }; -enum { - NTB_LINK_DOWN = 0, - NTB_LINK_UP, -}; - -int ntb_transport_register_client(struct ntb_client *drvr); -void ntb_transport_unregister_client(struct ntb_client *drvr); -int ntb_register_client_dev(char *device_name); -void ntb_unregister_client_dev(char *device_name); +int ntb_transport_register_client(struct ntb_transport_client *drvr); +void ntb_transport_unregister_client(struct ntb_transport_client *drvr); +int ntb_transport_register_client_dev(char *device_name); +void ntb_transport_unregister_client_dev(char *device_name); struct ntb_queue_handlers { void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data, @@ -75,7 +72,7 @@ struct ntb_queue_handlers { unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp); unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp); struct ntb_transport_qp * -ntb_transport_create_queue(void *data, struct pci_dev *pdev, +ntb_transport_create_queue(void *data, struct device *client_dev, const struct ntb_queue_handlers *handlers); void ntb_transport_free_queue(struct ntb_transport_qp *qp); int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data, -- cgit v1.2.3 From 0294112ee3135fbd15eaa70015af8283642dd970 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 4 Jul 2015 03:09:03 +0200 Subject: ACPI / PNP: Reserve ACPI resources at the fs_initcall_sync stage This effectively reverts the following three commits: 7bc10388ccdd ACPI / resources: free memory on error in add_region_before() 0f1b414d1907 ACPI / PNP: Avoid conflicting resource reservations b9a5e5e18fbf ACPI / init: Fix the ordering of acpi_reserve_resources() (commit b9a5e5e18fbf introduced regressions some of which, but not all, were addressed by commit 0f1b414d1907 and commit 7bc10388ccdd was a fixup on top of the latter) and causes ACPI fixed hardware resources to be reserved at the fs_initcall_sync stage of system initialization. The story is as follows. First, a boot regression was reported due to an apparent resource reservation ordering change after a commit that shouldn't lead to such changes. Investigation led to the conclusion that the problem happened because acpi_reserve_resources() was executed at the device_initcall() stage of system initialization which wasn't strictly ordered with respect to driver initialization (and with respect to the initialization of the pcieport driver in particular), so a random change causing the device initcalls to be run in a different order might break things. The response to that was to attempt to run acpi_reserve_resources() as soon as we knew that ACPI would be in use (commit b9a5e5e18fbf). However, that turned out to be too early, because it caused resource reservations made by the PNP system driver to fail on at least one system and that failure was addressed by commit 0f1b414d1907. That fix still turned out to be insufficient, though, because calling acpi_reserve_resources() before the fs_initcall stage of system initialization caused a boot regression to happen on the eCAFE EC-800-H20G/S netbook. That meant that we only could call acpi_reserve_resources() at the fs_initcall initialization stage or later, but then we might just as well call it after the PNP initalization in which case commit 0f1b414d1907 wouldn't be necessary any more. For this reason, the changes made by commit 0f1b414d1907 are reverted (along with a memory leak fixup on top of that commit), the changes made by commit b9a5e5e18fbf that went too far are reverted too and acpi_reserve_resources() is changed into fs_initcall_sync, which will cause it to be executed after the PNP subsystem initialization (which is an fs_initcall) and before device initcalls (including the pcieport driver initialization) which should avoid the initial issue. Link: https://bugzilla.kernel.org/show_bug.cgi?id=100581 Link: http://marc.info/?t=143092384600002&r=1&w=2 Link: https://bugzilla.kernel.org/show_bug.cgi?id=99831 Link: http://marc.info/?t=143389402600001&r=1&w=2 Fixes: b9a5e5e18fbf "ACPI / init: Fix the ordering of acpi_reserve_resources()" Reported-by: Roland Dreier Cc: All applicable Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c471dfc93b71..fedfd1bea3bf 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -309,9 +309,6 @@ int acpi_check_region(resource_size_t start, resource_size_t n, int acpi_resources_are_enforced(void); -int acpi_reserve_region(u64 start, unsigned int length, u8 space_id, - unsigned long flags, char *desc); - #ifdef CONFIG_HIBERNATION void __init acpi_no_s4_hw_signature(void); #endif @@ -507,13 +504,6 @@ static inline int acpi_check_region(resource_size_t start, resource_size_t n, return 0; } -static inline int acpi_reserve_region(u64 start, unsigned int length, - u8 space_id, unsigned long flags, - char *desc) -{ - return -ENXIO; -} - struct acpi_table_header; static inline int acpi_table_parse(char *id, int (*handler)(struct acpi_table_header *)) -- cgit v1.2.3 From 26095a01d359827eeccec5459c28ddd976183179 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 7 Jul 2015 01:55:20 +0200 Subject: ACPI / scan: Add support for ACPI _CLS device matching Device drivers typically use ACPI _HIDs/_CIDs listed in struct device_driver acpi_match_table to match devices. However, for generic drivers, we do not want to list _HID for all supported devices. Also, certain classes of devices do not have _CID (e.g. SATA, USB). Instead, we can leverage ACPI _CLS, which specifies PCI-defined class code (i.e. base-class, subclass and programming interface). This patch adds support for matching ACPI devices using the _CLS method. To support loadable module, current design uses _HID or _CID to match device's modalias. With the new way of matching with _CLS this would requires modification to the current ACPI modalias key to include _CLS. This patch appends PCI-defined class-code to the existing ACPI modalias as following. acpi::::..::: E.g: # cat /sys/devices/platform/AMDI0600:00/modalias acpi:AMDI0600:010601: where bb is th base-class code, ss is te sub-class code, and pp is the programming interface code Since there would not be _HID/_CID in the ACPI matching table of the driver, this patch adds a field to acpi_device_id to specify the matching _CLS. static const struct acpi_device_id ahci_acpi_match[] = { { ACPI_DEVICE_CLASS(PCI_CLASS_STORAGE_SATA_AHCI, 0xffffff) }, {}, }; In this case, the corresponded entry in modules.alias file would be: alias acpi*:010601:* ahci_platform Acked-by: Mika Westerberg Reviewed-by: Hanjun Guo Signed-off-by: Suravee Suthikulpanit Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 14 ++++++++++++++ include/linux/mod_devicetable.h | 2 ++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c471dfc93b71..bdfdb9f65c23 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -58,6 +58,19 @@ static inline acpi_handle acpi_device_handle(struct acpi_device *adev) acpi_fwnode_handle(adev) : NULL) #define ACPI_HANDLE(dev) acpi_device_handle(ACPI_COMPANION(dev)) +/** + * ACPI_DEVICE_CLASS - macro used to describe an ACPI device with + * the PCI-defined class-code information + * + * @_cls : the class, subclass, prog-if triple for this device + * @_msk : the class mask for this device + * + * This macro is used to create a struct acpi_device_id that matches a + * specific PCI class. The .id and .driver_data fields will be left + * initialized with the default value. + */ +#define ACPI_DEVICE_CLASS(_cls, _msk) .cls = (_cls), .cls_msk = (_msk), + static inline bool has_acpi_companion(struct device *dev) { return is_acpi_node(dev->fwnode); @@ -446,6 +459,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *); #define ACPI_COMPANION(dev) (NULL) #define ACPI_COMPANION_SET(dev, adev) do { } while (0) #define ACPI_HANDLE(dev) (NULL) +#define ACPI_DEVICE_CLASS(_cls, _msk) .cls = (0), .cls_msk = (0), struct fwnode_handle; diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 8183d6640ca7..34f25b7bf642 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -189,6 +189,8 @@ struct css_device_id { struct acpi_device_id { __u8 id[ACPI_ID_LEN]; kernel_ulong_t driver_data; + __u32 cls; + __u32 cls_msk; }; #define PNP_ID_LEN 8 -- cgit v1.2.3 From f32dd117051185da6e923b35491a44d7debeeea5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:29:38 +0200 Subject: tick/broadcast: Make idle check independent from mode and config Currently the broadcast busy check, which prevents the idle code from going into deep idle, works only in one shot mode. If NOHZ and HIGHRES are off (config or command line) there is no sanity check at all, so under certain conditions cpus are allowed to go into deep idle, where the local timer stops, and are not woken up again because there is no broadcast timer installed or a hrtimer based broadcast device is not evaluated. Move tick_broadcast_oneshot_control() into the common code and provide proper subfunctions for the various config combinations. The common check in tick_broadcast_oneshot_control() is for the C3STOP misfeature flag of the local clock event device. If its not set, idle can proceed. If set, further checks are necessary. Provide checks for the trivial cases: - If broadcast is disabled in the config, then return busy - If oneshot mode (NOHZ/HIGHES) is disabled in the config, return busy if the broadcast device is hrtimer based. - If oneshot mode is enabled in the config call the original tick_broadcast_oneshot_control() function. That function needs extra checks which will be implemented in seperate patches. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- include/linux/tick.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 3741ba1a652c..6916dcb61857 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -67,11 +67,7 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode); static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } #endif /* BROADCAST */ -#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state); -#else -static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; } -#endif static inline void tick_broadcast_enable(void) { -- cgit v1.2.3 From 37b64a42067a04a22468c4e52c12af00d72e462b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 21:56:34 +0200 Subject: tick/broadcast: Unbreak CONFIG_GENERIC_CLOCKEVENTS=n build Making tick_broadcast_oneshot_control() independent from CONFIG_GENERIC_CLOCKEVENTS_BROADCAST broke the build for CONFIG_GENERIC_CLOCKEVENTS=n because the function is not defined there. Provide a proper stub inline. Fixes: f32dd1170511 'tick/broadcast: Make idle check independent from mode and config' Reported-by: kbuild test robot Signed-off-by: Thomas Gleixner --- include/linux/tick.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 6916dcb61857..edbfc9a5293e 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -67,7 +67,14 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode); static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } #endif /* BROADCAST */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return 0; +} +#endif static inline void tick_broadcast_enable(void) { -- cgit v1.2.3 From a899418167264c7bac574b1a0f1b2c26c5b0995a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:30 +0000 Subject: hotplug: Prevent alloc/free of irq descriptors during cpu up/down When a cpu goes up some architectures (e.g. x86) have to walk the irq space to set up the vector space for the cpu. While this needs extra protection at the architecture level we can avoid a few race conditions by preventing the concurrent allocation/free of irq descriptors and the associated data. When a cpu goes down it moves the interrupts which are targeted to this cpu away by reassigning the affinities. While this happens interrupts can be allocated and freed, which opens a can of race conditions in the code which reassignes the affinities because interrupt descriptors might be freed underneath. Example: CPU1 CPU2 cpu_up/down irq_desc = irq_to_desc(irq); remove_from_radix_tree(desc); raw_spin_lock(&desc->lock); free(desc); We could protect the irq descriptors with RCU, but that would require a full tree change of all accesses to interrupt descriptors. But fortunately these kind of race conditions are rather limited to a few things like cpu hotplug. The normal setup/teardown is very well serialized. So the simpler and obvious solution is: Prevent allocation and freeing of interrupt descriptors accross cpu hotplug. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.063519515@linutronix.de --- include/linux/irqdesc.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 624a668e61f1..fcea4e48e21f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -87,7 +87,12 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; -#ifndef CONFIG_SPARSE_IRQ +#ifdef CONFIG_SPARSE_IRQ +extern void irq_lock_sparse(void); +extern void irq_unlock_sparse(void); +#else +static inline void irq_lock_sparse(void) { } +static inline void irq_unlock_sparse(void) { } extern struct irq_desc irq_desc[NR_IRQS]; #endif -- cgit v1.2.3 From 1f6823faa8c563431a94e614d2b63ce16bb6f658 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Jul 2015 10:51:46 +0200 Subject: time: Get rid of do_posix_clock_monotonic_gettime All users gone. Remove it before we get another one. Signed-off-by: Thomas Gleixner --- include/linux/timekeeping.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 3aa72e648650..6e191e4e6ab6 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -145,7 +145,6 @@ static inline void getboottime(struct timespec *ts) } #endif -#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts) #define ktime_get_real_ts64(ts) getnstimeofday64(ts) /* -- cgit v1.2.3 From 757856d2b9568a701df9ea6a4be68effbb9d6f44 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 25 Jun 2015 17:47:45 +0300 Subject: libceph: enable ceph in a non-default network namespace Grab a reference on a network namespace of the 'rbd map' (in case of rbd) or 'mount' (in case of ceph) process and use that to open sockets instead of always using init_net and bailing if network namespace is anything but init_net. Be careful to not share struct ceph_client instances between different namespaces and don't add any code in the !CONFIG_NET_NS case. This is based on a patch from Hong Zhiguo . Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/messenger.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index e15499422fdc..37753278987a 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,7 @@ struct ceph_messenger { struct ceph_entity_addr my_enc_addr; atomic_t stopping; + possible_net_t net; bool nocrc; bool tcp_nodelay; @@ -267,6 +269,7 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr, u64 required_features, bool nocrc, bool tcp_nodelay); +extern void ceph_messenger_fini(struct ceph_messenger *msgr); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, -- cgit v1.2.3 From 4a0e3e989d66bb7204b163d9cfaa7fa96d0f2023 Mon Sep 17 00:00:00 2001 From: Enrico Mioso Date: Wed, 8 Jul 2015 13:05:57 +0200 Subject: cdc_ncm: Add support for moving NDP to end of NCM frame NCM specs are not actually mandating a specific position in the frame for the NDP (Network Datagram Pointer). However, some Huawei devices will ignore our aggregates if it is not placed after the datagrams it points to. Add support for doing just this, in a per-device configurable way. While at it, update NCM subdrivers, disabling this functionality in all of them, except in huawei_cdc_ncm where it is enabled instead. We aren't making any distinction between different Huawei NCM devices, based on what the vendor driver does. Standard NCM devices are left unaffected: if they are compliant, they should be always usable, still stay on the safe side. This change has been tested and working with a Huawei E3131 device (which works regardless of NDP position), a Huawei E3531 (also working both ways) and an E3372 (which mandates NDP to be after indexed datagrams). V1->V2: - corrected wrong NDP acronym definition - fixed possible NULL pointer dereference - patch cleanup V2->V3: - Properly account for the NDP size when writing new packets to SKB Signed-off-by: Enrico Mioso Signed-off-by: David S. Miller --- include/linux/usb/cdc_ncm.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 7c9b484735c5..1f6526c76ee8 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -80,6 +80,9 @@ #define CDC_NCM_TIMER_INTERVAL_MIN 5UL #define CDC_NCM_TIMER_INTERVAL_MAX (U32_MAX / NSEC_PER_USEC) +/* Driver flags */ +#define CDC_NCM_FLAG_NDP_TO_END 0x02 /* NDP is placed at end of frame */ + #define cdc_ncm_comm_intf_is_mbim(x) ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \ (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE) #define cdc_ncm_data_intf_is_mbim(x) ((x)->desc.bInterfaceProtocol == USB_CDC_MBIM_PROTO_NTB) @@ -103,9 +106,11 @@ struct cdc_ncm_ctx { spinlock_t mtx; atomic_t stop; + int drvflags; u32 timer_interval; u32 max_ndp_size; + struct usb_cdc_ncm_ndp16 *delayed_ndp16; u32 tx_timer_pending; u32 tx_curr_frame_num; @@ -133,7 +138,7 @@ struct cdc_ncm_ctx { }; u8 cdc_ncm_select_altsetting(struct usb_interface *intf); -int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting); +int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags); void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf); struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign); int cdc_ncm_rx_verify_nth16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in); -- cgit v1.2.3 From d3b58c47d330de8c29898fe9746f7530408f8a59 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 26 Jun 2015 11:58:19 +0200 Subject: can: replace timestamp as unique skb attribute Commit 514ac99c64b "can: fix multiple delivery of a single CAN frame for overlapping CAN filters" requires the skb->tstamp to be set to check for identical CAN skbs. Without timestamping to be required by user space applications this timestamp was not generated which lead to commit 36c01245eb8 "can: fix loss of CAN frames in raw_rcv" - which forces the timestamp to be set in all CAN related skbuffs by introducing several __net_timestamp() calls. This forces e.g. out of tree drivers which are not using alloc_can{,fd}_skb() to add __net_timestamp() after skbuff creation to prevent the frame loss fixed in mainline Linux. This patch removes the timestamp dependency and uses an atomic counter to create an unique identifier together with the skbuff pointer. Btw: the new skbcnt element introduced in struct can_skb_priv has to be initialized with zero in out-of-tree drivers which are not using alloc_can{,fd}_skb() too. Signed-off-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index b6a52a4b457a..51bb6532785c 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -27,10 +27,12 @@ /** * struct can_skb_priv - private additional data inside CAN sk_buffs * @ifindex: ifindex of the first interface the CAN frame appeared on + * @skbcnt: atomic counter to have an unique id together with skb pointer * @cf: align to the following CAN frame at skb->data */ struct can_skb_priv { int ifindex; + int skbcnt; struct can_frame cf[0]; }; -- cgit v1.2.3 From 2377799c084d86d22074cd4acd20edc32024d669 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Jul 2015 12:17:25 +0200 Subject: average: provide macro to create static EWMA Having the EWMA parameters stored in the runtime struct imposes memory requirements for the constant values that could just be inlined in the code. This particularly makes sense if there are a lot of such structs, for example in mac80211 in the station table where each station has a number of these in an array, and there can be many stations. Provide a macro DECLARE_EWMA() that declares the necessary struct and inline functions to access it with the parameters hard-coded; using this also means the user no longer needs to 'select AVERAGE' as it's entirely self-contained. In the mac80211 case, on x86-64, this actually slightly *reduces* code size, while also saving 80 bytes of runtime memory per sta. Signed-off-by: Johannes Berg --- include/linux/average.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/average.h b/include/linux/average.h index c6028fd742c1..60f8226c5652 100644 --- a/include/linux/average.h +++ b/include/linux/average.h @@ -27,4 +27,43 @@ static inline unsigned long ewma_read(const struct ewma *avg) return avg->internal >> avg->factor; } +#define DECLARE_EWMA(name, _factor, _weight) \ + struct ewma_##name { \ + unsigned long internal; \ + }; \ + static inline void ewma_##name##_init(struct ewma_##name *e) \ + { \ + BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + e->internal = 0; \ + } \ + static inline unsigned long \ + ewma_##name##_read(struct ewma_##name *e) \ + { \ + BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + return e->internal >> ilog2(_factor); \ + } \ + static inline void ewma_##name##_add(struct ewma_##name *e, \ + unsigned long val) \ + { \ + unsigned long internal = ACCESS_ONCE(e->internal); \ + unsigned long weight = ilog2(_weight); \ + unsigned long factor = ilog2(_factor); \ + \ + BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + \ + ACCESS_ONCE(e->internal) = internal ? \ + (((internal << weight) - internal) + \ + (val << factor)) >> weight : \ + (val << factor); \ + } + #endif /* _LINUX_AVERAGE_H */ -- cgit v1.2.3 From 8f9c98df949333f08b74e5df1caacf7e2c5e8552 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 19 Jul 2015 16:09:12 +0300 Subject: mac80211: fix BIT position for TDLS WIDE extended cap The bit was not according to ieee80211 specification. Fix that. Reviewed-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b9c7897dc566..cfa906f28b7a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2074,8 +2074,8 @@ enum ieee80211_tdls_actioncode { #define WLAN_EXT_CAPA5_TDLS_PROHIBITED BIT(6) #define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED BIT(7) +#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED BIT(5) #define WLAN_EXT_CAPA8_OPMODE_NOTIF BIT(6) -#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED BIT(7) /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 -- cgit v1.2.3