From c8a8585431efba0faaf41167f8f7c27c48307ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vianney=20le=20Cl=C3=A9ment=20de=20Saint-Marcq?= Date: Mon, 30 Mar 2015 10:34:58 +0200 Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBEMISSIVITY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contact-less IR temperature sensors measure the temperature of an object by using its thermal radiation. Surfaces with different emissivity ratios emit different amounts of energy at the same temperature. IIO_CHAN_INFO_CALIBEMISSIVITY allows the user to inform the sensor of the emissivity of the object in front of it, in order to effectively measure its temperature. A device providing such setting is Melexis's MLX90614: http://melexis.com/Assets/IR-sensor-thermometer-MLX90614-Datasheet-5152.aspx. Signed-off-by: Vianney le Clément de Saint-Marcq Cc: Arnout Vandecappelle (Essensium/Mind) Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index d86b753e9b30..b1e46ae89aa7 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -43,6 +43,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_CALIBWEIGHT, IIO_CHAN_INFO_DEBOUNCE_COUNT, IIO_CHAN_INFO_DEBOUNCE_TIME, + IIO_CHAN_INFO_CALIBEMISSIVITY, }; enum iio_shared_by { -- cgit v1.2.3 From d0e83059a6c9b04f00264a74b8f6439948de4613 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 20 Apr 2015 13:39:03 +0800 Subject: crypto: rng - Convert crypto_rng to new style crypto_type This patch converts the top-level crypto_rng to the "new" style. It was the last algorithm type added before we switched over to the new way of doing things exemplified by shash. All users will automatically switch over to the new interface. Note that this patch does not touch the low-level interface to rng implementations. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 10df5d2d093a..781f7d546020 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -655,19 +655,12 @@ struct compress_tfm { u8 *dst, unsigned int *dlen); }; -struct rng_tfm { - int (*rng_gen_random)(struct crypto_rng *tfm, u8 *rdata, - unsigned int dlen); - int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen); -}; - #define crt_ablkcipher crt_u.ablkcipher #define crt_aead crt_u.aead #define crt_blkcipher crt_u.blkcipher #define crt_cipher crt_u.cipher #define crt_hash crt_u.hash #define crt_compress crt_u.compress -#define crt_rng crt_u.rng struct crypto_tfm { @@ -680,7 +673,6 @@ struct crypto_tfm { struct cipher_tfm cipher; struct hash_tfm hash; struct compress_tfm compress; - struct rng_tfm rng; } crt_u; void (*exit)(struct crypto_tfm *tfm); @@ -714,10 +706,6 @@ struct crypto_hash { struct crypto_tfm base; }; -struct crypto_rng { - struct crypto_tfm base; -}; - enum { CRYPTOA_UNSPEC, CRYPTOA_ALG, -- cgit v1.2.3 From acec27ff35af9caf34d76d16ee17ff3b292e7d83 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 21 Apr 2015 10:46:38 +0800 Subject: crypto: rng - Convert low-level crypto_rng to new style This patch converts the low-level crypto_rng interface to the "new" style. This allows existing implementations to be converted over one- by-one. Once that is complete we can then remove the old rng interface. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 781f7d546020..2fa9b05360f7 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -427,7 +427,7 @@ struct compress_alg { }; /** - * struct rng_alg - random number generator definition + * struct old_rng_alg - random number generator definition * @rng_make_random: The function defined by this variable obtains a random * number. The random number generator transform must generate * the random number out of the context provided with this @@ -445,7 +445,7 @@ struct compress_alg { * seeding is implemented internally without the need of support by * the consumer. In this case, the seed size is set to zero. */ -struct rng_alg { +struct old_rng_alg { int (*rng_make_random)(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen); int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen); @@ -559,7 +559,7 @@ struct crypto_alg { struct blkcipher_alg blkcipher; struct cipher_alg cipher; struct compress_alg compress; - struct rng_alg rng; + struct old_rng_alg rng; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); -- cgit v1.2.3 From 94f1bb15bed84ad6c893916b7e7b9db6f1d7eec6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 21 Apr 2015 10:46:46 +0800 Subject: crypto: rng - Remove old low-level rng interface Now that all rng implementations have switched over to the new interface, we can remove the old low-level interface. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 2fa9b05360f7..ee14140f8893 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -138,7 +138,6 @@ struct crypto_async_request; struct crypto_aead; struct crypto_blkcipher; struct crypto_hash; -struct crypto_rng; struct crypto_tfm; struct crypto_type; struct aead_givcrypt_request; @@ -426,40 +425,12 @@ struct compress_alg { unsigned int slen, u8 *dst, unsigned int *dlen); }; -/** - * struct old_rng_alg - random number generator definition - * @rng_make_random: The function defined by this variable obtains a random - * number. The random number generator transform must generate - * the random number out of the context provided with this - * call. - * @rng_reset: Reset of the random number generator by clearing the entire state. - * With the invocation of this function call, the random number - * generator shall completely reinitialize its state. If the random - * number generator requires a seed for setting up a new state, - * the seed must be provided by the consumer while invoking this - * function. The required size of the seed is defined with - * @seedsize . - * @seedsize: The seed size required for a random number generator - * initialization defined with this variable. Some random number - * generators like the SP800-90A DRBG does not require a seed as the - * seeding is implemented internally without the need of support by - * the consumer. In this case, the seed size is set to zero. - */ -struct old_rng_alg { - int (*rng_make_random)(struct crypto_rng *tfm, u8 *rdata, - unsigned int dlen); - int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen); - - unsigned int seedsize; -}; - #define cra_ablkcipher cra_u.ablkcipher #define cra_aead cra_u.aead #define cra_blkcipher cra_u.blkcipher #define cra_cipher cra_u.cipher #define cra_compress cra_u.compress -#define cra_rng cra_u.rng /** * struct crypto_alg - definition of a cryptograpic cipher algorithm @@ -559,7 +530,6 @@ struct crypto_alg { struct blkcipher_alg blkcipher; struct cipher_alg cipher; struct compress_alg compress; - struct old_rng_alg rng; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); -- cgit v1.2.3 From 4796cf9b02b5bea141632e21d64556a7eb883a65 Mon Sep 17 00:00:00 2001 From: Yingjoe Chen Date: Fri, 10 Apr 2015 21:55:50 +0800 Subject: time: Remove nonexistent function prototype The function clocksource_get_next() was removed in commit 75c5158f70 (timekeeping: Update clocksource with stop_machine), but the prototype was not removed with it. Remove the prototype. Signed-off-by: Yingjoe Chen Cc: Cc: Martin Schwidefsky Cc: Cc: John Stultz Link: http://lkml.kernel.org/r/1428674150-1780-1-git-send-email-yingjoe.chen@mediatek.com Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 135509821c39..a25fc6e873b8 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -181,7 +181,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); -extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); -- cgit v1.2.3 From 91e5a2170e795989da9f90c18ba18984f23acc5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 21:02:22 +0000 Subject: hrtimer: Document hrtimer_forward[_now]() proper Document the calling context conditions. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20150413210035.178751779@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 05f6df1fdf5b..7770676c387a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -418,7 +418,22 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); -/* Forward a hrtimer so it expires after the hrtimer's current now */ +/** + * hrtimer_forward_now - forward the timer expiry so it expires after now + * @timer: hrtimer to forward + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire after the current time + * of the hrtimer clock base. Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. + */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { -- cgit v1.2.3 From 398ca17fb54b212cdc9da7ff4a17a35c48dd2103 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:27 +0000 Subject: hrtimer: Get rid of the resolution field in hrtimer_clock_base The field has no value because all clock bases have the same resolution. The resolution only changes when we switch to high resolution timer mode. We can evaluate that from a single static variable as well. In the !HIGHRES case its simply a constant. Export the variable, so we can simplify the usage sites. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.645454122@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7770676c387a..bc6f91b5443b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -137,7 +137,6 @@ struct hrtimer_sleeper { * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers - * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base @@ -147,7 +146,6 @@ struct hrtimer_clock_base { int index; clockid_t clockid; struct timerqueue_head active; - ktime_t resolution; ktime_t (*get_time)(void); ktime_t softirq_time; ktime_t offset; @@ -295,11 +293,15 @@ extern void hrtimer_peek_ahead_timers(void); extern void clock_was_set_delayed(void); +extern unsigned int hrtimer_resolution; + #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES +#define hrtimer_resolution LOW_RES_NSEC + static inline void hrtimer_peek_ahead_timers(void) { } /* -- cgit v1.2.3 From 056a3cacbc46e5aca27b350ce4ecb3b33ebb0700 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:32 +0000 Subject: hrtimer: Get rid of hrtimer_get_res() The resolution is directly accessible now. So its simpler just to fill in the values of the timespec and be done with it. Text size reduction (combined with "hrtimer: Get rid of the resolution field in hrtimer_clock_base"): x8664 -61, i386 -221, ARM -60, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.879888080@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bc6f91b5443b..8025156c8fa1 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -385,7 +385,6 @@ static inline int hrtimer_restart(struct hrtimer *timer) /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); -extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); extern ktime_t hrtimer_get_next_event(void); -- cgit v1.2.3 From a6ffebce7f89f6f97cc22838a5d4383b15d6774f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:34 +0000 Subject: hrtimer: Make the statistics fields smaller No point in having usigned long for /proc/timer_list statistics. Make them unsigned int. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.959773467@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 8025156c8fa1..d39f2847754c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -187,10 +187,10 @@ struct hrtimer_cpu_base { int in_hrtirq; int hres_active; int hang_detected; - unsigned long nr_events; - unsigned long nr_retries; - unsigned long nr_hangs; - ktime_t max_hang_time; + unsigned int nr_events; + unsigned int nr_retries; + unsigned int nr_hangs; + unsigned int max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; }; -- cgit v1.2.3 From 21d6d52a1b7028e6a6840bd82e354aefa9a5e203 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:35 +0000 Subject: hrtimer: Get rid of softirq time The softirq time field in the clock bases is an optimization from the early days of hrtimers. It provides a coarse "jiffies" like time mostly for self rearming timers. But that comes with a price: - Larger code size - Extra storage space - Duplicated functions with really small differences The benefit of this is optimization is marginal for contemporary systems. Consolidate everything on the high resolution timer implementation. This makes further optimizations possible. Text size reduction: x8664 -95, i386 -356, ARM -148, ARM64 -40, power64 -16 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.039977424@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d39f2847754c..e292830b58f0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -138,7 +138,6 @@ struct hrtimer_sleeper { * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers * @get_time: function to retrieve the current time of the clock - * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -147,7 +146,6 @@ struct hrtimer_clock_base { clockid_t clockid; struct timerqueue_head active; ktime_t (*get_time)(void); - ktime_t softirq_time; ktime_t offset; }; @@ -260,19 +258,16 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, timer->base->get_time()); } -#ifdef CONFIG_HIGH_RES_TIMERS -struct clock_event_device; - -extern void hrtimer_interrupt(struct clock_event_device *dev); - -/* - * In high resolution mode the time reference must be read accurate - */ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } +#ifdef CONFIG_HIGH_RES_TIMERS +struct clock_event_device; + +extern void hrtimer_interrupt(struct clock_event_device *dev); + static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return timer->base->cpu_base->hres_active; @@ -304,15 +299,6 @@ extern unsigned int hrtimer_resolution; static inline void hrtimer_peek_ahead_timers(void) { } -/* - * In non high resolution mode the time reference is taken from - * the base softirq time variable. - */ -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) -{ - return timer->base->softirq_time; -} - static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return 0; -- cgit v1.2.3 From 868a3e915f7f5eba8f8cb4f7da2276760807c51c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:37 +0000 Subject: hrtimer: Make offset update smarter On every tick/hrtimer interrupt we update the offset variables of the clock bases. That's silly because these offsets change very seldom. Add a sequence counter to the time keeping code which keeps track of the offset updates (clock_was_set()). Have a sequence cache in the hrtimer cpu bases to evaluate whether the offsets must be updated or not. This allows us later to avoid pointless cacheline pollution. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203501.132820245@linutronix.de Signed-off-by: Thomas Gleixner Cc: John Stultz --- include/linux/hrtimer.h | 4 ++-- include/linux/timekeeper_internal.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e292830b58f0..5e04f8fc26f6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -163,7 +163,7 @@ enum hrtimer_base_type { * and timers * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set: Indicates that clock was set from irq context. + * @clock_was_set_seq: Sequence counter of clock was set events * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @in_hrtirq: hrtimer_interrupt() is currently executing @@ -179,7 +179,7 @@ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int cpu; unsigned int active_bases; - unsigned int clock_was_set; + unsigned int clock_was_set_seq; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int in_hrtirq; diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index fb86963859c7..6f8276ae579c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -49,6 +49,7 @@ struct tk_read_base { * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds + * @clock_was_set_seq: The sequence number of clock was set events * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -85,6 +86,7 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; + unsigned int clock_was_set_seq; struct timespec64 raw_time; /* The following members are for timekeeping internal use */ -- cgit v1.2.3 From e19ffe8be2cd0a1f726b235443eba21e64f6be5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:39 +0000 Subject: hrtimer: Use bits for various boolean indicators No point in wasting 12 byte storage space. Generates better code as well. Text size reduction: x8664 -64, i386 -16, ARM -132, ARM64 -0, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.227955358@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5e04f8fc26f6..17a59ddcc79a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -181,10 +181,10 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; #ifdef CONFIG_HIGH_RES_TIMERS + unsigned int in_hrtirq : 1, + hres_active : 1, + hang_detected : 1; ktime_t expires_next; - int in_hrtirq; - int hres_active; - int hang_detected; unsigned int nr_events; unsigned int nr_retries; unsigned int nr_hangs; -- cgit v1.2.3 From 6d9a1411393d51f17bee3fe163430b21b2cb2de9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:42 +0000 Subject: hrtimer: Cache line align the hrtimer cpu base We really want that data structure to start at a cache line boundary. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.417597627@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 17a59ddcc79a..0853f52f8ffb 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -191,7 +191,7 @@ struct hrtimer_cpu_base { unsigned int max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; -}; +} ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { -- cgit v1.2.3 From b8e38413ac2c33c497e72895fcd5da709fd1b908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:44 +0000 Subject: hrtimer: Align the hrtimer clock bases as well We don't use cacheline_align here because that might waste lot of space on 32bit machine with 64 bytes cachelines and on 64bit machines with 128 bytes cachelines. The size of struct hrtimer_clock_base is 64byte on 64bit and 32byte on 32bit machines. So we utilize the cache lines proper. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.498165771@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0853f52f8ffb..e5c22d611850 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -130,6 +130,12 @@ struct hrtimer_sleeper { struct task_struct *task; }; +#ifdef CONFIG_64BIT +# define HRTIMER_CLOCK_BASE_ALIGN 64 +#else +# define HRTIMER_CLOCK_BASE_ALIGN 32 +#endif + /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base @@ -147,7 +153,7 @@ struct hrtimer_clock_base { struct timerqueue_head active; ktime_t (*get_time)(void); ktime_t offset; -}; +} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN))); enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, @@ -195,6 +201,8 @@ struct hrtimer_cpu_base { static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { + BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN); + timer->node.expires = time; timer->_softexpires = time; } -- cgit v1.2.3 From c320642e1ced3b81592610e374894fea995f475b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:46 +0000 Subject: timerqueue: Let timerqueue_add/del return information The hrtimer code is interested whether the added timer is the first one to expire and whether the removed timer was the last one in the tree. The add/del routines have that information already. So we can return it right away instead of reevaluating it at the call site. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203501.579063647@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timerqueue.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index a520fd70a59f..7eec17ad7fa1 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -16,10 +16,10 @@ struct timerqueue_head { }; -extern void timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern void timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); +extern bool timerqueue_add(struct timerqueue_head *head, + struct timerqueue_node *node); +extern bool timerqueue_del(struct timerqueue_head *head, + struct timerqueue_node *node); extern struct timerqueue_node *timerqueue_iterate_next( struct timerqueue_node *node); -- cgit v1.2.3 From 895bdfa793f6e912d1a58fc445b3dd4d686f7bd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:49 +0000 Subject: hrtimer: Keep pointer to first timer and simplify __remove_hrtimer() __remove_hrtimer() needs to evaluate the expiry time to figure out whether the timer which is removed is eventually the first expiring timer on the cpu. Keep a pointer to it, which is lazily updated, so we can avoid the evaluation dance and retrieve the information from there. Generates slightly better code. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.752838019@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e5c22d611850..d194c1dacdaa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -172,6 +172,7 @@ enum hrtimer_base_type { * @clock_was_set_seq: Sequence counter of clock was set events * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() + * @next_timer: Pointer to the first expiring timer * @in_hrtirq: hrtimer_interrupt() is currently executing * @hres_active: State of high resolution mode * @hang_detected: The last hrtimer interrupt detected a hang @@ -180,6 +181,10 @@ enum hrtimer_base_type { * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @clock_base: array of clock bases for this cpu + * + * Note: next_timer is just an optimization for __remove_hrtimer(). + * Do not dereference the pointer because it is not reliable on + * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; @@ -191,6 +196,7 @@ struct hrtimer_cpu_base { hres_active : 1, hang_detected : 1; ktime_t expires_next; + struct hrtimer *next_timer; unsigned int nr_events; unsigned int nr_retries; unsigned int nr_hangs; -- cgit v1.2.3 From c6eb3f70d4482806dc2d3e1e3c7736f497b1d418 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:51 +0000 Subject: hrtimer: Get rid of hrtimer softirq hrtimer softirq is a leftover from the initial implementation and serves only the purpose to handle the enqueueing of already expired timers in the high resolution timer mode. We discussed whether we change the return value and force all start sites to handle that the timer is already expired, but that would be a Herculean task and I'm not sure whether its a good idea to enforce that handling on everyone. A simpler solution is to enforce a timer interrupt instead of raising and scheduling a softirq. Just use the existing infrastructure to do so and remove all the softirq leftovers. The HRTIMER softirq enum is now unused, but kept around because trace parsers rely on the existing numbering. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.840834708@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - include/linux/interrupt.h | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d194c1dacdaa..048270a27bc5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -459,7 +459,6 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); -extern void hrtimer_run_pending(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 950ae4501826..6bf15a66bce7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -413,7 +413,8 @@ enum BLOCK_IOPOLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, - HRTIMER_SOFTIRQ, + HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the + numbering. Sigh! */ RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS -- cgit v1.2.3 From c1ad348b452aacd784fb97403d03d71723c72ee1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:58 +0000 Subject: tick: Nohz: Rework next timer evaluation The evaluation of the next timer in the nohz code is based on jiffies while all the tick internals are nano seconds based. We have also to convert hrtimer nanoseconds to jiffies in the !highres case. That's just wrong and introduces interesting corner cases. Turn it around and convert the next timer wheel timer expiry and the rcu event to clock monotonic and base all calculations on nanoseconds. That identifies the case where no timer is pending clearly with an absolute expiry value of KTIME_MAX. Makes the code more readable and gets rid of the jiffies magic in the nohz code. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- include/linux/rcupdate.h | 6 ++++-- include/linux/rcutree.h | 2 +- include/linux/timer.h | 7 ------- 4 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 048270a27bc5..2c68f71ffd24 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -386,7 +386,7 @@ static inline int hrtimer_restart(struct hrtimer *timer) /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); -extern ktime_t hrtimer_get_next_event(void); +extern u64 hrtimer_get_next_event(void); /* * A timer is active, when it is enqueued into the rbtree or the diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 573a5afd5ed8..0627a447c589 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,8 @@ #include #include #include +#include + #include extern int rcu_expedited; /* for sysctl */ @@ -1154,9 +1156,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) -static inline int rcu_needs_cpu(unsigned long *delta_jiffies) +static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } #endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d2e583a6aaca..db2e31beaae7 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -32,7 +32,7 @@ void rcu_note_context_switch(void); #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *delta_jiffies); +int rcu_needs_cpu(u64 basem, u64 *nextevt); #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ void rcu_cpu_stall_reset(void); diff --git a/include/linux/timer.h b/include/linux/timer.h index 8c5a197e1587..fbb80e0030bf 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -187,13 +187,6 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) -/* - * Return when the next timer-wheel timeout occurs (in absolute jiffies), - * locks the timer base and does the comparison against the given - * jiffie. - */ -extern unsigned long get_next_timer_interrupt(unsigned long now); - /* * Timer-statistics info: */ -- cgit v1.2.3 From 58f1f803f1d6ef9ab280de13246d65970a09cb95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:08 +0000 Subject: hrtimer: Get rid of __hrtimer_start_range_ns() No more callers. Remove the leftovers. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.707871492@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2c68f71ffd24..a80baa86bb24 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -359,10 +359,6 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode); extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); -extern int -__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, - const enum hrtimer_mode mode, int wakeup); extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -- cgit v1.2.3 From 02a171af1a46966dcdb5b38cdc33e4f43e92c778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:10 +0000 Subject: hrtimer: Make hrtimer_start() a inline wrapper No point for an extra export just to set the extra argument of hrtimer_start_range_ns() to 0. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.808544539@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a80baa86bb24..42074ab3d5c3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -355,11 +355,26 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ -extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode); extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); +/** + * hrtimer_start - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ + return hrtimer_start_range_ns(timer, tim, 0, mode); +} + extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -- cgit v1.2.3 From b193217e6dc3f88b599b573b53e0e0f6671d969a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:18 +0000 Subject: alarmtimer: Get rid of unused return value We want to get rid of the hrtimer_start() return value and the alarm timer return value is nowhere used. Remove it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203503.243910615@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/alarmtimer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index a899402a5a0e..52f3b7da4f2d 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -43,8 +43,8 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); -int alarm_start(struct alarm *alarm, ktime_t start); -int alarm_start_relative(struct alarm *alarm, ktime_t start); +void alarm_start(struct alarm *alarm, ktime_t start); +void alarm_start_relative(struct alarm *alarm, ktime_t start); void alarm_restart(struct alarm *alarm); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); -- cgit v1.2.3 From 61699e13072a89880aa584dcc64c6da465fb2ccc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:23 +0000 Subject: hrtimer: Remove hrtimer_start() return value No user was ever interested whether the timer was active or not when it was started. All abusers of the return value are gone, so get rid of it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.483556394@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 22 +++++++++------------- include/linux/interrupt.h | 6 +++--- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 42074ab3d5c3..470d876c2eda 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ -extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, +extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); /** @@ -364,34 +364,30 @@ extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active */ -static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode) +static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) { - return hrtimer_start_range_ns(timer, tim, 0, mode); + hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -static inline int hrtimer_start_expires(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void hrtimer_start_expires(struct hrtimer *timer, + enum hrtimer_mode mode) { unsigned long delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); - return hrtimer_start_range_ns(timer, soft, delta, mode); + hrtimer_start_range_ns(timer, soft, delta, mode); } -static inline int hrtimer_restart(struct hrtimer *timer) +static inline void hrtimer_restart(struct hrtimer *timer) { - return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 6bf15a66bce7..be7e75c945e9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -593,10 +593,10 @@ tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, clockid_t which_clock, enum hrtimer_mode mode); static inline -int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, - const enum hrtimer_mode mode) +void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, + const enum hrtimer_mode mode) { - return hrtimer_start(&ttimer->timer, time, mode); + hrtimer_start(&ttimer->timer, time, mode); } static inline -- cgit v1.2.3 From 59afdc7b32143528524455039e7557a46b60e4c8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 Apr 2015 11:28:46 +0800 Subject: crypto: api - Move module sig ifdef into accessor function Currently we're hiding mod->sig_ok under an ifdef in open code. This patch adds a module_sig_ok accessor function and removes that ifdef. Signed-off-by: Herbert Xu Acked-by: Rusty Russell --- include/linux/module.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index c883b86ea964..1e5436042eb0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -655,4 +655,16 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr, static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef CONFIG_MODULE_SIG +static inline bool module_sig_ok(struct module *module) +{ + return module->sig_ok; +} +#else /* !CONFIG_MODULE_SIG */ +static inline bool module_sig_ok(struct module *module) +{ + return true; +} +#endif /* CONFIG_MODULE_SIG */ + #endif /* _LINUX_MODULE_H */ -- cgit v1.2.3 From af87baedf2c23b1181f51323339210a26a64f7fc Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:28 +0800 Subject: x86/htirq: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HTIRQ, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. This patch changes the interfaces between arch independent PCI driver and arch specific code. Currently HT_IRQ is only enabled on x86, so it does not affect other architectures. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/htirq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/htirq.h b/include/linux/htirq.h index 70a1dbbf2093..5caa51b7b95c 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -15,6 +15,8 @@ void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); +int arch_alloc_ht_irq(struct pci_dev *dev); +void arch_free_ht_irq(int irq); /* For drivers of buggy hardware */ typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, -- cgit v1.2.3 From b106ee63abccbba5f5a52d6e43168a6a30c6d98a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:32 +0800 Subject: irq_remapping/vt-d: Enhance Intel IR driver to support hierarchical irqdomains Enhance Intel interrupt remapping driver to support hierarchical irqdomains. Implement intel_ir_chip to support stacked irq_chip. Signed-off-by: Jiang Liu Acked-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: David Woodhouse Link: http://lkml.kernel.org/r/1428905519-23704-11-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/intel-iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a65208a8fe18..ecaf3a937845 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -286,6 +286,8 @@ struct q_inval { #define INTR_REMAP_TABLE_ENTRIES 65536 +struct irq_domain; + struct ir_table { struct irte *base; unsigned long *bitmap; @@ -335,6 +337,8 @@ struct intel_iommu { #ifdef CONFIG_IRQ_REMAP struct ir_table *ir_table; /* Interrupt remapping info */ + struct irq_domain *ir_domain; + struct irq_domain *ir_msi_domain; #endif struct device *iommu_dev; /* IOMMU-sysfs device */ int node; -- cgit v1.2.3 From 34742db8eaf9ff364034f214ee5827701e131d4b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:41 +0800 Subject: iommu/vt-d: Refine the interfaces to create IRQ for DMAR unit Refine the interfaces to create IRQ for DMAR unit. It's a preparation for converting DMAR IRQ to hierarchical irqdomain on x86. It also moves dmar_alloc_hwirq()/dmar_free_hwirq() from irq_remapping.h to dmar.h. They are not irq_remapping specific. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Tony Luck Cc: Fenghua Yu Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/dmar.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 30624954dec5..84737565c1fd 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -227,6 +227,7 @@ extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); -extern int arch_setup_dmar_msi(unsigned int irq); +extern int dmar_alloc_hwirq(int id, int node, void *arg); +extern void dmar_free_hwirq(int irq); #endif /* __DMAR_H__ */ -- cgit v1.2.3 From 49e07d8f28c05347f237146a9ec66f6d958db83e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:43 +0800 Subject: x86/htirq: Use hierarchical irqdomain to manage Hypertransport interrupts We have slightly changed the architecture interfaces to support htirq PCI driver. It's safe because currently Hypertransport interrupt is only enabled on x86 platforms. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/htirq.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/htirq.h b/include/linux/htirq.h index 5caa51b7b95c..d4a527e58434 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -1,26 +1,38 @@ #ifndef LINUX_HTIRQ_H #define LINUX_HTIRQ_H +struct pci_dev; +struct irq_data; + struct ht_irq_msg { u32 address_lo; /* low 32 bits of the ht irq message */ u32 address_hi; /* high 32 bits of the it irq message */ }; +typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg); + +struct ht_irq_cfg { + struct pci_dev *dev; + /* Update callback used to cope with buggy hardware */ + ht_irq_update_t *update; + unsigned pos; + unsigned idx; + struct ht_irq_msg msg; +}; + /* Helper functions.. */ void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); -struct irq_data; void mask_ht_irq(struct irq_data *data); void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); -int arch_alloc_ht_irq(struct pci_dev *dev); -void arch_free_ht_irq(int irq); +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update); +void arch_teardown_ht_irq(unsigned int irq); /* For drivers of buggy hardware */ -typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, - struct ht_irq_msg *msg); int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update); #endif /* LINUX_HTIRQ_H */ -- cgit v1.2.3 From 591fc116f3302da915bb57d4474a61a5e8884cec Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 9 Apr 2015 11:34:22 +0300 Subject: usb: phy: msm: Use extcon framework for VBUS and ID detection On recent Qualcomm platforms VBUS and ID lines are not routed to USB PHY LINK controller. Use extcon framework to receive connect and disconnect ID and VBUS notification. Signed-off-by: Ivan T. Ivanov Signed-off-by: Felipe Balbi --- include/linux/usb/msm_hsusb.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h index 7dbecf9a4656..c4d956e50d09 100644 --- a/include/linux/usb/msm_hsusb.h +++ b/include/linux/usb/msm_hsusb.h @@ -18,6 +18,7 @@ #ifndef __ASM_ARCH_MSM_HSUSB_H #define __ASM_ARCH_MSM_HSUSB_H +#include #include #include #include @@ -119,6 +120,17 @@ struct msm_otg_platform_data { void (*setup_gpio)(enum usb_otg_state state); }; +/** + * struct msm_usb_cable - structure for exteternal connector cable + * state tracking + * @nb: hold event notification callback + * @conn: used for notification registration + */ +struct msm_usb_cable { + struct notifier_block nb; + struct extcon_specific_cable_nb conn; +}; + /** * struct msm_otg: OTG driver data. Shared by HCD and DCD. * @otg: USB OTG Transceiver structure. @@ -138,6 +150,8 @@ struct msm_otg_platform_data { * @chg_type: The type of charger attached. * @dcd_retires: The retry count used to track Data contact * detection process. + * @vbus: VBUS signal state trakining, using extcon framework + * @id: ID signal state trakining, using extcon framework */ struct msm_otg { struct usb_phy phy; @@ -166,6 +180,9 @@ struct msm_otg { struct reset_control *phy_rst; struct reset_control *link_rst; int vdd_levels[3]; + + struct msm_usb_cable vbus; + struct msm_usb_cable id; }; #endif -- cgit v1.2.3 From 44e42ae3a398b559c768b9b3c324d72b0b0b4479 Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 9 Apr 2015 11:34:33 +0300 Subject: usb: phy: msm: Manual PHY and LINK controller VBUS change notification VBUS is not routed to USB PHY on recent Qualcomm platforms. USB controller must see VBUS in order to pull-up DP when setting RS bit. Henc configure USB PHY and LINK registers sense VBUS and enable manual pullup on D+ line. Cc: Vamsi Krishna Cc: Mayank Rana Signed-off-by: Ivan T. Ivanov Signed-off-by: Felipe Balbi --- include/linux/usb/msm_hsusb.h | 5 +++++ include/linux/usb/msm_hsusb_hw.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h index c4d956e50d09..e55a1504266e 100644 --- a/include/linux/usb/msm_hsusb.h +++ b/include/linux/usb/msm_hsusb.h @@ -150,6 +150,9 @@ struct msm_usb_cable { * @chg_type: The type of charger attached. * @dcd_retires: The retry count used to track Data contact * detection process. + * @manual_pullup: true if VBUS is not routed to USB controller/phy + * and controller driver therefore enables pull-up explicitly before + * starting controller using usbcmd run/stop bit. * @vbus: VBUS signal state trakining, using extcon framework * @id: ID signal state trakining, using extcon framework */ @@ -181,6 +184,8 @@ struct msm_otg { struct reset_control *link_rst; int vdd_levels[3]; + bool manual_pullup; + struct msm_usb_cable vbus; struct msm_usb_cable id; }; diff --git a/include/linux/usb/msm_hsusb_hw.h b/include/linux/usb/msm_hsusb_hw.h index a29f6030afb1..e159b39f67a2 100644 --- a/include/linux/usb/msm_hsusb_hw.h +++ b/include/linux/usb/msm_hsusb_hw.h @@ -21,6 +21,8 @@ #define USB_AHBBURST (MSM_USB_BASE + 0x0090) #define USB_AHBMODE (MSM_USB_BASE + 0x0098) +#define USB_GENCONFIG_2 (MSM_USB_BASE + 0x00a0) + #define USB_CAPLENGTH (MSM_USB_BASE + 0x0100) /* 8 bit */ #define USB_USBCMD (MSM_USB_BASE + 0x0140) @@ -30,6 +32,9 @@ #define USB_PHY_CTRL (MSM_USB_BASE + 0x0240) #define USB_PHY_CTRL2 (MSM_USB_BASE + 0x0278) +#define GENCONFIG_2_SESS_VLD_CTRL_EN BIT(7) +#define USBCMD_SESS_VLD_CTRL BIT(25) + #define USBCMD_RESET 2 #define USB_USBINTR (MSM_USB_BASE + 0x0148) @@ -50,6 +55,10 @@ #define ULPI_PWR_CLK_MNG_REG 0x88 #define OTG_COMP_DISABLE BIT(0) +#define ULPI_MISC_A 0x96 +#define ULPI_MISC_A_VBUSVLDEXTSEL BIT(1) +#define ULPI_MISC_A_VBUSVLDEXT BIT(0) + #define ASYNC_INTR_CTRL (1 << 29) /* Enable async interrupt */ #define ULPI_STP_CTRL (1 << 30) /* Block communication with PHY */ #define PHY_RETEN (1 << 1) /* PHY retention enable/disable */ -- cgit v1.2.3 From b189a2117223edbe40e0a187ae5c606cbdd6447c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 14:04:07 +0200 Subject: usb: phy: Remove the phy-rcar-gen2-usb driver The phy-rcar-gen2-usb driver, which supports legacy platform data only, is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager: Remove legacy board support"). This driver was superseded by the DT-only phy-rcar-gen2 driver, which was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY driver"). Signed-off-by: Geert Uytterhoeven Signed-off-by: Felipe Balbi --- include/linux/platform_data/usb-rcar-gen2-phy.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h (limited to 'include/linux') diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h deleted file mode 100644 index dd3ba46c0d90..000000000000 --- a/include/linux/platform_data/usb-rcar-gen2-phy.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __USB_RCAR_GEN2_PHY_H -#define __USB_RCAR_GEN2_PHY_H - -#include - -struct rcar_gen2_phy_platform_data { - /* USB channel 0 configuration */ - bool chan0_pci:1; /* true: PCI USB host 0, false: USBHS */ - /* USB channel 2 configuration */ - bool chan2_pci:1; /* true: PCI USB host 2, false: USBSS */ -}; - -#endif -- cgit v1.2.3 From 042f7df15a4fff8eec42873f755aea848dcdedd1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 30 Apr 2015 17:16:12 +0800 Subject: workqueue: Allow modifying low level unbound workqueue cpumask Allow to modify the low-level unbound workqueues cpumask through sysfs. This is performed by traversing the entire workqueue list and calling apply_wqattrs_prepare() on the unbound workqueues with the new low level mask. Only after all the preparation are done, we commit them all together. Ordered workqueues are ignored from the low level unbound workqueue cpumask, it will be handled in near future. All the (default & per-node) pwqs are mandatorily controlled by the low level cpumask. If the user configured cpumask doesn't overlap with the low level cpumask, the low level cpumask will be used for the wq instead. The comment of wq_calc_node_cpumask() is updated and explicitly requires that its first argument should be the attrs of the default pwq. The default wq_unbound_cpumask is cpu_possible_mask. The workqueue subsystem doesn't know its best default value, let the system manager or the other subsystem set it when needed. Changed from V8: merge the calculating code for the attrs of the default pwq together. minor change the code&comments for saving the user configured attrs. remove unnecessary list_del(). minor update the comment of wq_calc_node_cpumask(). update the comment of workqueue_set_unbound_cpumask(); Cc: Christoph Lameter Cc: Kevin Hilman Cc: Lai Jiangshan Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Tejun Heo Cc: Viresh Kumar Cc: Frederic Weisbecker Original-patch-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index deee212af8e0..4618dd672d1b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -- cgit v1.2.3 From 0bb549052d33f8992544764a6cf1299d06ba7e2f Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Tue, 28 Apr 2015 18:44:31 -0400 Subject: efi: Add esrt support Add sysfs files for the EFI System Resource Table (ESRT) under /sys/firmware/efi/esrt and for each EFI System Resource Entry under entries/ as a subdir. The EFI System Resource Table (ESRT) provides a read-only catalog of system components for which the system accepts firmware upgrades via UEFI's "Capsule Update" feature. This module allows userland utilities to evaluate what firmware updates can be applied to this system, and potentially arrange for those updates to occur. The ESRT is described as part of the UEFI specification, in version 2.5 which should be available from http://uefi.org/specifications in early 2015. If you're a member of the UEFI Forum, information about its addition to the standard is available as UEFI Mantis 1090. For some hardware platforms, additional restrictions may be found at http://msdn.microsoft.com/en-us/library/windows/hardware/jj128256.aspx , and additional documentation may be found at http://download.microsoft.com/download/5/F/5/5F5D16CD-2530-4289-8019-94C6A20BED3C/windows-uefi-firmware-update-platform.docx . Signed-off-by: Peter Jones Signed-off-by: Matt Fleming --- include/linux/efi.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index af5be0368dec..024c27e7c0fa 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -583,6 +583,9 @@ void efi_native_runtime_setup(void); #define EFI_FILE_INFO_ID \ EFI_GUID( 0x9576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b ) +#define EFI_SYSTEM_RESOURCE_TABLE_GUID \ + EFI_GUID( 0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80 ) + #define EFI_FILE_SYSTEM_GUID \ EFI_GUID( 0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b ) @@ -823,6 +826,7 @@ extern struct efi { unsigned long fw_vendor; /* fw_vendor */ unsigned long runtime; /* runtime table */ unsigned long config_table; /* config tables */ + unsigned long esrt; /* ESRT table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; @@ -875,6 +879,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon #endif extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); extern int efi_config_init(efi_config_table_type_t *arch_tables); +extern void __init efi_esrt_init(void); extern int efi_config_parse_tables(void *config_tables, int count, int sz, efi_config_table_type_t *arch_tables); extern u64 efi_get_iobase (void); @@ -882,12 +887,15 @@ extern u32 efi_mem_type (unsigned long phys_addr); extern u64 efi_mem_attributes (unsigned long phys_addr); extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); +extern u64 efi_mem_desc_end(efi_memory_desc_t *md); +extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); extern void efi_get_time(struct timespec *now); extern void efi_reserve_boot_services(void); extern int efi_get_fdt_params(struct efi_fdt_params *params, int verbose); extern struct efi_memory_map memmap; +extern struct kobject *efi_kobj; extern int efi_reboot_quirk_mode; extern bool efi_poweroff_required(void); -- cgit v1.2.3 From d54385ce68cd18ab002b46f61246ad197cec92de Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 30 Apr 2015 14:53:54 -0700 Subject: etherdev: Process is_multicast_ether_addr at same size as other operations This change makes it so that we process the address in is_multicast_ether_addr at the same size as the other calls. This allows us to avoid duplicate reads when used with other calls such as is_zero_ether_addr or eth_addr_copy. In addition I have added a 64 bit version of the function so in eth_type_trans we can process the destination address as a 64 bit value throughout. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 606563ef8a72..c4a10f991fe0 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -110,7 +110,29 @@ static inline bool is_zero_ether_addr(const u8 *addr) */ static inline bool is_multicast_ether_addr(const u8 *addr) { - return 0x01 & addr[0]; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + u32 a = *(const u32 *)addr; +#else + u16 a = *(const u16 *)addr; +#endif +#ifdef __BIG_ENDIAN + return 0x01 & (a >> ((sizeof(a) * 8) - 8)); +#else + return 0x01 & a; +#endif +} + +static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 +#ifdef __BIG_ENDIAN + return 0x01 & ((*(const u64 *)addr) >> 56); +#else + return 0x01 & (*(const u64 *)addr); +#endif +#else + return is_multicast_ether_addr(addr); +#endif } /** -- cgit v1.2.3 From 50fb799289501c2eab9f43fc9af513027e1e994f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:12 -0700 Subject: net: Add skb_get_hash_perturb This calls flow_disect and __skb_get_hash to procure a hash for a packet. Input includes a key to initialize jhash. This function does not set skb->hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66e374d62f64..acb83e249e3f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -927,6 +927,8 @@ static inline __u32 skb_get_hash(struct sk_buff *skb) return skb->hash; } +__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb); + static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) { return skb->hash; -- cgit v1.2.3 From 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Sat, 2 May 2015 14:01:07 +0200 Subject: net: Export IGMP/MLD message validation code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch, the IGMP and MLD message validation functions are moved from the bridge code to IPv4/IPv6 multicast files. Some small refactoring was done to enhance readibility and to iron out some differences in behaviour between the IGMP and MLD parsing code (e.g. the skb-cloning of MLD messages is now only done if necessary, just like the IGMP part always did). Finally, these IGMP and MLD message validation functions are exported so that not only the bridge can use it but batman-adv later, too. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 + include/linux/skbuff.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 2c677afeea47..193ad488d3e2 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -130,5 +130,6 @@ extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed); #endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index acb83e249e3f..9c2f793573fa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3419,6 +3419,9 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb) bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, -- cgit v1.2.3 From 6cd9e9f629f11b9412d4e9aa294c029dbb36b3cf Mon Sep 17 00:00:00 2001 From: Kapileshwar Singh Date: Wed, 18 Feb 2015 16:04:21 +0000 Subject: thermal: of: fix cooling device weights in device tree Currently you can specify the weight of the cooling device in the device tree but that information is not populated to the thermal_bind_params where the fair share governor expects it to be. The of thermal zone device doesn't have a thermal_bind_params structure and arguably it's better to pass the weight inside the thermal_instance as it is specific to the bind of a cooling device to a thermal zone parameter. Core thermal code is fixed to populate the weight in the instance from the thermal_bind_params, so platform code that was passing the weight inside the thermal_bind_params continue to work seamlessly. While we are at it, create a default value for the weight parameter for those thermal zones that currently don't define it and remove the hardcoded default in of-thermal. Cc: Zhang Rui Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Peter Feuerer Cc: Darren Hart Cc: Eduardo Valentin Cc: Kukjin Kim Cc: Durgadoss R Signed-off-by: Kapileshwar Singh Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 5eac316490ea..00dacd4dfdce 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -40,6 +40,9 @@ /* No upper/lower limit requirement */ #define THERMAL_NO_LIMIT ((u32)~0) +/* Default weight of a bound cooling device */ +#define THERMAL_WEIGHT_DEFAULT 0 + /* Unit conversion macros */ #define KELVIN_TO_CELSIUS(t) (long)(((long)t-2732 >= 0) ? \ ((long)t-2732+5)/10 : ((long)t-2732-5)/10) @@ -323,7 +326,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *, - unsigned long, unsigned long); + unsigned long, unsigned long, + unsigned int); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *); -- cgit v1.2.3 From bcdcbbc71125c37195f97314f453ca9a3a4eb758 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Wed, 18 Feb 2015 16:04:25 +0000 Subject: thermal: fair_share: generalize the weight concept The fair share governor has the concept of weights, which is the influence of each cooling device in a thermal zone. The current implementation forces the weights of all cooling devices in a thermal zone to add up to a 100. This complicates setups, as you need to know in advance how many cooling devices you are going to have. If you bind a new cooling device, you have to modify all the other cooling devices weights, which is error prone. Furthermore, you can't specify a "default" weight for platforms since that default value depends on the number of cooling devices in the platform. This patch generalizes the concept of weight by allowing any number to be a "weight". Weights are now relative to each other. Platforms that don't specify weights get the same default value for all their cooling devices, so all their cdevs are considered to be equally influential. It's important to note that previous users of the weights don't need to alter the code: percentages continue to work as they used to. This patch just removes the constraint of all the weights in a thermal zone having to add up to a 100. If they do, you get the same behavior as before. If they don't, fair share now works for that platform. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Durgadoss R Acked-by: Durgadoss R Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 00dacd4dfdce..bac0f52c7a1e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -217,9 +217,12 @@ struct thermal_bind_params { /* * This is a measure of 'how effectively these devices can - * cool 'this' thermal zone. The shall be determined by platform - * characterization. This is on a 'percentage' scale. - * See Documentation/thermal/sysfs-api.txt for more information. + * cool 'this' thermal zone. It shall be determined by + * platform characterization. This value is relative to the + * rest of the weights so a cooling device whose weight is + * double that of another cooling device is twice as + * effective. See Documentation/thermal/sysfs-api.txt for more + * information. */ int weight; -- cgit v1.2.3 From e33df1d2f3a0141cd79e770f31999ba0dd7ebfa8 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:27 +0000 Subject: thermal: let governors have private data for each thermal zone A governor may need to store its current state between calls to throttle(). That state depends on the thermal zone, so store it as private data in struct thermal_zone_device. The governors may have two new ops: bind_to_tz() and unbind_from_tz(). When provided, these functions let governors do some initialization and teardown when they are bound/unbound to a tz and possibly store that information in the governor_data field of the struct thermal_zone_device. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bac0f52c7a1e..edf9d53c67e6 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -165,6 +165,7 @@ struct thermal_attr { * @ops: operations this &thermal_zone_device supports * @tzp: thermal zone parameters * @governor: pointer to the governor for this thermal zone + * @governor_data: private pointer for governor data * @thermal_instances: list of &struct thermal_instance of this thermal zone * @idr: &struct idr to generate unique id for this zone's cooling * devices @@ -191,6 +192,7 @@ struct thermal_zone_device { struct thermal_zone_device_ops *ops; const struct thermal_zone_params *tzp; struct thermal_governor *governor; + void *governor_data; struct list_head thermal_instances; struct idr idr; struct mutex lock; @@ -201,12 +203,19 @@ struct thermal_zone_device { /** * struct thermal_governor - structure that holds thermal governor information * @name: name of the governor + * @bind_to_tz: callback called when binding to a thermal zone. If it + * returns 0, the governor is bound to the thermal zone, + * otherwise it fails. + * @unbind_from_tz: callback called when a governor is unbound from a + * thermal zone. * @throttle: callback called for every trip point even if temperature is * below the trip point temperature * @governor_list: node in thermal_governor_list (in thermal_core.c) */ struct thermal_governor { char name[THERMAL_NAME_LENGTH]; + int (*bind_to_tz)(struct thermal_zone_device *tz); + void (*unbind_from_tz)(struct thermal_zone_device *tz); int (*throttle)(struct thermal_zone_device *tz, int trip); struct list_head governor_list; }; -- cgit v1.2.3 From 35b11d2e3a66279a477e36cefb2603806295b8ce Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:28 +0000 Subject: thermal: extend the cooling device API to include power information Add three optional callbacks to the cooling device interface to allow them to express power. In addition to the callbacks, add helpers to identify cooling devices that implement the power cooling device API. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index edf9d53c67e6..bf3c55f405c2 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -63,6 +63,7 @@ struct thermal_zone_device; struct thermal_cooling_device; +struct thermal_instance; enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, @@ -116,6 +117,12 @@ struct thermal_cooling_device_ops { int (*get_max_state) (struct thermal_cooling_device *, unsigned long *); int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *); int (*set_cur_state) (struct thermal_cooling_device *, unsigned long); + int (*get_requested_power)(struct thermal_cooling_device *, + struct thermal_zone_device *, u32 *); + int (*state2power)(struct thermal_cooling_device *, + struct thermal_zone_device *, unsigned long, u32 *); + int (*power2state)(struct thermal_cooling_device *, + struct thermal_zone_device *, u32, unsigned long *); }; struct thermal_cooling_device { @@ -331,6 +338,16 @@ void thermal_zone_of_sensor_unregister(struct device *dev, #endif #if IS_ENABLED(CONFIG_THERMAL) +static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) +{ + return cdev->ops->get_requested_power && cdev->ops->state2power && + cdev->ops->power2state; +} + +int power_actor_get_max_power(struct thermal_cooling_device *, + struct thermal_zone_device *tz, u32 *max_power); +int power_actor_set_power(struct thermal_cooling_device *, + struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, const struct thermal_zone_params *, int, int); @@ -359,6 +376,14 @@ struct thermal_instance *get_thermal_instance(struct thermal_zone_device *, void thermal_cdev_update(struct thermal_cooling_device *); void thermal_notify_framework(struct thermal_zone_device *, int); #else +static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) +{ return false; } +static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *max_power) +{ return 0; } +static inline int power_actor_set_power(struct thermal_cooling_device *cdev, + struct thermal_instance *tz, u32 power) +{ return 0; } static inline struct thermal_zone_device *thermal_zone_device_register( const char *type, int trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, -- cgit v1.2.3 From c36cf07176316fbe6a4bdbc23afcb0cbf7822bf2 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:29 +0000 Subject: thermal: cpu_cooling: implement the power cooling device API Add a basic power model to the cpu cooling device to implement the power cooling device API. The power model uses the current frequency, current load and OPPs for the power calculations. The cpus must have registered their OPPs using the OPP library. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Kapileshwar Singh Signed-off-by: Punit Agrawal Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/cpu_cooling.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h index bd955270d5aa..c156f5082758 100644 --- a/include/linux/cpu_cooling.h +++ b/include/linux/cpu_cooling.h @@ -28,6 +28,9 @@ #include #include +typedef int (*get_static_t)(cpumask_t *cpumask, int interval, + unsigned long voltage, u32 *power); + #ifdef CONFIG_CPU_THERMAL /** * cpufreq_cooling_register - function to create cpufreq cooling device. @@ -36,6 +39,10 @@ struct thermal_cooling_device * cpufreq_cooling_register(const struct cpumask *clip_cpus); +struct thermal_cooling_device * +cpufreq_power_cooling_register(const struct cpumask *clip_cpus, + u32 capacitance, get_static_t plat_static_func); + /** * of_cpufreq_cooling_register - create cpufreq cooling device based on DT. * @np: a valid struct device_node to the cooling device device tree node. @@ -45,6 +52,12 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus); struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, const struct cpumask *clip_cpus); + +struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func); #else static inline struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, @@ -52,6 +65,15 @@ of_cpufreq_cooling_register(struct device_node *np, { return ERR_PTR(-ENOSYS); } + +static inline struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func) +{ + return NULL; +} #endif /** @@ -67,12 +89,29 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus) { return ERR_PTR(-ENOSYS); } +static inline struct thermal_cooling_device * +cpufreq_power_cooling_register(const struct cpumask *clip_cpus, + u32 capacitance, get_static_t plat_static_func) +{ + return NULL; +} + static inline struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, const struct cpumask *clip_cpus) { return ERR_PTR(-ENOSYS); } + +static inline struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func) +{ + return NULL; +} + static inline void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) { -- cgit v1.2.3 From 6b775e870c56c59c3e16531ea2307b797395f9f7 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Mon, 2 Mar 2015 17:17:19 +0000 Subject: thermal: introduce the Power Allocator governor The power allocator governor is a thermal governor that controls system and device power allocation to control temperature. Conceptually, the implementation divides the sustainable power of a thermal zone among all the heat sources in that zone. This governor relies on "power actors", entities that represent heat sources. They can report current and maximum power consumption and can set a given maximum power consumption, usually via a cooling device. The governor uses a Proportional Integral Derivative (PID) controller driven by the temperature of the thermal zone. The output of the controller is a power budget that is then allocated to each power actor that can have bearing on the temperature we are trying to control. It decides how much power to give each cooling device based on the performance they are requesting. The PID controller ensures that the total power budget does not exceed the control temperature. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Punit Agrawal Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bf3c55f405c2..6bbe11c97cea 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -59,6 +59,8 @@ #define DEFAULT_THERMAL_GOVERNOR "fair_share" #elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE) #define DEFAULT_THERMAL_GOVERNOR "user_space" +#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR) +#define DEFAULT_THERMAL_GOVERNOR "power_allocator" #endif struct thermal_zone_device; @@ -154,8 +156,7 @@ struct thermal_attr { * @devdata: private pointer for device private data * @trips: number of trip points the thermal zone supports * @passive_delay: number of milliseconds to wait between polls when - * performing passive cooling. Currenty only used by the - * step-wise governor + * performing passive cooling. * @polling_delay: number of milliseconds to wait between polls when * checking whether trip points have been crossed (0 for * interrupt driven systems) @@ -165,7 +166,6 @@ struct thermal_attr { * @last_temperature: previous temperature read * @emul_temperature: emulated temperature when using CONFIG_THERMAL_EMULATION * @passive: 1 if you've crossed a passive trip point, 0 otherwise. - * Currenty only used by the step-wise governor. * @forced_passive: If > 0, temperature at which to switch on all ACPI * processor cooling devices. Currently only used by the * step-wise governor. @@ -197,7 +197,7 @@ struct thermal_zone_device { int passive; unsigned int forced_passive; struct thermal_zone_device_ops *ops; - const struct thermal_zone_params *tzp; + struct thermal_zone_params *tzp; struct thermal_governor *governor; void *governor_data; struct list_head thermal_instances; @@ -275,6 +275,33 @@ struct thermal_zone_params { int num_tbps; /* Number of tbp entries */ struct thermal_bind_params *tbp; + + /* + * Sustainable power (heat) that this thermal zone can dissipate in + * mW + */ + u32 sustainable_power; + + /* + * Proportional parameter of the PID controller when + * overshooting (i.e., when temperature is below the target) + */ + s32 k_po; + + /* + * Proportional parameter of the PID controller when + * undershooting + */ + s32 k_pu; + + /* Integral parameter of the PID controller */ + s32 k_i; + + /* Derivative parameter of the PID controller */ + s32 k_d; + + /* threshold below which the error is no longer accumulated */ + s32 integral_cutoff; }; struct thermal_genl_event { @@ -350,7 +377,7 @@ int power_actor_set_power(struct thermal_cooling_device *, struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, - const struct thermal_zone_params *, int, int); + struct thermal_zone_params *, int, int); void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, -- cgit v1.2.3 From f31105347cc56c13d552b844ada04418769d875d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 12:17:50 +0200 Subject: irqchip: irqc: Remove platform data support As of commit 914d7d148411997c ("ARM: shmobile: r8a73a4: Remove legacy code"), the Renesas R-Mobile/R-Car interrupt controller is used with DT only, and interrupt numbers are thus always assigned automatically. Drop the platform data declaration and all related support code. Signed-off-by: Geert Uytterhoeven Cc: Magnus Damm Cc: Jason Cooper Link: http://lkml.kernel.org/r/1430216270-31929-1-git-send-email-geert%2Brenesas@glider.be Signed-off-by: Thomas Gleixner --- include/linux/platform_data/irq-renesas-irqc.h | 27 -------------------------- 1 file changed, 27 deletions(-) delete mode 100644 include/linux/platform_data/irq-renesas-irqc.h (limited to 'include/linux') diff --git a/include/linux/platform_data/irq-renesas-irqc.h b/include/linux/platform_data/irq-renesas-irqc.h deleted file mode 100644 index 3ae17b3e00ed..000000000000 --- a/include/linux/platform_data/irq-renesas-irqc.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Renesas IRQC Driver - * - * Copyright (C) 2013 Magnus Damm - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef __IRQ_RENESAS_IRQC_H__ -#define __IRQ_RENESAS_IRQC_H__ - -struct renesas_irqc_config { - unsigned int irq_base; -}; - -#endif /* __IRQ_RENESAS_IRQC_H__ */ -- cgit v1.2.3 From 406c057c4e00744453d5b0731eb23629ec14dcdf Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 4 May 2015 21:54:20 -0400 Subject: libata: READ LOG DMA EXT support can be in either page 119 or 120 Support for the READ/WRITE LOG DMA EXT commands can be signaled either in page 119 or page 120. We should check both pages. Signed-off-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- include/linux/ata.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index b666b773e111..fed36418dd1c 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -704,9 +704,19 @@ static inline bool ata_id_wcache_enabled(const u16 *id) static inline bool ata_id_has_read_log_dma_ext(const u16 *id) { + /* Word 86 must have bit 15 set */ if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15))) return false; - return id[ATA_ID_COMMAND_SET_3] & (1 << 3); + + /* READ LOG DMA EXT support can be signaled either from word 119 + * or from word 120. The format is the same for both words: Bit + * 15 must be cleared, bit 14 set and bit 3 set. + */ + if ((id[ATA_ID_COMMAND_SET_3] & 0xC008) == 0x4008 || + (id[ATA_ID_COMMAND_SET_4] & 0xC008) == 0x4008) + return true; + + return false; } static inline bool ata_id_has_sense_reporting(const u16 *id) -- cgit v1.2.3 From 5d3abf8ff67f49271a42c0f7fa4f20f9e046bf0e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 4 May 2015 21:54:21 -0400 Subject: libata: Fall back to unqueued READ LOG EXT if the DMA variant fails Some devices advertise support for the READ/WRITE LOG DMA EXT commands but fail when we try to issue them. This can lead to queued TRIM being unintentionally disabled since the relevant feature flag is located in a general purpose log page. Fall back to unqueued READ LOG EXT if the DMA variant fails while reading a log page. Signed-off-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 8dad4a307bb8..c3ef58014b33 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -424,6 +424,7 @@ enum { ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ + ATA_HORKAGE_NO_NCQ_LOG = (1 << 23), /* don't use NCQ for log read */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:15:18 -0600 Subject: bio: skip atomic inc/dec of ->bi_remaining for non-chains Struct bio has an atomic ref count for chained bio's, and we use this to know when to end IO on the bio. However, most bio's are not chained, so we don't need to always introduce this atomic operation as part of ending IO. Add a helper to elevate the bi_remaining count, and flag the bio as now actually needing the decrement at end_io time. Rename the field to __bi_remaining to catch any current users of this doing the incrementing manually. For high IOPS workloads, this reduces the overhead of bio_endio() substantially. Tested-by: Robert Elliott Acked-by: Kent Overstreet Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 +++++++++++ include/linux/blk_types.h | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index da3a127c9958..8bfe9eee6d1a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -644,6 +644,17 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1b25e35ea5f..8b07e0603887 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -65,7 +65,7 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - atomic_t bi_remaining; + atomic_t __bi_remaining; bio_end_io_t *bi_end_io; @@ -122,6 +122,7 @@ struct bio { #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ +#define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:23:59 -0600 Subject: bio: skip atomic inc/dec of ->bi_cnt for most use cases Struct bio has a reference count that controls when it can be freed. Most uses cases is allocating the bio, which then returns with a single reference to it, doing IO, and then dropping that single reference. We can remove this atomic_dec_and_test() in the completion path, if nobody else is holding a reference to the bio. If someone does call bio_get() on the bio, then we flag the bio as now having valid count and that we must properly honor the reference count when it's being put. Tested-by: Robert Elliott Signed-off-by: Jens Axboe --- include/linux/bio.h | 16 +++++++++++++++- include/linux/blk_types.h | 3 ++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 8bfe9eee6d1a..7486ea103f6e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -290,7 +290,21 @@ static inline unsigned bio_segments(struct bio *bio) * returns. and then bio would be freed memory when if (bio->bi_flags ...) * runs */ -#define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +static inline void bio_get(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_cnt); +} + +static inline void bio_cnt_set(struct bio *bio, unsigned int count) +{ + if (count != 1) { + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + } + atomic_set(&bio->__bi_cnt, count); +} enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8b07e0603887..93d2e7153816 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -92,7 +92,7 @@ struct bio { unsigned short bi_max_vecs; /* max bvl_vecs we can hold */ - atomic_t bi_cnt; /* pin count */ + atomic_t __bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */ @@ -123,6 +123,7 @@ struct bio { #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ #define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ +#define BIO_REFFED 12 /* bio has elevated ->bi_cnt */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From 84be456f883c4685680fba8e5154b5f72e92957e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 May 2015 12:46:15 +0200 Subject: remove We don't have any arch specific scatterlist now that parisc switched over to the generic one. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- include/linux/dmapool.h | 2 +- include/linux/scatterlist.h | 39 ++++++++++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..504af1e65ce1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,8 +22,7 @@ #include #include #include - -#include +#include struct module; struct scsi_ioctl_command; diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index 52456aa566a0..e1043f79122f 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -11,8 +11,8 @@ #ifndef LINUX_DMAPOOL_H #define LINUX_DMAPOOL_H +#include #include -#include struct device; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index ed8f9e70df9b..eca1ec93775c 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -2,13 +2,39 @@ #define _LINUX_SCATTERLIST_H #include +#include #include #include - -#include -#include #include +struct scatterlist { +#ifdef CONFIG_DEBUG_SG + unsigned long sg_magic; +#endif + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + unsigned int dma_length; +#endif +}; + +/* + * These macros should be used after a dma_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries dma_map_sg + * returns, or alternatively stop on the first sg_dma_len(sg) which + * is 0. + */ +#define sg_dma_address(sg) ((sg)->dma_address) + +#ifdef CONFIG_NEED_SG_DMA_LENGTH +#define sg_dma_len(sg) ((sg)->dma_length) +#else +#define sg_dma_len(sg) ((sg)->length) +#endif + struct sg_table { struct scatterlist *sgl; /* the list */ unsigned int nents; /* number of mapped entries */ @@ -18,10 +44,9 @@ struct sg_table { /* * Notes on SG table design. * - * Architectures must provide an unsigned long page_link field in the - * scatterlist struct. We use that to place the page pointer AND encode - * information about the sg table as well. The two lower bits are reserved - * for this information. + * We use the unsigned long page_link field in the scatterlist struct to place + * the page pointer AND encode information about the sg table as well. The two + * lower bits are reserved for this information. * * If bit 0 is set, then the page_link contains a pointer to the next sg * table list. Otherwise the next entry is at sg + 1. -- cgit v1.2.3 From 4f8c9510ba71bb54477841bebb90154ef140860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:16 +0200 Subject: block: rename REQ_TYPE_SPECIAL to REQ_TYPE_DRV_PRIV Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..98c90272443b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,10 +79,10 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_SPECIAL, /* driver defined type */ + REQ_TYPE_DRV_PRIV, /* driver defined type */ /* * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver + * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver * private REQ_LB opcodes to differentiate what type of request this is */ REQ_TYPE_ATA_TASKFILE, -- cgit v1.2.3 From b42171ef7d938a66fa52e66a3d911ed63770b5ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:17 +0200 Subject: block: move REQ_TYPE_ATA_TASKFILE and REQ_TYPE_ATA_PC to ide.h These values are only used by the IDE driver, so move them into it by allowing drivers to take cmd_type values after the first private one. Note that we have to turn cmd_type into a plain unsigned integer so that gcc doesn't complain about mismatching enum types. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ++--------- include/linux/ide.h | 7 +++++++ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 98c90272443b..9cb4d80a4987 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,14 +79,7 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_DRV_PRIV, /* driver defined type */ - /* - * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver - * private REQ_LB opcodes to differentiate what type of request this is - */ - REQ_TYPE_ATA_TASKFILE, - REQ_TYPE_ATA_PC, + REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; #define BLK_MAX_CDB 16 @@ -108,7 +101,7 @@ struct request { struct blk_mq_ctx *mq_ctx; u64 cmd_flags; - enum rq_cmd_type_bits cmd_type; + unsigned cmd_type; unsigned long atomic_flags; int cpu; diff --git a/include/linux/ide.h b/include/linux/ide.h index 93b5ca754b5b..62ac399144a6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -39,6 +39,12 @@ struct device; +/* IDE-specific values for req->cmd_type */ +enum ata_cmd_type_bits { + REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, + REQ_TYPE_ATA_PC, +}; + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1551,4 +1557,5 @@ static inline void ide_set_drivedata(ide_drive_t *drive, void *data) #define ide_host_for_each_port(i, port, host) \ for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++) + #endif /* _IDE_H */ -- cgit v1.2.3 From b0b93b48a30e809240ddd7449a6ad60a5ddf7b4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:18 +0200 Subject: block: move REQ_TYPE_SENSE to the ide driver Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - include/linux/ide.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cb4d80a4987..6076b9e18dcb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -75,7 +75,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_SENSE, /* sense request */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ diff --git a/include/linux/ide.h b/include/linux/ide.h index 62ac399144a6..9856b7d455d9 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -43,6 +43,7 @@ struct device; enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, + REQ_TYPE_ATA_SENSE, /* sense request */ }; /* Error codes returned in rq->errors to the higher part of the driver. */ -- cgit v1.2.3 From ac7cdff00a33d48d27217560fa3b16d802e5f535 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:19 +0200 Subject: block: remove REQ_TYPE_PM_SHUTDOWN Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6076b9e18dcb..c2829ba5e738 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -77,7 +77,6 @@ enum rq_cmd_type_bits { REQ_TYPE_BLOCK_PC, /* scsi command */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ - REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; -- cgit v1.2.3 From a7928c1578c550bd6f4dec62d65132e6db226c57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:20 +0200 Subject: block: move PM request support to IDE This removes the request types and hacks from the block code and into the old IDE driver. There is a small amunt of code duplication due to this, but it's not too bad. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +-------------------- include/linux/ide.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c2829ba5e738..2da818a48097 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -30,7 +30,6 @@ struct scsi_ioctl_command; struct request_queue; struct elevator_queue; -struct request_pm_state; struct blk_trace; struct request; struct sg_io_hdr; @@ -75,8 +74,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_PM_SUSPEND, /* suspend request */ - REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; @@ -207,19 +204,6 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } -/* - * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME - * requests. Some step values could eventually be made generic. - */ -struct request_pm_state -{ - /* PM state machine step value, currently driver specific */ - int pm_step; - /* requested PM state value (S1, S2, S3, S4, ...) */ - u32 pm_state; - void* data; /* for driver use */ -}; - #include struct blk_queue_ctx; @@ -601,10 +585,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) (((rq)->cmd_flags & REQ_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) -#define blk_pm_request(rq) \ - ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \ - (rq)->cmd_type == REQ_TYPE_PM_RESUME) - #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) /* rq->queuelist of dequeued request must be list_empty() */ @@ -838,6 +818,7 @@ extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); extern void __blk_stop_queue(struct request_queue *q); extern void __blk_run_queue(struct request_queue *q); +extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, diff --git a/include/linux/ide.h b/include/linux/ide.h index 9856b7d455d9..a633898f36ac 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -44,8 +44,14 @@ enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, REQ_TYPE_ATA_SENSE, /* sense request */ + REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */ + REQ_TYPE_ATA_PM_RESUME, /* resume request */ }; +#define ata_pm_request(rq) \ + ((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \ + (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME) + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1321,6 +1327,19 @@ struct ide_port_info { u8 udma_mask; }; +/* + * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME + * requests. + */ +struct ide_pm_state { + /* PM state machine step value, currently driver specific */ + int pm_step; + /* requested PM state value (S1, S2, S3, S4, ...) */ + u32 pm_state; + void* data; /* for driver use */ +}; + + int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *); int ide_pci_init_two(struct pci_dev *, struct pci_dev *, const struct ide_port_info *, void *); -- cgit v1.2.3 From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 May 2015 21:34:46 -0700 Subject: tcp: provide SYN headers for passive connections This patch allows a server application to get the TCP SYN headers for its passive connections. This is useful if the server is doing fingerprinting of clients based on SYN packet contents. Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN. The first is used on a socket to enable saving the SYN headers for child connections. This can be set before or after the listen() call. The latter is used to retrieve the SYN headers for passive connections, if the parent listener has enabled TCP_SAVE_SYN. TCP_SAVED_SYN is read once, it frees the saved SYN headers. The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP headers. Original patch was written by Tom Herbert, I changed it to not hold a full skb (and associated dst and conntracking reference). We have used such patch for about 3 years at Google. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3b2911502a8c..e6fb5df22db1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -199,6 +199,7 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ @@ -326,6 +327,7 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock *fastopen_rsk; + u32 *saved_syn; }; enum tsq_flags { @@ -393,4 +395,10 @@ static inline int fastopen_init_queue(struct sock *sk, int backlog) return 0; } +static inline void tcp_saved_syn_free(struct tcp_sock *tp) +{ + kfree(tp->saved_syn); + tp->saved_syn = NULL; +} + #endif /* _LINUX_TCP_H */ -- cgit v1.2.3 From 2c7a88c252bf3381958cf716f31b6b2e0f2f3fa7 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:33:48 -0700 Subject: etherdev: Fix sparse error, make test usable by other functions This change does two things. First it fixes a sparse error for the fact that the __be16 degrades to an integer. Since that is actually what I am kind of doing I am simply working around that by forcing both sides of the comparison to u16. Also I realized on some compilers I was generating another instruction for big endian systems such as PowerPC since it was masking the value before doing the comparison. So to resolve that I have simply pulled the mask out and wrapped it in an #ifndef __BIG_ENDIAN. Lastly I pulled this all out into its own function. I notices there are similar checks in a number of other places so this function can be reused there to help reduce overhead in these paths as well. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index c4a10f991fe0..9012f8775208 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -190,6 +190,24 @@ static inline bool is_valid_ether_addr(const u8 *addr) return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } +/** + * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol + * @proto: Ethertype/length value to be tested + * + * Check that the value from the Ethertype/length field is a valid Ethertype. + * + * Return true if the valid is an 802.3 supported Ethertype. + */ +static inline bool eth_proto_is_802_3(__be16 proto) +{ +#ifndef __BIG_ENDIAN + /* if CPU is little endian mask off bits representing LSB */ + proto &= htons(0xFF00); +#endif + /* cast both to u16 and compare since LSB can be ignored */ + return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); +} + /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address -- cgit v1.2.3 From 9545b22da647cf6fbbac9c5a48c50fd72d892b11 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:34:10 -0700 Subject: vlan: Use eth_proto_is_802_3 Replace "ntohs(proto) >= ETH_P_802_3_MIN" w/ eth_proto_is_802_3(proto). Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 920e4457ce6e..b9ab677c0c0a 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -539,7 +539,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, */ proto = vhdr->h_vlan_encapsulated_proto; - if (ntohs(proto) >= ETH_P_802_3_MIN) { + if (eth_proto_is_802_3(proto)) { skb->protocol = proto; return; } -- cgit v1.2.3 From d5622a9c13752be46e6fcde9d31391ce0bb0598b Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 2 Mar 2015 15:45:41 +0000 Subject: clkdev: use clk_hw internally clk_add_alias() calls clk_get() followed by clk_put() but in between those two calls it saves away the struct clk pointer to a clk_lookup structure. This leaves the 'clk' member of the clk_lookup pointing at freed memory on configurations where CONFIG_COMMON_CLK=y. This is a problem because clk_get_sys() will eventually try to dereference the freed pointer by calling __clk_get_hw() on it. Fix this by saving away the struct clk_hw pointer instead of the struct clk pointer so that when we try to create a per-user struct clk in clk_get_sys() we don't dereference a junk pointer. Signed-off-by: Russell King --- include/linux/clkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h index 94bad77eeb4a..3003afad46c9 100644 --- a/include/linux/clkdev.h +++ b/include/linux/clkdev.h @@ -22,6 +22,7 @@ struct clk_lookup { const char *dev_id; const char *con_id; struct clk *clk; + struct clk_hw *clk_hw; }; #define CLKDEV_INIT(d, n, c) \ -- cgit v1.2.3 From d2d14a77886485310ec66e575f00ea5232ac7a14 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 14 Mar 2015 15:12:35 +0000 Subject: clk: update clk API documentation to clarify clk_round_rate() The idea is that rate = clk_round_rate(clk, r) is equivalent to: clk_set_rate(clk, r); rate = clk_get_rate(clk); except that clk_round_rate() does not change the hardware in any way. Signed-off-by: Russell King --- include/linux/clk.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 68c16a6bedb3..cafb22df8d00 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -306,6 +306,20 @@ void devm_clk_put(struct device *dev, struct clk *clk); * @clk: clock source * @rate: desired clock rate in Hz * + * This answers the question "if I were to pass @rate to clk_set_rate(), + * what clock rate would I end up with?" without changing the hardware + * in any way. In other words: + * + * rate = clk_round_rate(clk, r); + * + * and: + * + * clk_set_rate(clk, r); + * rate = clk_get_rate(clk); + * + * are equivalent except the former does not modify the clock hardware + * in any way. + * * Returns rounded clock rate in Hz, or negative errno. */ long clk_round_rate(struct clk *clk, unsigned long rate); -- cgit v1.2.3 From 2d34e507293102f29ee94d9a9c5b890696d42452 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 9 Mar 2015 11:03:00 +0000 Subject: clkdev: get rid of redundant clk_add_alias() prototype in linux/clk.h clk_add_alias() is provided by clkdev, and is not part of the clk API. Howver, it is prototyped in two locations: linux/clkdev.h and linux/clk.h. This is a mess. Get rid of the redundant and unnecessary version in linux/clk.h. Acked-by: Tony Lindgren Tested-by: Robert Jarzmik Acked-by: Sekhar Nori Signed-off-by: Russell King --- include/linux/clk.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index cafb22df8d00..0df4a51e1a78 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -485,19 +485,6 @@ static inline void clk_disable_unprepare(struct clk *clk) clk_unprepare(clk); } -/** - * clk_add_alias - add a new clock alias - * @alias: name for clock alias - * @alias_dev_name: device name - * @id: platform specific clock name - * @dev: device - * - * Allows using generic clock names for drivers by adding a new alias. - * Assumes clkdev, see clkdev.h for more info. - */ -int clk_add_alias(const char *alias, const char *alias_dev_name, char *id, - struct device *dev); - struct device_node; struct of_phandle_args; -- cgit v1.2.3 From b3d8d7e89fab374d731dfb46fe048f09766ca9c8 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 9 Mar 2015 10:43:04 +0000 Subject: clkdev: const-ify connection id to clk_add_alias() The connection id is only passed to clk_get() which is already const. Const-ify this argument too. Signed-off-by: Russell King --- include/linux/clkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h index 3003afad46c9..cd93b215e3af 100644 --- a/include/linux/clkdev.h +++ b/include/linux/clkdev.h @@ -39,7 +39,7 @@ void clkdev_add(struct clk_lookup *cl); void clkdev_drop(struct clk_lookup *cl); void clkdev_add_table(struct clk_lookup *, size_t); -int clk_add_alias(const char *, const char *, char *, struct device *); +int clk_add_alias(const char *, const char *, const char *, struct device *); int clk_register_clkdev(struct clk *, const char *, const char *, ...); int clk_register_clkdevs(struct clk *, struct clk_lookup *, size_t); -- cgit v1.2.3 From 2568999835d7797afce3dcc3a3f368051ffcaf1f Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 2 Mar 2015 15:40:29 +0000 Subject: clkdev: add clkdev_create() helper Add a helper to allocate and add a clk_lookup structure. This can not only be used in several places in clkdev.c to simplify the code, but more importantly, can be used by callers of the clkdev code to simplify their clkdev creation and registration. Signed-off-by: Russell King --- include/linux/clkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h index cd93b215e3af..a240b18e86fa 100644 --- a/include/linux/clkdev.h +++ b/include/linux/clkdev.h @@ -38,6 +38,9 @@ struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id, void clkdev_add(struct clk_lookup *cl); void clkdev_drop(struct clk_lookup *cl); +struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id, + const char *dev_fmt, ...); + void clkdev_add_table(struct clk_lookup *, size_t); int clk_add_alias(const char *, const char *, const char *, struct device *); -- cgit v1.2.3 From efb0de55b6a2ec15fc424e660601f22ae2fa487a Mon Sep 17 00:00:00 2001 From: Shobhit Kumar Date: Tue, 5 May 2015 15:04:18 +0530 Subject: pwm: Add support to remove registered consumer lookup tables In case some drivers are unloading, they can remove lookup tables which they had registered during their load time to avoid redundant entries if loaded again. CC: Samuel Ortiz Cc: Linus Walleij Cc: Alexandre Courbot Cc: Thierry Reding Signed-off-by: Shobhit Kumar Signed-off-by: Thierry Reding --- include/linux/pwm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index e90628cac8fa..cfe2d8df5be0 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -290,10 +290,15 @@ struct pwm_lookup { #if IS_ENABLED(CONFIG_PWM) void pwm_add_table(struct pwm_lookup *table, size_t num); +void pwm_remove_table(struct pwm_lookup *table, size_t num); #else static inline void pwm_add_table(struct pwm_lookup *table, size_t num) { } + +static inline void pwm_remove_table(struct pwm_lookup *table, size_t num) +{ +} #endif #ifdef CONFIG_PWM_SYSFS -- cgit v1.2.3 From fa76a3db7093a527333c380df82a0f158d9b8299 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Thu, 9 Apr 2015 11:13:07 +0800 Subject: pinctrl: allow exlusive GPIO/mux pin allocation Disallow simultaneous use of the the GPIO and peripheral mux functions by setting a flag "strict" in struct pinctrl_desc. The blackfin pinmux and gpio controller doesn't allow user to set up a pin for both GPIO and peripheral function. So, add flag strict in struct pinctrl_desc to check both gpio_owner and mux_owner before approving the pin request. v2-changes: - if strict flag is set, check gpio_owner and mux_onwer in if and else clause v3-changes: - add kerneldoc for this struct - augment Documentation/pinctrl.txt Signed-off-by: Sonic Zhang Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 66e4697516de..fc6b0348c375 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -114,6 +114,8 @@ struct pinctrl_ops { * of the pins field above * @pctlops: pin control operation vtable, to support global concepts like * grouping of pins, this is optional. + * @strict: check both gpio_owner and mux_owner strictly before approving + the pin request * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver * @confops: pin config operations vtable, if you support pin configuration in * your driver @@ -132,6 +134,7 @@ struct pinctrl_desc { const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; const struct pinconf_ops *confops; + bool strict; struct module *owner; #ifdef CONFIG_GENERIC_PINCONF unsigned int num_custom_params; -- cgit v1.2.3 From 8c4c2016345feefcd289ce2479eb70286d30825a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 6 May 2015 14:19:13 +0200 Subject: pinctrl: move strict option to pinmux_ops While the pinmux_ops are ideally just a vtable for pin mux calls, the "strict" setting belongs so intuitively with the pin multiplexing that we should move it here anyway. Putting it in the top pinctrl_desc makes no sense. Cc: Sonic Zhang Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl.h | 3 --- include/linux/pinctrl/pinmux.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index fc6b0348c375..66e4697516de 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -114,8 +114,6 @@ struct pinctrl_ops { * of the pins field above * @pctlops: pin control operation vtable, to support global concepts like * grouping of pins, this is optional. - * @strict: check both gpio_owner and mux_owner strictly before approving - the pin request * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver * @confops: pin config operations vtable, if you support pin configuration in * your driver @@ -134,7 +132,6 @@ struct pinctrl_desc { const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; const struct pinconf_ops *confops; - bool strict; struct module *owner; #ifdef CONFIG_GENERIC_PINCONF unsigned int num_custom_params; diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index 511bda9ed4bf..d3740fa7073f 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -56,6 +56,9 @@ struct pinctrl_dev; * depending on whether the GPIO is configured as input or output, * a direction selector function may be implemented as a backing * to the GPIO controllers that need pin muxing. + * @strict: do not allow simultaneous use of the same pin for GPIO and another + * function. Check both gpio_owner and mux_owner strictly before approving + * the pin request. */ struct pinmux_ops { int (*request) (struct pinctrl_dev *pctldev, unsigned offset); @@ -79,6 +82,7 @@ struct pinmux_ops { struct pinctrl_gpio_range *range, unsigned offset, bool input); + bool strict; }; #endif /* CONFIG_PINMUX */ -- cgit v1.2.3 From cac089f9026e9ddb3481daf08f0fc4e5949fa1af Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 23 Apr 2015 16:56:22 -0700 Subject: gpio: omap: Allow building as a loadable module We currently get all kinds of errors building the omap gpio driver as a module starting with: undefined reference to `omap2_gpio_resume_after_idle' undefined reference to `omap2_gpio_prepare_for_idle' ... Let's fix the issue by adding inline functions to the header. Note that we can now also remove the two unused functions for omap_set_gpio_debounce and omap_set_gpio_debounce_time. Then doing rmmod on the module produces further warnings because of missing exit related functions. Let's add those. And finally, we can make the Kconfig entry just a tristate option that's selected for omaps. Cc: Javier Martinez Canillas Cc: Kevin Hilman Cc: Nishanth Menon Signed-off-by: Tony Lindgren Reviewed-by: Grygorii Strashko Acked-by: Santosh Shilimkar Reviewed-by: Felipe Balbi Signed-off-by: Linus Walleij --- include/linux/platform_data/gpio-omap.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-omap.h b/include/linux/platform_data/gpio-omap.h index 5d50b25a73d7..cb2618147c34 100644 --- a/include/linux/platform_data/gpio-omap.h +++ b/include/linux/platform_data/gpio-omap.h @@ -208,9 +208,17 @@ struct omap_gpio_platform_data { int (*get_context_loss_count)(struct device *dev); }; +#if IS_BUILTIN(CONFIG_GPIO_OMAP) extern void omap2_gpio_prepare_for_idle(int off_mode); extern void omap2_gpio_resume_after_idle(void); -extern void omap_set_gpio_debounce(int gpio, int enable); -extern void omap_set_gpio_debounce_time(int gpio, int enable); +#else +static inline void omap2_gpio_prepare_for_idle(int off_mode) +{ +} + +static inline void omap2_gpio_resume_after_idle(void) +{ +} +#endif #endif -- cgit v1.2.3 From 66eb3bd857f5311f72c7c371f78ddc9c472befba Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 27 Apr 2015 18:04:05 +0200 Subject: pinctrl: use ERR_CAST instead of ERR_PTR/PTR_ERR Inspired by scripts/coccinelle/api/err_cast.cocci Signed-off-by: Fabian Frederick Signed-off-by: Linus Walleij --- include/linux/pinctrl/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 18eccefea06e..d7e5d608faa7 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -142,7 +142,7 @@ static inline struct pinctrl * __must_check pinctrl_get_select( s = pinctrl_lookup_state(p, name); if (IS_ERR(s)) { pinctrl_put(p); - return ERR_PTR(PTR_ERR(s)); + return ERR_CAST(s); } ret = pinctrl_select_state(p, s); -- cgit v1.2.3 From 1d6b98774cff82860a3f044610e956bcbff556c1 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 31 Mar 2015 15:55:57 +0200 Subject: tty: constify return type of tty_name All users of tty_name pass the result directly to a printf-like function. This means we can actually let tty_name return the literal "NULL tty" or tty->name directly, avoiding the strcpy and a lot of medium-sized stack buffers. In preparation for that, make the return type const char*. While at it, we can also constify the tty parameter. Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index fe5623c9af71..4cbecfc7b3c9 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine); -extern char *tty_name(struct tty_struct *tty, char *buf); +extern const char *tty_name(const struct tty_struct *tty, char *buf); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); -- cgit v1.2.3 From 429b474990cb4e5e8cfe2352daf649d0599cccb6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 31 Mar 2015 15:55:59 +0200 Subject: tty: remove buf parameter from tty_name() tty_name no longer uses the buf parameter, so remove it along with all the 64 byte stack buffers that used to be passed in. Mostly generated by the coccinelle script @depends on patch@ identifier buf; constant C; expression tty; @@ - char buf[C]; <+... - tty_name(tty, buf) + tty_name(tty) ...+> allmodconfig compiles, so I'm fairly confident the stack buffers weren't used for other purposes as well. Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Hurley Acked-by: Jesper Nilsson Acked-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 4cbecfc7b3c9..9a72c9144d8a 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine); -extern const char *tty_name(const struct tty_struct *tty, char *buf); +extern const char *tty_name(const struct tty_struct *tty); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); -- cgit v1.2.3 From 6b3cddccf4eec0883feb065aea28dd9770bb17d0 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 11 Apr 2015 11:02:36 -0400 Subject: serial: core: Fix unused variable warnings from uart_console() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SERIAL_CORE_CONSOLE=n, build warnings are generated by uart_console() macro expansion: drivers/tty/serial/of_serial.c: In function ‘of_serial_suspend_8250’: drivers/tty/serial/of_serial.c:262:20: warning: unused variable ‘port’ [-Wunused-variable] struct uart_port *port = &port8250->port; ^ drivers/tty/serial/of_serial.c: In function ‘of_serial_resume_8250’: drivers/tty/serial/of_serial.c:272:20: warning: unused variable ‘port’ [-Wunused-variable] struct uart_port *port = &port8250->port; Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 025dad9dcde4..297d4fa1cfe5 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -35,7 +35,7 @@ #define uart_console(port) \ ((port)->cons && (port)->cons->index == (port)->line) #else -#define uart_console(port) (0) +#define uart_console(port) ({ (void)port; 0; }) #endif struct uart_port; -- cgit v1.2.3 From 17799359e7b3fa6ef4f2bf926cd6821cf7903ecf Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Sat, 28 Feb 2015 02:13:11 -0800 Subject: mtd: nand_bbt: make nand_scan_bbt() static This implementation detail is no longer needed outside of nand_bbt.c. Signed-off-by: Brian Norris --- include/linux/mtd/nand.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 3d4ea7eb2b68..6c51876941f3 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -833,7 +833,6 @@ struct nand_manufacturers { extern struct nand_flash_dev nand_flash_ids[]; extern struct nand_manufacturers nand_manuf_ids[]; -extern int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd); extern int nand_default_bbt(struct mtd_info *mtd); extern int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); extern int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From 0097d12e504b3ce57b68810737ad6a5a64a98c68 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 30 Apr 2015 13:43:30 +0200 Subject: KVM: provide irq_unsafe kvm_guest_{enter|exit} Several kvm architectures disable interrupts before kvm_guest_enter. kvm_guest_enter then uses local_irq_save/restore to disable interrupts again or for the first time. Lets provide underscore versions of kvm_guest_{enter|exit} that assume being called locked. kvm_guest_enter now disables interrupts for the full function and thus we can remove the check for preemptible. This patch then adopts s390/kvm to use local_irq_disable/enable calls which are slighty cheaper that local_irq_save/restore and call these new functions. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ad45054309a0..efc16df1fc5d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -762,16 +762,10 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, } #endif -static inline void kvm_guest_enter(void) +/* must be called with irqs disabled */ +static inline void __kvm_guest_enter(void) { - unsigned long flags; - - BUG_ON(preemptible()); - - local_irq_save(flags); guest_enter(); - local_irq_restore(flags); - /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In @@ -783,12 +777,27 @@ static inline void kvm_guest_enter(void) rcu_virt_note_context_switch(smp_processor_id()); } +/* must be called with irqs disabled */ +static inline void __kvm_guest_exit(void) +{ + guest_exit(); +} + +static inline void kvm_guest_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + __kvm_guest_enter(); + local_irq_restore(flags); +} + static inline void kvm_guest_exit(void) { unsigned long flags; local_irq_save(flags); - guest_exit(); + __kvm_guest_exit(); local_irq_restore(flags); } -- cgit v1.2.3 From 653f52c316a49c5ee2701bc13b15879f20790662 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 23 Apr 2015 11:52:37 -0400 Subject: kvm,x86: load guest FPU context more eagerly Currently KVM will clear the FPU bits in CR0.TS in the VMCS, and trap to re-load them every time the guest accesses the FPU after a switch back into the guest from the host. This patch copies the x86 task switch semantics for FPU loading, with the FPU loaded eagerly after first use if the system uses eager fpu mode, or if the guest uses the FPU frequently. In the latter case, after loading the FPU for 255 times, the fpu_counter will roll over, and we will revert to loading the FPU on demand, until it has been established that the guest is still actively using the FPU. This mirrors the x86 task switch policy, which seems to work. Signed-off-by: Rik van Riel Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index efc16df1fc5d..b7a08cd6f4a8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -230,6 +230,7 @@ struct kvm_vcpu { int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; + unsigned char fpu_counter; wait_queue_head_t wq; struct pid *pid; int sigset_active; -- cgit v1.2.3 From aed5ed47724f6a7453fa62e3c90f3cee93edbfe3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 6 May 2015 18:04:23 +0200 Subject: context_tracking: Protect against recursion Context tracking recursion can happen when an exception triggers in the middle of a call to a context tracking probe. This special case can be caused by vmalloc faults. If an access to a memory area allocated by vmalloc happens in the middle of context_tracking_enter(), we may run into an endless fault loop because the exception in turn calls context_tracking_enter() which faults on the same vmalloc'ed memory, triggering an exception again, etc... Some rare crashes have been reported so lets protect against this with a recursion counter. Reported-by: Dave Jones Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris Metcalf Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rafael J . Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430928266-24888-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/context_tracking_state.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 6b7b96a32b75..678ecdf90cf6 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -12,6 +12,7 @@ struct context_tracking { * may be further optimized using static keys. */ bool active; + int recursion; enum ctx_state { CONTEXT_KERNEL = 0, CONTEXT_USER, -- cgit v1.2.3 From fafe870f31212a72f3c2d74e7b90e4ef39e83ee1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 6 May 2015 18:04:24 +0200 Subject: context_tracking: Inherit TIF_NOHZ through forks instead of context switches TIF_NOHZ is used by context_tracking to force syscall slow-path on every task in order to track userspace roundtrips. As such, it must be set on all running tasks. It's currently explicitly inherited through context switches. There is no need to do it in this fast-path though. The flag could simply be set once for all on all tasks, whether they are running or not. Lets do this by setting the flag for the init task on early boot, and let it propagate through fork inheritance. While at it, mark context_tracking_cpu_set() as init code, we only need it at early boot time. Suggested-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Cc: Borislav Petkov Cc: Chris Metcalf Cc: Dave Jones Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430928266-24888-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/context_tracking.h | 10 ---------- include/linux/sched.h | 3 +++ 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 2821838256b4..b96bd299966f 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -14,8 +14,6 @@ extern void context_tracking_enter(enum ctx_state state); extern void context_tracking_exit(enum ctx_state state); extern void context_tracking_user_enter(void); extern void context_tracking_user_exit(void); -extern void __context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next); static inline void user_enter(void) { @@ -51,19 +49,11 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } -static inline void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) -{ - if (context_tracking_is_enabled()) - __context_tracking_task_switch(prev, next); -} #else static inline void user_enter(void) { } static inline void user_exit(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } -static inline void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) { } #endif /* !CONFIG_CONTEXT_TRACKING */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..185a750e4ed4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2532,6 +2532,9 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif +#define tasklist_empty() \ + list_empty(&init_task.tasks) + #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) -- cgit v1.2.3 From 83dedea8a07fb4bf91863764b15c1c4ec00330f9 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 May 2015 18:04:25 +0200 Subject: nohz: Add tick_nohz_full_add_cpus_to() API This API is useful to modify a cpumask indicating some special nohz-type functionality so that the nohz cores are automatically added to that set. Signed-off-by: Chris Metcalf Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Dave Jones Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429024675-18938-1-git-send-email-cmetcalf@ezchip.com Link: http://lkml.kernel.org/r/1430928266-24888-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/tick.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index f8492da57ad3..4191b5623a28 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -134,6 +134,12 @@ static inline bool tick_nohz_full_cpu(int cpu) return cpumask_test_cpu(cpu, tick_nohz_full_mask); } +static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) +{ + if (tick_nohz_full_enabled()) + cpumask_or(mask, mask, tick_nohz_full_mask); +} + extern void __tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); @@ -142,6 +148,7 @@ extern void __tick_nohz_task_switch(struct task_struct *tsk); #else static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } +static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } static inline void __tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } -- cgit v1.2.3 From c6201cd8513db2db54b248a862672849ed9ccb82 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 7 May 2015 09:52:22 -0500 Subject: PCI/MSI: Remove unused pci_msi_off() pci_msi_off() is unused, so remove it. Removes the exported symbol pci_msi_off(). Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..50b7c7d0206f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -974,7 +974,6 @@ void pci_intx(struct pci_dev *dev, int enable); bool pci_intx_mask_supported(struct pci_dev *dev); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); -void pci_msi_off(struct pci_dev *dev); int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size); int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask); int pci_wait_for_pending(struct pci_dev *dev, int pos, u16 mask); -- cgit v1.2.3 From 385f83f85cd9428db82cae5e6f6f786be113b24c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 12:21:40 +0200 Subject: dmaengine: Remove Renesas Audio DMAC peri peri platform data Commit 3cd44dcd35a6 ("dmaengine: remove Renesas Audio DMAC peri peri") forgot to remove the header file with the platform data definitions. Signed-off-by: Geert Uytterhoeven Acked-by: Simon Horman Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-rcar-audmapp.h | 34 -------------------------- 1 file changed, 34 deletions(-) delete mode 100644 include/linux/platform_data/dma-rcar-audmapp.h (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-rcar-audmapp.h b/include/linux/platform_data/dma-rcar-audmapp.h deleted file mode 100644 index 471fffebbeb4..000000000000 --- a/include/linux/platform_data/dma-rcar-audmapp.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * This is for Renesas R-Car Audio-DMAC-peri-peri. - * - * Copyright (C) 2014 Renesas Electronics Corporation - * Copyright (C) 2014 Kuninori Morimoto - * - * This file is based on the include/linux/sh_dma.h - * - * Header for the new SH dmaengine driver - * - * Copyright (C) 2010 Guennadi Liakhovetski - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef SH_AUDMAPP_H -#define SH_AUDMAPP_H - -#include - -struct audmapp_slave_config { - int slave_id; - dma_addr_t src; - dma_addr_t dst; - u32 chcr; -}; - -struct audmapp_pdata { - struct audmapp_slave_config *slave; - int slave_num; -}; - -#endif /* SH_AUDMAPP_H */ -- cgit v1.2.3 From 3289bdb429884c0279bf9ab72dff7b934f19dfc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Apr 2015 13:19:42 +0200 Subject: sched: Move the loadavg code to a more obvious location I could not find the loadavg code.. turns out it was hidden in a file called proc.c. It further got mingled up with the cruft per rq load indexes (which we really want to get rid of). Move the per rq load indexes into the fair.c load-balance code (that's the only thing that uses them) and rename proc.c to loadavg.c so we can find it again. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Paul Gortmaker Cc: Thomas Gleixner [ Did minor cleanups to the code. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..85cf253bc366 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void update_cpu_load_nohz(void); +#else +static inline void update_cpu_load_nohz(void) { } +#endif extern unsigned long get_parent_ip(unsigned long addr); -- cgit v1.2.3 From b76808e6808e34e7e78131d2b8cb0535622b8e9f Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:57 -0700 Subject: signals, sched: Change all uses of JOBCTL_* from 'int' to 'long' c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()") makes jobctl an "unsigned long". It makes sense to have the masks applied to it match that type. This is currently just a cosmetic change, but it will prevent the mask from being unexpectedly truncated if we ever end up with masks with more bits. One instance of "signr" is an int, but I left this alone because the mask ensures that it will never overflow. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-4-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 85cf253bc366..4f066cb625ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2082,22 +2082,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ -#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) -#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) -#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) -#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) -#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) -#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) +#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) +#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) +#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT) +#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) +#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) +#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, - unsigned int mask); + unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); extern void task_clear_jobctl_pending(struct task_struct *task, - unsigned int mask); + unsigned long mask); static inline void rcu_copy_process(struct task_struct *p) { -- cgit v1.2.3 From 7e60598785f30cf3dc9e476cc0fc3feeb37a0c63 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:56 -0700 Subject: sched/wait: Change wait_on_bit*() to take an unsigned long *, not a void * The implementations of wait_on_bit*() will only work with long-aligned memory on systems that don't support misaligned loads and stores. This patch changes the function prototypes to ensure that the compiler will enforce alignment. Running make defconfig make KFLAGS="-Werror" seems to indicate that, as of c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()"), there are now no users of non-long-aligned calls to wait_on_bit*(). I additionally tried a few "make randconfig" attempts, none of which failed to compile for this reason. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-3-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db83349865b..d69ac4ecc88b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *); * on that signal. */ static inline int -wait_on_bit(void *word, int bit, unsigned mode) +wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode) * on that signal. */ static inline int -wait_on_bit_io(void *word, int bit, unsigned mode) +wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode) * received a signal and the mode permitted wakeup on that signal. */ static inline int -wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) +wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, + unsigned long timeout) { might_sleep(); if (!test_bit(bit, word)) @@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) * on that signal. */ static inline int -wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) +wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock(void *word, int bit, unsigned mode) +wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) @@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode) * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock_io(void *word, int bit, unsigned mode) +wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) @@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode) * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) +wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) -- cgit v1.2.3 From e7cc4173115347bcdaa5de2824dd46ef2c58425f Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:55 -0700 Subject: signals, ptrace, sched: Fix a misaligned load inside ptrace_attach() The misaligned load exception arises when running ptrace_attach() on the RISC-V (which hasn't been upstreamed yet). The problem is that wait_on_bit() takes a void* but then proceeds to call test_bit(), which takes a long*. This allows an int-aligned pointer to be passed to test_bit(), which promptly fails. This will manifest on any other asm-generic port where unaligned loads trap, where sizeof(long) > sizeof(int), and where task_struct.jobctl ends up not being long-aligned. This patch changes task_struct.jobctl to be a long, which ensures it has the correct alignment. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-2-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f066cb625ad..fb650a2f4a73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1374,7 +1374,7 @@ struct task_struct { int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ - unsigned int jobctl; /* JOBCTL_*, siglock protected */ + unsigned long jobctl; /* JOBCTL_*, siglock protected */ /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; -- cgit v1.2.3 From 316c1608d15c736439d4065ed12f306db554b3da Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:20 -0700 Subject: sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to READ_ONCE()/WRITE_ONCE() ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use the new READ_ONCE() and WRITE_ONCE() APIs as appropriate. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Acked-by: Waiman Long Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1430251224-5764-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fb650a2f4a73..d70910355b20 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3085,13 +3085,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); + return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); + return READ_ONCE(tsk->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) -- cgit v1.2.3 From 1018016c706f7ff9f56fde3a649789c47085a293 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:22 -0700 Subject: sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 7 ++++--- include/linux/sched.h | 10 +++------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..7b9d8b59e7bf 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,9 +50,10 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .cputime = INIT_CPUTIME, \ - .running = 0, \ - .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + .running = 0 \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d70910355b20..a45874c3fab6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -598,9 +598,10 @@ struct task_cputime { * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - struct task_cputime cputime; + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; int running; - raw_spinlock_t lock; }; #include @@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); -static inline void thread_group_cputime_init(struct signal_struct *sig) -{ - raw_spin_lock_init(&sig->cputimer.lock); -} - /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. -- cgit v1.2.3 From 971e8a985482c76487edb5a49811e99b96e846e1 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:23 -0700 Subject: sched, timer: Provide an atomic 'struct task_cputime' data structure This patch adds an atomic variant of the 'struct task_cputime' data structure, which can be used to store and update task_cputime statistics without needing to do locking. Suggested-by: Ingo Molnar Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-5-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a45874c3fab6..6eb78cd45da7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -572,6 +572,23 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +/* + * This is the atomic variant of task_cputime, which can be used for + * storing and updating task_cputime statistics without locking. + */ +struct task_cputime_atomic { + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; +}; + +#define INIT_CPUTIME_ATOMIC \ + (struct task_cputime_atomic) { \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + } + #ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) #else -- cgit v1.2.3 From 7110744516276e906f9197e2857d026eb2343393 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:24 -0700 Subject: sched, timer: Use the atomic task_cputime in thread_group_cputimer Recent optimizations were made to thread_group_cputimer to improve its scalability by keeping track of cputime stats without a lock. However, the values were open coded to the structure, causing them to be at a different abstraction level from the regular task_cputime structure. Furthermore, any subsequent similar optimizations would not be able to share the new code, since they are specific to thread_group_cputimer. This patch adds the new task_cputime_atomic data structure (introduced in the previous patch in the series) to thread_group_cputimer for keeping track of the cputime atomically, which also helps generalize the code. Suggested-by: Ingo Molnar Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-6-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 ++---- include/linux/sched.h | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 7b9d8b59e7bf..bb9b075f0eb0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,10 +50,8 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .utime = ATOMIC64_INIT(0), \ - .stime = ATOMIC64_INIT(0), \ - .sum_exec_runtime = ATOMIC64_INIT(0), \ - .running = 0 \ + .cputime_atomic = INIT_CPUTIME_ATOMIC, \ + .running = 0, \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6eb78cd45da7..4adc536a3b03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -615,9 +615,7 @@ struct task_cputime_atomic { * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - atomic64_t utime; - atomic64_t stime; - atomic64_t sum_exec_runtime; + struct task_cputime_atomic cputime_atomic; int running; }; -- cgit v1.2.3 From 7675104990ed255b9315a82ae827ff312a2a88a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2015 08:27:50 -0700 Subject: sched: Implement lockless wake-queues This is useful for locking primitives that can effect multiple wakeups per operation and want to avoid lock internal lock contention by delaying the wakeups until we've released the lock internal locks. Alternatively it can be used to avoid issuing multiple wakeups, and thus save a few cycles, in packet processing. Queue all target tasks and wakeup once you've processed all packets. That way you avoid waking the target task multiple times if there were multiple packets for the same task. Properties of a wake_q are: - Lockless, as queue head must reside on the stack. - Being a queue, maintains wakeup order passed by the callers. This can be important for otherwise, in scenarios where highly contended locks could affect any reliance on lock fairness. - A queued task cannot be added again until it is woken up. This patch adds the needed infrastructure into the scheduler code and uses the new wake_list to delay the futex wakeups until after we've released the hash bucket locks. Signed-off-by: Peter Zijlstra (Intel) [tweaks, adjustments, comments, etc.] Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Chris Mason Cc: Davidlohr Bueso Cc: George Spelvin Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Manfred Spraul Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4adc536a3b03..254d88e80f65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,6 +920,50 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +/* + * Wake-queues are lists of tasks with a pending wakeup, whose + * callers have already marked the task as woken internally, + * and can thus carry on. A common use case is being able to + * do the wakeups once the corresponding user lock as been + * released. + * + * We hold reference to each task in the list across the wakeup, + * thus guaranteeing that the memory is still valid by the time + * the actual wakeups are performed in wake_up_q(). + * + * One per task suffices, because there's never a need for a task to be + * in two wake queues simultaneously; it is forbidden to abandon a task + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is + * already in a wake queue, the wakeup will happen soon and the second + * waker can just skip it. + * + * The WAKE_Q macro declares and initializes the list head. + * wake_up_q() does NOT reinitialize the list; it's expected to be + * called near the end of a function, where the fact that the queue is + * not used again will be easy to see by inspection. + * + * Note that this can cause spurious wakeups. schedule() callers + * must ensure the call is done inside a loop, confirming that the + * wakeup condition has in fact occurred. + */ +struct wake_q_node { + struct wake_q_node *next; +}; + +struct wake_q_head { + struct wake_q_node *first; + struct wake_q_node **lastp; +}; + +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) + +#define WAKE_Q(name) \ + struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + +extern void wake_q_add(struct wake_q_head *head, + struct task_struct *task); +extern void wake_up_q(struct wake_q_head *head); + /* * sched-domains (multiprocessor balancing) declarations: */ @@ -1532,6 +1576,8 @@ struct task_struct { /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; + struct wake_q_node wake_q; + #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ struct rb_root pi_waiters; -- cgit v1.2.3 From ff303e66c240ba6269e31817a386995440a18c99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Apr 2015 20:05:30 +0200 Subject: perf: Fix software migrate events Stephane asked about PERF_COUNT_SW_CPU_MIGRATIONS and I realized it was borken: > The problem is that the task isn't actually scheduled while its being > migrated (obviously), and if its not scheduled, the counters aren't > scheduled either, so there's no observing of the fact. > > A further problem with migrations is that many migrations happen from > softirq context, which is nested inside the 'random' task context of > whoemever happens to run at that time, similarly for the wakeup > migrations triggered from (soft)irq context. All those end up being > accounted in the task that's currently running, eg. your 'ls'. The below cures this by marking a task as migrated and accounting it on the subsequent sched_in(). Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 ++++++++++++++++++++++++ include/linux/sched.h | 7 ++++--- 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..e86f85abeda7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -798,11 +798,33 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) extern struct static_key_deferred perf_sched_events; +static __always_inline bool +perf_sw_migrate_enabled(void) +{ + if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS])) + return true; + return false; +} + +static inline void perf_event_task_migrate(struct task_struct *task) +{ + if (perf_sw_migrate_enabled()) + task->sched_migrated = 1; +} + static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); + + if (perf_sw_migrate_enabled() && task->sched_migrated) { + struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); + + perf_fetch_caller_regs(regs); + ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0); + task->sched_migrated = 0; + } } static inline void perf_event_task_sched_out(struct task_struct *prev, @@ -925,6 +947,8 @@ perf_aux_output_skip(struct perf_output_handle *handle, static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void +perf_event_task_migrate(struct task_struct *task) { } +static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..2c5e6c3db654 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1356,9 +1356,6 @@ struct task_struct { #endif struct mm_struct *mm, *active_mm; -#ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; -#endif /* per-thread vma caching */ u32 vmacache_seqnum; struct vm_area_struct *vmacache[VMACACHE_SIZE]; @@ -1381,10 +1378,14 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; #ifdef CONFIG_MEMCG_KMEM unsigned memcg_kmem_skip_account:1; #endif +#ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; +#endif unsigned long atomic_flags; /* Flags needing atomic access. */ -- cgit v1.2.3 From 59aabfc7e959f5f213e4e5cc7567ab4934da2adf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2015 17:12:16 -0400 Subject: locking/rwsem: Reduce spinlock contention in wakeup after up_read()/up_write() In up_write()/up_read(), rwsem_wake() will be called whenever it detects that some writers/readers are waiting. The rwsem_wake() function will take the wait_lock and call __rwsem_do_wake() to do the real wakeup. For a heavily contended rwsem, doing a spin_lock() on wait_lock will cause further contention on the heavily contended rwsem cacheline resulting in delay in the completion of the up_read/up_write operations. This patch makes the wait_lock taking and the call to __rwsem_do_wake() optional if at least one spinning writer is present. The spinning writer will be able to take the rwsem and call rwsem_wake() later when it calls up_write(). With the presence of a spinning writer, rwsem_wake() will now try to acquire the lock using trylock. If that fails, it will just quit. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Acked-by: Jason Low Cc: Andrew Morton Cc: Borislav Petkov Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430428337-16802-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- include/linux/osq_lock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h index 3a6490e81b28..703ea5c30a33 100644 --- a/include/linux/osq_lock.h +++ b/include/linux/osq_lock.h @@ -32,4 +32,9 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock) extern bool osq_lock(struct optimistic_spin_queue *lock); extern void osq_unlock(struct optimistic_spin_queue *lock); +static inline bool osq_is_locked(struct optimistic_spin_queue *lock) +{ + return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL; +} + #endif -- cgit v1.2.3 From 663fdcbee0a656cdaef934e7f50e6c2670373bc9 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Thu, 30 Apr 2015 17:27:21 +0530 Subject: kernel: Replace reference to ASSIGN_ONCE() with WRITE_ONCE() in comment Looks like commit : 43239cbe79fc ("kernel: Change ASSIGN_ONCE(val, x) to WRITE_ONCE(x, val)") left behind a reference to ASSIGN_ONCE(). Update this to WRITE_ONCE(). Signed-off-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: borntraeger@de.ibm.com Cc: dave@stgolabs.net Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20150430115721.22278.94082.stgit@preeti.in.ibm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..a7c0941d10da 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -450,7 +450,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * with an explicit memory barrier or atomic instruction that provides the * required ordering. * - * If possible use READ_ONCE/ASSIGN_ONCE instead. + * If possible use READ_ONCE()/WRITE_ONCE() instead. */ #define __ACCESS_ONCE(x) ({ \ __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \ -- cgit v1.2.3 From 56f13c0d9524c5816f5dc9c91b9d766d6b1064ca Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 9 Apr 2015 12:35:47 +0300 Subject: dmaengine: of_dma: Support for DMA routers DMA routers are transparent devices used to mux DMA requests from peripherals to DMA controllers. They are used when the SoC integrates more devices with DMA requests then their controller can handle. DRA7x is one example of such SoC, where the sDMA can hanlde 128 DMA request lines, but in SoC level it has 205 DMA requests. The of_dma_router will be registered as of_dma_controller with special xlate function and additional parameters. The driver for the router is responsible to craft the dma_spec (in the of_dma_route_allocate callback) which can be used to requests a DMA channel from the real DMA controller. This way the router can be transparent for the system while remaining generic enough to be used in different environments. Signed-off-by: Peter Ujfalusi Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 17 +++++++++++++++++ include/linux/of_dma.h | 21 +++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..abf63ceabef9 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -221,6 +221,16 @@ struct dma_chan_percpu { unsigned long bytes_transferred; }; +/** + * struct dma_router - DMA router structure + * @dev: pointer to the DMA router device + * @route_free: function to be called when the route can be disconnected + */ +struct dma_router { + struct device *dev; + void (*route_free)(struct device *dev, void *route_data); +}; + /** * struct dma_chan - devices supply DMA channels, clients use them * @device: ptr to the dma device who supplies this channel, always !%NULL @@ -232,6 +242,8 @@ struct dma_chan_percpu { * @local: per-cpu pointer to a struct dma_chan_percpu * @client_count: how many clients are using this channel * @table_count: number of appearances in the mem-to-mem allocation table + * @router: pointer to the DMA router structure + * @route_data: channel specific data for the router * @private: private data for certain client-channel associations */ struct dma_chan { @@ -247,6 +259,11 @@ struct dma_chan { struct dma_chan_percpu __percpu *local; int client_count; int table_count; + + /* DMA router */ + struct dma_router *router; + void *route_data; + void *private; }; diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index 56bc026c143f..98ba7525929e 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -23,6 +23,9 @@ struct of_dma { struct device_node *of_node; struct dma_chan *(*of_dma_xlate) (struct of_phandle_args *, struct of_dma *); + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *); + struct dma_router *dma_router; void *of_dma_data; }; @@ -37,12 +40,20 @@ extern int of_dma_controller_register(struct device_node *np, (struct of_phandle_args *, struct of_dma *), void *data); extern void of_dma_controller_free(struct device_node *np); + +extern int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router); +#define of_dma_router_free of_dma_controller_free + extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name); extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma); extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec, struct of_dma *ofdma); + #else static inline int of_dma_controller_register(struct device_node *np, struct dma_chan *(*of_dma_xlate) @@ -56,6 +67,16 @@ static inline void of_dma_controller_free(struct device_node *np) { } +static inline int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router) +{ + return -ENODEV; +} + +#define of_dma_router_free of_dma_controller_free + static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name) { -- cgit v1.2.3 From 4ae92bc77ac8e620f7c8d59b5882a4cb0d1c4ef1 Mon Sep 17 00:00:00 2001 From: Nicolas Schichan Date: Wed, 6 May 2015 16:12:27 +0200 Subject: net: filter: add a callback to allow classic post-verifier transformations This is in preparation for use by the seccomp code, the rationale is not to duplicate additional code within the seccomp layer, but instead, have it abstracted and hidden within the classic BPF API. As an interim step, this now also makes bpf_prepare_filter() visible (not as exported symbol though), so that seccomp can reuse that code path instead of reimplementing it. Joint work with Daniel Borkmann. Signed-off-by: Nicolas Schichan Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index fa11b3a367be..91996247cb55 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -384,7 +384,13 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); +typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, + unsigned int flen); + int bpf_check_classic(const struct sock_filter *filter, unsigned int flen); +struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans); + int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); -- cgit v1.2.3 From d9e12f42e58da475379b9080708b94f2095904af Mon Sep 17 00:00:00 2001 From: Nicolas Schichan Date: Wed, 6 May 2015 16:12:28 +0200 Subject: seccomp: simplify seccomp_prepare_filter and reuse bpf_prepare_filter Remove the calls to bpf_check_classic(), bpf_convert_filter() and bpf_migrate_runtime() and let bpf_prepare_filter() take care of that instead. seccomp_check_filter() is passed to bpf_prepare_filter() so that it gets called from there, after bpf_check_classic(). We can now remove exposure of two internal classic BPF functions previously used by seccomp. The export of bpf_check_classic() symbol, previously known as sk_chk_filter(), was there since pre git times, and no in-tree module was using it, therefore remove it. Joint work with Daniel Borkmann. Signed-off-by: Nicolas Schichan Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 91996247cb55..0dcb44bcfc5f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -363,9 +363,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); void bpf_prog_free(struct bpf_prog *fp); -int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len); - struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); @@ -387,7 +384,6 @@ int sk_detach_filter(struct sock *sk); typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, unsigned int flen); -int bpf_check_classic(const struct sock_filter *filter, unsigned int flen); struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans); -- cgit v1.2.3 From ac67eb2c5347bd9976308c0e0cf1d9e7ca690342 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 6 May 2015 16:12:30 +0200 Subject: seccomp, filter: add and use bpf_prog_create_from_user from seccomp Seccomp has always been a special candidate when it comes to preparation of its filters in seccomp_prepare_filter(). Due to the extra checks and filter rewrite it partially duplicates code and has BPF internals exposed. This patch adds a generic API inside the BPF code code that seccomp can use and thus keep it's filter preparation code minimal and better maintainable. The other side-effect is that now classic JITs can add seccomp support as well by only providing a BPF_LDX | BPF_W | BPF_ABS translation. Tested with seccomp and BPF test suites. Signed-off-by: Daniel Borkmann Cc: Nicolas Schichan Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0dcb44bcfc5f..3c03a6085b82 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -374,19 +374,17 @@ static inline void bpf_prog_unlock_free(struct bpf_prog *fp) __bpf_prog_free(fp); } +typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, + unsigned int flen); + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); - -typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, - unsigned int flen); - -struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, - bpf_aux_classic_check_t trans); - int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); -- cgit v1.2.3 From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:53 +0200 Subject: netlink: allow to listen "all" netns More accurately, listen all netns that have a nsid assigned into the netns where the netlink socket is opened. For this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this socket will receive netlink notifications from all netns that have a nsid assigned into the netns where the socket has been opened. The nsid is sent to userland via an anscillary data. With this patch, a daemon needs only one socket to listen many netns. This is useful when the number of netns is high. Because 0 is a valid value for a nsid, the field nsid_is_set indicates if the field nsid is valid or not. skb->cb is initialized to 0 on skb allocation, thus we are sure that we will never send a nsid 0 by error to the userland. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 6835c1279df7..9120edb650a0 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -28,6 +28,8 @@ struct netlink_skb_parms { __u32 dst_group; __u32 flags; struct sock *sk; + bool nsid_is_set; + int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) -- cgit v1.2.3 From 920ce39f6c204d4ce4d8acebe7522f0dfa95f662 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 8 May 2015 14:31:50 -0700 Subject: sched, timer: Fix documentation for 'struct thread_group_cputimer' Fix the docbook build bug reported by Fengguang Wu. Reported-by: Fengguang Wu Signed-off-by: Jason Low Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: aswin@hp.com Cc: bp@alien8.de Cc: dave@stgolabs.net Cc: fweisbec@gmail.com Cc: mgorman@suse.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: rostedt@goodmis.org Cc: scott.norton@hp.com Cc: torvalds@linux-foundation.org Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/1431120710.5136.12.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 254d88e80f65..0eceeec5a01a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -606,10 +606,9 @@ struct task_cputime_atomic { /** * struct thread_group_cputimer - thread group interval timer counts - * @cputime: thread group interval timers. + * @cputime_atomic: atomic thread group interval timers. * @running: non-zero when there are timers running and * @cputime receives updates. - * @lock: lock for fields in this struct. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. -- cgit v1.2.3 From 34ef33f7da6b00900d3a896d33522a035a930245 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 14:04:07 +0200 Subject: usb: phy: Remove the phy-rcar-gen2-usb driver The phy-rcar-gen2-usb driver, which supports legacy platform data only, is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager: Remove legacy board support"). This driver was superseded by the DT-only phy-rcar-gen2 driver, which was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY driver"). Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/usb-rcar-gen2-phy.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h (limited to 'include/linux') diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h deleted file mode 100644 index dd3ba46c0d90..000000000000 --- a/include/linux/platform_data/usb-rcar-gen2-phy.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __USB_RCAR_GEN2_PHY_H -#define __USB_RCAR_GEN2_PHY_H - -#include - -struct rcar_gen2_phy_platform_data { - /* USB channel 0 configuration */ - bool chan0_pci:1; /* true: PCI USB host 0, false: USBHS */ - /* USB channel 2 configuration */ - bool chan2_pci:1; /* true: PCI USB host 2, false: USBSS */ -}; - -#endif -- cgit v1.2.3 From 1c5841e832e2d7563c31de4946118e78baf573a3 Mon Sep 17 00:00:00 2001 From: Eddie Huang Date: Tue, 28 Apr 2015 21:40:32 +0800 Subject: tty: serial: 8250: export early_serial8250_setup function 8250-like uart driver may call early_serial8250_setup to reuse 8250_early.c character output function. Signed-off-by: Eddie Huang Tested-by: Sascha Hauer Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 78097e7a330a..f0c68d88b6f4 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -137,6 +137,8 @@ extern int early_serial_setup(struct uart_port *port); extern unsigned int serial8250_early_in(struct uart_port *port, int offset); extern void serial8250_early_out(struct uart_port *port, int offset, int value); +extern int early_serial8250_setup(struct earlycon_device *device, + const char *options); extern void serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, struct ktermios *old); extern int serial8250_do_startup(struct uart_port *port); -- cgit v1.2.3 From c27ffc1080179c3f3b85e1e194fa61f1c9923b62 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Apr 2015 18:21:25 +0200 Subject: serial: sh-sci: Move private definitions to private header file Move private register definitions and enums from the public header file to the driver private "sh-sci.h" header file. The common Serial Control Register definitions are left in the public header file, as they're needed to fill in plat_sci_port.scscr on legacy systems not using DT. Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_sci.h | 67 +--------------------------------------------- 1 file changed, 1 insertion(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 6c5e3bb282b0..395fceb8c060 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -10,13 +10,6 @@ #define SCIx_NOT_SUPPORTED (-1) -/* SCSMR (Serial Mode Register) */ -#define SCSMR_CHR (1 << 6) /* 7-bit Character Length */ -#define SCSMR_PE (1 << 5) /* Parity Enable */ -#define SCSMR_ODD (1 << 4) /* Odd Parity */ -#define SCSMR_STOP (1 << 3) /* Stop Bit Length */ -#define SCSMR_CKS 0x0003 /* Clock Select */ - /* Serial Control Register (@ = not supported by all parts) */ #define SCSCR_TIE (1 << 7) /* Transmit Interrupt Enable */ #define SCSCR_RIE (1 << 6) /* Receive Interrupt Enable */ @@ -26,43 +19,7 @@ #define SCSCR_TOIE (1 << 2) /* Timeout Interrupt Enable @ */ #define SCSCR_CKE1 (1 << 1) /* Clock Enable 1 */ #define SCSCR_CKE0 (1 << 0) /* Clock Enable 0 */ -/* SCIFA/SCIFB only */ -#define SCSCR_TDRQE (1 << 15) /* Tx Data Transfer Request Enable */ -#define SCSCR_RDRQE (1 << 14) /* Rx Data Transfer Request Enable */ - -/* SCxSR (Serial Status Register) on SCI */ -#define SCI_TDRE 0x80 /* Transmit Data Register Empty */ -#define SCI_RDRF 0x40 /* Receive Data Register Full */ -#define SCI_ORER 0x20 /* Overrun Error */ -#define SCI_FER 0x10 /* Framing Error */ -#define SCI_PER 0x08 /* Parity Error */ -#define SCI_TEND 0x04 /* Transmit End */ - -#define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER) - -/* SCxSR (Serial Status Register) on SCIF, HSCIF */ -#define SCIF_ER 0x0080 /* Receive Error */ -#define SCIF_TEND 0x0040 /* Transmission End */ -#define SCIF_TDFE 0x0020 /* Transmit FIFO Data Empty */ -#define SCIF_BRK 0x0010 /* Break Detect */ -#define SCIF_FER 0x0008 /* Framing Error */ -#define SCIF_PER 0x0004 /* Parity Error */ -#define SCIF_RDF 0x0002 /* Receive FIFO Data Full */ -#define SCIF_DR 0x0001 /* Receive Data Ready */ - -#define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK) - -/* SCFCR (FIFO Control Register) */ -#define SCFCR_LOOP (1 << 0) /* Loopback Test */ - -/* SCSPTR (Serial Port Register), optional */ -#define SCSPTR_RTSIO (1 << 7) /* Serial Port RTS Pin Input/Output */ -#define SCSPTR_CTSIO (1 << 5) /* Serial Port CTS Pin Input/Output */ -#define SCSPTR_SPB2IO (1 << 1) /* Serial Port Break Input/Output */ -#define SCSPTR_SPB2DT (1 << 0) /* Serial Port Break Data */ - -/* HSSRR HSCIF */ -#define HSCIF_SRE 0x8000 /* Sampling Rate Register Enable */ + enum { SCIx_PROBE_REGTYPE, @@ -82,28 +39,6 @@ enum { SCIx_NR_REGTYPES, }; -/* - * SCI register subset common for all port types. - * Not all registers will exist on all parts. - */ -enum { - SCSMR, /* Serial Mode Register */ - SCBRR, /* Bit Rate Register */ - SCSCR, /* Serial Control Register */ - SCxSR, /* Serial Status Register */ - SCFCR, /* FIFO Control Register */ - SCFDR, /* FIFO Data Count Register */ - SCxTDR, /* Transmit (FIFO) Data Register */ - SCxRDR, /* Receive (FIFO) Data Register */ - SCLSR, /* Line Status Register */ - SCTFDR, /* Transmit FIFO Data Count Register */ - SCRFDR, /* Receive FIFO Data Count Register */ - SCSPTR, /* Serial Port Register */ - HSSRR, /* Sampling Rate Register */ - - SCIx_NR_REGS, -}; - struct device; struct plat_sci_port_ops { -- cgit v1.2.3 From d94a0a3857987c76c37a8095977fe554799ab69d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Apr 2015 18:21:29 +0200 Subject: serial: sh-sci: Standardize on using the BIT() macro to define register bits Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_sci.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 395fceb8c060..7c536ac5be05 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -1,6 +1,7 @@ #ifndef __LINUX_SERIAL_SCI_H #define __LINUX_SERIAL_SCI_H +#include #include #include @@ -11,14 +12,14 @@ #define SCIx_NOT_SUPPORTED (-1) /* Serial Control Register (@ = not supported by all parts) */ -#define SCSCR_TIE (1 << 7) /* Transmit Interrupt Enable */ -#define SCSCR_RIE (1 << 6) /* Receive Interrupt Enable */ -#define SCSCR_TE (1 << 5) /* Transmit Enable */ -#define SCSCR_RE (1 << 4) /* Receive Enable */ -#define SCSCR_REIE (1 << 3) /* Receive Error Interrupt Enable @ */ -#define SCSCR_TOIE (1 << 2) /* Timeout Interrupt Enable @ */ -#define SCSCR_CKE1 (1 << 1) /* Clock Enable 1 */ -#define SCSCR_CKE0 (1 << 0) /* Clock Enable 0 */ +#define SCSCR_TIE BIT(7) /* Transmit Interrupt Enable */ +#define SCSCR_RIE BIT(6) /* Receive Interrupt Enable */ +#define SCSCR_TE BIT(5) /* Transmit Enable */ +#define SCSCR_RE BIT(4) /* Receive Enable */ +#define SCSCR_REIE BIT(3) /* Receive Error Interrupt Enable @ */ +#define SCSCR_TOIE BIT(2) /* Timeout Interrupt Enable @ */ +#define SCSCR_CKE1 BIT(1) /* Clock Enable 1 */ +#define SCSCR_CKE0 BIT(0) /* Clock Enable 0 */ enum { @@ -48,7 +49,7 @@ struct plat_sci_port_ops { /* * Port-specific capabilities */ -#define SCIx_HAVE_RTSCTS (1 << 0) +#define SCIx_HAVE_RTSCTS BIT(0) /* * Platform device specific platform_data struct -- cgit v1.2.3 From bd63364caa8df38bad2b25b11b2a1b849475cce5 Mon Sep 17 00:00:00 2001 From: Scot Doyle Date: Thu, 26 Mar 2015 13:54:39 +0000 Subject: vt: add cursor blink interval escape sequence Add an escape sequence to specify the current console's cursor blink interval. The interval is specified as a number of milliseconds until the next cursor display state toggle, from 50 to 65535. /proc/loadavg did not show a difference with a one msec interval, but the lower bound is set to 50 msecs since slower hardware wasn't tested. Store the interval in the vc_data structure for later access by fbcon, initializing the value to fbcon's current hardcoded value of 200 msecs. Signed-off-by: Scot Doyle Acked-by: Pavel Machek Signed-off-by: Greg Kroah-Hartman --- include/linux/console_struct.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index e859c98d1767..e329ee2667e1 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -104,6 +104,7 @@ struct vc_data { unsigned int vc_resize_user; /* resize request from user */ unsigned int vc_bell_pitch; /* Console bell pitch */ unsigned int vc_bell_duration; /* Console bell duration */ + unsigned short vc_cur_blink_ms; /* Cursor blink duration */ struct vc_data **vc_display_fg; /* [!] Ptr to var holding fg console for this display */ struct uni_pagedir *vc_uni_pagedir; struct uni_pagedir **vc_uni_pagedir_loc; /* [!] Location of uni_pagedir variable for this console */ -- cgit v1.2.3 From faaa44955dedc661f083636d816af90975a359ee Mon Sep 17 00:00:00 2001 From: Irina Tirdea Date: Wed, 29 Apr 2015 21:16:39 +0300 Subject: iio: core: Introduce IIO_CHAN_INFO_OVERSAMPLING_RATIO Some magnetometers can perform a number of repetitions in HW for each measurement to increase accuracy. One example is Bosch BMC150: http://ae-bst.resource.bosch.com/media/products/dokumente/bmc150/BST-BMC150-DS000-04.pdf. Introduce an interface to set the oversampling ratio for these devices. Signed-off-by: Irina Tirdea Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index b1e46ae89aa7..058441da4984 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -44,6 +44,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_DEBOUNCE_COUNT, IIO_CHAN_INFO_DEBOUNCE_TIME, IIO_CHAN_INFO_CALIBEMISSIVITY, + IIO_CHAN_INFO_OVERSAMPLING_RATIO, }; enum iio_shared_by { -- cgit v1.2.3 From 61ba64fc0768879a300599b011c176203bdf27d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 09:54:06 -0400 Subject: libfs: simple_follow_link() let "fast" symlinks store the pointer to the body into ->i_link and use simple_follow_link for ->follow_link() Reviewed-by: Jan Kara Signed-off-by: Al Viro --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..0ac758fcff00 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -656,6 +656,7 @@ struct inode { struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct cdev *i_cdev; + char *i_link; }; __u32 i_generation; @@ -2721,6 +2722,8 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); +void *simple_follow_link(struct dentry *, struct nameidata *); +extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); -- cgit v1.2.3 From 5723cb01f0295ace2b029b0737dd6525a2de337f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 10:27:18 -0400 Subject: debugfs: switch to simple_follow_link() Reviewed-by: Jan Kara Signed-off-by: Al Viro --- include/linux/debugfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index cb25af461054..420311bcee38 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -45,7 +45,6 @@ extern struct dentry *arch_debugfs_dir; /* declared over in file.c */ extern const struct file_operations debugfs_file_operations; -extern const struct inode_operations debugfs_link_operations; struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, -- cgit v1.2.3 From 37882db0546c759ff75b561c188539ac96fd0bfe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 13:37:39 +1100 Subject: SECURITY: remove nameidata arg from inode_follow_link. No ->inode_follow_link() methods use the nameidata arg, and it is about to become private to namei.c. So remove from all inode_follow_link() functions. Signed-off-by: NeilBrown Signed-off-by: Al Viro --- include/linux/security.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 18264ea9e314..62a66202ecf1 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -43,7 +43,6 @@ struct file; struct vfsmount; struct path; struct qstr; -struct nameidata; struct iattr; struct fown_struct; struct file_operations; @@ -477,7 +476,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @inode_follow_link: * Check permission to follow a symbolic link when looking up a pathname. * @dentry contains the dentry structure for the link. - * @nd contains the nameidata structure for the parent directory. * Return 0 if permission is granted. * @inode_permission: * Check permission before accessing an inode. This hook is called by the @@ -1553,7 +1551,7 @@ struct security_operations { int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int (*inode_readlink) (struct dentry *dentry); - int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd); + int (*inode_follow_link) (struct dentry *dentry); int (*inode_permission) (struct inode *inode, int mask); int (*inode_setattr) (struct dentry *dentry, struct iattr *attr); int (*inode_getattr) (const struct path *path); @@ -1839,7 +1837,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int security_inode_readlink(struct dentry *dentry); -int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); +int security_inode_follow_link(struct dentry *dentry); int security_inode_permission(struct inode *inode, int mask); int security_inode_setattr(struct dentry *dentry, struct iattr *attr); int security_inode_getattr(const struct path *path); @@ -2241,8 +2239,7 @@ static inline int security_inode_readlink(struct dentry *dentry) return 0; } -static inline int security_inode_follow_link(struct dentry *dentry, - struct nameidata *nd) +static inline int security_inode_follow_link(struct dentry *dentry) { return 0; } -- cgit v1.2.3 From 680baacbca69d18a6d7315374ad83d05ac9c0977 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 13:32:22 -0400 Subject: new ->follow_link() and ->put_link() calling conventions a) instead of storing the symlink body (via nd_set_link()) and returning an opaque pointer later passed to ->put_link(), ->follow_link() _stores_ that opaque pointer (into void * passed by address by caller) and returns the symlink body. Returning ERR_PTR() on error, NULL on jump (procfs magic symlinks) and pointer to symlink body for normal symlinks. Stored pointer is ignored in all cases except the last one. Storing NULL for opaque pointer (or not storing it at all) means no call of ->put_link(). b) the body used to be passed to ->put_link() implicitly (via nameidata). Now only the opaque pointer is. In the cases when we used the symlink body to free stuff, ->follow_link() now should store it as opaque pointer in addition to returning it. Signed-off-by: Al Viro --- include/linux/fs.h | 12 ++++++------ include/linux/namei.h | 2 -- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0ac758fcff00..9ab934113a28 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1608,12 +1608,12 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); - void * (*follow_link) (struct dentry *, struct nameidata *); + const char * (*follow_link) (struct dentry *, void **, struct nameidata *); int (*permission) (struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); - void (*put_link) (struct dentry *, struct nameidata *, void *); + void (*put_link) (struct dentry *, void *); int (*create) (struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); @@ -2705,13 +2705,13 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); -extern void *page_follow_link_light(struct dentry *, struct nameidata *); -extern void page_put_link(struct dentry *, struct nameidata *, void *); +extern const char *page_follow_link_light(struct dentry *, void **, struct nameidata *); +extern void page_put_link(struct dentry *, void *); extern int __page_symlink(struct inode *inode, const char *symname, int len, int nofs); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; -extern void kfree_put_link(struct dentry *, struct nameidata *, void *); +extern void kfree_put_link(struct dentry *, void *); extern int generic_readlink(struct dentry *, char __user *, int); extern void generic_fillattr(struct inode *, struct kstat *); int vfs_getattr_nosec(struct path *path, struct kstat *stat); @@ -2722,7 +2722,7 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -void *simple_follow_link(struct dentry *, struct nameidata *); +const char *simple_follow_link(struct dentry *, void **, struct nameidata *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); diff --git a/include/linux/namei.h b/include/linux/namei.h index c8990779f0c3..a5d5bed2c0e1 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -71,8 +71,6 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); extern void nd_jump_link(struct nameidata *nd, struct path *path); -extern void nd_set_link(struct nameidata *nd, char *path); -extern char *nd_get_link(struct nameidata *nd); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { -- cgit v1.2.3 From 894bc8c4662ba9daceafe943a5ba0dd407da5cd3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 07:16:16 -0400 Subject: namei: remove restrictions on nesting depth The only restriction is that on the total amount of symlinks crossed; how they are nested does not matter Signed-off-by: Al Viro --- include/linux/namei.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index a5d5bed2c0e1..3a6cc9651712 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -11,6 +11,8 @@ struct nameidata; enum { MAX_NESTED_LINKS = 8 }; +#define MAXSYMLINKS 40 + /* * Type of the last component on LOOKUP_PARENT */ -- cgit v1.2.3 From 756daf263ea53a8bfc89db26cb92e963953253a1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 13:37:38 +1100 Subject: VFS: replace {, total_}link_count in task_struct with pointer to nameidata task_struct currently contains two ad-hoc members for use by the VFS: link_count and total_link_count. These are only interesting to fs/namei.c, so exposing them explicitly is poor layering. Incidentally, link_count isn't used anymore, so it can just die. This patches replaces those with a single pointer to 'struct nameidata'. This structure represents the current filename lookup of which there can only be one per process, and is a natural place to store total_link_count. This will allow the current "nameidata" argument to all follow_link operations to be removed as current->nameidata can be used instead in the _very_ few instances that care about it at all. As there are occasional circumstances where pathname lookup can recurse, such as through kern_path_locked, we always save and old current->nameidata (if there is one) when setting a new value, and make sure any active link_counts are preserved. follow_mount and follow_automount now get a 'struct nameidata *' rather than 'int flags' so that they can directly access total_link_count, rather than going through 'current'. Suggested-by: Al Viro Signed-off-by: NeilBrown Signed-off-by: Al Viro --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..f6c9b69d66f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1461,7 +1461,7 @@ struct task_struct { it with task_lock()) - initialized normally by setup_new_exec */ /* file system info */ - int link_count, total_link_count; + struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; -- cgit v1.2.3 From 6e77137b363b8d866ac29c5a0c95e953614fb2d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 13:37:52 -0400 Subject: don't pass nameidata to ->follow_link() its only use is getting passed to nd_jump_link(), which can obtain it from current->nameidata Signed-off-by: Al Viro --- include/linux/fs.h | 6 +++--- include/linux/namei.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ab934113a28..ed7c9f298759 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1608,7 +1608,7 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); - const char * (*follow_link) (struct dentry *, void **, struct nameidata *); + const char * (*follow_link) (struct dentry *, void **); int (*permission) (struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); @@ -2705,7 +2705,7 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); -extern const char *page_follow_link_light(struct dentry *, void **, struct nameidata *); +extern const char *page_follow_link_light(struct dentry *, void **); extern void page_put_link(struct dentry *, void *); extern int __page_symlink(struct inode *inode, const char *symname, int len, int nofs); @@ -2722,7 +2722,7 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -const char *simple_follow_link(struct dentry *, void **, struct nameidata *); +const char *simple_follow_link(struct dentry *, void **); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); diff --git a/include/linux/namei.h b/include/linux/namei.h index 3a6cc9651712..d756304aa09b 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -72,7 +72,7 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); -extern void nd_jump_link(struct nameidata *nd, struct path *path); +extern void nd_jump_link(struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { -- cgit v1.2.3 From 2da572c959dd5815aef153cf62010b16a498a0d3 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 7 May 2015 13:49:14 -0400 Subject: lib: add software 842 compression/decompression Add 842-format software compression and decompression functions. Update the MAINTAINERS 842 section to include the new files. The 842 compression function can compress any input data into the 842 compression format. The 842 decompression function can decompress any standard-format 842 compressed data - specifically, either a compressed data buffer created by the 842 software compression function, or a compressed data buffer created by the 842 hardware compressor (located in PowerPC coprocessors). The 842 compressed data format is explained in the header comments. This is used in a later patch to provide a full software 842 compression and decompression crypto interface. Signed-off-by: Dan Streetman Signed-off-by: Herbert Xu --- include/linux/sw842.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 include/linux/sw842.h (limited to 'include/linux') diff --git a/include/linux/sw842.h b/include/linux/sw842.h new file mode 100644 index 000000000000..109ba041c2ae --- /dev/null +++ b/include/linux/sw842.h @@ -0,0 +1,12 @@ +#ifndef __SW842_H__ +#define __SW842_H__ + +#define SW842_MEM_COMPRESS (0xf000) + +int sw842_compress(const u8 *src, unsigned int srclen, + u8 *dst, unsigned int *destlen, void *wmem); + +int sw842_decompress(const u8 *src, unsigned int srclen, + u8 *dst, unsigned int *destlen); + +#endif -- cgit v1.2.3 From 7011a122383e36dab594406720fa1d089e0be8f9 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 7 May 2015 13:49:17 -0400 Subject: crypto: nx - add NX-842 platform frontend driver Add NX-842 frontend that allows using either the pSeries platform or PowerNV platform driver (to be added by later patch) for the NX-842 hardware. Update the MAINTAINERS file to include the new filenames. Update Kconfig files to clarify titles and descriptions, and correct dependencies. Signed-off-by: Dan Streetman Signed-off-by: Herbert Xu --- include/linux/nx842.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nx842.h b/include/linux/nx842.h index a4d324c6406a..d919c22b7fd6 100644 --- a/include/linux/nx842.h +++ b/include/linux/nx842.h @@ -1,11 +1,13 @@ #ifndef __NX842_H__ #define __NX842_H__ -int nx842_get_workmem_size(void); -int nx842_get_workmem_size_aligned(void); +#define __NX842_PSERIES_MEM_COMPRESS ((PAGE_SIZE * 2) + 10240) + +#define NX842_MEM_COMPRESS __NX842_PSERIES_MEM_COMPRESS + int nx842_compress(const unsigned char *in, unsigned int in_len, - unsigned char *out, unsigned int *out_len, void *wrkmem); + unsigned char *out, unsigned int *out_len, void *wrkmem); int nx842_decompress(const unsigned char *in, unsigned int in_len, - unsigned char *out, unsigned int *out_len, void *wrkmem); + unsigned char *out, unsigned int *out_len, void *wrkmem); #endif -- cgit v1.2.3 From 959e6659b6f74ec1fa4d391a3b88d63dc0189f36 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 7 May 2015 13:49:18 -0400 Subject: crypto: nx - add nx842 constraints Add "constraints" for the NX-842 driver. The constraints are used to indicate what the current NX-842 platform driver is capable of. The constraints tell the NX-842 user what alignment, min and max length, and length multiple each provided buffers should conform to. These are required because the 842 hardware requires buffers to meet specific constraints that vary based on platform - for example, the pSeries max length is much lower than the PowerNV max length. Signed-off-by: Dan Streetman Signed-off-by: Herbert Xu --- include/linux/nx842.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nx842.h b/include/linux/nx842.h index d919c22b7fd6..aa1a97e90dea 100644 --- a/include/linux/nx842.h +++ b/include/linux/nx842.h @@ -5,6 +5,15 @@ #define NX842_MEM_COMPRESS __NX842_PSERIES_MEM_COMPRESS +struct nx842_constraints { + int alignment; + int multiple; + int minimum; + int maximum; +}; + +int nx842_constraints(struct nx842_constraints *constraints); + int nx842_compress(const unsigned char *in, unsigned int in_len, unsigned char *out, unsigned int *out_len, void *wrkmem); int nx842_decompress(const unsigned char *in, unsigned int in_len, -- cgit v1.2.3 From 99182a42b7ef3d5e4180992ce01befd9e87526d2 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 7 May 2015 13:49:19 -0400 Subject: crypto: nx - add PowerNV platform NX-842 driver Add driver for NX-842 hardware on the PowerNV platform. This allows the use of the 842 compression hardware coprocessor on the PowerNV platform. Signed-off-by: Dan Streetman Signed-off-by: Herbert Xu --- include/linux/nx842.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nx842.h b/include/linux/nx842.h index aa1a97e90dea..4ddf68d9c0d4 100644 --- a/include/linux/nx842.h +++ b/include/linux/nx842.h @@ -1,9 +1,11 @@ #ifndef __NX842_H__ #define __NX842_H__ -#define __NX842_PSERIES_MEM_COMPRESS ((PAGE_SIZE * 2) + 10240) +#define __NX842_PSERIES_MEM_COMPRESS (10240) +#define __NX842_POWERNV_MEM_COMPRESS (1024) -#define NX842_MEM_COMPRESS __NX842_PSERIES_MEM_COMPRESS +#define NX842_MEM_COMPRESS (max_t(unsigned int, \ + __NX842_PSERIES_MEM_COMPRESS, __NX842_POWERNV_MEM_COMPRESS)) struct nx842_constraints { int alignment; -- cgit v1.2.3 From b19e7f51a55fe740c18038d1d6957aedfc078d07 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Wed, 29 Apr 2015 18:34:59 +0300 Subject: gpio: gpio-generic: add flag to read out output value from reg_set The change introduces BGPIOF_READ_OUTPUT_REG_SET flag for gpio-generic GPIO chip implementation, which allows to get correct configured value from reg_set register, input value is still get from reg_dat. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Linus Walleij --- include/linux/basic_mmio_gpio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h index 0e97856b2cff..14eea946e640 100644 --- a/include/linux/basic_mmio_gpio.h +++ b/include/linux/basic_mmio_gpio.h @@ -74,5 +74,6 @@ int bgpio_init(struct bgpio_chip *bgc, struct device *dev, #define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ #define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ #define BGPIOF_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ #endif /* __BASIC_MMIO_GPIO_H */ -- cgit v1.2.3 From c884fbd452147e952ae160e750553d00ea4dc4c9 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 6 May 2015 13:29:06 +0300 Subject: gpio / ACPI: Add support for retrieving GpioInt resources from a device ACPI specification knows two types of GPIOs: GpioIo and GpioInt. The latter is used to describe that a given device interrupt line is connected to a specific GPIO pin. Typical ACPI _CRS entry for such device looks like below: Name (_CRS, ResourceTemplate () { I2cSerialBus (0x004A, ControllerInitiated, 0x00061A80, AddressingMode7Bit, "\\_SB.PCI0.I2C6", 0x00, ResourceConsumer) GpioIo (Exclusive, PullDefault, 0x0000, 0x0000, IoRestrictionOutputOnly, "\\_SB.GPO0", 0x00, ResourceConsumer) { 0x004B } GpioInt (Level, ActiveLow, Shared, PullDefault, 0x0000, "\\_SB.GPO0", 0x00, ResourceConsumer) { 0x004C } }) Currently drivers need to request a GPIO corresponding to the right GpioInt and then translate that to Linux IRQ number. This adds unnecessary lines of boiler-plate code. We can ease this a bit by introducing acpi_dev_gpio_irq_get() analogous to of_irq_get(). This function translates given GpioInt resource under the device in question to the suitable Linux IRQ number. Signed-off-by: Mika Westerberg Acked-by: Rafael J. Wysocki Signed-off-by: Linus Walleij --- include/linux/acpi.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..f57c440642cd 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -721,6 +721,8 @@ static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) if (adev) adev->driver_gpios = NULL; } + +int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index); #else static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, const struct acpi_gpio_mapping *gpios) @@ -728,6 +730,11 @@ static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, return -ENXIO; } static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) {} + +static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index) +{ + return -ENXIO; +} #endif /* Device properties */ -- cgit v1.2.3 From bda0be7ad994812960e9f8f2d2757f72cb4a96cb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 13:37:39 +1100 Subject: security: make inode_follow_link RCU-walk aware inode_follow_link now takes an inode and rcu flag as well as the dentry. inode is used in preference to d_backing_inode(dentry), particularly in RCU-walk mode. selinux_inode_follow_link() gets dentry_has_perm() and inode_has_perm() open-coded into it so that it can call avc_has_perm_flags() in way that is safe if LOOKUP_RCU is set. Calling avc_has_perm_flags() with rcu_read_lock() held means that when avc_has_perm_noaudit calls avc_compute_av(), the attempt to rcu_read_unlock() before calling security_compute_av() will not actually drop the RCU read-lock. However as security_compute_av() is completely in a read_lock()ed region, it should be safe with the RCU read-lock held. Signed-off-by: NeilBrown Signed-off-by: Al Viro --- include/linux/security.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 62a66202ecf1..52febde52479 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -476,6 +476,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @inode_follow_link: * Check permission to follow a symbolic link when looking up a pathname. * @dentry contains the dentry structure for the link. + * @inode contains the inode, which itself is not stable in RCU-walk + * @rcu indicates whether we are in RCU-walk mode. * Return 0 if permission is granted. * @inode_permission: * Check permission before accessing an inode. This hook is called by the @@ -1551,7 +1553,8 @@ struct security_operations { int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int (*inode_readlink) (struct dentry *dentry); - int (*inode_follow_link) (struct dentry *dentry); + int (*inode_follow_link) (struct dentry *dentry, struct inode *inode, + bool rcu); int (*inode_permission) (struct inode *inode, int mask); int (*inode_setattr) (struct dentry *dentry, struct iattr *attr); int (*inode_getattr) (const struct path *path); @@ -1837,7 +1840,8 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int security_inode_readlink(struct dentry *dentry); -int security_inode_follow_link(struct dentry *dentry); +int security_inode_follow_link(struct dentry *dentry, struct inode *inode, + bool rcu); int security_inode_permission(struct inode *inode, int mask); int security_inode_setattr(struct dentry *dentry, struct iattr *attr); int security_inode_getattr(const struct path *path); @@ -2239,7 +2243,9 @@ static inline int security_inode_readlink(struct dentry *dentry) return 0; } -static inline int security_inode_follow_link(struct dentry *dentry) +static inline int security_inode_follow_link(struct dentry *dentry, + struct inode *inode, + bool rcu) { return 0; } -- cgit v1.2.3 From 5f2c4179e129bdc47870a81a65d0aff85aa18293 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 May 2015 11:14:26 -0400 Subject: switch ->put_link() from dentry to inode only one instance looks at that argument at all; that sole exception wants inode rather than dentry. Signed-off-by: Al Viro --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed7c9f298759..f21e3328f991 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1613,7 +1613,7 @@ struct inode_operations { struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); - void (*put_link) (struct dentry *, void *); + void (*put_link) (struct inode *, void *); int (*create) (struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); @@ -2706,12 +2706,12 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_follow_link_light(struct dentry *, void **); -extern void page_put_link(struct dentry *, void *); +extern void page_put_link(struct inode *, void *); extern int __page_symlink(struct inode *inode, const char *symname, int len, int nofs); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; -extern void kfree_put_link(struct dentry *, void *); +extern void kfree_put_link(struct inode *, void *); extern int generic_readlink(struct dentry *, char __user *, int); extern void generic_fillattr(struct inode *, struct kstat *); int vfs_getattr_nosec(struct path *path, struct kstat *stat); -- cgit v1.2.3 From ecc087ff14352aed52b8e775b4511e7f9cfc64ec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 May 2015 11:19:14 -0400 Subject: new helper: free_page_put_link() similar to kfree_put_link() Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f21e3328f991..8f738512c874 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2712,6 +2712,7 @@ extern int __page_symlink(struct inode *inode, const char *symname, int len, extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_put_link(struct inode *, void *); +extern void free_page_put_link(struct inode *, void *); extern int generic_readlink(struct dentry *, char __user *, int); extern void generic_fillattr(struct inode *, struct kstat *); int vfs_getattr_nosec(struct path *path, struct kstat *stat); -- cgit v1.2.3 From 140e807da12988e2a925fe029336e7bb67a8d4de Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:07:08 -0500 Subject: tun: Utilize the normal socket network namespace refcounting. There is no need for tun to do the weird network namespace refcounting. The existing network namespace refcounting in tfile has almost exactly the same lifetime. So rewrite the code to use the struct sock network namespace refcounting and remove the unnecessary hand rolled network namespace refcounting and the unncesary tfile->net. This change allows the tun code to directly call sock_put bypassing sock_release and making SOCK_EXTERNALLY_ALLOCATED unnecessary. Remove the now unncessary tun_release so that if anything tries to use the sock_release code path the kernel will oops, and let us know about the bug. The macvtap code already uses it's internal socket this way. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/linux/net.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 738ea48be889..8a5e81d2bdf7 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -38,7 +38,6 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 -#define SOCK_EXTERNALLY_ALLOCATED 5 #ifndef ARCH_HAS_SOCKET_TYPES /** -- cgit v1.2.3 From eeb1bd5c40edb0e2fd925c8535e2fdebdbc5cef2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:08:05 -0500 Subject: net: Add a struct net parameter to sock_create_kern This is long overdue, and is part of cleaning up how we allocate kernel sockets that don't reference count struct net. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/linux/net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 8a5e81d2bdf7..04aa06852771 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -207,7 +207,7 @@ void sock_unregister(int family); int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); int sock_create(int family, int type, int proto, struct socket **res); -int sock_create_kern(int family, int type, int proto, struct socket **res); +int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); void sock_release(struct socket *sock); int sock_sendmsg(struct socket *sock, struct msghdr *msg); -- cgit v1.2.3 From 11aa9c28b4209242a9de0a661a7b3405adb568a0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:09:13 -0500 Subject: net: Pass kern from net_proto_family.create to sk_alloc In preparation for changing how struct net is refcounted on kernel sockets pass the knowledge that we are creating a kernel socket from sock_create_kern through to sk_alloc. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/linux/if_pppox.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 66a7d7600f43..b49cf923becc 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -74,7 +74,7 @@ static inline struct sock *sk_pppox(struct pppox_sock *po) struct module; struct pppox_proto { - int (*create)(struct net *net, struct socket *sock); + int (*create)(struct net *net, struct socket *sock, int kern); int (*ioctl)(struct socket *sock, unsigned int cmd, unsigned long arg); struct module *owner; -- cgit v1.2.3 From d2788d34885d4ce5ba17a8996fd95d28942e574e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 9 May 2015 22:51:32 +0200 Subject: net: sched: further simplify handle_ing Ingress qdisc has no other purpose than calling into tc_classify() that executes attached classifier(s) and action(s). It has a 1:1 relationship to dev->ingress_queue. After having commit 087c1a601ad7 ("net: sched: run ingress qdisc without locks") removed the central ingress lock, one major contention point is gone. The extra indirection layers however, are not necessary for calling into ingress qdisc. pktgen calling locally into netif_receive_skb() with a dummy u32, single CPU result on a Supermicro X10SLM-F, Xeon E3-1240: before ~21,1 Mpps, after patch ~22,9 Mpps. We can redirect the private classifier list to the netdev directly, without changing any classifier API bits (!) and execute on that from handle_ing() side. The __QDISC_STATE_DEACTIVATE test can be removed, ingress qdisc doesn't have a queue and thus dev_deactivate_queue() is also not applicable, ingress_cl_list provides similar behaviour. In other words, ingress qdisc acts like TCQ_F_BUILTIN qdisc. One next possible step is the removal of the dev's ingress (dummy) netdev_queue, and to only have the list member in the netdevice itself. Note, the filter chain is RCU protected and individual filter elements are being kfree'd by sched subsystem after RCU grace period. RCU read lock is being held by __netif_receive_skb_core(). Joint work with Alexei Starovoitov. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1899c74a7127..c4e1caf6056f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1655,7 +1655,11 @@ struct net_device { rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; +#if CONFIG_NET_CLS_ACT + struct tcf_proto __rcu *ingress_cl_list; +#endif struct netdev_queue __rcu *ingress_queue; + unsigned char broadcast[MAX_ADDR_LEN]; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rx_cpu_rmap; -- cgit v1.2.3 From 6be109b31ccdb9c98e7be12687171f6602527a5d Mon Sep 17 00:00:00 2001 From: Arun Ramamurthy Date: Wed, 22 Apr 2015 16:04:11 -0700 Subject: phy: core: Add devm_of_phy_get_by_index to phy-core Some generic drivers, such as ehci, may use multiple phys and for such drivers referencing phy(s) by name(s) does not make sense. Instead of inventing new naming schemes and using custom code to iterate through them, such drivers are better of using nameless phy bindings and using this newly introduced API to iterate through them. Signed-off-by: Arun Ramamurthy Reviewed-by: Ray Jui Reviewed-by: Scott Branden [kishon@ti.com: fix compilation errors] Signed-off-by: Kishon Vijay Abraham I --- include/linux/phy/phy.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index a0197fa1b116..8cf05e341cff 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -133,6 +133,8 @@ struct phy *devm_phy_get(struct device *dev, const char *string); struct phy *devm_phy_optional_get(struct device *dev, const char *string); struct phy *devm_of_phy_get(struct device *dev, struct device_node *np, const char *con_id); +struct phy *devm_of_phy_get_by_index(struct device *dev, struct device_node *np, + int index); void phy_put(struct phy *phy); void devm_phy_put(struct device *dev, struct phy *phy); struct phy *of_phy_get(struct device_node *np, const char *con_id); @@ -261,6 +263,13 @@ static inline struct phy *devm_of_phy_get(struct device *dev, return ERR_PTR(-ENOSYS); } +static inline struct phy *devm_of_phy_get_by_index(struct device *dev, + struct device_node *np, + int index) +{ + return ERR_PTR(-ENOSYS); +} + static inline void phy_put(struct phy *phy) { } -- cgit v1.2.3 From 4cda01e86f68dacc758b2daf6e8809f2ce915b85 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 May 2015 19:28:49 +0200 Subject: net: sched: fix typo in net_device ifdef This should have been #ifdef not #if. Reported-by: Fengguang Wu Fixes: d2788d34885d ("net: sched: further simplify handle_ing") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c4e1caf6056f..a6d706b2a947 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1655,7 +1655,7 @@ struct net_device { rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; -#if CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_CLS_ACT struct tcf_proto __rcu *ingress_cl_list; #endif struct netdev_queue __rcu *ingress_queue; -- cgit v1.2.3 From 5844feeaa4154d1c46d3462c7a4653d22356d8b4 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 23 Jan 2015 00:22:27 -0800 Subject: mtd: nand: add common DT init code These are already-documented common bindings for NAND chips. Let's handle them in nand_base. If NAND controller drivers need to act on this data before bringing up the NAND chip (e.g., fill out ECC callback functions, change HW modes, etc.), then they can do so between calling nand_scan_ident() and nand_scan_tail(). Signed-off-by: Brian Norris --- include/linux/mtd/nand.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 6c51876941f3..f25e2bdd188c 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -26,6 +26,8 @@ struct mtd_info; struct nand_flash_dev; +struct device_node; + /* Scan and identify a NAND device */ extern int nand_scan(struct mtd_info *mtd, int max_chips); /* @@ -542,6 +544,7 @@ struct nand_buffers { * flash device * @IO_ADDR_W: [BOARDSPECIFIC] address to write the 8 I/O lines of the * flash device. + * @dn: [BOARDSPECIFIC] device node describing this instance * @read_byte: [REPLACEABLE] read one byte from the chip * @read_word: [REPLACEABLE] read one word from the chip * @write_byte: [REPLACEABLE] write a single byte to the chip on the @@ -644,6 +647,8 @@ struct nand_chip { void __iomem *IO_ADDR_R; void __iomem *IO_ADDR_W; + struct device_node *dn; + uint8_t (*read_byte)(struct mtd_info *mtd); u16 (*read_word)(struct mtd_info *mtd); void (*write_byte)(struct mtd_info *mtd, uint8_t byte); -- cgit v1.2.3 From 9d0be7f4810257a9b0fc78fff641f14409f14ab3 Mon Sep 17 00:00:00 2001 From: Eduardo Valentin Date: Mon, 11 May 2015 19:34:23 -0700 Subject: thermal: support slope and offset coefficients It is common to have a linear extrapolation from the current sensor readings and the actual temperature value. This is specially the case when the sensor is in use to extrapolate hotspots. This patch adds slope and offset constants for single sensor linear extrapolation equation. Because the same sensor can be use in different locations, from board to board, these constants are added as part of thermal_zone_params. The constants are available through sysfs. It is up to the device driver to determine the usage of these values. Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 6bbe11c97cea..037e9df2f610 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -302,6 +302,17 @@ struct thermal_zone_params { /* threshold below which the error is no longer accumulated */ s32 integral_cutoff; + + /* + * @slope: slope of a linear temperature adjustment curve. + * Used by thermal zone drivers. + */ + int slope; + /* + * @offset: offset of a linear temperature adjustment curve. + * Used by thermal zone drivers (default 0). + */ + int offset; }; struct thermal_genl_event { -- cgit v1.2.3 From 05ae797566a66d159cf1e2ee11bf3f6fae40c8eb Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Mon, 4 May 2015 10:36:35 -0700 Subject: mailbox: Make mbox_chan_ops const The mailbox controller's channel ops ought to be read-only. Update all the mailbox drivers to make their mbox_chan_ops const as well. Signed-off-by: Andrew Bresticker Cc: Ashwin Chaugule Cc: Ley Foon Tan Acked-by: Suman Anna Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index d4cf96f07cfc..68c42454439b 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -72,7 +72,7 @@ struct mbox_chan_ops { */ struct mbox_controller { struct device *dev; - struct mbox_chan_ops *ops; + const struct mbox_chan_ops *ops; struct mbox_chan *chans; int num_chans; bool txdone_irq; -- cgit v1.2.3 From 3c4ed7bdf5997d8020cbb8d4abbef2fcfb9f1284 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:10:46 -0700 Subject: LSM: Split security.h The security.h header file serves two purposes, interfaces for users of the security modules and interfaces for security modules. Users of the security modules don't need to know about what's in the security_operations structure, so pull it out into it's own header, lsm_hooks.h Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 358 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/security.h | 305 --------------------------------------- 2 files changed, 358 insertions(+), 305 deletions(-) create mode 100644 include/linux/lsm_hooks.h (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h new file mode 100644 index 000000000000..c60f81b2d18c --- /dev/null +++ b/include/linux/lsm_hooks.h @@ -0,0 +1,358 @@ +/* + * Linux Security Module interfaces + * + * Copyright (C) 2001 WireX Communications, Inc + * Copyright (C) 2001 Greg Kroah-Hartman + * Copyright (C) 2001 Networks Associates Technology, Inc + * Copyright (C) 2001 James Morris + * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group) + * Copyright (C) 2015 Intel Corporation. + * Copyright (C) 2015 Casey Schaufler + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Due to this file being licensed under the GPL there is controversy over + * whether this permits you to write a module that #includes this file + * without placing your module under the GPL. Please consult a lawyer for + * advice before doing this. + * + */ + +#ifndef __LINUX_LSM_HOOKS_H +#define __LINUX_LSM_HOOKS_H + +#include + +/* Maximum number of letters for an LSM name string */ +#define SECURITY_NAME_MAX 10 + +#ifdef CONFIG_SECURITY + +struct security_operations { + char name[SECURITY_NAME_MAX + 1]; + + int (*binder_set_context_mgr)(struct task_struct *mgr); + int (*binder_transaction)(struct task_struct *from, + struct task_struct *to); + int (*binder_transfer_binder)(struct task_struct *from, + struct task_struct *to); + int (*binder_transfer_file)(struct task_struct *from, + struct task_struct *to, + struct file *file); + + int (*ptrace_access_check)(struct task_struct *child, + unsigned int mode); + int (*ptrace_traceme)(struct task_struct *parent); + int (*capget)(struct task_struct *target, kernel_cap_t *effective, + kernel_cap_t *inheritable, kernel_cap_t *permitted); + int (*capset)(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); + int (*capable)(const struct cred *cred, struct user_namespace *ns, + int cap, int audit); + int (*quotactl)(int cmds, int type, int id, struct super_block *sb); + int (*quota_on)(struct dentry *dentry); + int (*syslog)(int type); + int (*settime)(const struct timespec *ts, const struct timezone *tz); + int (*vm_enough_memory)(struct mm_struct *mm, long pages); + + int (*bprm_set_creds)(struct linux_binprm *bprm); + int (*bprm_check_security)(struct linux_binprm *bprm); + int (*bprm_secureexec)(struct linux_binprm *bprm); + void (*bprm_committing_creds)(struct linux_binprm *bprm); + void (*bprm_committed_creds)(struct linux_binprm *bprm); + + int (*sb_alloc_security)(struct super_block *sb); + void (*sb_free_security)(struct super_block *sb); + int (*sb_copy_data)(char *orig, char *copy); + int (*sb_remount)(struct super_block *sb, void *data); + int (*sb_kern_mount)(struct super_block *sb, int flags, void *data); + int (*sb_show_options)(struct seq_file *m, struct super_block *sb); + int (*sb_statfs)(struct dentry *dentry); + int (*sb_mount)(const char *dev_name, struct path *path, + const char *type, unsigned long flags, void *data); + int (*sb_umount)(struct vfsmount *mnt, int flags); + int (*sb_pivotroot)(struct path *old_path, struct path *new_path); + int (*sb_set_mnt_opts)(struct super_block *sb, + struct security_mnt_opts *opts, + unsigned long kern_flags, + unsigned long *set_kern_flags); + int (*sb_clone_mnt_opts)(const struct super_block *oldsb, + struct super_block *newsb); + int (*sb_parse_opts_str)(char *options, struct security_mnt_opts *opts); + int (*dentry_init_security)(struct dentry *dentry, int mode, + struct qstr *name, void **ctx, + u32 *ctxlen); + + +#ifdef CONFIG_SECURITY_PATH + int (*path_unlink)(struct path *dir, struct dentry *dentry); + int (*path_mkdir)(struct path *dir, struct dentry *dentry, + umode_t mode); + int (*path_rmdir)(struct path *dir, struct dentry *dentry); + int (*path_mknod)(struct path *dir, struct dentry *dentry, + umode_t mode, unsigned int dev); + int (*path_truncate)(struct path *path); + int (*path_symlink)(struct path *dir, struct dentry *dentry, + const char *old_name); + int (*path_link)(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry); + int (*path_rename)(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, + struct dentry *new_dentry); + int (*path_chmod)(struct path *path, umode_t mode); + int (*path_chown)(struct path *path, kuid_t uid, kgid_t gid); + int (*path_chroot)(struct path *path); +#endif + + int (*inode_alloc_security)(struct inode *inode); + void (*inode_free_security)(struct inode *inode); + int (*inode_init_security)(struct inode *inode, struct inode *dir, + const struct qstr *qstr, + const char **name, void **value, + size_t *len); + int (*inode_create)(struct inode *dir, struct dentry *dentry, + umode_t mode); + int (*inode_link)(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry); + int (*inode_unlink)(struct inode *dir, struct dentry *dentry); + int (*inode_symlink)(struct inode *dir, struct dentry *dentry, + const char *old_name); + int (*inode_mkdir)(struct inode *dir, struct dentry *dentry, + umode_t mode); + int (*inode_rmdir)(struct inode *dir, struct dentry *dentry); + int (*inode_mknod)(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev); + int (*inode_rename)(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry); + int (*inode_readlink)(struct dentry *dentry); + int (*inode_follow_link)(struct dentry *dentry, struct nameidata *nd); + int (*inode_permission)(struct inode *inode, int mask); + int (*inode_setattr)(struct dentry *dentry, struct iattr *attr); + int (*inode_getattr)(const struct path *path); + int (*inode_setxattr)(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); + void (*inode_post_setxattr)(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags); + int (*inode_getxattr)(struct dentry *dentry, const char *name); + int (*inode_listxattr)(struct dentry *dentry); + int (*inode_removexattr)(struct dentry *dentry, const char *name); + int (*inode_need_killpriv)(struct dentry *dentry); + int (*inode_killpriv)(struct dentry *dentry); + int (*inode_getsecurity)(const struct inode *inode, const char *name, + void **buffer, bool alloc); + int (*inode_setsecurity)(struct inode *inode, const char *name, + const void *value, size_t size, + int flags); + int (*inode_listsecurity)(struct inode *inode, char *buffer, + size_t buffer_size); + void (*inode_getsecid)(const struct inode *inode, u32 *secid); + + int (*file_permission)(struct file *file, int mask); + int (*file_alloc_security)(struct file *file); + void (*file_free_security)(struct file *file); + int (*file_ioctl)(struct file *file, unsigned int cmd, + unsigned long arg); + int (*mmap_addr)(unsigned long addr); + int (*mmap_file)(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags); + int (*file_mprotect)(struct vm_area_struct *vma, unsigned long reqprot, + unsigned long prot); + int (*file_lock)(struct file *file, unsigned int cmd); + int (*file_fcntl)(struct file *file, unsigned int cmd, + unsigned long arg); + void (*file_set_fowner)(struct file *file); + int (*file_send_sigiotask)(struct task_struct *tsk, + struct fown_struct *fown, int sig); + int (*file_receive)(struct file *file); + int (*file_open)(struct file *file, const struct cred *cred); + + int (*task_create)(unsigned long clone_flags); + void (*task_free)(struct task_struct *task); + int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp); + void (*cred_free)(struct cred *cred); + int (*cred_prepare)(struct cred *new, const struct cred *old, + gfp_t gfp); + void (*cred_transfer)(struct cred *new, const struct cred *old); + int (*kernel_act_as)(struct cred *new, u32 secid); + int (*kernel_create_files_as)(struct cred *new, struct inode *inode); + int (*kernel_fw_from_file)(struct file *file, char *buf, size_t size); + int (*kernel_module_request)(char *kmod_name); + int (*kernel_module_from_file)(struct file *file); + int (*task_fix_setuid)(struct cred *new, const struct cred *old, + int flags); + int (*task_setpgid)(struct task_struct *p, pid_t pgid); + int (*task_getpgid)(struct task_struct *p); + int (*task_getsid)(struct task_struct *p); + void (*task_getsecid)(struct task_struct *p, u32 *secid); + int (*task_setnice)(struct task_struct *p, int nice); + int (*task_setioprio)(struct task_struct *p, int ioprio); + int (*task_getioprio)(struct task_struct *p); + int (*task_setrlimit)(struct task_struct *p, unsigned int resource, + struct rlimit *new_rlim); + int (*task_setscheduler)(struct task_struct *p); + int (*task_getscheduler)(struct task_struct *p); + int (*task_movememory)(struct task_struct *p); + int (*task_kill)(struct task_struct *p, struct siginfo *info, + int sig, u32 secid); + int (*task_wait)(struct task_struct *p); + int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5); + void (*task_to_inode)(struct task_struct *p, struct inode *inode); + + int (*ipc_permission)(struct kern_ipc_perm *ipcp, short flag); + void (*ipc_getsecid)(struct kern_ipc_perm *ipcp, u32 *secid); + + int (*msg_msg_alloc_security)(struct msg_msg *msg); + void (*msg_msg_free_security)(struct msg_msg *msg); + + int (*msg_queue_alloc_security)(struct msg_queue *msq); + void (*msg_queue_free_security)(struct msg_queue *msq); + int (*msg_queue_associate)(struct msg_queue *msq, int msqflg); + int (*msg_queue_msgctl)(struct msg_queue *msq, int cmd); + int (*msg_queue_msgsnd)(struct msg_queue *msq, struct msg_msg *msg, + int msqflg); + int (*msg_queue_msgrcv)(struct msg_queue *msq, struct msg_msg *msg, + struct task_struct *target, long type, + int mode); + + int (*shm_alloc_security)(struct shmid_kernel *shp); + void (*shm_free_security)(struct shmid_kernel *shp); + int (*shm_associate)(struct shmid_kernel *shp, int shmflg); + int (*shm_shmctl)(struct shmid_kernel *shp, int cmd); + int (*shm_shmat)(struct shmid_kernel *shp, char __user *shmaddr, + int shmflg); + + int (*sem_alloc_security)(struct sem_array *sma); + void (*sem_free_security)(struct sem_array *sma); + int (*sem_associate)(struct sem_array *sma, int semflg); + int (*sem_semctl)(struct sem_array *sma, int cmd); + int (*sem_semop)(struct sem_array *sma, struct sembuf *sops, + unsigned nsops, int alter); + + int (*netlink_send)(struct sock *sk, struct sk_buff *skb); + + void (*d_instantiate)(struct dentry *dentry, struct inode *inode); + + int (*getprocattr)(struct task_struct *p, char *name, char **value); + int (*setprocattr)(struct task_struct *p, char *name, void *value, + size_t size); + int (*ismaclabel)(const char *name); + int (*secid_to_secctx)(u32 secid, char **secdata, u32 *seclen); + int (*secctx_to_secid)(const char *secdata, u32 seclen, u32 *secid); + void (*release_secctx)(char *secdata, u32 seclen); + + int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen); + int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen); + int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen); + +#ifdef CONFIG_SECURITY_NETWORK + int (*unix_stream_connect)(struct sock *sock, struct sock *other, + struct sock *newsk); + int (*unix_may_send)(struct socket *sock, struct socket *other); + + int (*socket_create)(int family, int type, int protocol, int kern); + int (*socket_post_create)(struct socket *sock, int family, int type, + int protocol, int kern); + int (*socket_bind)(struct socket *sock, struct sockaddr *address, + int addrlen); + int (*socket_connect)(struct socket *sock, struct sockaddr *address, + int addrlen); + int (*socket_listen)(struct socket *sock, int backlog); + int (*socket_accept)(struct socket *sock, struct socket *newsock); + int (*socket_sendmsg)(struct socket *sock, struct msghdr *msg, + int size); + int (*socket_recvmsg)(struct socket *sock, struct msghdr *msg, + int size, int flags); + int (*socket_getsockname)(struct socket *sock); + int (*socket_getpeername)(struct socket *sock); + int (*socket_getsockopt)(struct socket *sock, int level, int optname); + int (*socket_setsockopt)(struct socket *sock, int level, int optname); + int (*socket_shutdown)(struct socket *sock, int how); + int (*socket_sock_rcv_skb)(struct sock *sk, struct sk_buff *skb); + int (*socket_getpeersec_stream)(struct socket *sock, + char __user *optval, + int __user *optlen, unsigned len); + int (*socket_getpeersec_dgram)(struct socket *sock, + struct sk_buff *skb, u32 *secid); + int (*sk_alloc_security)(struct sock *sk, int family, gfp_t priority); + void (*sk_free_security)(struct sock *sk); + void (*sk_clone_security)(const struct sock *sk, struct sock *newsk); + void (*sk_getsecid)(struct sock *sk, u32 *secid); + void (*sock_graft)(struct sock *sk, struct socket *parent); + int (*inet_conn_request)(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); + void (*inet_csk_clone)(struct sock *newsk, + const struct request_sock *req); + void (*inet_conn_established)(struct sock *sk, struct sk_buff *skb); + int (*secmark_relabel_packet)(u32 secid); + void (*secmark_refcount_inc)(void); + void (*secmark_refcount_dec)(void); + void (*req_classify_flow)(const struct request_sock *req, + struct flowi *fl); + int (*tun_dev_alloc_security)(void **security); + void (*tun_dev_free_security)(void *security); + int (*tun_dev_create)(void); + int (*tun_dev_attach_queue)(void *security); + int (*tun_dev_attach)(struct sock *sk, void *security); + int (*tun_dev_open)(void *security); +#endif /* CONFIG_SECURITY_NETWORK */ + +#ifdef CONFIG_SECURITY_NETWORK_XFRM + int (*xfrm_policy_alloc_security)(struct xfrm_sec_ctx **ctxp, + struct xfrm_user_sec_ctx *sec_ctx, + gfp_t gfp); + int (*xfrm_policy_clone_security)(struct xfrm_sec_ctx *old_ctx, + struct xfrm_sec_ctx **new_ctx); + void (*xfrm_policy_free_security)(struct xfrm_sec_ctx *ctx); + int (*xfrm_policy_delete_security)(struct xfrm_sec_ctx *ctx); + int (*xfrm_state_alloc)(struct xfrm_state *x, + struct xfrm_user_sec_ctx *sec_ctx); + int (*xfrm_state_alloc_acquire)(struct xfrm_state *x, + struct xfrm_sec_ctx *polsec, + u32 secid); + void (*xfrm_state_free_security)(struct xfrm_state *x); + int (*xfrm_state_delete_security)(struct xfrm_state *x); + int (*xfrm_policy_lookup)(struct xfrm_sec_ctx *ctx, u32 fl_secid, + u8 dir); + int (*xfrm_state_pol_flow_match)(struct xfrm_state *x, + struct xfrm_policy *xp, + const struct flowi *fl); + int (*xfrm_decode_session)(struct sk_buff *skb, u32 *secid, int ckall); +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ + + /* key management security hooks */ +#ifdef CONFIG_KEYS + int (*key_alloc)(struct key *key, const struct cred *cred, + unsigned long flags); + void (*key_free)(struct key *key); + int (*key_permission)(key_ref_t key_ref, const struct cred *cred, + unsigned perm); + int (*key_getsecurity)(struct key *key, char **_buffer); +#endif /* CONFIG_KEYS */ + +#ifdef CONFIG_AUDIT + int (*audit_rule_init)(u32 field, u32 op, char *rulestr, + void **lsmrule); + int (*audit_rule_known)(struct audit_krule *krule); + int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule, + struct audit_context *actx); + void (*audit_rule_free)(void *lsmrule); +#endif /* CONFIG_AUDIT */ +}; + +/* prototypes */ +extern int security_module_enable(struct security_operations *ops); +extern int register_security(struct security_operations *ops); +extern void __init security_fixup_ops(struct security_operations *ops); +extern void reset_security_ops(void); + +#endif /* CONFIG_SECURITY */ + +#endif /* ! __LINUX_LSM_HOOKS_H */ diff --git a/include/linux/security.h b/include/linux/security.h index 18264ea9e314..f3d42c636f27 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -116,8 +116,6 @@ struct seq_file; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); -void reset_security_ops(void); - #ifdef CONFIG_MMU extern unsigned long mmap_min_addr; extern unsigned long dac_mmap_min_addr; @@ -1457,312 +1455,9 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @ctxlen points to the place to put the length of @ctx. * This is the main security structure. */ -struct security_operations { - char name[SECURITY_NAME_MAX + 1]; - - int (*binder_set_context_mgr) (struct task_struct *mgr); - int (*binder_transaction) (struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_binder) (struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_file) (struct task_struct *from, - struct task_struct *to, struct file *file); - - int (*ptrace_access_check) (struct task_struct *child, unsigned int mode); - int (*ptrace_traceme) (struct task_struct *parent); - int (*capget) (struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capset) (struct cred *new, - const struct cred *old, - const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); - int (*capable) (const struct cred *cred, struct user_namespace *ns, - int cap, int audit); - int (*quotactl) (int cmds, int type, int id, struct super_block *sb); - int (*quota_on) (struct dentry *dentry); - int (*syslog) (int type); - int (*settime) (const struct timespec *ts, const struct timezone *tz); - int (*vm_enough_memory) (struct mm_struct *mm, long pages); - - int (*bprm_set_creds) (struct linux_binprm *bprm); - int (*bprm_check_security) (struct linux_binprm *bprm); - int (*bprm_secureexec) (struct linux_binprm *bprm); - void (*bprm_committing_creds) (struct linux_binprm *bprm); - void (*bprm_committed_creds) (struct linux_binprm *bprm); - - int (*sb_alloc_security) (struct super_block *sb); - void (*sb_free_security) (struct super_block *sb); - int (*sb_copy_data) (char *orig, char *copy); - int (*sb_remount) (struct super_block *sb, void *data); - int (*sb_kern_mount) (struct super_block *sb, int flags, void *data); - int (*sb_show_options) (struct seq_file *m, struct super_block *sb); - int (*sb_statfs) (struct dentry *dentry); - int (*sb_mount) (const char *dev_name, struct path *path, - const char *type, unsigned long flags, void *data); - int (*sb_umount) (struct vfsmount *mnt, int flags); - int (*sb_pivotroot) (struct path *old_path, - struct path *new_path); - int (*sb_set_mnt_opts) (struct super_block *sb, - struct security_mnt_opts *opts, - unsigned long kern_flags, - unsigned long *set_kern_flags); - int (*sb_clone_mnt_opts) (const struct super_block *oldsb, - struct super_block *newsb); - int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts); - int (*dentry_init_security) (struct dentry *dentry, int mode, - struct qstr *name, void **ctx, - u32 *ctxlen); - - -#ifdef CONFIG_SECURITY_PATH - int (*path_unlink) (struct path *dir, struct dentry *dentry); - int (*path_mkdir) (struct path *dir, struct dentry *dentry, umode_t mode); - int (*path_rmdir) (struct path *dir, struct dentry *dentry); - int (*path_mknod) (struct path *dir, struct dentry *dentry, umode_t mode, - unsigned int dev); - int (*path_truncate) (struct path *path); - int (*path_symlink) (struct path *dir, struct dentry *dentry, - const char *old_name); - int (*path_link) (struct dentry *old_dentry, struct path *new_dir, - struct dentry *new_dentry); - int (*path_rename) (struct path *old_dir, struct dentry *old_dentry, - struct path *new_dir, struct dentry *new_dentry); - int (*path_chmod) (struct path *path, umode_t mode); - int (*path_chown) (struct path *path, kuid_t uid, kgid_t gid); - int (*path_chroot) (struct path *path); -#endif - - int (*inode_alloc_security) (struct inode *inode); - void (*inode_free_security) (struct inode *inode); - int (*inode_init_security) (struct inode *inode, struct inode *dir, - const struct qstr *qstr, const char **name, - void **value, size_t *len); - int (*inode_create) (struct inode *dir, - struct dentry *dentry, umode_t mode); - int (*inode_link) (struct dentry *old_dentry, - struct inode *dir, struct dentry *new_dentry); - int (*inode_unlink) (struct inode *dir, struct dentry *dentry); - int (*inode_symlink) (struct inode *dir, - struct dentry *dentry, const char *old_name); - int (*inode_mkdir) (struct inode *dir, struct dentry *dentry, umode_t mode); - int (*inode_rmdir) (struct inode *dir, struct dentry *dentry); - int (*inode_mknod) (struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t dev); - int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); - int (*inode_readlink) (struct dentry *dentry); - int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd); - int (*inode_permission) (struct inode *inode, int mask); - int (*inode_setattr) (struct dentry *dentry, struct iattr *attr); - int (*inode_getattr) (const struct path *path); - int (*inode_setxattr) (struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); - void (*inode_post_setxattr) (struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); - int (*inode_getxattr) (struct dentry *dentry, const char *name); - int (*inode_listxattr) (struct dentry *dentry); - int (*inode_removexattr) (struct dentry *dentry, const char *name); - int (*inode_need_killpriv) (struct dentry *dentry); - int (*inode_killpriv) (struct dentry *dentry); - int (*inode_getsecurity) (const struct inode *inode, const char *name, void **buffer, bool alloc); - int (*inode_setsecurity) (struct inode *inode, const char *name, const void *value, size_t size, int flags); - int (*inode_listsecurity) (struct inode *inode, char *buffer, size_t buffer_size); - void (*inode_getsecid) (const struct inode *inode, u32 *secid); - - int (*file_permission) (struct file *file, int mask); - int (*file_alloc_security) (struct file *file); - void (*file_free_security) (struct file *file); - int (*file_ioctl) (struct file *file, unsigned int cmd, - unsigned long arg); - int (*mmap_addr) (unsigned long addr); - int (*mmap_file) (struct file *file, - unsigned long reqprot, unsigned long prot, - unsigned long flags); - int (*file_mprotect) (struct vm_area_struct *vma, - unsigned long reqprot, - unsigned long prot); - int (*file_lock) (struct file *file, unsigned int cmd); - int (*file_fcntl) (struct file *file, unsigned int cmd, - unsigned long arg); - void (*file_set_fowner) (struct file *file); - int (*file_send_sigiotask) (struct task_struct *tsk, - struct fown_struct *fown, int sig); - int (*file_receive) (struct file *file); - int (*file_open) (struct file *file, const struct cred *cred); - - int (*task_create) (unsigned long clone_flags); - void (*task_free) (struct task_struct *task); - int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp); - void (*cred_free) (struct cred *cred); - int (*cred_prepare)(struct cred *new, const struct cred *old, - gfp_t gfp); - void (*cred_transfer)(struct cred *new, const struct cred *old); - int (*kernel_act_as)(struct cred *new, u32 secid); - int (*kernel_create_files_as)(struct cred *new, struct inode *inode); - int (*kernel_fw_from_file)(struct file *file, char *buf, size_t size); - int (*kernel_module_request)(char *kmod_name); - int (*kernel_module_from_file)(struct file *file); - int (*task_fix_setuid) (struct cred *new, const struct cred *old, - int flags); - int (*task_setpgid) (struct task_struct *p, pid_t pgid); - int (*task_getpgid) (struct task_struct *p); - int (*task_getsid) (struct task_struct *p); - void (*task_getsecid) (struct task_struct *p, u32 *secid); - int (*task_setnice) (struct task_struct *p, int nice); - int (*task_setioprio) (struct task_struct *p, int ioprio); - int (*task_getioprio) (struct task_struct *p); - int (*task_setrlimit) (struct task_struct *p, unsigned int resource, - struct rlimit *new_rlim); - int (*task_setscheduler) (struct task_struct *p); - int (*task_getscheduler) (struct task_struct *p); - int (*task_movememory) (struct task_struct *p); - int (*task_kill) (struct task_struct *p, - struct siginfo *info, int sig, u32 secid); - int (*task_wait) (struct task_struct *p); - int (*task_prctl) (int option, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5); - void (*task_to_inode) (struct task_struct *p, struct inode *inode); - - int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag); - void (*ipc_getsecid) (struct kern_ipc_perm *ipcp, u32 *secid); - - int (*msg_msg_alloc_security) (struct msg_msg *msg); - void (*msg_msg_free_security) (struct msg_msg *msg); - - int (*msg_queue_alloc_security) (struct msg_queue *msq); - void (*msg_queue_free_security) (struct msg_queue *msq); - int (*msg_queue_associate) (struct msg_queue *msq, int msqflg); - int (*msg_queue_msgctl) (struct msg_queue *msq, int cmd); - int (*msg_queue_msgsnd) (struct msg_queue *msq, - struct msg_msg *msg, int msqflg); - int (*msg_queue_msgrcv) (struct msg_queue *msq, - struct msg_msg *msg, - struct task_struct *target, - long type, int mode); - - int (*shm_alloc_security) (struct shmid_kernel *shp); - void (*shm_free_security) (struct shmid_kernel *shp); - int (*shm_associate) (struct shmid_kernel *shp, int shmflg); - int (*shm_shmctl) (struct shmid_kernel *shp, int cmd); - int (*shm_shmat) (struct shmid_kernel *shp, - char __user *shmaddr, int shmflg); - - int (*sem_alloc_security) (struct sem_array *sma); - void (*sem_free_security) (struct sem_array *sma); - int (*sem_associate) (struct sem_array *sma, int semflg); - int (*sem_semctl) (struct sem_array *sma, int cmd); - int (*sem_semop) (struct sem_array *sma, - struct sembuf *sops, unsigned nsops, int alter); - - int (*netlink_send) (struct sock *sk, struct sk_buff *skb); - - void (*d_instantiate) (struct dentry *dentry, struct inode *inode); - - int (*getprocattr) (struct task_struct *p, char *name, char **value); - int (*setprocattr) (struct task_struct *p, char *name, void *value, size_t size); - int (*ismaclabel) (const char *name); - int (*secid_to_secctx) (u32 secid, char **secdata, u32 *seclen); - int (*secctx_to_secid) (const char *secdata, u32 seclen, u32 *secid); - void (*release_secctx) (char *secdata, u32 seclen); - - int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen); - int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen); - int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen); - -#ifdef CONFIG_SECURITY_NETWORK - int (*unix_stream_connect) (struct sock *sock, struct sock *other, struct sock *newsk); - int (*unix_may_send) (struct socket *sock, struct socket *other); - - int (*socket_create) (int family, int type, int protocol, int kern); - int (*socket_post_create) (struct socket *sock, int family, - int type, int protocol, int kern); - int (*socket_bind) (struct socket *sock, - struct sockaddr *address, int addrlen); - int (*socket_connect) (struct socket *sock, - struct sockaddr *address, int addrlen); - int (*socket_listen) (struct socket *sock, int backlog); - int (*socket_accept) (struct socket *sock, struct socket *newsock); - int (*socket_sendmsg) (struct socket *sock, - struct msghdr *msg, int size); - int (*socket_recvmsg) (struct socket *sock, - struct msghdr *msg, int size, int flags); - int (*socket_getsockname) (struct socket *sock); - int (*socket_getpeername) (struct socket *sock); - int (*socket_getsockopt) (struct socket *sock, int level, int optname); - int (*socket_setsockopt) (struct socket *sock, int level, int optname); - int (*socket_shutdown) (struct socket *sock, int how); - int (*socket_sock_rcv_skb) (struct sock *sk, struct sk_buff *skb); - int (*socket_getpeersec_stream) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len); - int (*socket_getpeersec_dgram) (struct socket *sock, struct sk_buff *skb, u32 *secid); - int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority); - void (*sk_free_security) (struct sock *sk); - void (*sk_clone_security) (const struct sock *sk, struct sock *newsk); - void (*sk_getsecid) (struct sock *sk, u32 *secid); - void (*sock_graft) (struct sock *sk, struct socket *parent); - int (*inet_conn_request) (struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); - void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); - int (*secmark_relabel_packet) (u32 secid); - void (*secmark_refcount_inc) (void); - void (*secmark_refcount_dec) (void); - void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); - int (*tun_dev_alloc_security) (void **security); - void (*tun_dev_free_security) (void *security); - int (*tun_dev_create) (void); - int (*tun_dev_attach_queue) (void *security); - int (*tun_dev_attach) (struct sock *sk, void *security); - int (*tun_dev_open) (void *security); -#endif /* CONFIG_SECURITY_NETWORK */ - -#ifdef CONFIG_SECURITY_NETWORK_XFRM - int (*xfrm_policy_alloc_security) (struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp); - int (*xfrm_policy_clone_security) (struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx); - void (*xfrm_policy_free_security) (struct xfrm_sec_ctx *ctx); - int (*xfrm_policy_delete_security) (struct xfrm_sec_ctx *ctx); - int (*xfrm_state_alloc) (struct xfrm_state *x, - struct xfrm_user_sec_ctx *sec_ctx); - int (*xfrm_state_alloc_acquire) (struct xfrm_state *x, - struct xfrm_sec_ctx *polsec, - u32 secid); - void (*xfrm_state_free_security) (struct xfrm_state *x); - int (*xfrm_state_delete_security) (struct xfrm_state *x); - int (*xfrm_policy_lookup) (struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); - int (*xfrm_state_pol_flow_match) (struct xfrm_state *x, - struct xfrm_policy *xp, - const struct flowi *fl); - int (*xfrm_decode_session) (struct sk_buff *skb, u32 *secid, int ckall); -#endif /* CONFIG_SECURITY_NETWORK_XFRM */ - - /* key management security hooks */ -#ifdef CONFIG_KEYS - int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags); - void (*key_free) (struct key *key); - int (*key_permission) (key_ref_t key_ref, - const struct cred *cred, - unsigned perm); - int (*key_getsecurity)(struct key *key, char **_buffer); -#endif /* CONFIG_KEYS */ - -#ifdef CONFIG_AUDIT - int (*audit_rule_init) (u32 field, u32 op, char *rulestr, void **lsmrule); - int (*audit_rule_known) (struct audit_krule *krule); - int (*audit_rule_match) (u32 secid, u32 field, u32 op, void *lsmrule, - struct audit_context *actx); - void (*audit_rule_free) (void *lsmrule); -#endif /* CONFIG_AUDIT */ -}; /* prototypes */ extern int security_init(void); -extern int security_module_enable(struct security_operations *ops); -extern int register_security(struct security_operations *ops); -extern void __init security_fixup_ops(struct security_operations *ops); - /* Security operations */ int security_binder_set_context_mgr(struct task_struct *mgr); -- cgit v1.2.3 From fe7bb272ee72b5cc377e02b556d0d718d12bbede Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:10:53 -0700 Subject: LSM: Add the comment to lsm_hooks.h Add the large comment describing the content of the security_operations structure to lsm_hooks.h. This wasn't done in the previous (1/7) patch because it would have exceeded the mail list size limits. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 1279 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1279 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index c60f81b2d18c..b4c91de510c2 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -31,6 +31,1285 @@ #ifdef CONFIG_SECURITY +/** + * struct security_operations - main security structure + * + * Security module identifier. + * + * @name: + * A string that acts as a unique identifier for the LSM with max number + * of characters = SECURITY_NAME_MAX. + * + * Security hooks for program execution operations. + * + * @bprm_set_creds: + * Save security information in the bprm->security field, typically based + * on information about the bprm->file, for later use by the apply_creds + * hook. This hook may also optionally check permissions (e.g. for + * transitions between security domains). + * This hook may be called multiple times during a single execve, e.g. for + * interpreters. The hook can tell whether it has already been called by + * checking to see if @bprm->security is non-NULL. If so, then the hook + * may decide either to retain the security information saved earlier or + * to replace it. + * @bprm contains the linux_binprm structure. + * Return 0 if the hook is successful and permission is granted. + * @bprm_check_security: + * This hook mediates the point when a search for a binary handler will + * begin. It allows a check the @bprm->security value which is set in the + * preceding set_creds call. The primary difference from set_creds is + * that the argv list and envp list are reliably available in @bprm. This + * hook may be called multiple times during a single execve; and in each + * pass set_creds is called first. + * @bprm contains the linux_binprm structure. + * Return 0 if the hook is successful and permission is granted. + * @bprm_committing_creds: + * Prepare to install the new security attributes of a process being + * transformed by an execve operation, based on the old credentials + * pointed to by @current->cred and the information set in @bprm->cred by + * the bprm_set_creds hook. @bprm points to the linux_binprm structure. + * This hook is a good place to perform state changes on the process such + * as closing open file descriptors to which access will no longer be + * granted when the attributes are changed. This is called immediately + * before commit_creds(). + * @bprm_committed_creds: + * Tidy up after the installation of the new security attributes of a + * process being transformed by an execve operation. The new credentials + * have, by this point, been set to @current->cred. @bprm points to the + * linux_binprm structure. This hook is a good place to perform state + * changes on the process such as clearing out non-inheritable signal + * state. This is called immediately after commit_creds(). + * @bprm_secureexec: + * Return a boolean value (0 or 1) indicating whether a "secure exec" + * is required. The flag is passed in the auxiliary table + * on the initial stack to the ELF interpreter to indicate whether libc + * should enable secure mode. + * @bprm contains the linux_binprm structure. + * + * Security hooks for filesystem operations. + * + * @sb_alloc_security: + * Allocate and attach a security structure to the sb->s_security field. + * The s_security field is initialized to NULL when the structure is + * allocated. + * @sb contains the super_block structure to be modified. + * Return 0 if operation was successful. + * @sb_free_security: + * Deallocate and clear the sb->s_security field. + * @sb contains the super_block structure to be modified. + * @sb_statfs: + * Check permission before obtaining filesystem statistics for the @mnt + * mountpoint. + * @dentry is a handle on the superblock for the filesystem. + * Return 0 if permission is granted. + * @sb_mount: + * Check permission before an object specified by @dev_name is mounted on + * the mount point named by @nd. For an ordinary mount, @dev_name + * identifies a device if the file system type requires a device. For a + * remount (@flags & MS_REMOUNT), @dev_name is irrelevant. For a + * loopback/bind mount (@flags & MS_BIND), @dev_name identifies the + * pathname of the object being mounted. + * @dev_name contains the name for object being mounted. + * @path contains the path for mount point object. + * @type contains the filesystem type. + * @flags contains the mount flags. + * @data contains the filesystem-specific data. + * Return 0 if permission is granted. + * @sb_copy_data: + * Allow mount option data to be copied prior to parsing by the filesystem, + * so that the security module can extract security-specific mount + * options cleanly (a filesystem may modify the data e.g. with strsep()). + * This also allows the original mount data to be stripped of security- + * specific options to avoid having to make filesystems aware of them. + * @type the type of filesystem being mounted. + * @orig the original mount data copied from userspace. + * @copy copied data which will be passed to the security module. + * Returns 0 if the copy was successful. + * @sb_remount: + * Extracts security system specific mount options and verifies no changes + * are being made to those options. + * @sb superblock being remounted + * @data contains the filesystem-specific data. + * Return 0 if permission is granted. + * @sb_umount: + * Check permission before the @mnt file system is unmounted. + * @mnt contains the mounted file system. + * @flags contains the unmount flags, e.g. MNT_FORCE. + * Return 0 if permission is granted. + * @sb_pivotroot: + * Check permission before pivoting the root filesystem. + * @old_path contains the path for the new location of the + * current root (put_old). + * @new_path contains the path for the new root (new_root). + * Return 0 if permission is granted. + * @sb_set_mnt_opts: + * Set the security relevant mount options used for a superblock + * @sb the superblock to set security mount options for + * @opts binary data structure containing all lsm mount data + * @sb_clone_mnt_opts: + * Copy all security options from a given superblock to another + * @oldsb old superblock which contain information to clone + * @newsb new superblock which needs filled in + * @sb_parse_opts_str: + * Parse a string of security data filling in the opts structure + * @options string containing all mount options known by the LSM + * @opts binary data structure usable by the LSM + * @dentry_init_security: + * Compute a context for a dentry as the inode is not yet available + * since NFSv4 has no label backed by an EA anyway. + * @dentry dentry to use in calculating the context. + * @mode mode used to determine resource type. + * @name name of the last path component used to create file + * @ctx pointer to place the pointer to the resulting context in. + * @ctxlen point to place the length of the resulting context. + * + * + * Security hooks for inode operations. + * + * @inode_alloc_security: + * Allocate and attach a security structure to @inode->i_security. The + * i_security field is initialized to NULL when the inode structure is + * allocated. + * @inode contains the inode structure. + * Return 0 if operation was successful. + * @inode_free_security: + * @inode contains the inode structure. + * Deallocate the inode security structure and set @inode->i_security to + * NULL. + * @inode_init_security: + * Obtain the security attribute name suffix and value to set on a newly + * created inode and set up the incore security field for the new inode. + * This hook is called by the fs code as part of the inode creation + * transaction and provides for atomic labeling of the inode, unlike + * the post_create/mkdir/... hooks called by the VFS. The hook function + * is expected to allocate the name and value via kmalloc, with the caller + * being responsible for calling kfree after using them. + * If the security module does not use security attributes or does + * not wish to put a security attribute on this particular inode, + * then it should return -EOPNOTSUPP to skip this processing. + * @inode contains the inode structure of the newly created inode. + * @dir contains the inode structure of the parent directory. + * @qstr contains the last path component of the new object + * @name will be set to the allocated name suffix (e.g. selinux). + * @value will be set to the allocated attribute value. + * @len will be set to the length of the value. + * Returns 0 if @name and @value have been successfully set, + * -EOPNOTSUPP if no security attribute is needed, or + * -ENOMEM on memory allocation failure. + * @inode_create: + * Check permission to create a regular file. + * @dir contains inode structure of the parent of the new file. + * @dentry contains the dentry structure for the file to be created. + * @mode contains the file mode of the file to be created. + * Return 0 if permission is granted. + * @inode_link: + * Check permission before creating a new hard link to a file. + * @old_dentry contains the dentry structure for an existing + * link to the file. + * @dir contains the inode structure of the parent directory + * of the new link. + * @new_dentry contains the dentry structure for the new link. + * Return 0 if permission is granted. + * @path_link: + * Check permission before creating a new hard link to a file. + * @old_dentry contains the dentry structure for an existing link + * to the file. + * @new_dir contains the path structure of the parent directory of + * the new link. + * @new_dentry contains the dentry structure for the new link. + * Return 0 if permission is granted. + * @inode_unlink: + * Check the permission to remove a hard link to a file. + * @dir contains the inode structure of parent directory of the file. + * @dentry contains the dentry structure for file to be unlinked. + * Return 0 if permission is granted. + * @path_unlink: + * Check the permission to remove a hard link to a file. + * @dir contains the path structure of parent directory of the file. + * @dentry contains the dentry structure for file to be unlinked. + * Return 0 if permission is granted. + * @inode_symlink: + * Check the permission to create a symbolic link to a file. + * @dir contains the inode structure of parent directory of + * the symbolic link. + * @dentry contains the dentry structure of the symbolic link. + * @old_name contains the pathname of file. + * Return 0 if permission is granted. + * @path_symlink: + * Check the permission to create a symbolic link to a file. + * @dir contains the path structure of parent directory of + * the symbolic link. + * @dentry contains the dentry structure of the symbolic link. + * @old_name contains the pathname of file. + * Return 0 if permission is granted. + * @inode_mkdir: + * Check permissions to create a new directory in the existing directory + * associated with inode structure @dir. + * @dir contains the inode structure of parent of the directory + * to be created. + * @dentry contains the dentry structure of new directory. + * @mode contains the mode of new directory. + * Return 0 if permission is granted. + * @path_mkdir: + * Check permissions to create a new directory in the existing directory + * associated with path structure @path. + * @dir contains the path structure of parent of the directory + * to be created. + * @dentry contains the dentry structure of new directory. + * @mode contains the mode of new directory. + * Return 0 if permission is granted. + * @inode_rmdir: + * Check the permission to remove a directory. + * @dir contains the inode structure of parent of the directory + * to be removed. + * @dentry contains the dentry structure of directory to be removed. + * Return 0 if permission is granted. + * @path_rmdir: + * Check the permission to remove a directory. + * @dir contains the path structure of parent of the directory to be + * removed. + * @dentry contains the dentry structure of directory to be removed. + * Return 0 if permission is granted. + * @inode_mknod: + * Check permissions when creating a special file (or a socket or a fifo + * file created via the mknod system call). Note that if mknod operation + * is being done for a regular file, then the create hook will be called + * and not this hook. + * @dir contains the inode structure of parent of the new file. + * @dentry contains the dentry structure of the new file. + * @mode contains the mode of the new file. + * @dev contains the device number. + * Return 0 if permission is granted. + * @path_mknod: + * Check permissions when creating a file. Note that this hook is called + * even if mknod operation is being done for a regular file. + * @dir contains the path structure of parent of the new file. + * @dentry contains the dentry structure of the new file. + * @mode contains the mode of the new file. + * @dev contains the undecoded device number. Use new_decode_dev() to get + * the decoded device number. + * Return 0 if permission is granted. + * @inode_rename: + * Check for permission to rename a file or directory. + * @old_dir contains the inode structure for parent of the old link. + * @old_dentry contains the dentry structure of the old link. + * @new_dir contains the inode structure for parent of the new link. + * @new_dentry contains the dentry structure of the new link. + * Return 0 if permission is granted. + * @path_rename: + * Check for permission to rename a file or directory. + * @old_dir contains the path structure for parent of the old link. + * @old_dentry contains the dentry structure of the old link. + * @new_dir contains the path structure for parent of the new link. + * @new_dentry contains the dentry structure of the new link. + * Return 0 if permission is granted. + * @path_chmod: + * Check for permission to change DAC's permission of a file or directory. + * @dentry contains the dentry structure. + * @mnt contains the vfsmnt structure. + * @mode contains DAC's mode. + * Return 0 if permission is granted. + * @path_chown: + * Check for permission to change owner/group of a file or directory. + * @path contains the path structure. + * @uid contains new owner's ID. + * @gid contains new group's ID. + * Return 0 if permission is granted. + * @path_chroot: + * Check for permission to change root directory. + * @path contains the path structure. + * Return 0 if permission is granted. + * @inode_readlink: + * Check the permission to read the symbolic link. + * @dentry contains the dentry structure for the file link. + * Return 0 if permission is granted. + * @inode_follow_link: + * Check permission to follow a symbolic link when looking up a pathname. + * @dentry contains the dentry structure for the link. + * @nd contains the nameidata structure for the parent directory. + * Return 0 if permission is granted. + * @inode_permission: + * Check permission before accessing an inode. This hook is called by the + * existing Linux permission function, so a security module can use it to + * provide additional checking for existing Linux permission checks. + * Notice that this hook is called when a file is opened (as well as many + * other operations), whereas the file_security_ops permission hook is + * called when the actual read/write operations are performed. + * @inode contains the inode structure to check. + * @mask contains the permission mask. + * Return 0 if permission is granted. + * @inode_setattr: + * Check permission before setting file attributes. Note that the kernel + * call to notify_change is performed from several locations, whenever + * file attributes change (such as when a file is truncated, chown/chmod + * operations, transferring disk quotas, etc). + * @dentry contains the dentry structure for the file. + * @attr is the iattr structure containing the new file attributes. + * Return 0 if permission is granted. + * @path_truncate: + * Check permission before truncating a file. + * @path contains the path structure for the file. + * Return 0 if permission is granted. + * @inode_getattr: + * Check permission before obtaining file attributes. + * @mnt is the vfsmount where the dentry was looked up + * @dentry contains the dentry structure for the file. + * Return 0 if permission is granted. + * @inode_setxattr: + * Check permission before setting the extended attributes + * @value identified by @name for @dentry. + * Return 0 if permission is granted. + * @inode_post_setxattr: + * Update inode security field after successful setxattr operation. + * @value identified by @name for @dentry. + * @inode_getxattr: + * Check permission before obtaining the extended attributes + * identified by @name for @dentry. + * Return 0 if permission is granted. + * @inode_listxattr: + * Check permission before obtaining the list of extended attribute + * names for @dentry. + * Return 0 if permission is granted. + * @inode_removexattr: + * Check permission before removing the extended attribute + * identified by @name for @dentry. + * Return 0 if permission is granted. + * @inode_getsecurity: + * Retrieve a copy of the extended attribute representation of the + * security label associated with @name for @inode via @buffer. Note that + * @name is the remainder of the attribute name after the security prefix + * has been removed. @alloc is used to specify of the call should return a + * value via the buffer or just the value length Return size of buffer on + * success. + * @inode_setsecurity: + * Set the security label associated with @name for @inode from the + * extended attribute value @value. @size indicates the size of the + * @value in bytes. @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. + * Note that @name is the remainder of the attribute name after the + * security. prefix has been removed. + * Return 0 on success. + * @inode_listsecurity: + * Copy the extended attribute names for the security labels + * associated with @inode into @buffer. The maximum size of @buffer + * is specified by @buffer_size. @buffer may be NULL to request + * the size of the buffer required. + * Returns number of bytes used/required on success. + * @inode_need_killpriv: + * Called when an inode has been changed. + * @dentry is the dentry being changed. + * Return <0 on error to abort the inode change operation. + * Return 0 if inode_killpriv does not need to be called. + * Return >0 if inode_killpriv does need to be called. + * @inode_killpriv: + * The setuid bit is being removed. Remove similar security labels. + * Called with the dentry->d_inode->i_mutex held. + * @dentry is the dentry being changed. + * Return 0 on success. If error is returned, then the operation + * causing setuid bit removal is failed. + * @inode_getsecid: + * Get the secid associated with the node. + * @inode contains a pointer to the inode. + * @secid contains a pointer to the location where result will be saved. + * In case of failure, @secid will be set to zero. + * + * Security hooks for file operations + * + * @file_permission: + * Check file permissions before accessing an open file. This hook is + * called by various operations that read or write files. A security + * module can use this hook to perform additional checking on these + * operations, e.g. to revalidate permissions on use to support privilege + * bracketing or policy changes. Notice that this hook is used when the + * actual read/write operations are performed, whereas the + * inode_security_ops hook is called when a file is opened (as well as + * many other operations). + * Caveat: Although this hook can be used to revalidate permissions for + * various system call operations that read or write files, it does not + * address the revalidation of permissions for memory-mapped files. + * Security modules must handle this separately if they need such + * revalidation. + * @file contains the file structure being accessed. + * @mask contains the requested permissions. + * Return 0 if permission is granted. + * @file_alloc_security: + * Allocate and attach a security structure to the file->f_security field. + * The security field is initialized to NULL when the structure is first + * created. + * @file contains the file structure to secure. + * Return 0 if the hook is successful and permission is granted. + * @file_free_security: + * Deallocate and free any security structures stored in file->f_security. + * @file contains the file structure being modified. + * @file_ioctl: + * @file contains the file structure. + * @cmd contains the operation to perform. + * @arg contains the operational arguments. + * Check permission for an ioctl operation on @file. Note that @arg + * sometimes represents a user space pointer; in other cases, it may be a + * simple integer value. When @arg represents a user space pointer, it + * should never be used by the security module. + * Return 0 if permission is granted. + * @mmap_addr : + * Check permissions for a mmap operation at @addr. + * @addr contains virtual address that will be used for the operation. + * Return 0 if permission is granted. + * @mmap_file : + * Check permissions for a mmap operation. The @file may be NULL, e.g. + * if mapping anonymous memory. + * @file contains the file structure for file to map (may be NULL). + * @reqprot contains the protection requested by the application. + * @prot contains the protection that will be applied by the kernel. + * @flags contains the operational flags. + * Return 0 if permission is granted. + * @file_mprotect: + * Check permissions before changing memory access permissions. + * @vma contains the memory region to modify. + * @reqprot contains the protection requested by the application. + * @prot contains the protection that will be applied by the kernel. + * Return 0 if permission is granted. + * @file_lock: + * Check permission before performing file locking operations. + * Note: this hook mediates both flock and fcntl style locks. + * @file contains the file structure. + * @cmd contains the posix-translated lock operation to perform + * (e.g. F_RDLCK, F_WRLCK). + * Return 0 if permission is granted. + * @file_fcntl: + * Check permission before allowing the file operation specified by @cmd + * from being performed on the file @file. Note that @arg sometimes + * represents a user space pointer; in other cases, it may be a simple + * integer value. When @arg represents a user space pointer, it should + * never be used by the security module. + * @file contains the file structure. + * @cmd contains the operation to be performed. + * @arg contains the operational arguments. + * Return 0 if permission is granted. + * @file_set_fowner: + * Save owner security information (typically from current->security) in + * file->f_security for later use by the send_sigiotask hook. + * @file contains the file structure to update. + * Return 0 on success. + * @file_send_sigiotask: + * Check permission for the file owner @fown to send SIGIO or SIGURG to the + * process @tsk. Note that this hook is sometimes called from interrupt. + * Note that the fown_struct, @fown, is never outside the context of a + * struct file, so the file structure (and associated security information) + * can always be obtained: + * container_of(fown, struct file, f_owner) + * @tsk contains the structure of task receiving signal. + * @fown contains the file owner information. + * @sig is the signal that will be sent. When 0, kernel sends SIGIO. + * Return 0 if permission is granted. + * @file_receive: + * This hook allows security modules to control the ability of a process + * to receive an open file descriptor via socket IPC. + * @file contains the file structure being received. + * Return 0 if permission is granted. + * @file_open + * Save open-time permission checking state for later use upon + * file_permission, and recheck access if anything has changed + * since inode_permission. + * + * Security hooks for task operations. + * + * @task_create: + * Check permission before creating a child process. See the clone(2) + * manual page for definitions of the @clone_flags. + * @clone_flags contains the flags indicating what should be shared. + * Return 0 if permission is granted. + * @task_free: + * @task task being freed + * Handle release of task-related resources. (Note that this can be called + * from interrupt context.) + * @cred_alloc_blank: + * @cred points to the credentials. + * @gfp indicates the atomicity of any memory allocations. + * Only allocate sufficient memory and attach to @cred such that + * cred_transfer() will not get ENOMEM. + * @cred_free: + * @cred points to the credentials. + * Deallocate and clear the cred->security field in a set of credentials. + * @cred_prepare: + * @new points to the new credentials. + * @old points to the original credentials. + * @gfp indicates the atomicity of any memory allocations. + * Prepare a new set of credentials by copying the data from the old set. + * @cred_transfer: + * @new points to the new credentials. + * @old points to the original credentials. + * Transfer data from original creds to new creds + * @kernel_act_as: + * Set the credentials for a kernel service to act as (subjective context). + * @new points to the credentials to be modified. + * @secid specifies the security ID to be set + * The current task must be the one that nominated @secid. + * Return 0 if successful. + * @kernel_create_files_as: + * Set the file creation context in a set of credentials to be the same as + * the objective context of the specified inode. + * @new points to the credentials to be modified. + * @inode points to the inode to use as a reference. + * The current task must be the one that nominated @inode. + * Return 0 if successful. + * @kernel_fw_from_file: + * Load firmware from userspace (not called for built-in firmware). + * @file contains the file structure pointing to the file containing + * the firmware to load. This argument will be NULL if the firmware + * was loaded via the uevent-triggered blob-based interface exposed + * by CONFIG_FW_LOADER_USER_HELPER. + * @buf pointer to buffer containing firmware contents. + * @size length of the firmware contents. + * Return 0 if permission is granted. + * @kernel_module_request: + * Ability to trigger the kernel to automatically upcall to userspace for + * userspace to load a kernel module with the given name. + * @kmod_name name of the module requested by the kernel + * Return 0 if successful. + * @kernel_module_from_file: + * Load a kernel module from userspace. + * @file contains the file structure pointing to the file containing + * the kernel module to load. If the module is being loaded from a blob, + * this argument will be NULL. + * Return 0 if permission is granted. + * @task_fix_setuid: + * Update the module's state after setting one or more of the user + * identity attributes of the current process. The @flags parameter + * indicates which of the set*uid system calls invoked this hook. If + * @new is the set of credentials that will be installed. Modifications + * should be made to this rather than to @current->cred. + * @old is the set of credentials that are being replaces + * @flags contains one of the LSM_SETID_* values. + * Return 0 on success. + * @task_setpgid: + * Check permission before setting the process group identifier of the + * process @p to @pgid. + * @p contains the task_struct for process being modified. + * @pgid contains the new pgid. + * Return 0 if permission is granted. + * @task_getpgid: + * Check permission before getting the process group identifier of the + * process @p. + * @p contains the task_struct for the process. + * Return 0 if permission is granted. + * @task_getsid: + * Check permission before getting the session identifier of the process + * @p. + * @p contains the task_struct for the process. + * Return 0 if permission is granted. + * @task_getsecid: + * Retrieve the security identifier of the process @p. + * @p contains the task_struct for the process and place is into @secid. + * In case of failure, @secid will be set to zero. + * + * @task_setnice: + * Check permission before setting the nice value of @p to @nice. + * @p contains the task_struct of process. + * @nice contains the new nice value. + * Return 0 if permission is granted. + * @task_setioprio + * Check permission before setting the ioprio value of @p to @ioprio. + * @p contains the task_struct of process. + * @ioprio contains the new ioprio value + * Return 0 if permission is granted. + * @task_getioprio + * Check permission before getting the ioprio value of @p. + * @p contains the task_struct of process. + * Return 0 if permission is granted. + * @task_setrlimit: + * Check permission before setting the resource limits of the current + * process for @resource to @new_rlim. The old resource limit values can + * be examined by dereferencing (current->signal->rlim + resource). + * @resource contains the resource whose limit is being set. + * @new_rlim contains the new limits for @resource. + * Return 0 if permission is granted. + * @task_setscheduler: + * Check permission before setting scheduling policy and/or parameters of + * process @p based on @policy and @lp. + * @p contains the task_struct for process. + * @policy contains the scheduling policy. + * @lp contains the scheduling parameters. + * Return 0 if permission is granted. + * @task_getscheduler: + * Check permission before obtaining scheduling information for process + * @p. + * @p contains the task_struct for process. + * Return 0 if permission is granted. + * @task_movememory + * Check permission before moving memory owned by process @p. + * @p contains the task_struct for process. + * Return 0 if permission is granted. + * @task_kill: + * Check permission before sending signal @sig to @p. @info can be NULL, + * the constant 1, or a pointer to a siginfo structure. If @info is 1 or + * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming + * from the kernel and should typically be permitted. + * SIGIO signals are handled separately by the send_sigiotask hook in + * file_security_ops. + * @p contains the task_struct for process. + * @info contains the signal information. + * @sig contains the signal value. + * @secid contains the sid of the process where the signal originated + * Return 0 if permission is granted. + * @task_wait: + * Check permission before allowing a process to reap a child process @p + * and collect its status information. + * @p contains the task_struct for process. + * Return 0 if permission is granted. + * @task_prctl: + * Check permission before performing a process control operation on the + * current process. + * @option contains the operation. + * @arg2 contains a argument. + * @arg3 contains a argument. + * @arg4 contains a argument. + * @arg5 contains a argument. + * Return -ENOSYS if no-one wanted to handle this op, any other value to + * cause prctl() to return immediately with that value. + * @task_to_inode: + * Set the security attributes for an inode based on an associated task's + * security attributes, e.g. for /proc/pid inodes. + * @p contains the task_struct for the task. + * @inode contains the inode structure for the inode. + * + * Security hooks for Netlink messaging. + * + * @netlink_send: + * Save security information for a netlink message so that permission + * checking can be performed when the message is processed. The security + * information can be saved using the eff_cap field of the + * netlink_skb_parms structure. Also may be used to provide fine + * grained control over message transmission. + * @sk associated sock of task sending the message. + * @skb contains the sk_buff structure for the netlink message. + * Return 0 if the information was successfully saved and message + * is allowed to be transmitted. + * + * Security hooks for Unix domain networking. + * + * @unix_stream_connect: + * Check permissions before establishing a Unix domain stream connection + * between @sock and @other. + * @sock contains the sock structure. + * @other contains the peer sock structure. + * @newsk contains the new sock structure. + * Return 0 if permission is granted. + * @unix_may_send: + * Check permissions before connecting or sending datagrams from @sock to + * @other. + * @sock contains the socket structure. + * @other contains the peer socket structure. + * Return 0 if permission is granted. + * + * The @unix_stream_connect and @unix_may_send hooks were necessary because + * Linux provides an alternative to the conventional file name space for Unix + * domain sockets. Whereas binding and connecting to sockets in the file name + * space is mediated by the typical file permissions (and caught by the mknod + * and permission hooks in inode_security_ops), binding and connecting to + * sockets in the abstract name space is completely unmediated. Sufficient + * control of Unix domain sockets in the abstract name space isn't possible + * using only the socket layer hooks, since we need to know the actual target + * socket, which is not looked up until we are inside the af_unix code. + * + * Security hooks for socket operations. + * + * @socket_create: + * Check permissions prior to creating a new socket. + * @family contains the requested protocol family. + * @type contains the requested communications type. + * @protocol contains the requested protocol. + * @kern set to 1 if a kernel socket. + * Return 0 if permission is granted. + * @socket_post_create: + * This hook allows a module to update or allocate a per-socket security + * structure. Note that the security field was not added directly to the + * socket structure, but rather, the socket security information is stored + * in the associated inode. Typically, the inode alloc_security hook will + * allocate and and attach security information to + * sock->inode->i_security. This hook may be used to update the + * sock->inode->i_security field with additional information that wasn't + * available when the inode was allocated. + * @sock contains the newly created socket structure. + * @family contains the requested protocol family. + * @type contains the requested communications type. + * @protocol contains the requested protocol. + * @kern set to 1 if a kernel socket. + * @socket_bind: + * Check permission before socket protocol layer bind operation is + * performed and the socket @sock is bound to the address specified in the + * @address parameter. + * @sock contains the socket structure. + * @address contains the address to bind to. + * @addrlen contains the length of address. + * Return 0 if permission is granted. + * @socket_connect: + * Check permission before socket protocol layer connect operation + * attempts to connect socket @sock to a remote address, @address. + * @sock contains the socket structure. + * @address contains the address of remote endpoint. + * @addrlen contains the length of address. + * Return 0 if permission is granted. + * @socket_listen: + * Check permission before socket protocol layer listen operation. + * @sock contains the socket structure. + * @backlog contains the maximum length for the pending connection queue. + * Return 0 if permission is granted. + * @socket_accept: + * Check permission before accepting a new connection. Note that the new + * socket, @newsock, has been created and some information copied to it, + * but the accept operation has not actually been performed. + * @sock contains the listening socket structure. + * @newsock contains the newly created server socket for connection. + * Return 0 if permission is granted. + * @socket_sendmsg: + * Check permission before transmitting a message to another socket. + * @sock contains the socket structure. + * @msg contains the message to be transmitted. + * @size contains the size of message. + * Return 0 if permission is granted. + * @socket_recvmsg: + * Check permission before receiving a message from a socket. + * @sock contains the socket structure. + * @msg contains the message structure. + * @size contains the size of message structure. + * @flags contains the operational flags. + * Return 0 if permission is granted. + * @socket_getsockname: + * Check permission before the local address (name) of the socket object + * @sock is retrieved. + * @sock contains the socket structure. + * Return 0 if permission is granted. + * @socket_getpeername: + * Check permission before the remote address (name) of a socket object + * @sock is retrieved. + * @sock contains the socket structure. + * Return 0 if permission is granted. + * @socket_getsockopt: + * Check permissions before retrieving the options associated with socket + * @sock. + * @sock contains the socket structure. + * @level contains the protocol level to retrieve option from. + * @optname contains the name of option to retrieve. + * Return 0 if permission is granted. + * @socket_setsockopt: + * Check permissions before setting the options associated with socket + * @sock. + * @sock contains the socket structure. + * @level contains the protocol level to set options for. + * @optname contains the name of the option to set. + * Return 0 if permission is granted. + * @socket_shutdown: + * Checks permission before all or part of a connection on the socket + * @sock is shut down. + * @sock contains the socket structure. + * @how contains the flag indicating how future sends and receives + * are handled. + * Return 0 if permission is granted. + * @socket_sock_rcv_skb: + * Check permissions on incoming network packets. This hook is distinct + * from Netfilter's IP input hooks since it is the first time that the + * incoming sk_buff @skb has been associated with a particular socket, @sk. + * Must not sleep inside this hook because some callers hold spinlocks. + * @sk contains the sock (not socket) associated with the incoming sk_buff. + * @skb contains the incoming network data. + * @socket_getpeersec_stream: + * This hook allows the security module to provide peer socket security + * state for unix or connected tcp sockets to userspace via getsockopt + * SO_GETPEERSEC. For tcp sockets this can be meaningful if the + * socket is associated with an ipsec SA. + * @sock is the local socket. + * @optval userspace memory where the security state is to be copied. + * @optlen userspace int where the module should copy the actual length + * of the security state. + * @len as input is the maximum length to copy to userspace provided + * by the caller. + * Return 0 if all is well, otherwise, typical getsockopt return + * values. + * @socket_getpeersec_dgram: + * This hook allows the security module to provide peer socket security + * state for udp sockets on a per-packet basis to userspace via + * getsockopt SO_GETPEERSEC. The application must first have indicated + * the IP_PASSSEC option via getsockopt. It can then retrieve the + * security state returned by this hook for a packet via the SCM_SECURITY + * ancillary message type. + * @skb is the skbuff for the packet being queried + * @secdata is a pointer to a buffer in which to copy the security data + * @seclen is the maximum length for @secdata + * Return 0 on success, error on failure. + * @sk_alloc_security: + * Allocate and attach a security structure to the sk->sk_security field, + * which is used to copy security attributes between local stream sockets. + * @sk_free_security: + * Deallocate security structure. + * @sk_clone_security: + * Clone/copy security structure. + * @sk_getsecid: + * Retrieve the LSM-specific secid for the sock to enable caching + * of network authorizations. + * @sock_graft: + * Sets the socket's isec sid to the sock's sid. + * @inet_conn_request: + * Sets the openreq's sid to socket's sid with MLS portion taken + * from peer sid. + * @inet_csk_clone: + * Sets the new child socket's sid to the openreq sid. + * @inet_conn_established: + * Sets the connection's peersid to the secmark on skb. + * @secmark_relabel_packet: + * check if the process should be allowed to relabel packets to + * the given secid + * @security_secmark_refcount_inc + * tells the LSM to increment the number of secmark labeling rules loaded + * @security_secmark_refcount_dec + * tells the LSM to decrement the number of secmark labeling rules loaded + * @req_classify_flow: + * Sets the flow's sid to the openreq sid. + * @tun_dev_alloc_security: + * This hook allows a module to allocate a security structure for a TUN + * device. + * @security pointer to a security structure pointer. + * Returns a zero on success, negative values on failure. + * @tun_dev_free_security: + * This hook allows a module to free the security structure for a TUN + * device. + * @security pointer to the TUN device's security structure + * @tun_dev_create: + * Check permissions prior to creating a new TUN device. + * @tun_dev_attach_queue: + * Check permissions prior to attaching to a TUN device queue. + * @security pointer to the TUN device's security structure. + * @tun_dev_attach: + * This hook can be used by the module to update any security state + * associated with the TUN device's sock structure. + * @sk contains the existing sock structure. + * @security pointer to the TUN device's security structure. + * @tun_dev_open: + * This hook can be used by the module to update any security state + * associated with the TUN device's security structure. + * @security pointer to the TUN devices's security structure. + * + * Security hooks for XFRM operations. + * + * @xfrm_policy_alloc_security: + * @ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy + * Database used by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level policy update program (e.g., setkey). + * Allocate a security structure to the xp->security field; the security + * field is initialized to NULL when the xfrm_policy is allocated. + * Return 0 if operation was successful (memory to allocate, legal context) + * @gfp is to specify the context for the allocation + * @xfrm_policy_clone_security: + * @old_ctx contains an existing xfrm_sec_ctx. + * @new_ctxp contains a new xfrm_sec_ctx being cloned from old. + * Allocate a security structure in new_ctxp that contains the + * information from the old_ctx structure. + * Return 0 if operation was successful (memory to allocate). + * @xfrm_policy_free_security: + * @ctx contains the xfrm_sec_ctx + * Deallocate xp->security. + * @xfrm_policy_delete_security: + * @ctx contains the xfrm_sec_ctx. + * Authorize deletion of xp->security. + * @xfrm_state_alloc: + * @x contains the xfrm_state being added to the Security Association + * Database by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level SA generation program (e.g., setkey or racoon). + * Allocate a security structure to the x->security field; the security + * field is initialized to NULL when the xfrm_state is allocated. Set the + * context to correspond to sec_ctx. Return 0 if operation was successful + * (memory to allocate, legal context). + * @xfrm_state_alloc_acquire: + * @x contains the xfrm_state being added to the Security Association + * Database by the XFRM system. + * @polsec contains the policy's security context. + * @secid contains the secid from which to take the mls portion of the + * context. + * Allocate a security structure to the x->security field; the security + * field is initialized to NULL when the xfrm_state is allocated. Set the + * context to correspond to secid. Return 0 if operation was successful + * (memory to allocate, legal context). + * @xfrm_state_free_security: + * @x contains the xfrm_state. + * Deallocate x->security. + * @xfrm_state_delete_security: + * @x contains the xfrm_state. + * Authorize deletion of x->security. + * @xfrm_policy_lookup: + * @ctx contains the xfrm_sec_ctx for which the access control is being + * checked. + * @fl_secid contains the flow security label that is used to authorize + * access to the policy xp. + * @dir contains the direction of the flow (input or output). + * Check permission when a flow selects a xfrm_policy for processing + * XFRMs on a packet. The hook is called when selecting either a + * per-socket policy or a generic xfrm policy. + * Return 0 if permission is granted, -ESRCH otherwise, or -errno + * on other errors. + * @xfrm_state_pol_flow_match: + * @x contains the state to match. + * @xp contains the policy to check for a match. + * @fl contains the flow to check for a match. + * Return 1 if there is a match. + * @xfrm_decode_session: + * @skb points to skb to decode. + * @secid points to the flow key secid to set. + * @ckall says if all xfrms used should be checked for same secid. + * Return 0 if ckall is zero or all xfrms used have the same secid. + * + * Security hooks affecting all Key Management operations + * + * @key_alloc: + * Permit allocation of a key and assign security data. Note that key does + * not have a serial number assigned at this point. + * @key points to the key. + * @flags is the allocation flags + * Return 0 if permission is granted, -ve error otherwise. + * @key_free: + * Notification of destruction; free security data. + * @key points to the key. + * No return value. + * @key_permission: + * See whether a specific operational right is granted to a process on a + * key. + * @key_ref refers to the key (key pointer + possession attribute bit). + * @cred points to the credentials to provide the context against which to + * evaluate the security data on the key. + * @perm describes the combination of permissions required of this key. + * Return 0 if permission is granted, -ve error otherwise. + * @key_getsecurity: + * Get a textual representation of the security context attached to a key + * for the purposes of honouring KEYCTL_GETSECURITY. This function + * allocates the storage for the NUL-terminated string and the caller + * should free it. + * @key points to the key to be queried. + * @_buffer points to a pointer that should be set to point to the + * resulting string (if no label or an error occurs). + * Return the length of the string (including terminating NUL) or -ve if + * an error. + * May also return 0 (and a NULL buffer pointer) if there is no label. + * + * Security hooks affecting all System V IPC operations. + * + * @ipc_permission: + * Check permissions for access to IPC + * @ipcp contains the kernel IPC permission structure + * @flag contains the desired (requested) permission set + * Return 0 if permission is granted. + * @ipc_getsecid: + * Get the secid associated with the ipc object. + * @ipcp contains the kernel IPC permission structure. + * @secid contains a pointer to the location where result will be saved. + * In case of failure, @secid will be set to zero. + * + * Security hooks for individual messages held in System V IPC message queues + * @msg_msg_alloc_security: + * Allocate and attach a security structure to the msg->security field. + * The security field is initialized to NULL when the structure is first + * created. + * @msg contains the message structure to be modified. + * Return 0 if operation was successful and permission is granted. + * @msg_msg_free_security: + * Deallocate the security structure for this message. + * @msg contains the message structure to be modified. + * + * Security hooks for System V IPC Message Queues + * + * @msg_queue_alloc_security: + * Allocate and attach a security structure to the + * msq->q_perm.security field. The security field is initialized to + * NULL when the structure is first created. + * @msq contains the message queue structure to be modified. + * Return 0 if operation was successful and permission is granted. + * @msg_queue_free_security: + * Deallocate security structure for this message queue. + * @msq contains the message queue structure to be modified. + * @msg_queue_associate: + * Check permission when a message queue is requested through the + * msgget system call. This hook is only called when returning the + * message queue identifier for an existing message queue, not when a + * new message queue is created. + * @msq contains the message queue to act upon. + * @msqflg contains the operation control flags. + * Return 0 if permission is granted. + * @msg_queue_msgctl: + * Check permission when a message control operation specified by @cmd + * is to be performed on the message queue @msq. + * The @msq may be NULL, e.g. for IPC_INFO or MSG_INFO. + * @msq contains the message queue to act upon. May be NULL. + * @cmd contains the operation to be performed. + * Return 0 if permission is granted. + * @msg_queue_msgsnd: + * Check permission before a message, @msg, is enqueued on the message + * queue, @msq. + * @msq contains the message queue to send message to. + * @msg contains the message to be enqueued. + * @msqflg contains operational flags. + * Return 0 if permission is granted. + * @msg_queue_msgrcv: + * Check permission before a message, @msg, is removed from the message + * queue, @msq. The @target task structure contains a pointer to the + * process that will be receiving the message (not equal to the current + * process when inline receives are being performed). + * @msq contains the message queue to retrieve message from. + * @msg contains the message destination. + * @target contains the task structure for recipient process. + * @type contains the type of message requested. + * @mode contains the operational flags. + * Return 0 if permission is granted. + * + * Security hooks for System V Shared Memory Segments + * + * @shm_alloc_security: + * Allocate and attach a security structure to the shp->shm_perm.security + * field. The security field is initialized to NULL when the structure is + * first created. + * @shp contains the shared memory structure to be modified. + * Return 0 if operation was successful and permission is granted. + * @shm_free_security: + * Deallocate the security struct for this memory segment. + * @shp contains the shared memory structure to be modified. + * @shm_associate: + * Check permission when a shared memory region is requested through the + * shmget system call. This hook is only called when returning the shared + * memory region identifier for an existing region, not when a new shared + * memory region is created. + * @shp contains the shared memory structure to be modified. + * @shmflg contains the operation control flags. + * Return 0 if permission is granted. + * @shm_shmctl: + * Check permission when a shared memory control operation specified by + * @cmd is to be performed on the shared memory region @shp. + * The @shp may be NULL, e.g. for IPC_INFO or SHM_INFO. + * @shp contains shared memory structure to be modified. + * @cmd contains the operation to be performed. + * Return 0 if permission is granted. + * @shm_shmat: + * Check permissions prior to allowing the shmat system call to attach the + * shared memory segment @shp to the data segment of the calling process. + * The attaching address is specified by @shmaddr. + * @shp contains the shared memory structure to be modified. + * @shmaddr contains the address to attach memory region to. + * @shmflg contains the operational flags. + * Return 0 if permission is granted. + * + * Security hooks for System V Semaphores + * + * @sem_alloc_security: + * Allocate and attach a security structure to the sma->sem_perm.security + * field. The security field is initialized to NULL when the structure is + * first created. + * @sma contains the semaphore structure + * Return 0 if operation was successful and permission is granted. + * @sem_free_security: + * deallocate security struct for this semaphore + * @sma contains the semaphore structure. + * @sem_associate: + * Check permission when a semaphore is requested through the semget + * system call. This hook is only called when returning the semaphore + * identifier for an existing semaphore, not when a new one must be + * created. + * @sma contains the semaphore structure. + * @semflg contains the operation control flags. + * Return 0 if permission is granted. + * @sem_semctl: + * Check permission when a semaphore operation specified by @cmd is to be + * performed on the semaphore @sma. The @sma may be NULL, e.g. for + * IPC_INFO or SEM_INFO. + * @sma contains the semaphore structure. May be NULL. + * @cmd contains the operation to be performed. + * Return 0 if permission is granted. + * @sem_semop + * Check permissions before performing operations on members of the + * semaphore set @sma. If the @alter flag is nonzero, the semaphore set + * may be modified. + * @sma contains the semaphore structure. + * @sops contains the operations to perform. + * @nsops contains the number of operations to perform. + * @alter contains the flag indicating whether changes are to be made. + * Return 0 if permission is granted. + * + * @binder_set_context_mgr + * Check whether @mgr is allowed to be the binder context manager. + * @mgr contains the task_struct for the task being registered. + * Return 0 if permission is granted. + * @binder_transaction + * Check whether @from is allowed to invoke a binder transaction call + * to @to. + * @from contains the task_struct for the sending task. + * @to contains the task_struct for the receiving task. + * @binder_transfer_binder + * Check whether @from is allowed to transfer a binder reference to @to. + * @from contains the task_struct for the sending task. + * @to contains the task_struct for the receiving task. + * @binder_transfer_file + * Check whether @from is allowed to transfer @file to @to. + * @from contains the task_struct for the sending task. + * @file contains the struct file being transferred. + * @to contains the task_struct for the receiving task. + * + * @ptrace_access_check: + * Check permission before allowing the current process to trace the + * @child process. + * Security modules may also want to perform a process tracing check + * during an execve in the set_security or apply_creds hooks of + * tracing check during an execve in the bprm_set_creds hook of + * binprm_security_ops if the process is being traced and its security + * attributes would be changed by the execve. + * @child contains the task_struct structure for the target process. + * @mode contains the PTRACE_MODE flags indicating the form of access. + * Return 0 if permission is granted. + * @ptrace_traceme: + * Check that the @parent process has sufficient permission to trace the + * current process before allowing the current process to present itself + * to the @parent process for tracing. + * @parent contains the task_struct structure for debugger process. + * Return 0 if permission is granted. + * @capget: + * Get the @effective, @inheritable, and @permitted capability sets for + * the @target process. The hook may also perform permission checking to + * determine if the current process is allowed to see the capability sets + * of the @target process. + * @target contains the task_struct structure for target process. + * @effective contains the effective capability set. + * @inheritable contains the inheritable capability set. + * @permitted contains the permitted capability set. + * Return 0 if the capability sets were successfully obtained. + * @capset: + * Set the @effective, @inheritable, and @permitted capability sets for + * the current process. + * @new contains the new credentials structure for target process. + * @old contains the current credentials structure for target process. + * @effective contains the effective capability set. + * @inheritable contains the inheritable capability set. + * @permitted contains the permitted capability set. + * Return 0 and update @new if permission is granted. + * @capable: + * Check whether the @tsk process has the @cap capability in the indicated + * credentials. + * @cred contains the credentials to use. + * @ns contains the user namespace we want the capability in + * @cap contains the capability . + * @audit: Whether to write an audit message or not + * Return 0 if the capability is granted for @tsk. + * @syslog: + * Check permission before accessing the kernel message ring or changing + * logging to the console. + * See the syslog(2) manual page for an explanation of the @type values. + * @type contains the type of action. + * @from_file indicates the context of action (if it came from /proc). + * Return 0 if permission is granted. + * @settime: + * Check permission to change the system time. + * struct timespec and timezone are defined in include/linux/time.h + * @ts contains new time + * @tz contains new timezone + * Return 0 if permission is granted. + * @vm_enough_memory: + * Check permissions for allocating a new virtual mapping. + * @mm contains the mm struct it is being added to. + * @pages contains the number of pages. + * Return 0 if permission is granted. + * + * @ismaclabel: + * Check if the extended attribute specified by @name + * represents a MAC label. Returns 1 if name is a MAC + * attribute otherwise returns 0. + * @name full extended attribute name to check against + * LSM as a MAC label. + * + * @secid_to_secctx: + * Convert secid to security context. If secdata is NULL the length of + * the result will be returned in seclen, but no secdata will be returned. + * This does mean that the length could change between calls to check the + * length and the next call which actually allocates and returns the + * secdata. + * @secid contains the security ID. + * @secdata contains the pointer that stores the converted security + * context. + * @seclen pointer which contains the length of the data + * @secctx_to_secid: + * Convert security context to secid. + * @secid contains the pointer to the generated security ID. + * @secdata contains the security context. + * + * @release_secctx: + * Release the security context. + * @secdata contains the security context. + * @seclen contains the length of the security context. + * + * Security hooks for Audit + * + * @audit_rule_init: + * Allocate and initialize an LSM audit rule structure. + * @field contains the required Audit action. + * Fields flags are defined in include/linux/audit.h + * @op contains the operator the rule uses. + * @rulestr contains the context where the rule will be applied to. + * @lsmrule contains a pointer to receive the result. + * Return 0 if @lsmrule has been successfully set, + * -EINVAL in case of an invalid rule. + * + * @audit_rule_known: + * Specifies whether given @rule contains any fields related to + * current LSM. + * @rule contains the audit rule of interest. + * Return 1 in case of relation found, 0 otherwise. + * + * @audit_rule_match: + * Determine if given @secid matches a rule previously approved + * by @audit_rule_known. + * @secid contains the security id in question. + * @field contains the field which relates to current LSM. + * @op contains the operator that will be used for matching. + * @rule points to the audit rule that will be checked against. + * @actx points to the audit context associated with the check. + * Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure. + * + * @audit_rule_free: + * Deallocate the LSM audit rule structure previously allocated by + * audit_rule_init. + * @rule contains the allocated rule + * + * @inode_notifysecctx: + * Notify the security module of what the security context of an inode + * should be. Initializes the incore security context managed by the + * security module for this inode. Example usage: NFS client invokes + * this hook to initialize the security context in its incore inode to the + * value provided by the server for the file when the server returned the + * file's attributes to the client. + * + * Must be called with inode->i_mutex locked. + * + * @inode we wish to set the security context of. + * @ctx contains the string which we wish to set in the inode. + * @ctxlen contains the length of @ctx. + * + * @inode_setsecctx: + * Change the security context of an inode. Updates the + * incore security context managed by the security module and invokes the + * fs code as needed (via __vfs_setxattr_noperm) to update any backing + * xattrs that represent the context. Example usage: NFS server invokes + * this hook to change the security context in its incore inode and on the + * backing filesystem to a value provided by the client on a SETATTR + * operation. + * + * Must be called with inode->i_mutex locked. + * + * @dentry contains the inode we wish to set the security context of. + * @ctx contains the string which we wish to set in the inode. + * @ctxlen contains the length of @ctx. + * + * @inode_getsecctx: + * On success, returns 0 and fills out @ctx and @ctxlen with the security + * context for the given @inode. + * + * @inode we wish to get the security context of. + * @ctx is a pointer in which to place the allocated security context. + * @ctxlen points to the place to put the length of @ctx. + * This is the main security structure. + */ + struct security_operations { char name[SECURITY_NAME_MAX + 1]; -- cgit v1.2.3 From 346033a28fb16b83dac2a74d8025ff8ee64a2c9b Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:11:14 -0700 Subject: LSM: Remove a comment from security.h Remove the large comment describing the content of the security_operations structure from security.h. This wasn't done in the previous (2/7) patch because it would have exceeded the mail list size limits. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/security.h | 1270 ---------------------------------------------- 1 file changed, 1270 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index f3d42c636f27..a2a100e7ac6e 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -186,1276 +186,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) opts->num_mnt_opts = 0; } -/** - * struct security_operations - main security structure - * - * Security module identifier. - * - * @name: - * A string that acts as a unique identifier for the LSM with max number - * of characters = SECURITY_NAME_MAX. - * - * Security hooks for program execution operations. - * - * @bprm_set_creds: - * Save security information in the bprm->security field, typically based - * on information about the bprm->file, for later use by the apply_creds - * hook. This hook may also optionally check permissions (e.g. for - * transitions between security domains). - * This hook may be called multiple times during a single execve, e.g. for - * interpreters. The hook can tell whether it has already been called by - * checking to see if @bprm->security is non-NULL. If so, then the hook - * may decide either to retain the security information saved earlier or - * to replace it. - * @bprm contains the linux_binprm structure. - * Return 0 if the hook is successful and permission is granted. - * @bprm_check_security: - * This hook mediates the point when a search for a binary handler will - * begin. It allows a check the @bprm->security value which is set in the - * preceding set_creds call. The primary difference from set_creds is - * that the argv list and envp list are reliably available in @bprm. This - * hook may be called multiple times during a single execve; and in each - * pass set_creds is called first. - * @bprm contains the linux_binprm structure. - * Return 0 if the hook is successful and permission is granted. - * @bprm_committing_creds: - * Prepare to install the new security attributes of a process being - * transformed by an execve operation, based on the old credentials - * pointed to by @current->cred and the information set in @bprm->cred by - * the bprm_set_creds hook. @bprm points to the linux_binprm structure. - * This hook is a good place to perform state changes on the process such - * as closing open file descriptors to which access will no longer be - * granted when the attributes are changed. This is called immediately - * before commit_creds(). - * @bprm_committed_creds: - * Tidy up after the installation of the new security attributes of a - * process being transformed by an execve operation. The new credentials - * have, by this point, been set to @current->cred. @bprm points to the - * linux_binprm structure. This hook is a good place to perform state - * changes on the process such as clearing out non-inheritable signal - * state. This is called immediately after commit_creds(). - * @bprm_secureexec: - * Return a boolean value (0 or 1) indicating whether a "secure exec" - * is required. The flag is passed in the auxiliary table - * on the initial stack to the ELF interpreter to indicate whether libc - * should enable secure mode. - * @bprm contains the linux_binprm structure. - * - * Security hooks for filesystem operations. - * - * @sb_alloc_security: - * Allocate and attach a security structure to the sb->s_security field. - * The s_security field is initialized to NULL when the structure is - * allocated. - * @sb contains the super_block structure to be modified. - * Return 0 if operation was successful. - * @sb_free_security: - * Deallocate and clear the sb->s_security field. - * @sb contains the super_block structure to be modified. - * @sb_statfs: - * Check permission before obtaining filesystem statistics for the @mnt - * mountpoint. - * @dentry is a handle on the superblock for the filesystem. - * Return 0 if permission is granted. - * @sb_mount: - * Check permission before an object specified by @dev_name is mounted on - * the mount point named by @nd. For an ordinary mount, @dev_name - * identifies a device if the file system type requires a device. For a - * remount (@flags & MS_REMOUNT), @dev_name is irrelevant. For a - * loopback/bind mount (@flags & MS_BIND), @dev_name identifies the - * pathname of the object being mounted. - * @dev_name contains the name for object being mounted. - * @path contains the path for mount point object. - * @type contains the filesystem type. - * @flags contains the mount flags. - * @data contains the filesystem-specific data. - * Return 0 if permission is granted. - * @sb_copy_data: - * Allow mount option data to be copied prior to parsing by the filesystem, - * so that the security module can extract security-specific mount - * options cleanly (a filesystem may modify the data e.g. with strsep()). - * This also allows the original mount data to be stripped of security- - * specific options to avoid having to make filesystems aware of them. - * @type the type of filesystem being mounted. - * @orig the original mount data copied from userspace. - * @copy copied data which will be passed to the security module. - * Returns 0 if the copy was successful. - * @sb_remount: - * Extracts security system specific mount options and verifies no changes - * are being made to those options. - * @sb superblock being remounted - * @data contains the filesystem-specific data. - * Return 0 if permission is granted. - * @sb_umount: - * Check permission before the @mnt file system is unmounted. - * @mnt contains the mounted file system. - * @flags contains the unmount flags, e.g. MNT_FORCE. - * Return 0 if permission is granted. - * @sb_pivotroot: - * Check permission before pivoting the root filesystem. - * @old_path contains the path for the new location of the current root (put_old). - * @new_path contains the path for the new root (new_root). - * Return 0 if permission is granted. - * @sb_set_mnt_opts: - * Set the security relevant mount options used for a superblock - * @sb the superblock to set security mount options for - * @opts binary data structure containing all lsm mount data - * @sb_clone_mnt_opts: - * Copy all security options from a given superblock to another - * @oldsb old superblock which contain information to clone - * @newsb new superblock which needs filled in - * @sb_parse_opts_str: - * Parse a string of security data filling in the opts structure - * @options string containing all mount options known by the LSM - * @opts binary data structure usable by the LSM - * @dentry_init_security: - * Compute a context for a dentry as the inode is not yet available - * since NFSv4 has no label backed by an EA anyway. - * @dentry dentry to use in calculating the context. - * @mode mode used to determine resource type. - * @name name of the last path component used to create file - * @ctx pointer to place the pointer to the resulting context in. - * @ctxlen point to place the length of the resulting context. - * - * - * Security hooks for inode operations. - * - * @inode_alloc_security: - * Allocate and attach a security structure to @inode->i_security. The - * i_security field is initialized to NULL when the inode structure is - * allocated. - * @inode contains the inode structure. - * Return 0 if operation was successful. - * @inode_free_security: - * @inode contains the inode structure. - * Deallocate the inode security structure and set @inode->i_security to - * NULL. - * @inode_init_security: - * Obtain the security attribute name suffix and value to set on a newly - * created inode and set up the incore security field for the new inode. - * This hook is called by the fs code as part of the inode creation - * transaction and provides for atomic labeling of the inode, unlike - * the post_create/mkdir/... hooks called by the VFS. The hook function - * is expected to allocate the name and value via kmalloc, with the caller - * being responsible for calling kfree after using them. - * If the security module does not use security attributes or does - * not wish to put a security attribute on this particular inode, - * then it should return -EOPNOTSUPP to skip this processing. - * @inode contains the inode structure of the newly created inode. - * @dir contains the inode structure of the parent directory. - * @qstr contains the last path component of the new object - * @name will be set to the allocated name suffix (e.g. selinux). - * @value will be set to the allocated attribute value. - * @len will be set to the length of the value. - * Returns 0 if @name and @value have been successfully set, - * -EOPNOTSUPP if no security attribute is needed, or - * -ENOMEM on memory allocation failure. - * @inode_create: - * Check permission to create a regular file. - * @dir contains inode structure of the parent of the new file. - * @dentry contains the dentry structure for the file to be created. - * @mode contains the file mode of the file to be created. - * Return 0 if permission is granted. - * @inode_link: - * Check permission before creating a new hard link to a file. - * @old_dentry contains the dentry structure for an existing link to the file. - * @dir contains the inode structure of the parent directory of the new link. - * @new_dentry contains the dentry structure for the new link. - * Return 0 if permission is granted. - * @path_link: - * Check permission before creating a new hard link to a file. - * @old_dentry contains the dentry structure for an existing link - * to the file. - * @new_dir contains the path structure of the parent directory of - * the new link. - * @new_dentry contains the dentry structure for the new link. - * Return 0 if permission is granted. - * @inode_unlink: - * Check the permission to remove a hard link to a file. - * @dir contains the inode structure of parent directory of the file. - * @dentry contains the dentry structure for file to be unlinked. - * Return 0 if permission is granted. - * @path_unlink: - * Check the permission to remove a hard link to a file. - * @dir contains the path structure of parent directory of the file. - * @dentry contains the dentry structure for file to be unlinked. - * Return 0 if permission is granted. - * @inode_symlink: - * Check the permission to create a symbolic link to a file. - * @dir contains the inode structure of parent directory of the symbolic link. - * @dentry contains the dentry structure of the symbolic link. - * @old_name contains the pathname of file. - * Return 0 if permission is granted. - * @path_symlink: - * Check the permission to create a symbolic link to a file. - * @dir contains the path structure of parent directory of - * the symbolic link. - * @dentry contains the dentry structure of the symbolic link. - * @old_name contains the pathname of file. - * Return 0 if permission is granted. - * @inode_mkdir: - * Check permissions to create a new directory in the existing directory - * associated with inode structure @dir. - * @dir contains the inode structure of parent of the directory to be created. - * @dentry contains the dentry structure of new directory. - * @mode contains the mode of new directory. - * Return 0 if permission is granted. - * @path_mkdir: - * Check permissions to create a new directory in the existing directory - * associated with path structure @path. - * @dir contains the path structure of parent of the directory - * to be created. - * @dentry contains the dentry structure of new directory. - * @mode contains the mode of new directory. - * Return 0 if permission is granted. - * @inode_rmdir: - * Check the permission to remove a directory. - * @dir contains the inode structure of parent of the directory to be removed. - * @dentry contains the dentry structure of directory to be removed. - * Return 0 if permission is granted. - * @path_rmdir: - * Check the permission to remove a directory. - * @dir contains the path structure of parent of the directory to be - * removed. - * @dentry contains the dentry structure of directory to be removed. - * Return 0 if permission is granted. - * @inode_mknod: - * Check permissions when creating a special file (or a socket or a fifo - * file created via the mknod system call). Note that if mknod operation - * is being done for a regular file, then the create hook will be called - * and not this hook. - * @dir contains the inode structure of parent of the new file. - * @dentry contains the dentry structure of the new file. - * @mode contains the mode of the new file. - * @dev contains the device number. - * Return 0 if permission is granted. - * @path_mknod: - * Check permissions when creating a file. Note that this hook is called - * even if mknod operation is being done for a regular file. - * @dir contains the path structure of parent of the new file. - * @dentry contains the dentry structure of the new file. - * @mode contains the mode of the new file. - * @dev contains the undecoded device number. Use new_decode_dev() to get - * the decoded device number. - * Return 0 if permission is granted. - * @inode_rename: - * Check for permission to rename a file or directory. - * @old_dir contains the inode structure for parent of the old link. - * @old_dentry contains the dentry structure of the old link. - * @new_dir contains the inode structure for parent of the new link. - * @new_dentry contains the dentry structure of the new link. - * Return 0 if permission is granted. - * @path_rename: - * Check for permission to rename a file or directory. - * @old_dir contains the path structure for parent of the old link. - * @old_dentry contains the dentry structure of the old link. - * @new_dir contains the path structure for parent of the new link. - * @new_dentry contains the dentry structure of the new link. - * Return 0 if permission is granted. - * @path_chmod: - * Check for permission to change DAC's permission of a file or directory. - * @dentry contains the dentry structure. - * @mnt contains the vfsmnt structure. - * @mode contains DAC's mode. - * Return 0 if permission is granted. - * @path_chown: - * Check for permission to change owner/group of a file or directory. - * @path contains the path structure. - * @uid contains new owner's ID. - * @gid contains new group's ID. - * Return 0 if permission is granted. - * @path_chroot: - * Check for permission to change root directory. - * @path contains the path structure. - * Return 0 if permission is granted. - * @inode_readlink: - * Check the permission to read the symbolic link. - * @dentry contains the dentry structure for the file link. - * Return 0 if permission is granted. - * @inode_follow_link: - * Check permission to follow a symbolic link when looking up a pathname. - * @dentry contains the dentry structure for the link. - * @nd contains the nameidata structure for the parent directory. - * Return 0 if permission is granted. - * @inode_permission: - * Check permission before accessing an inode. This hook is called by the - * existing Linux permission function, so a security module can use it to - * provide additional checking for existing Linux permission checks. - * Notice that this hook is called when a file is opened (as well as many - * other operations), whereas the file_security_ops permission hook is - * called when the actual read/write operations are performed. - * @inode contains the inode structure to check. - * @mask contains the permission mask. - * Return 0 if permission is granted. - * @inode_setattr: - * Check permission before setting file attributes. Note that the kernel - * call to notify_change is performed from several locations, whenever - * file attributes change (such as when a file is truncated, chown/chmod - * operations, transferring disk quotas, etc). - * @dentry contains the dentry structure for the file. - * @attr is the iattr structure containing the new file attributes. - * Return 0 if permission is granted. - * @path_truncate: - * Check permission before truncating a file. - * @path contains the path structure for the file. - * Return 0 if permission is granted. - * @inode_getattr: - * Check permission before obtaining file attributes. - * @mnt is the vfsmount where the dentry was looked up - * @dentry contains the dentry structure for the file. - * Return 0 if permission is granted. - * @inode_setxattr: - * Check permission before setting the extended attributes - * @value identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_post_setxattr: - * Update inode security field after successful setxattr operation. - * @value identified by @name for @dentry. - * @inode_getxattr: - * Check permission before obtaining the extended attributes - * identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_listxattr: - * Check permission before obtaining the list of extended attribute - * names for @dentry. - * Return 0 if permission is granted. - * @inode_removexattr: - * Check permission before removing the extended attribute - * identified by @name for @dentry. - * Return 0 if permission is granted. - * @inode_getsecurity: - * Retrieve a copy of the extended attribute representation of the - * security label associated with @name for @inode via @buffer. Note that - * @name is the remainder of the attribute name after the security prefix - * has been removed. @alloc is used to specify of the call should return a - * value via the buffer or just the value length Return size of buffer on - * success. - * @inode_setsecurity: - * Set the security label associated with @name for @inode from the - * extended attribute value @value. @size indicates the size of the - * @value in bytes. @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. - * Note that @name is the remainder of the attribute name after the - * security. prefix has been removed. - * Return 0 on success. - * @inode_listsecurity: - * Copy the extended attribute names for the security labels - * associated with @inode into @buffer. The maximum size of @buffer - * is specified by @buffer_size. @buffer may be NULL to request - * the size of the buffer required. - * Returns number of bytes used/required on success. - * @inode_need_killpriv: - * Called when an inode has been changed. - * @dentry is the dentry being changed. - * Return <0 on error to abort the inode change operation. - * Return 0 if inode_killpriv does not need to be called. - * Return >0 if inode_killpriv does need to be called. - * @inode_killpriv: - * The setuid bit is being removed. Remove similar security labels. - * Called with the dentry->d_inode->i_mutex held. - * @dentry is the dentry being changed. - * Return 0 on success. If error is returned, then the operation - * causing setuid bit removal is failed. - * @inode_getsecid: - * Get the secid associated with the node. - * @inode contains a pointer to the inode. - * @secid contains a pointer to the location where result will be saved. - * In case of failure, @secid will be set to zero. - * - * Security hooks for file operations - * - * @file_permission: - * Check file permissions before accessing an open file. This hook is - * called by various operations that read or write files. A security - * module can use this hook to perform additional checking on these - * operations, e.g. to revalidate permissions on use to support privilege - * bracketing or policy changes. Notice that this hook is used when the - * actual read/write operations are performed, whereas the - * inode_security_ops hook is called when a file is opened (as well as - * many other operations). - * Caveat: Although this hook can be used to revalidate permissions for - * various system call operations that read or write files, it does not - * address the revalidation of permissions for memory-mapped files. - * Security modules must handle this separately if they need such - * revalidation. - * @file contains the file structure being accessed. - * @mask contains the requested permissions. - * Return 0 if permission is granted. - * @file_alloc_security: - * Allocate and attach a security structure to the file->f_security field. - * The security field is initialized to NULL when the structure is first - * created. - * @file contains the file structure to secure. - * Return 0 if the hook is successful and permission is granted. - * @file_free_security: - * Deallocate and free any security structures stored in file->f_security. - * @file contains the file structure being modified. - * @file_ioctl: - * @file contains the file structure. - * @cmd contains the operation to perform. - * @arg contains the operational arguments. - * Check permission for an ioctl operation on @file. Note that @arg - * sometimes represents a user space pointer; in other cases, it may be a - * simple integer value. When @arg represents a user space pointer, it - * should never be used by the security module. - * Return 0 if permission is granted. - * @mmap_addr : - * Check permissions for a mmap operation at @addr. - * @addr contains virtual address that will be used for the operation. - * Return 0 if permission is granted. - * @mmap_file : - * Check permissions for a mmap operation. The @file may be NULL, e.g. - * if mapping anonymous memory. - * @file contains the file structure for file to map (may be NULL). - * @reqprot contains the protection requested by the application. - * @prot contains the protection that will be applied by the kernel. - * @flags contains the operational flags. - * Return 0 if permission is granted. - * @file_mprotect: - * Check permissions before changing memory access permissions. - * @vma contains the memory region to modify. - * @reqprot contains the protection requested by the application. - * @prot contains the protection that will be applied by the kernel. - * Return 0 if permission is granted. - * @file_lock: - * Check permission before performing file locking operations. - * Note: this hook mediates both flock and fcntl style locks. - * @file contains the file structure. - * @cmd contains the posix-translated lock operation to perform - * (e.g. F_RDLCK, F_WRLCK). - * Return 0 if permission is granted. - * @file_fcntl: - * Check permission before allowing the file operation specified by @cmd - * from being performed on the file @file. Note that @arg sometimes - * represents a user space pointer; in other cases, it may be a simple - * integer value. When @arg represents a user space pointer, it should - * never be used by the security module. - * @file contains the file structure. - * @cmd contains the operation to be performed. - * @arg contains the operational arguments. - * Return 0 if permission is granted. - * @file_set_fowner: - * Save owner security information (typically from current->security) in - * file->f_security for later use by the send_sigiotask hook. - * @file contains the file structure to update. - * Return 0 on success. - * @file_send_sigiotask: - * Check permission for the file owner @fown to send SIGIO or SIGURG to the - * process @tsk. Note that this hook is sometimes called from interrupt. - * Note that the fown_struct, @fown, is never outside the context of a - * struct file, so the file structure (and associated security information) - * can always be obtained: - * container_of(fown, struct file, f_owner) - * @tsk contains the structure of task receiving signal. - * @fown contains the file owner information. - * @sig is the signal that will be sent. When 0, kernel sends SIGIO. - * Return 0 if permission is granted. - * @file_receive: - * This hook allows security modules to control the ability of a process - * to receive an open file descriptor via socket IPC. - * @file contains the file structure being received. - * Return 0 if permission is granted. - * @file_open - * Save open-time permission checking state for later use upon - * file_permission, and recheck access if anything has changed - * since inode_permission. - * - * Security hooks for task operations. - * - * @task_create: - * Check permission before creating a child process. See the clone(2) - * manual page for definitions of the @clone_flags. - * @clone_flags contains the flags indicating what should be shared. - * Return 0 if permission is granted. - * @task_free: - * @task task being freed - * Handle release of task-related resources. (Note that this can be called - * from interrupt context.) - * @cred_alloc_blank: - * @cred points to the credentials. - * @gfp indicates the atomicity of any memory allocations. - * Only allocate sufficient memory and attach to @cred such that - * cred_transfer() will not get ENOMEM. - * @cred_free: - * @cred points to the credentials. - * Deallocate and clear the cred->security field in a set of credentials. - * @cred_prepare: - * @new points to the new credentials. - * @old points to the original credentials. - * @gfp indicates the atomicity of any memory allocations. - * Prepare a new set of credentials by copying the data from the old set. - * @cred_transfer: - * @new points to the new credentials. - * @old points to the original credentials. - * Transfer data from original creds to new creds - * @kernel_act_as: - * Set the credentials for a kernel service to act as (subjective context). - * @new points to the credentials to be modified. - * @secid specifies the security ID to be set - * The current task must be the one that nominated @secid. - * Return 0 if successful. - * @kernel_create_files_as: - * Set the file creation context in a set of credentials to be the same as - * the objective context of the specified inode. - * @new points to the credentials to be modified. - * @inode points to the inode to use as a reference. - * The current task must be the one that nominated @inode. - * Return 0 if successful. - * @kernel_fw_from_file: - * Load firmware from userspace (not called for built-in firmware). - * @file contains the file structure pointing to the file containing - * the firmware to load. This argument will be NULL if the firmware - * was loaded via the uevent-triggered blob-based interface exposed - * by CONFIG_FW_LOADER_USER_HELPER. - * @buf pointer to buffer containing firmware contents. - * @size length of the firmware contents. - * Return 0 if permission is granted. - * @kernel_module_request: - * Ability to trigger the kernel to automatically upcall to userspace for - * userspace to load a kernel module with the given name. - * @kmod_name name of the module requested by the kernel - * Return 0 if successful. - * @kernel_module_from_file: - * Load a kernel module from userspace. - * @file contains the file structure pointing to the file containing - * the kernel module to load. If the module is being loaded from a blob, - * this argument will be NULL. - * Return 0 if permission is granted. - * @task_fix_setuid: - * Update the module's state after setting one or more of the user - * identity attributes of the current process. The @flags parameter - * indicates which of the set*uid system calls invoked this hook. If - * @new is the set of credentials that will be installed. Modifications - * should be made to this rather than to @current->cred. - * @old is the set of credentials that are being replaces - * @flags contains one of the LSM_SETID_* values. - * Return 0 on success. - * @task_setpgid: - * Check permission before setting the process group identifier of the - * process @p to @pgid. - * @p contains the task_struct for process being modified. - * @pgid contains the new pgid. - * Return 0 if permission is granted. - * @task_getpgid: - * Check permission before getting the process group identifier of the - * process @p. - * @p contains the task_struct for the process. - * Return 0 if permission is granted. - * @task_getsid: - * Check permission before getting the session identifier of the process - * @p. - * @p contains the task_struct for the process. - * Return 0 if permission is granted. - * @task_getsecid: - * Retrieve the security identifier of the process @p. - * @p contains the task_struct for the process and place is into @secid. - * In case of failure, @secid will be set to zero. - * - * @task_setnice: - * Check permission before setting the nice value of @p to @nice. - * @p contains the task_struct of process. - * @nice contains the new nice value. - * Return 0 if permission is granted. - * @task_setioprio - * Check permission before setting the ioprio value of @p to @ioprio. - * @p contains the task_struct of process. - * @ioprio contains the new ioprio value - * Return 0 if permission is granted. - * @task_getioprio - * Check permission before getting the ioprio value of @p. - * @p contains the task_struct of process. - * Return 0 if permission is granted. - * @task_setrlimit: - * Check permission before setting the resource limits of the current - * process for @resource to @new_rlim. The old resource limit values can - * be examined by dereferencing (current->signal->rlim + resource). - * @resource contains the resource whose limit is being set. - * @new_rlim contains the new limits for @resource. - * Return 0 if permission is granted. - * @task_setscheduler: - * Check permission before setting scheduling policy and/or parameters of - * process @p based on @policy and @lp. - * @p contains the task_struct for process. - * @policy contains the scheduling policy. - * @lp contains the scheduling parameters. - * Return 0 if permission is granted. - * @task_getscheduler: - * Check permission before obtaining scheduling information for process - * @p. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_movememory - * Check permission before moving memory owned by process @p. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_kill: - * Check permission before sending signal @sig to @p. @info can be NULL, - * the constant 1, or a pointer to a siginfo structure. If @info is 1 or - * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming - * from the kernel and should typically be permitted. - * SIGIO signals are handled separately by the send_sigiotask hook in - * file_security_ops. - * @p contains the task_struct for process. - * @info contains the signal information. - * @sig contains the signal value. - * @secid contains the sid of the process where the signal originated - * Return 0 if permission is granted. - * @task_wait: - * Check permission before allowing a process to reap a child process @p - * and collect its status information. - * @p contains the task_struct for process. - * Return 0 if permission is granted. - * @task_prctl: - * Check permission before performing a process control operation on the - * current process. - * @option contains the operation. - * @arg2 contains a argument. - * @arg3 contains a argument. - * @arg4 contains a argument. - * @arg5 contains a argument. - * Return -ENOSYS if no-one wanted to handle this op, any other value to - * cause prctl() to return immediately with that value. - * @task_to_inode: - * Set the security attributes for an inode based on an associated task's - * security attributes, e.g. for /proc/pid inodes. - * @p contains the task_struct for the task. - * @inode contains the inode structure for the inode. - * - * Security hooks for Netlink messaging. - * - * @netlink_send: - * Save security information for a netlink message so that permission - * checking can be performed when the message is processed. The security - * information can be saved using the eff_cap field of the - * netlink_skb_parms structure. Also may be used to provide fine - * grained control over message transmission. - * @sk associated sock of task sending the message. - * @skb contains the sk_buff structure for the netlink message. - * Return 0 if the information was successfully saved and message - * is allowed to be transmitted. - * - * Security hooks for Unix domain networking. - * - * @unix_stream_connect: - * Check permissions before establishing a Unix domain stream connection - * between @sock and @other. - * @sock contains the sock structure. - * @other contains the peer sock structure. - * @newsk contains the new sock structure. - * Return 0 if permission is granted. - * @unix_may_send: - * Check permissions before connecting or sending datagrams from @sock to - * @other. - * @sock contains the socket structure. - * @other contains the peer socket structure. - * Return 0 if permission is granted. - * - * The @unix_stream_connect and @unix_may_send hooks were necessary because - * Linux provides an alternative to the conventional file name space for Unix - * domain sockets. Whereas binding and connecting to sockets in the file name - * space is mediated by the typical file permissions (and caught by the mknod - * and permission hooks in inode_security_ops), binding and connecting to - * sockets in the abstract name space is completely unmediated. Sufficient - * control of Unix domain sockets in the abstract name space isn't possible - * using only the socket layer hooks, since we need to know the actual target - * socket, which is not looked up until we are inside the af_unix code. - * - * Security hooks for socket operations. - * - * @socket_create: - * Check permissions prior to creating a new socket. - * @family contains the requested protocol family. - * @type contains the requested communications type. - * @protocol contains the requested protocol. - * @kern set to 1 if a kernel socket. - * Return 0 if permission is granted. - * @socket_post_create: - * This hook allows a module to update or allocate a per-socket security - * structure. Note that the security field was not added directly to the - * socket structure, but rather, the socket security information is stored - * in the associated inode. Typically, the inode alloc_security hook will - * allocate and and attach security information to - * sock->inode->i_security. This hook may be used to update the - * sock->inode->i_security field with additional information that wasn't - * available when the inode was allocated. - * @sock contains the newly created socket structure. - * @family contains the requested protocol family. - * @type contains the requested communications type. - * @protocol contains the requested protocol. - * @kern set to 1 if a kernel socket. - * @socket_bind: - * Check permission before socket protocol layer bind operation is - * performed and the socket @sock is bound to the address specified in the - * @address parameter. - * @sock contains the socket structure. - * @address contains the address to bind to. - * @addrlen contains the length of address. - * Return 0 if permission is granted. - * @socket_connect: - * Check permission before socket protocol layer connect operation - * attempts to connect socket @sock to a remote address, @address. - * @sock contains the socket structure. - * @address contains the address of remote endpoint. - * @addrlen contains the length of address. - * Return 0 if permission is granted. - * @socket_listen: - * Check permission before socket protocol layer listen operation. - * @sock contains the socket structure. - * @backlog contains the maximum length for the pending connection queue. - * Return 0 if permission is granted. - * @socket_accept: - * Check permission before accepting a new connection. Note that the new - * socket, @newsock, has been created and some information copied to it, - * but the accept operation has not actually been performed. - * @sock contains the listening socket structure. - * @newsock contains the newly created server socket for connection. - * Return 0 if permission is granted. - * @socket_sendmsg: - * Check permission before transmitting a message to another socket. - * @sock contains the socket structure. - * @msg contains the message to be transmitted. - * @size contains the size of message. - * Return 0 if permission is granted. - * @socket_recvmsg: - * Check permission before receiving a message from a socket. - * @sock contains the socket structure. - * @msg contains the message structure. - * @size contains the size of message structure. - * @flags contains the operational flags. - * Return 0 if permission is granted. - * @socket_getsockname: - * Check permission before the local address (name) of the socket object - * @sock is retrieved. - * @sock contains the socket structure. - * Return 0 if permission is granted. - * @socket_getpeername: - * Check permission before the remote address (name) of a socket object - * @sock is retrieved. - * @sock contains the socket structure. - * Return 0 if permission is granted. - * @socket_getsockopt: - * Check permissions before retrieving the options associated with socket - * @sock. - * @sock contains the socket structure. - * @level contains the protocol level to retrieve option from. - * @optname contains the name of option to retrieve. - * Return 0 if permission is granted. - * @socket_setsockopt: - * Check permissions before setting the options associated with socket - * @sock. - * @sock contains the socket structure. - * @level contains the protocol level to set options for. - * @optname contains the name of the option to set. - * Return 0 if permission is granted. - * @socket_shutdown: - * Checks permission before all or part of a connection on the socket - * @sock is shut down. - * @sock contains the socket structure. - * @how contains the flag indicating how future sends and receives are handled. - * Return 0 if permission is granted. - * @socket_sock_rcv_skb: - * Check permissions on incoming network packets. This hook is distinct - * from Netfilter's IP input hooks since it is the first time that the - * incoming sk_buff @skb has been associated with a particular socket, @sk. - * Must not sleep inside this hook because some callers hold spinlocks. - * @sk contains the sock (not socket) associated with the incoming sk_buff. - * @skb contains the incoming network data. - * @socket_getpeersec_stream: - * This hook allows the security module to provide peer socket security - * state for unix or connected tcp sockets to userspace via getsockopt - * SO_GETPEERSEC. For tcp sockets this can be meaningful if the - * socket is associated with an ipsec SA. - * @sock is the local socket. - * @optval userspace memory where the security state is to be copied. - * @optlen userspace int where the module should copy the actual length - * of the security state. - * @len as input is the maximum length to copy to userspace provided - * by the caller. - * Return 0 if all is well, otherwise, typical getsockopt return - * values. - * @socket_getpeersec_dgram: - * This hook allows the security module to provide peer socket security - * state for udp sockets on a per-packet basis to userspace via - * getsockopt SO_GETPEERSEC. The application must first have indicated - * the IP_PASSSEC option via getsockopt. It can then retrieve the - * security state returned by this hook for a packet via the SCM_SECURITY - * ancillary message type. - * @skb is the skbuff for the packet being queried - * @secdata is a pointer to a buffer in which to copy the security data - * @seclen is the maximum length for @secdata - * Return 0 on success, error on failure. - * @sk_alloc_security: - * Allocate and attach a security structure to the sk->sk_security field, - * which is used to copy security attributes between local stream sockets. - * @sk_free_security: - * Deallocate security structure. - * @sk_clone_security: - * Clone/copy security structure. - * @sk_getsecid: - * Retrieve the LSM-specific secid for the sock to enable caching of network - * authorizations. - * @sock_graft: - * Sets the socket's isec sid to the sock's sid. - * @inet_conn_request: - * Sets the openreq's sid to socket's sid with MLS portion taken from peer sid. - * @inet_csk_clone: - * Sets the new child socket's sid to the openreq sid. - * @inet_conn_established: - * Sets the connection's peersid to the secmark on skb. - * @secmark_relabel_packet: - * check if the process should be allowed to relabel packets to the given secid - * @security_secmark_refcount_inc - * tells the LSM to increment the number of secmark labeling rules loaded - * @security_secmark_refcount_dec - * tells the LSM to decrement the number of secmark labeling rules loaded - * @req_classify_flow: - * Sets the flow's sid to the openreq sid. - * @tun_dev_alloc_security: - * This hook allows a module to allocate a security structure for a TUN - * device. - * @security pointer to a security structure pointer. - * Returns a zero on success, negative values on failure. - * @tun_dev_free_security: - * This hook allows a module to free the security structure for a TUN - * device. - * @security pointer to the TUN device's security structure - * @tun_dev_create: - * Check permissions prior to creating a new TUN device. - * @tun_dev_attach_queue: - * Check permissions prior to attaching to a TUN device queue. - * @security pointer to the TUN device's security structure. - * @tun_dev_attach: - * This hook can be used by the module to update any security state - * associated with the TUN device's sock structure. - * @sk contains the existing sock structure. - * @security pointer to the TUN device's security structure. - * @tun_dev_open: - * This hook can be used by the module to update any security state - * associated with the TUN device's security structure. - * @security pointer to the TUN devices's security structure. - * @skb_owned_by: - * This hook sets the packet's owning sock. - * @skb is the packet. - * @sk the sock which owns the packet. - * - * Security hooks for XFRM operations. - * - * @xfrm_policy_alloc_security: - * @ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy - * Database used by the XFRM system. - * @sec_ctx contains the security context information being provided by - * the user-level policy update program (e.g., setkey). - * Allocate a security structure to the xp->security field; the security - * field is initialized to NULL when the xfrm_policy is allocated. - * Return 0 if operation was successful (memory to allocate, legal context) - * @gfp is to specify the context for the allocation - * @xfrm_policy_clone_security: - * @old_ctx contains an existing xfrm_sec_ctx. - * @new_ctxp contains a new xfrm_sec_ctx being cloned from old. - * Allocate a security structure in new_ctxp that contains the - * information from the old_ctx structure. - * Return 0 if operation was successful (memory to allocate). - * @xfrm_policy_free_security: - * @ctx contains the xfrm_sec_ctx - * Deallocate xp->security. - * @xfrm_policy_delete_security: - * @ctx contains the xfrm_sec_ctx. - * Authorize deletion of xp->security. - * @xfrm_state_alloc: - * @x contains the xfrm_state being added to the Security Association - * Database by the XFRM system. - * @sec_ctx contains the security context information being provided by - * the user-level SA generation program (e.g., setkey or racoon). - * Allocate a security structure to the x->security field; the security - * field is initialized to NULL when the xfrm_state is allocated. Set the - * context to correspond to sec_ctx. Return 0 if operation was successful - * (memory to allocate, legal context). - * @xfrm_state_alloc_acquire: - * @x contains the xfrm_state being added to the Security Association - * Database by the XFRM system. - * @polsec contains the policy's security context. - * @secid contains the secid from which to take the mls portion of the - * context. - * Allocate a security structure to the x->security field; the security - * field is initialized to NULL when the xfrm_state is allocated. Set the - * context to correspond to secid. Return 0 if operation was successful - * (memory to allocate, legal context). - * @xfrm_state_free_security: - * @x contains the xfrm_state. - * Deallocate x->security. - * @xfrm_state_delete_security: - * @x contains the xfrm_state. - * Authorize deletion of x->security. - * @xfrm_policy_lookup: - * @ctx contains the xfrm_sec_ctx for which the access control is being - * checked. - * @fl_secid contains the flow security label that is used to authorize - * access to the policy xp. - * @dir contains the direction of the flow (input or output). - * Check permission when a flow selects a xfrm_policy for processing - * XFRMs on a packet. The hook is called when selecting either a - * per-socket policy or a generic xfrm policy. - * Return 0 if permission is granted, -ESRCH otherwise, or -errno - * on other errors. - * @xfrm_state_pol_flow_match: - * @x contains the state to match. - * @xp contains the policy to check for a match. - * @fl contains the flow to check for a match. - * Return 1 if there is a match. - * @xfrm_decode_session: - * @skb points to skb to decode. - * @secid points to the flow key secid to set. - * @ckall says if all xfrms used should be checked for same secid. - * Return 0 if ckall is zero or all xfrms used have the same secid. - * - * Security hooks affecting all Key Management operations - * - * @key_alloc: - * Permit allocation of a key and assign security data. Note that key does - * not have a serial number assigned at this point. - * @key points to the key. - * @flags is the allocation flags - * Return 0 if permission is granted, -ve error otherwise. - * @key_free: - * Notification of destruction; free security data. - * @key points to the key. - * No return value. - * @key_permission: - * See whether a specific operational right is granted to a process on a - * key. - * @key_ref refers to the key (key pointer + possession attribute bit). - * @cred points to the credentials to provide the context against which to - * evaluate the security data on the key. - * @perm describes the combination of permissions required of this key. - * Return 0 if permission is granted, -ve error otherwise. - * @key_getsecurity: - * Get a textual representation of the security context attached to a key - * for the purposes of honouring KEYCTL_GETSECURITY. This function - * allocates the storage for the NUL-terminated string and the caller - * should free it. - * @key points to the key to be queried. - * @_buffer points to a pointer that should be set to point to the - * resulting string (if no label or an error occurs). - * Return the length of the string (including terminating NUL) or -ve if - * an error. - * May also return 0 (and a NULL buffer pointer) if there is no label. - * - * Security hooks affecting all System V IPC operations. - * - * @ipc_permission: - * Check permissions for access to IPC - * @ipcp contains the kernel IPC permission structure - * @flag contains the desired (requested) permission set - * Return 0 if permission is granted. - * @ipc_getsecid: - * Get the secid associated with the ipc object. - * @ipcp contains the kernel IPC permission structure. - * @secid contains a pointer to the location where result will be saved. - * In case of failure, @secid will be set to zero. - * - * Security hooks for individual messages held in System V IPC message queues - * @msg_msg_alloc_security: - * Allocate and attach a security structure to the msg->security field. - * The security field is initialized to NULL when the structure is first - * created. - * @msg contains the message structure to be modified. - * Return 0 if operation was successful and permission is granted. - * @msg_msg_free_security: - * Deallocate the security structure for this message. - * @msg contains the message structure to be modified. - * - * Security hooks for System V IPC Message Queues - * - * @msg_queue_alloc_security: - * Allocate and attach a security structure to the - * msq->q_perm.security field. The security field is initialized to - * NULL when the structure is first created. - * @msq contains the message queue structure to be modified. - * Return 0 if operation was successful and permission is granted. - * @msg_queue_free_security: - * Deallocate security structure for this message queue. - * @msq contains the message queue structure to be modified. - * @msg_queue_associate: - * Check permission when a message queue is requested through the - * msgget system call. This hook is only called when returning the - * message queue identifier for an existing message queue, not when a - * new message queue is created. - * @msq contains the message queue to act upon. - * @msqflg contains the operation control flags. - * Return 0 if permission is granted. - * @msg_queue_msgctl: - * Check permission when a message control operation specified by @cmd - * is to be performed on the message queue @msq. - * The @msq may be NULL, e.g. for IPC_INFO or MSG_INFO. - * @msq contains the message queue to act upon. May be NULL. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @msg_queue_msgsnd: - * Check permission before a message, @msg, is enqueued on the message - * queue, @msq. - * @msq contains the message queue to send message to. - * @msg contains the message to be enqueued. - * @msqflg contains operational flags. - * Return 0 if permission is granted. - * @msg_queue_msgrcv: - * Check permission before a message, @msg, is removed from the message - * queue, @msq. The @target task structure contains a pointer to the - * process that will be receiving the message (not equal to the current - * process when inline receives are being performed). - * @msq contains the message queue to retrieve message from. - * @msg contains the message destination. - * @target contains the task structure for recipient process. - * @type contains the type of message requested. - * @mode contains the operational flags. - * Return 0 if permission is granted. - * - * Security hooks for System V Shared Memory Segments - * - * @shm_alloc_security: - * Allocate and attach a security structure to the shp->shm_perm.security - * field. The security field is initialized to NULL when the structure is - * first created. - * @shp contains the shared memory structure to be modified. - * Return 0 if operation was successful and permission is granted. - * @shm_free_security: - * Deallocate the security struct for this memory segment. - * @shp contains the shared memory structure to be modified. - * @shm_associate: - * Check permission when a shared memory region is requested through the - * shmget system call. This hook is only called when returning the shared - * memory region identifier for an existing region, not when a new shared - * memory region is created. - * @shp contains the shared memory structure to be modified. - * @shmflg contains the operation control flags. - * Return 0 if permission is granted. - * @shm_shmctl: - * Check permission when a shared memory control operation specified by - * @cmd is to be performed on the shared memory region @shp. - * The @shp may be NULL, e.g. for IPC_INFO or SHM_INFO. - * @shp contains shared memory structure to be modified. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @shm_shmat: - * Check permissions prior to allowing the shmat system call to attach the - * shared memory segment @shp to the data segment of the calling process. - * The attaching address is specified by @shmaddr. - * @shp contains the shared memory structure to be modified. - * @shmaddr contains the address to attach memory region to. - * @shmflg contains the operational flags. - * Return 0 if permission is granted. - * - * Security hooks for System V Semaphores - * - * @sem_alloc_security: - * Allocate and attach a security structure to the sma->sem_perm.security - * field. The security field is initialized to NULL when the structure is - * first created. - * @sma contains the semaphore structure - * Return 0 if operation was successful and permission is granted. - * @sem_free_security: - * deallocate security struct for this semaphore - * @sma contains the semaphore structure. - * @sem_associate: - * Check permission when a semaphore is requested through the semget - * system call. This hook is only called when returning the semaphore - * identifier for an existing semaphore, not when a new one must be - * created. - * @sma contains the semaphore structure. - * @semflg contains the operation control flags. - * Return 0 if permission is granted. - * @sem_semctl: - * Check permission when a semaphore operation specified by @cmd is to be - * performed on the semaphore @sma. The @sma may be NULL, e.g. for - * IPC_INFO or SEM_INFO. - * @sma contains the semaphore structure. May be NULL. - * @cmd contains the operation to be performed. - * Return 0 if permission is granted. - * @sem_semop - * Check permissions before performing operations on members of the - * semaphore set @sma. If the @alter flag is nonzero, the semaphore set - * may be modified. - * @sma contains the semaphore structure. - * @sops contains the operations to perform. - * @nsops contains the number of operations to perform. - * @alter contains the flag indicating whether changes are to be made. - * Return 0 if permission is granted. - * - * @binder_set_context_mgr - * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the task_struct for the task being registered. - * Return 0 if permission is granted. - * @binder_transaction - * Check whether @from is allowed to invoke a binder transaction call - * to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. - * @binder_transfer_binder - * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. - * @binder_transfer_file - * Check whether @from is allowed to transfer @file to @to. - * @from contains the task_struct for the sending task. - * @file contains the struct file being transferred. - * @to contains the task_struct for the receiving task. - * - * @ptrace_access_check: - * Check permission before allowing the current process to trace the - * @child process. - * Security modules may also want to perform a process tracing check - * during an execve in the set_security or apply_creds hooks of - * tracing check during an execve in the bprm_set_creds hook of - * binprm_security_ops if the process is being traced and its security - * attributes would be changed by the execve. - * @child contains the task_struct structure for the target process. - * @mode contains the PTRACE_MODE flags indicating the form of access. - * Return 0 if permission is granted. - * @ptrace_traceme: - * Check that the @parent process has sufficient permission to trace the - * current process before allowing the current process to present itself - * to the @parent process for tracing. - * @parent contains the task_struct structure for debugger process. - * Return 0 if permission is granted. - * @capget: - * Get the @effective, @inheritable, and @permitted capability sets for - * the @target process. The hook may also perform permission checking to - * determine if the current process is allowed to see the capability sets - * of the @target process. - * @target contains the task_struct structure for target process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 if the capability sets were successfully obtained. - * @capset: - * Set the @effective, @inheritable, and @permitted capability sets for - * the current process. - * @new contains the new credentials structure for target process. - * @old contains the current credentials structure for target process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 and update @new if permission is granted. - * @capable: - * Check whether the @tsk process has the @cap capability in the indicated - * credentials. - * @cred contains the credentials to use. - * @ns contains the user namespace we want the capability in - * @cap contains the capability . - * @audit: Whether to write an audit message or not - * Return 0 if the capability is granted for @tsk. - * @syslog: - * Check permission before accessing the kernel message ring or changing - * logging to the console. - * See the syslog(2) manual page for an explanation of the @type values. - * @type contains the type of action. - * @from_file indicates the context of action (if it came from /proc). - * Return 0 if permission is granted. - * @settime: - * Check permission to change the system time. - * struct timespec and timezone are defined in include/linux/time.h - * @ts contains new time - * @tz contains new timezone - * Return 0 if permission is granted. - * @vm_enough_memory: - * Check permissions for allocating a new virtual mapping. - * @mm contains the mm struct it is being added to. - * @pages contains the number of pages. - * Return 0 if permission is granted. - * - * @ismaclabel: - * Check if the extended attribute specified by @name - * represents a MAC label. Returns 1 if name is a MAC - * attribute otherwise returns 0. - * @name full extended attribute name to check against - * LSM as a MAC label. - * - * @secid_to_secctx: - * Convert secid to security context. If secdata is NULL the length of - * the result will be returned in seclen, but no secdata will be returned. - * This does mean that the length could change between calls to check the - * length and the next call which actually allocates and returns the secdata. - * @secid contains the security ID. - * @secdata contains the pointer that stores the converted security context. - * @seclen pointer which contains the length of the data - * @secctx_to_secid: - * Convert security context to secid. - * @secid contains the pointer to the generated security ID. - * @secdata contains the security context. - * - * @release_secctx: - * Release the security context. - * @secdata contains the security context. - * @seclen contains the length of the security context. - * - * Security hooks for Audit - * - * @audit_rule_init: - * Allocate and initialize an LSM audit rule structure. - * @field contains the required Audit action. Fields flags are defined in include/linux/audit.h - * @op contains the operator the rule uses. - * @rulestr contains the context where the rule will be applied to. - * @lsmrule contains a pointer to receive the result. - * Return 0 if @lsmrule has been successfully set, - * -EINVAL in case of an invalid rule. - * - * @audit_rule_known: - * Specifies whether given @rule contains any fields related to current LSM. - * @rule contains the audit rule of interest. - * Return 1 in case of relation found, 0 otherwise. - * - * @audit_rule_match: - * Determine if given @secid matches a rule previously approved - * by @audit_rule_known. - * @secid contains the security id in question. - * @field contains the field which relates to current LSM. - * @op contains the operator that will be used for matching. - * @rule points to the audit rule that will be checked against. - * @actx points to the audit context associated with the check. - * Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure. - * - * @audit_rule_free: - * Deallocate the LSM audit rule structure previously allocated by - * audit_rule_init. - * @rule contains the allocated rule - * - * @inode_notifysecctx: - * Notify the security module of what the security context of an inode - * should be. Initializes the incore security context managed by the - * security module for this inode. Example usage: NFS client invokes - * this hook to initialize the security context in its incore inode to the - * value provided by the server for the file when the server returned the - * file's attributes to the client. - * - * Must be called with inode->i_mutex locked. - * - * @inode we wish to set the security context of. - * @ctx contains the string which we wish to set in the inode. - * @ctxlen contains the length of @ctx. - * - * @inode_setsecctx: - * Change the security context of an inode. Updates the - * incore security context managed by the security module and invokes the - * fs code as needed (via __vfs_setxattr_noperm) to update any backing - * xattrs that represent the context. Example usage: NFS server invokes - * this hook to change the security context in its incore inode and on the - * backing filesystem to a value provided by the client on a SETATTR - * operation. - * - * Must be called with inode->i_mutex locked. - * - * @dentry contains the inode we wish to set the security context of. - * @ctx contains the string which we wish to set in the inode. - * @ctxlen contains the length of @ctx. - * - * @inode_getsecctx: - * On success, returns 0 and fills out @ctx and @ctxlen with the security - * context for the given @inode. - * - * @inode we wish to get the security context of. - * @ctx is a pointer in which to place the allocated security context. - * @ctxlen points to the place to put the length of @ctx. - * This is the main security structure. - */ - /* prototypes */ extern int security_init(void); -- cgit v1.2.3 From e20b043a6902ecb61c2c84355c3bae5149f391db Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:11:36 -0700 Subject: LSM: Add security module hook list heads Add a list header for each security hook. They aren't used until later in the patch series. They are grouped together in a structure so that there doesn't need to be an external address for each. Macro-ize the initialization of the security_operations for each security module in anticipation of changing out the security_operations structure. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 220 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index b4c91de510c2..27dd6fcacccc 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1626,6 +1626,226 @@ struct security_operations { #endif /* CONFIG_AUDIT */ }; +struct security_hook_heads { + struct list_head binder_set_context_mgr; + struct list_head binder_transaction; + struct list_head binder_transfer_binder; + struct list_head binder_transfer_file; + struct list_head ptrace_access_check; + struct list_head ptrace_traceme; + struct list_head capget; + struct list_head capset; + struct list_head capable; + struct list_head quotactl; + struct list_head quota_on; + struct list_head syslog; + struct list_head settime; + struct list_head vm_enough_memory; + struct list_head bprm_set_creds; + struct list_head bprm_check_security; + struct list_head bprm_secureexec; + struct list_head bprm_committing_creds; + struct list_head bprm_committed_creds; + struct list_head sb_alloc_security; + struct list_head sb_free_security; + struct list_head sb_copy_data; + struct list_head sb_remount; + struct list_head sb_kern_mount; + struct list_head sb_show_options; + struct list_head sb_statfs; + struct list_head sb_mount; + struct list_head sb_umount; + struct list_head sb_pivotroot; + struct list_head sb_set_mnt_opts; + struct list_head sb_clone_mnt_opts; + struct list_head sb_parse_opts_str; + struct list_head dentry_init_security; +#ifdef CONFIG_SECURITY_PATH + struct list_head path_unlink; + struct list_head path_mkdir; + struct list_head path_rmdir; + struct list_head path_mknod; + struct list_head path_truncate; + struct list_head path_symlink; + struct list_head path_link; + struct list_head path_rename; + struct list_head path_chmod; + struct list_head path_chown; + struct list_head path_chroot; +#endif + struct list_head inode_alloc_security; + struct list_head inode_free_security; + struct list_head inode_init_security; + struct list_head inode_create; + struct list_head inode_link; + struct list_head inode_unlink; + struct list_head inode_symlink; + struct list_head inode_mkdir; + struct list_head inode_rmdir; + struct list_head inode_mknod; + struct list_head inode_rename; + struct list_head inode_readlink; + struct list_head inode_follow_link; + struct list_head inode_permission; + struct list_head inode_setattr; + struct list_head inode_getattr; + struct list_head inode_setxattr; + struct list_head inode_post_setxattr; + struct list_head inode_getxattr; + struct list_head inode_listxattr; + struct list_head inode_removexattr; + struct list_head inode_need_killpriv; + struct list_head inode_killpriv; + struct list_head inode_getsecurity; + struct list_head inode_setsecurity; + struct list_head inode_listsecurity; + struct list_head inode_getsecid; + struct list_head file_permission; + struct list_head file_alloc_security; + struct list_head file_free_security; + struct list_head file_ioctl; + struct list_head mmap_addr; + struct list_head mmap_file; + struct list_head file_mprotect; + struct list_head file_lock; + struct list_head file_fcntl; + struct list_head file_set_fowner; + struct list_head file_send_sigiotask; + struct list_head file_receive; + struct list_head file_open; + struct list_head task_create; + struct list_head task_free; + struct list_head cred_alloc_blank; + struct list_head cred_free; + struct list_head cred_prepare; + struct list_head cred_transfer; + struct list_head kernel_act_as; + struct list_head kernel_create_files_as; + struct list_head kernel_fw_from_file; + struct list_head kernel_module_request; + struct list_head kernel_module_from_file; + struct list_head task_fix_setuid; + struct list_head task_setpgid; + struct list_head task_getpgid; + struct list_head task_getsid; + struct list_head task_getsecid; + struct list_head task_setnice; + struct list_head task_setioprio; + struct list_head task_getioprio; + struct list_head task_setrlimit; + struct list_head task_setscheduler; + struct list_head task_getscheduler; + struct list_head task_movememory; + struct list_head task_kill; + struct list_head task_wait; + struct list_head task_prctl; + struct list_head task_to_inode; + struct list_head ipc_permission; + struct list_head ipc_getsecid; + struct list_head msg_msg_alloc_security; + struct list_head msg_msg_free_security; + struct list_head msg_queue_alloc_security; + struct list_head msg_queue_free_security; + struct list_head msg_queue_associate; + struct list_head msg_queue_msgctl; + struct list_head msg_queue_msgsnd; + struct list_head msg_queue_msgrcv; + struct list_head shm_alloc_security; + struct list_head shm_free_security; + struct list_head shm_associate; + struct list_head shm_shmctl; + struct list_head shm_shmat; + struct list_head sem_alloc_security; + struct list_head sem_free_security; + struct list_head sem_associate; + struct list_head sem_semctl; + struct list_head sem_semop; + struct list_head netlink_send; + struct list_head d_instantiate; + struct list_head getprocattr; + struct list_head setprocattr; + struct list_head ismaclabel; + struct list_head secid_to_secctx; + struct list_head secctx_to_secid; + struct list_head release_secctx; + struct list_head inode_notifysecctx; + struct list_head inode_setsecctx; + struct list_head inode_getsecctx; +#ifdef CONFIG_SECURITY_NETWORK + struct list_head unix_stream_connect; + struct list_head unix_may_send; + struct list_head socket_create; + struct list_head socket_post_create; + struct list_head socket_bind; + struct list_head socket_connect; + struct list_head socket_listen; + struct list_head socket_accept; + struct list_head socket_sendmsg; + struct list_head socket_recvmsg; + struct list_head socket_getsockname; + struct list_head socket_getpeername; + struct list_head socket_getsockopt; + struct list_head socket_setsockopt; + struct list_head socket_shutdown; + struct list_head socket_sock_rcv_skb; + struct list_head socket_getpeersec_stream; + struct list_head socket_getpeersec_dgram; + struct list_head sk_alloc_security; + struct list_head sk_free_security; + struct list_head sk_clone_security; + struct list_head sk_getsecid; + struct list_head sock_graft; + struct list_head inet_conn_request; + struct list_head inet_csk_clone; + struct list_head inet_conn_established; + struct list_head secmark_relabel_packet; + struct list_head secmark_refcount_inc; + struct list_head secmark_refcount_dec; + struct list_head req_classify_flow; + struct list_head tun_dev_alloc_security; + struct list_head tun_dev_free_security; + struct list_head tun_dev_create; + struct list_head tun_dev_attach_queue; + struct list_head tun_dev_attach; + struct list_head tun_dev_open; + struct list_head skb_owned_by; +#endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM + struct list_head xfrm_policy_alloc_security; + struct list_head xfrm_policy_clone_security; + struct list_head xfrm_policy_free_security; + struct list_head xfrm_policy_delete_security; + struct list_head xfrm_state_alloc; + struct list_head xfrm_state_alloc_acquire; + struct list_head xfrm_state_free_security; + struct list_head xfrm_state_delete_security; + struct list_head xfrm_policy_lookup; + struct list_head xfrm_state_pol_flow_match; + struct list_head xfrm_decode_session; +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ +#ifdef CONFIG_KEYS + struct list_head key_alloc; + struct list_head key_free; + struct list_head key_permission; + struct list_head key_getsecurity; +#endif /* CONFIG_KEYS */ +#ifdef CONFIG_AUDIT + struct list_head audit_rule_init; + struct list_head audit_rule_known; + struct list_head audit_rule_match; + struct list_head audit_rule_free; +#endif /* CONFIG_AUDIT */ +}; + +/* + * Initializing a security_hook_list structure takes + * up a lot of space in a source file. This macro takes + * care of the common case and reduces the amount of + * text involved. + * Casey says: Comment is true in the next patch. + */ +#define LSM_HOOK_INIT(HEAD, HOOK) .HEAD = HOOK + /* prototypes */ extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); -- cgit v1.2.3 From b1d9e6b0646d0e5ee5d9050bd236b6c65d66faef Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 2 May 2015 15:11:42 -0700 Subject: LSM: Switch to lists of hooks Instead of using a vector of security operations with explicit, special case stacking of the capability and yama hooks use lists of hooks with capability and yama hooks included as appropriate. The security_operations structure is no longer required. Instead, there is a union of the function pointers that allows all the hooks lists to use a common mechanism for list management while retaining typing. Each module supplies an array describing the hooks it provides instead of a sparsely populated security_operations structure. The description includes the element that gets put on the hook list, avoiding the issues surrounding individual element allocation. The method for registering security modules is changed to reflect the information available. The method for removing a module, currently only used by SELinux, has also changed. It should be generic now, however if there are potential race conditions based on ordering of hook removal that needs to be addressed by the calling module. The security hooks are called from the lists and the first failure is returned. Signed-off-by: Casey Schaufler Acked-by: John Johansen Acked-by: Kees Cook Acked-by: Paul Moore Acked-by: Stephen Smalley Acked-by: Tetsuo Handa Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 77 ++++++++++++++++++++++++++++++++--------------- include/linux/security.h | 46 +++------------------------- 2 files changed, 57 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 27dd6fcacccc..f014f2596e22 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -25,21 +25,10 @@ #define __LINUX_LSM_HOOKS_H #include - -/* Maximum number of letters for an LSM name string */ -#define SECURITY_NAME_MAX 10 - -#ifdef CONFIG_SECURITY +#include +#include /** - * struct security_operations - main security structure - * - * Security module identifier. - * - * @name: - * A string that acts as a unique identifier for the LSM with max number - * of characters = SECURITY_NAME_MAX. - * * Security hooks for program execution operations. * * @bprm_set_creds: @@ -1310,9 +1299,7 @@ * This is the main security structure. */ -struct security_operations { - char name[SECURITY_NAME_MAX + 1]; - +union security_list_options { int (*binder_set_context_mgr)(struct task_struct *mgr); int (*binder_transaction)(struct task_struct *from, struct task_struct *to); @@ -1837,21 +1824,63 @@ struct security_hook_heads { #endif /* CONFIG_AUDIT */ }; +/* + * Security module hook list structure. + * For use with generic list macros for common operations. + */ +struct security_hook_list { + struct list_head list; + struct list_head *head; + union security_list_options hook; +}; + /* * Initializing a security_hook_list structure takes * up a lot of space in a source file. This macro takes * care of the common case and reduces the amount of * text involved. - * Casey says: Comment is true in the next patch. */ -#define LSM_HOOK_INIT(HEAD, HOOK) .HEAD = HOOK +#define LSM_HOOK_INIT(HEAD, HOOK) \ + { .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } } + +extern struct security_hook_heads security_hook_heads; + +static inline void security_add_hooks(struct security_hook_list *hooks, + int count) +{ + int i; -/* prototypes */ -extern int security_module_enable(struct security_operations *ops); -extern int register_security(struct security_operations *ops); -extern void __init security_fixup_ops(struct security_operations *ops); -extern void reset_security_ops(void); + for (i = 0; i < count; i++) + list_add_tail_rcu(&hooks[i].list, hooks[i].head); +} -#endif /* CONFIG_SECURITY */ +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +/* + * Assuring the safety of deleting a security module is up to + * the security module involved. This may entail ordering the + * module's hook list in a particular way, refusing to disable + * the module once a policy is loaded or any number of other + * actions better imagined than described. + * + * The name of the configuration option reflects the only module + * that currently uses the mechanism. Any developer who thinks + * disabling their module is a good idea needs to be at least as + * careful as the SELinux team. + */ +static inline void security_delete_hooks(struct security_hook_list *hooks, + int count) +{ + int i; + + for (i = 0; i < count; i++) + list_del_rcu(&hooks[i].list); +} +#endif /* CONFIG_SECURITY_SELINUX_DISABLE */ + +extern int __init security_module_enable(const char *module); +extern void __init capability_add_hooks(void); +#ifdef CONFIG_SECURITY_YAMA_STACKED +void __init yama_add_hooks(void); +#endif #endif /* ! __LINUX_LSM_HOOKS_H */ diff --git a/include/linux/security.h b/include/linux/security.h index a2a100e7ac6e..8c8175d41b4c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -27,6 +27,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -54,9 +55,6 @@ struct xattr; struct xfrm_sec_ctx; struct mm_struct; -/* Maximum number of letters for an LSM name string */ -#define SECURITY_NAME_MAX 10 - /* If capable should audit the security request */ #define SECURITY_CAP_NOAUDIT 0 #define SECURITY_CAP_AUDIT 1 @@ -69,10 +67,7 @@ struct audit_krule; struct user_namespace; struct timezone; -/* - * These functions are in security/capability.c and are used - * as the default capabilities functions - */ +/* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, int cap, int audit); extern int cap_settime(const struct timespec *ts, const struct timezone *tz); @@ -114,8 +109,6 @@ struct xfrm_state; struct xfrm_user_sec_ctx; struct seq_file; -extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); - #ifdef CONFIG_MMU extern unsigned long mmap_min_addr; extern unsigned long dac_mmap_min_addr; @@ -472,7 +465,7 @@ static inline int security_settime(const struct timespec *ts, static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { - return cap_vm_enough_memory(mm, pages); + return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages)); } static inline int security_bprm_set_creds(struct linux_binprm *bprm) @@ -1075,7 +1068,7 @@ static inline int security_setprocattr(struct task_struct *p, char *name, void * static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb) { - return cap_netlink_send(sk, skb); + return 0; } static inline int security_ismaclabel(const char *name) @@ -1643,36 +1636,5 @@ static inline void free_secdata(void *secdata) { } #endif /* CONFIG_SECURITY */ -#ifdef CONFIG_SECURITY_YAMA -extern int yama_ptrace_access_check(struct task_struct *child, - unsigned int mode); -extern int yama_ptrace_traceme(struct task_struct *parent); -extern void yama_task_free(struct task_struct *task); -extern int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5); -#else -static inline int yama_ptrace_access_check(struct task_struct *child, - unsigned int mode) -{ - return 0; -} - -static inline int yama_ptrace_traceme(struct task_struct *parent) -{ - return 0; -} - -static inline void yama_task_free(struct task_struct *task) -{ -} - -static inline int yama_task_prctl(int option, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5) -{ - return -ENOSYS; -} -#endif /* CONFIG_SECURITY_YAMA */ - #endif /* ! __LINUX_SECURITY_H */ -- cgit v1.2.3 From 6a4b6b0a3b55f23f4cc9ad85a1539c581bcb0874 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 4 May 2015 17:10:31 +0200 Subject: gpio: sysfs: clean up chip class-device handling Clean gpio-chip class device registration and deregistration. The class device is registered when a gpio-chip is added (or from gpiolib_sysfs_init post-core init call), and deregistered when the chip is removed. Store the class device in struct gpio_chip directly rather than do a class-device lookup on deregistration. This also removes the need for the exported flag. Signed-off-by: Johan Hovold Reviewed-by: Alexandre Courbot Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index f1b36593ec9f..2c1e639f66bd 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -20,6 +20,7 @@ struct seq_file; * struct gpio_chip - abstract a GPIO controller * @label: for diagnostics * @dev: optional device providing the GPIOs + * @cdev: class device used by sysfs interface (may be NULL) * @owner: helps prevent removal of modules exporting active GPIOs * @list: links gpio_chips together for traversal * @request: optional hook for chip-specific activation, such as @@ -57,7 +58,6 @@ struct seq_file; * implies that if the chip supports IRQs, these IRQs need to be threaded * as the chip access may sleep when e.g. reading out the IRQ status * registers. - * @exported: flags if the gpiochip is exported for use from sysfs. Private. * @irq_not_threaded: flag must be set if @can_sleep is set but the * IRQs don't need to be threaded * @@ -74,6 +74,7 @@ struct seq_file; struct gpio_chip { const char *label; struct device *dev; + struct device *cdev; struct module *owner; struct list_head list; @@ -109,7 +110,6 @@ struct gpio_chip { const char *const *names; bool can_sleep; bool irq_not_threaded; - bool exported; #ifdef CONFIG_GPIOLIB_IRQCHIP /* -- cgit v1.2.3 From 166a85e44245d771bd7042f3ad72aa0e12bb53bd Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 4 May 2015 17:10:33 +0200 Subject: gpio: remove gpiod_sysfs_set_active_low Remove gpiod_sysfs_set_active_low (and gpio_sysfs_set_active_low) which allowed code to change the polarity of a gpio line even after it had been exported through sysfs. Drivers should not care, and generally does not know, about gpio-line polarity which is a hardware feature that needs to be described by firmware. It is currently possible to define gpio-line polarity in device-tree and acpi firmware or using platform data. Userspace can also change the polarity through sysfs. Note that drivers using the legacy gpio interface could still use GPIOF_ACTIVE_LOW to change the polarity before exporting the gpio. There are no in-kernel users of this interface. Cc: Jonathan Corbet Cc: Harry Wei Cc: Arnd Bergmann Cc: linux-doc@vger.kernel.org Cc: linux-kernel@zh-kernel.org Cc: linux-arch@vger.kernel.org Signed-off-by: Johan Hovold Reviewed-by: Alexandre Courbot Signed-off-by: Linus Walleij --- include/linux/gpio.h | 7 ------- include/linux/gpio/consumer.h | 6 ------ 2 files changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index ab81339a8590..d12b5d566e4b 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -196,13 +196,6 @@ static inline int gpio_export_link(struct device *dev, const char *name, return -EINVAL; } -static inline int gpio_sysfs_set_active_low(unsigned gpio, int value) -{ - /* GPIO can never have been requested */ - WARN_ON(1); - return -EINVAL; -} - static inline void gpio_unexport(unsigned gpio) { /* GPIO can never have been exported */ diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 3a7c9ffd5ab9..09a7fb0062a6 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -449,7 +449,6 @@ static inline int desc_to_gpio(const struct gpio_desc *desc) int gpiod_export(struct gpio_desc *desc, bool direction_may_change); int gpiod_export_link(struct device *dev, const char *name, struct gpio_desc *desc); -int gpiod_sysfs_set_active_low(struct gpio_desc *desc, int value); void gpiod_unexport(struct gpio_desc *desc); #else /* CONFIG_GPIOLIB && CONFIG_GPIO_SYSFS */ @@ -466,11 +465,6 @@ static inline int gpiod_export_link(struct device *dev, const char *name, return -ENOSYS; } -static inline int gpiod_sysfs_set_active_low(struct gpio_desc *desc, int value) -{ - return -ENOSYS; -} - static inline void gpiod_unexport(struct gpio_desc *desc) { } -- cgit v1.2.3 From 0e39250845c0f91acc64264709b25f7f9b85c2c3 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 6 May 2015 21:11:51 -0700 Subject: net: Store virtual address instead of page in netdev_alloc_cache This change makes it so that we store the virtual address of the page in the netdev_alloc_cache instead of the page pointer. The idea behind this is to avoid multiple calls to page_address since the virtual address is required for every access, but the page pointer is only needed at allocation or reset of the page. While I was at it I also reordered the netdev_alloc_cache structure a bit so that the size is always 16 bytes by dropping size in the case where PAGE_SIZE is greater than or equal to 32KB. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9c2f793573fa..8b9a2c35a9d7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2128,9 +2128,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } -#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768) -#define NETDEV_FRAG_PAGE_MAX_SIZE (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER) -#define NETDEV_PAGECNT_MAX_BIAS NETDEV_FRAG_PAGE_MAX_SIZE +#define NETDEV_FRAG_PAGE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) +#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(NETDEV_FRAG_PAGE_MAX_SIZE) void *netdev_alloc_frag(unsigned int fragsz); -- cgit v1.2.3 From b63ae8ca096dfdbfeef6a209c30a93a966518853 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 6 May 2015 21:11:57 -0700 Subject: mm/net: Rename and move page fragment handling from net/ to mm/ This change moves the __alloc_page_frag functionality out of the networking stack and into the page allocation portion of mm. The idea it so help make this maintainable by placing it with other page allocation functions. Since we are moving it from skbuff.c to page_alloc.c I have also renamed the basic defines and structure from netdev_alloc_cache to page_frag_cache to reflect that this is now part of a different kernel subsystem. I have also added a simple __free_page_frag function which can handle freeing the frags based on the skb->head pointer. The model for this is based off of __free_pages since we don't actually need to deal with all of the cases that put_page handles. I incorporated the virt_to_head_page call and compound_order into the function as it actually allows for a signficant size reduction by reducing code duplication. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/gfp.h | 5 +++++ include/linux/mm_types.h | 18 ++++++++++++++++++ include/linux/skbuff.h | 3 --- 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 97a9373e61e8..70a7fee1efb3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -366,6 +366,11 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, bool cold); extern void free_hot_cold_page_list(struct list_head *list, bool cold); +struct page_frag_cache; +extern void *__alloc_page_frag(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask); +extern void __free_page_frag(void *addr); + extern void __free_kmem_pages(struct page *page, unsigned int order); extern void free_kmem_pages(unsigned long addr, unsigned int order); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8d37e26a1007..0038ac7466fd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -226,6 +226,24 @@ struct page_frag { #endif }; +#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) +#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) + +struct page_frag_cache { + void * va; +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + __u16 offset; + __u16 size; +#else + __u32 offset; +#endif + /* we maintain a pagecount bias, so that we dont dirty cache line + * containing page->_count every time we allocate a fragment. + */ + unsigned int pagecnt_bias; + bool pfmemalloc; +}; + typedef unsigned long __nocast vm_flags_t; /* diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8b9a2c35a9d7..0039fcc45b3b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2128,9 +2128,6 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } -#define NETDEV_FRAG_PAGE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) -#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(NETDEV_FRAG_PAGE_MAX_SIZE) - void *netdev_alloc_frag(unsigned int fragsz); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, -- cgit v1.2.3 From 181edb2bfa22b50817684135ab6430ed2808abf0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 6 May 2015 21:12:03 -0700 Subject: net: Add skb_free_frag to replace use of put_page in freeing skb->head This change adds a function called skb_free_frag which is meant to compliment the function netdev_alloc_frag. The general idea is to enable a more lightweight version of page freeing since we don't actually need all the overhead of a put_page, and we don't quite fit the model of __free_pages. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0039fcc45b3b..c0b574a414e7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2182,6 +2182,11 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } +static inline void skb_free_frag(void *addr) +{ + __free_page_frag(addr); +} + void *napi_alloc_frag(unsigned int fragsz); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int length, gfp_t gfp_mask); -- cgit v1.2.3 From 755a27e7e4c817dd51ade41668b380f26026899c Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sun, 3 May 2015 18:18:02 +0800 Subject: tracing: remove unused ftrace_output_event() prototype The prototype of ftrace_output_event was added by commit 1d6bae966e90 ("tracing: Move raw output code from macro to standalone function") but this function was not defined anywhere, and is still nowhere to be found. Link: http://lkml.kernel.org/r/1430648282-25792-1-git-send-email-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index f9ecf63d47f1..65ce6de91307 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -219,9 +219,6 @@ struct ftrace_event_class { extern int ftrace_event_reg(struct ftrace_event_call *event, enum trace_reg type, void *data); -int ftrace_output_event(struct trace_iterator *iter, struct ftrace_event_call *event, - char *fmt, ...); - int ftrace_event_define_field(struct ftrace_event_call *call, char *type, int len, char *item, int offset, int field_size, int sign, int filter); -- cgit v1.2.3 From 020af89a41c41fd2c92d0da524968dfaba6269f0 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Mon, 27 Apr 2015 21:24:30 +0300 Subject: PM / sleep: Add macro to define common noirq system PM callbacks The same approach is used as for the existing SET_SYSTEM_SLEEP_PM_OPS, but for noirq callbacks. New SET_NOIRQ_SYSTEM_SLEEP_PM_OPS, defined for CONFIG_PM_SLEEP, will point ->suspend_noirq, ->freeze_noirq and ->poweroff_noirq to the same function. Vice versa happens for ->resume_noirq, ->thaw_noirq and ->restore_noirq. Signed-off-by: Grygorii Strashko Acked-by: Santosh Shilimkar Reviewed-by: Ulf Hansson Acked-by: Pavel Machek Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 2d29c64f8fb1..4890743892ef 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -342,6 +342,18 @@ struct dev_pm_ops { #define SET_LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) #endif +#ifdef CONFIG_PM_SLEEP +#define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + .suspend_noirq = suspend_fn, \ + .resume_noirq = resume_fn, \ + .freeze_noirq = suspend_fn, \ + .thaw_noirq = resume_fn, \ + .poweroff_noirq = suspend_fn, \ + .restore_noirq = resume_fn, +#else +#define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) +#endif + #ifdef CONFIG_PM #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ .runtime_suspend = suspend_fn, \ -- cgit v1.2.3 From 75f504004ab866c8f84749303b0f70953724e259 Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Thu, 23 Apr 2015 14:03:09 +0530 Subject: PM / clock_ops: Provide default runtime ops to users Most users of PM clocks do the extact same things in the runtime suspend/resume callbacks. Provide them USE_PM_CLK_RUNTIME_OPS so as to avoid/remove boilerplate code. Signed-off-by: Rajendra Nayak Reviewed-by: Kevin Hilman Acked-by: Santosh Shilimkar Acked-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- include/linux/pm_clock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 0b0039634410..25266c600021 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -20,6 +20,16 @@ struct pm_clk_notifier_block { struct clk; +#ifdef CONFIG_PM +extern int pm_clk_runtime_suspend(struct device *dev); +extern int pm_clk_runtime_resume(struct device *dev); +#define USE_PM_CLK_RUNTIME_OPS \ + .runtime_suspend = pm_clk_runtime_suspend, \ + .runtime_resume = pm_clk_runtime_resume, +#else +#define USE_PM_CLK_RUNTIME_OPS +#endif + #ifdef CONFIG_PM_CLK static inline bool pm_clk_no_clocks(struct device *dev) { -- cgit v1.2.3 From 9d47c0a2d958e06322c88245749278633d333cca Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 10 May 2015 09:47:47 -0700 Subject: switchdev: s/swdev_/switchdev_/ Turned out that "switchdev" sticks. So just unify all related terms to use this prefix. Signed-off-by: Jiri Pirko Signed-off-by: Scott Feldman Acked-by: Roopa Prabhu Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a6d706b2a947..2b39235b9f13 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1567,7 +1567,7 @@ struct net_device { const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_SWITCHDEV - const struct swdev_ops *swdev_ops; + const struct switchdev_ops *switchdev_ops; #endif const struct header_ops *header_ops; -- cgit v1.2.3 From 7889cbee8357aaed85898d028829dfb4f75bae2c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:48:07 -0700 Subject: switchdev: remove NETIF_F_HW_SWITCH_OFFLOAD feature flag Roopa said remove the feature flag for this series and she'll work on bringing it back if needed at a later date. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 7d59dc6ab789..9672781c593d 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -66,7 +66,6 @@ enum { NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */ NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */ NETIF_F_BUSY_POLL_BIT, /* Busy poll */ - NETIF_F_HW_SWITCH_OFFLOAD_BIT, /* HW switch offload */ /* * Add your fresh new feature above and remember to update @@ -125,7 +124,6 @@ enum { #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) #define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD) #define NETIF_F_BUSY_POLL __NETIF_F(BUSY_POLL) -#define NETIF_F_HW_SWITCH_OFFLOAD __NETIF_F(HW_SWITCH_OFFLOAD) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ @@ -161,8 +159,7 @@ enum { */ #define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \ NETIF_F_SG | NETIF_F_HIGHDMA | \ - NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED | \ - NETIF_F_HW_SWITCH_OFFLOAD) + NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED) /* * If one device doesn't support one of these features, then disable it -- cgit v1.2.3 From 5d1d65f8bea6de3d9c2c60fdfdd2da02da5ea672 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 11 May 2015 17:48:12 +0800 Subject: crypto: aead - Convert top level interface to new style This patch converts the top-level aead interface to the new style. All user-level AEAD interface code have been moved into crypto/aead.h. The allocation/free functions have switched over to the new way of allocating tfms. This patch also removes the double indrection on setkey so the indirection now exists only at the alg level. Apart from these there are no user-visible changes. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 442 +------------------------------------------------ 1 file changed, 1 insertion(+), 441 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index ee14140f8893..59ca4086ce6a 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -140,6 +140,7 @@ struct crypto_blkcipher; struct crypto_hash; struct crypto_tfm; struct crypto_type; +struct aead_request; struct aead_givcrypt_request; struct skcipher_givcrypt_request; @@ -174,32 +175,6 @@ struct ablkcipher_request { void *__ctx[] CRYPTO_MINALIGN_ATTR; }; -/** - * struct aead_request - AEAD request - * @base: Common attributes for async crypto requests - * @assoclen: Length in bytes of associated data for authentication - * @cryptlen: Length of data to be encrypted or decrypted - * @iv: Initialisation vector - * @assoc: Associated data - * @src: Source data - * @dst: Destination data - * @__ctx: Start of private context data - */ -struct aead_request { - struct crypto_async_request base; - - unsigned int assoclen; - unsigned int cryptlen; - - u8 *iv; - - struct scatterlist *assoc; - struct scatterlist *src; - struct scatterlist *dst; - - void *__ctx[] CRYPTO_MINALIGN_ATTR; -}; - struct blkcipher_desc { struct crypto_blkcipher *tfm; void *info; @@ -572,21 +547,6 @@ struct ablkcipher_tfm { unsigned int reqsize; }; -struct aead_tfm { - int (*setkey)(struct crypto_aead *tfm, const u8 *key, - unsigned int keylen); - int (*encrypt)(struct aead_request *req); - int (*decrypt)(struct aead_request *req); - int (*givencrypt)(struct aead_givcrypt_request *req); - int (*givdecrypt)(struct aead_givcrypt_request *req); - - struct crypto_aead *base; - - unsigned int ivsize; - unsigned int authsize; - unsigned int reqsize; -}; - struct blkcipher_tfm { void *iv; int (*setkey)(struct crypto_tfm *tfm, const u8 *key, @@ -626,7 +586,6 @@ struct compress_tfm { }; #define crt_ablkcipher crt_u.ablkcipher -#define crt_aead crt_u.aead #define crt_blkcipher crt_u.blkcipher #define crt_cipher crt_u.cipher #define crt_hash crt_u.hash @@ -638,7 +597,6 @@ struct crypto_tfm { union { struct ablkcipher_tfm ablkcipher; - struct aead_tfm aead; struct blkcipher_tfm blkcipher; struct cipher_tfm cipher; struct hash_tfm hash; @@ -656,10 +614,6 @@ struct crypto_ablkcipher { struct crypto_tfm base; }; -struct crypto_aead { - struct crypto_tfm base; -}; - struct crypto_blkcipher { struct crypto_tfm base; }; @@ -1151,400 +1105,6 @@ static inline void ablkcipher_request_set_crypt( req->info = iv; } -/** - * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API - * - * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD - * (listed as type "aead" in /proc/crypto) - * - * The most prominent examples for this type of encryption is GCM and CCM. - * However, the kernel supports other types of AEAD ciphers which are defined - * with the following cipher string: - * - * authenc(keyed message digest, block cipher) - * - * For example: authenc(hmac(sha256), cbc(aes)) - * - * The example code provided for the asynchronous block cipher operation - * applies here as well. Naturally all *ablkcipher* symbols must be exchanged - * the *aead* pendants discussed in the following. In addtion, for the AEAD - * operation, the aead_request_set_assoc function must be used to set the - * pointer to the associated data memory location before performing the - * encryption or decryption operation. In case of an encryption, the associated - * data memory is filled during the encryption operation. For decryption, the - * associated data memory must contain data that is used to verify the integrity - * of the decrypted data. Another deviation from the asynchronous block cipher - * operation is that the caller should explicitly check for -EBADMSG of the - * crypto_aead_decrypt. That error indicates an authentication error, i.e. - * a breach in the integrity of the message. In essence, that -EBADMSG error - * code is the key bonus an AEAD cipher has over "standard" block chaining - * modes. - */ - -static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm) -{ - return (struct crypto_aead *)tfm; -} - -/** - * crypto_alloc_aead() - allocate AEAD cipher handle - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * AEAD cipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Allocate a cipher handle for an AEAD. The returned struct - * crypto_aead is the cipher handle that is required for any subsequent - * API invocation for that AEAD. - * - * Return: allocated cipher handle in case of success; IS_ERR() is true in case - * of an error, PTR_ERR() returns the error code. - */ -struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask); - -static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm) -{ - return &tfm->base; -} - -/** - * crypto_free_aead() - zeroize and free aead handle - * @tfm: cipher handle to be freed - */ -static inline void crypto_free_aead(struct crypto_aead *tfm) -{ - crypto_free_tfm(crypto_aead_tfm(tfm)); -} - -static inline struct aead_tfm *crypto_aead_crt(struct crypto_aead *tfm) -{ - return &crypto_aead_tfm(tfm)->crt_aead; -} - -/** - * crypto_aead_ivsize() - obtain IV size - * @tfm: cipher handle - * - * The size of the IV for the aead referenced by the cipher handle is - * returned. This IV size may be zero if the cipher does not need an IV. - * - * Return: IV size in bytes - */ -static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm) -{ - return crypto_aead_crt(tfm)->ivsize; -} - -/** - * crypto_aead_authsize() - obtain maximum authentication data size - * @tfm: cipher handle - * - * The maximum size of the authentication data for the AEAD cipher referenced - * by the AEAD cipher handle is returned. The authentication data size may be - * zero if the cipher implements a hard-coded maximum. - * - * The authentication data may also be known as "tag value". - * - * Return: authentication data size / tag size in bytes - */ -static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm) -{ - return crypto_aead_crt(tfm)->authsize; -} - -/** - * crypto_aead_blocksize() - obtain block size of cipher - * @tfm: cipher handle - * - * The block size for the AEAD referenced with the cipher handle is returned. - * The caller may use that information to allocate appropriate memory for the - * data returned by the encryption or decryption operation - * - * Return: block size of cipher - */ -static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm) -{ - return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm)); -} - -static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm) -{ - return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm)); -} - -static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm) -{ - return crypto_tfm_get_flags(crypto_aead_tfm(tfm)); -} - -static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags) -{ - crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags); -} - -static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags) -{ - crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags); -} - -/** - * crypto_aead_setkey() - set key for cipher - * @tfm: cipher handle - * @key: buffer holding the key - * @keylen: length of the key in bytes - * - * The caller provided key is set for the AEAD referenced by the cipher - * handle. - * - * Note, the key length determines the cipher type. Many block ciphers implement - * different cipher modes depending on the key size, such as AES-128 vs AES-192 - * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 - * is performed. - * - * Return: 0 if the setting of the key was successful; < 0 if an error occurred - */ -static inline int crypto_aead_setkey(struct crypto_aead *tfm, const u8 *key, - unsigned int keylen) -{ - struct aead_tfm *crt = crypto_aead_crt(tfm); - - return crt->setkey(crt->base, key, keylen); -} - -/** - * crypto_aead_setauthsize() - set authentication data size - * @tfm: cipher handle - * @authsize: size of the authentication data / tag in bytes - * - * Set the authentication data size / tag size. AEAD requires an authentication - * tag (or MAC) in addition to the associated data. - * - * Return: 0 if the setting of the key was successful; < 0 if an error occurred - */ -int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize); - -static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req) -{ - return __crypto_aead_cast(req->base.tfm); -} - -/** - * crypto_aead_encrypt() - encrypt plaintext - * @req: reference to the aead_request handle that holds all information - * needed to perform the cipher operation - * - * Encrypt plaintext data using the aead_request handle. That data structure - * and how it is filled with data is discussed with the aead_request_* - * functions. - * - * IMPORTANT NOTE The encryption operation creates the authentication data / - * tag. That data is concatenated with the created ciphertext. - * The ciphertext memory size is therefore the given number of - * block cipher blocks + the size defined by the - * crypto_aead_setauthsize invocation. The caller must ensure - * that sufficient memory is available for the ciphertext and - * the authentication tag. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_aead_encrypt(struct aead_request *req) -{ - return crypto_aead_crt(crypto_aead_reqtfm(req))->encrypt(req); -} - -/** - * crypto_aead_decrypt() - decrypt ciphertext - * @req: reference to the ablkcipher_request handle that holds all information - * needed to perform the cipher operation - * - * Decrypt ciphertext data using the aead_request handle. That data structure - * and how it is filled with data is discussed with the aead_request_* - * functions. - * - * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the - * authentication data / tag. That authentication data / tag - * must have the size defined by the crypto_aead_setauthsize - * invocation. - * - * - * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD - * cipher operation performs the authentication of the data during the - * decryption operation. Therefore, the function returns this error if - * the authentication of the ciphertext was unsuccessful (i.e. the - * integrity of the ciphertext or the associated data was violated); - * < 0 if an error occurred. - */ -static inline int crypto_aead_decrypt(struct aead_request *req) -{ - if (req->cryptlen < crypto_aead_authsize(crypto_aead_reqtfm(req))) - return -EINVAL; - - return crypto_aead_crt(crypto_aead_reqtfm(req))->decrypt(req); -} - -/** - * DOC: Asynchronous AEAD Request Handle - * - * The aead_request data structure contains all pointers to data required for - * the AEAD cipher operation. This includes the cipher handle (which can be - * used by multiple aead_request instances), pointer to plaintext and - * ciphertext, asynchronous callback function, etc. It acts as a handle to the - * aead_request_* API calls in a similar way as AEAD handle to the - * crypto_aead_* API calls. - */ - -/** - * crypto_aead_reqsize() - obtain size of the request data structure - * @tfm: cipher handle - * - * Return: number of bytes - */ -static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm) -{ - return crypto_aead_crt(tfm)->reqsize; -} - -/** - * aead_request_set_tfm() - update cipher handle reference in request - * @req: request handle to be modified - * @tfm: cipher handle that shall be added to the request handle - * - * Allow the caller to replace the existing aead handle in the request - * data structure with a different one. - */ -static inline void aead_request_set_tfm(struct aead_request *req, - struct crypto_aead *tfm) -{ - req->base.tfm = crypto_aead_tfm(crypto_aead_crt(tfm)->base); -} - -/** - * aead_request_alloc() - allocate request data structure - * @tfm: cipher handle to be registered with the request - * @gfp: memory allocation flag that is handed to kmalloc by the API call. - * - * Allocate the request data structure that must be used with the AEAD - * encrypt and decrypt API calls. During the allocation, the provided aead - * handle is registered in the request data structure. - * - * Return: allocated request handle in case of success; IS_ERR() is true in case - * of an error, PTR_ERR() returns the error code. - */ -static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm, - gfp_t gfp) -{ - struct aead_request *req; - - req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp); - - if (likely(req)) - aead_request_set_tfm(req, tfm); - - return req; -} - -/** - * aead_request_free() - zeroize and free request data structure - * @req: request data structure cipher handle to be freed - */ -static inline void aead_request_free(struct aead_request *req) -{ - kzfree(req); -} - -/** - * aead_request_set_callback() - set asynchronous callback function - * @req: request handle - * @flags: specify zero or an ORing of the flags - * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and - * increase the wait queue beyond the initial maximum size; - * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep - * @compl: callback function pointer to be registered with the request handle - * @data: The data pointer refers to memory that is not used by the kernel - * crypto API, but provided to the callback function for it to use. Here, - * the caller can provide a reference to memory the callback function can - * operate on. As the callback function is invoked asynchronously to the - * related functionality, it may need to access data structures of the - * related functionality which can be referenced using this pointer. The - * callback function can access the memory via the "data" field in the - * crypto_async_request data structure provided to the callback function. - * - * Setting the callback function that is triggered once the cipher operation - * completes - * - * The callback function is registered with the aead_request handle and - * must comply with the following template - * - * void callback_function(struct crypto_async_request *req, int error) - */ -static inline void aead_request_set_callback(struct aead_request *req, - u32 flags, - crypto_completion_t compl, - void *data) -{ - req->base.complete = compl; - req->base.data = data; - req->base.flags = flags; -} - -/** - * aead_request_set_crypt - set data buffers - * @req: request handle - * @src: source scatter / gather list - * @dst: destination scatter / gather list - * @cryptlen: number of bytes to process from @src - * @iv: IV for the cipher operation which must comply with the IV size defined - * by crypto_aead_ivsize() - * - * Setting the source data and destination data scatter / gather lists. - * - * For encryption, the source is treated as the plaintext and the - * destination is the ciphertext. For a decryption operation, the use is - * reversed - the source is the ciphertext and the destination is the plaintext. - * - * IMPORTANT NOTE AEAD requires an authentication tag (MAC). For decryption, - * the caller must concatenate the ciphertext followed by the - * authentication tag and provide the entire data stream to the - * decryption operation (i.e. the data length used for the - * initialization of the scatterlist and the data length for the - * decryption operation is identical). For encryption, however, - * the authentication tag is created while encrypting the data. - * The destination buffer must hold sufficient space for the - * ciphertext and the authentication tag while the encryption - * invocation must only point to the plaintext data size. The - * following code snippet illustrates the memory usage - * buffer = kmalloc(ptbuflen + (enc ? authsize : 0)); - * sg_init_one(&sg, buffer, ptbuflen + (enc ? authsize : 0)); - * aead_request_set_crypt(req, &sg, &sg, ptbuflen, iv); - */ -static inline void aead_request_set_crypt(struct aead_request *req, - struct scatterlist *src, - struct scatterlist *dst, - unsigned int cryptlen, u8 *iv) -{ - req->src = src; - req->dst = dst; - req->cryptlen = cryptlen; - req->iv = iv; -} - -/** - * aead_request_set_assoc() - set the associated data scatter / gather list - * @req: request handle - * @assoc: associated data scatter / gather list - * @assoclen: number of bytes to process from @assoc - * - * For encryption, the memory is filled with the associated data. For - * decryption, the memory must point to the associated data. - */ -static inline void aead_request_set_assoc(struct aead_request *req, - struct scatterlist *assoc, - unsigned int assoclen) -{ - req->assoc = assoc; - req->assoclen = assoclen; -} - /** * DOC: Synchronous Block Cipher API * -- cgit v1.2.3 From a2029240e5836e73ebcc1a8ddb8c22d636f89c9a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 11 May 2015 21:17:53 +0200 Subject: net: deinline netif_tx_stop_all_queues(), remove WARN_ON in netif_tx_stop_queue() These functions compile to 60 bytes of machine code each. With this .config: http://busybox.net/~vda/kernel_config there are 617 calls of netif_tx_stop_queue() and 49 calls of netif_tx_stop_all_queues() in vmlinux. To fix this, remove WARN_ON in netif_tx_stop_queue() as suggested by davem, and deinline netif_tx_stop_all_queues(). Change in code size is about 20k: text data bss dec hex filename 82426986 22255416 20627456 125309858 77813a2 vmlinux.before 82406248 22255416 20627456 125289120 777c2a0 vmlinux gcc-4.7.2 still creates deinlined version of netif_tx_stop_queue sometimes: $ nm --size-sort vmlinux | grep netif_tx_stop_queue | wc -l 190 ffffffff81b558a8 : ffffffff81b558a8: 55 push %rbp ffffffff81b558a9: 48 89 e5 mov %rsp,%rbp ffffffff81b558ac: f0 80 8f e0 01 00 00 lock orb $0x1,0x1e0(%rdi) ffffffff81b558b3: 01 ffffffff81b558b4: 5d pop %rbp ffffffff81b558b5: c3 retq This needs additional fixing. Signed-off-by: Denys Vlasenko CC: Alexei Starovoitov CC: Alexander Duyck CC: Joe Perches CC: David S. Miller CC: Jiri Pirko CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: netfilter-devel@vger.kernel.org Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2b39235b9f13..fa57915f440c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2559,10 +2559,6 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { - if (WARN_ON(!dev_queue)) { - pr_info("netif_stop_queue() cannot be called before register_netdev()\n"); - return; - } set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } @@ -2578,15 +2574,7 @@ static inline void netif_stop_queue(struct net_device *dev) netif_tx_stop_queue(netdev_get_tx_queue(dev, 0)); } -static inline void netif_tx_stop_all_queues(struct net_device *dev) -{ - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - netif_tx_stop_queue(txq); - } -} +void netif_tx_stop_all_queues(struct net_device *dev); static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue) { -- cgit v1.2.3 From cffc642d93f9324a06dfbd7da9af29652952a248 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Mon, 11 May 2015 22:22:44 -0700 Subject: test_bpf: add 173 new testcases for eBPF add an exhaustive set of eBPF tests bringing total to: test_bpf: Summary: 233 PASSED, 0 FAILED, [0/226 JIT'ed] Signed-off-by: Michael Holzheu Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3c03a6085b82..ce1d72d34382 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -207,6 +207,16 @@ struct bpf_prog_aux; .off = OFF, \ .imm = 0 }) +/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ + +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ -- cgit v1.2.3 From 9abdffe286c1532a54d5aee31571d3029be4026c Mon Sep 17 00:00:00 2001 From: Sumit Semwal Date: Tue, 5 May 2015 14:56:15 +0530 Subject: dma-buf: add ref counting for module as exporter Add reference counting on a kernel module that exports dma-buf and implements its operations. This prevents the module from being unloaded while DMABUF file is in use. The original patch [1] was submitted by Tomasz Stanislawski, but this is a simpler way to do it. v3: call module_put() as late as possible, per gregkh's comment. v2: move owner to struct dma_buf, and use DEFINE_DMA_BUF_EXPORT_INFO macro to simplify the change. Acked-by: Greg Kroah-Hartman Signed-off-by: Sumit Semwal [1]: https://lkml.org/lkml/2012/8/8/163 --- include/linux/dma-buf.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 2f0b431b73e0..f98bd7068d55 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -115,6 +115,8 @@ struct dma_buf_ops { * @attachments: list of dma_buf_attachment that denotes all devices attached. * @ops: dma_buf_ops associated with this buffer object. * @exp_name: name of the exporter; useful for debugging. + * @owner: pointer to exporter module; used for refcounting when exporter is a + * kernel module. * @list_node: node for dma_buf accounting and debugging. * @priv: exporter specific private data for this buffer object. * @resv: reservation object linked to this dma-buf @@ -129,6 +131,7 @@ struct dma_buf { unsigned vmapping_counter; void *vmap_ptr; const char *exp_name; + struct module *owner; struct list_head list_node; void *priv; struct reservation_object *resv; @@ -164,7 +167,8 @@ struct dma_buf_attachment { /** * struct dma_buf_export_info - holds information needed to export a dma_buf - * @exp_name: name of the exporting module - useful for debugging. + * @exp_name: name of the exporter - useful for debugging. + * @owner: pointer to exporter module - used for refcounting kernel module * @ops: Attach allocator-defined dma buf ops to the new buffer * @size: Size of the buffer * @flags: mode flags for the file @@ -176,6 +180,7 @@ struct dma_buf_attachment { */ struct dma_buf_export_info { const char *exp_name; + struct module *owner; const struct dma_buf_ops *ops; size_t size; int flags; @@ -187,7 +192,8 @@ struct dma_buf_export_info { * helper macro for exporters; zeros and fills in most common values */ #define DEFINE_DMA_BUF_EXPORT_INFO(a) \ - struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME } + struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME, \ + .owner = THIS_MODULE } /** * get_dma_buf - convenience wrapper for get_file. -- cgit v1.2.3 From 25e4fe92a20bbffde87500615250f1d54bfb832f Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Tue, 12 May 2015 20:12:23 +0300 Subject: gpiolib: cleanup chained handler and data Clean up chained handler and handler data if they were set by gpiochip_set_chained_irqchip(). Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 2c1e639f66bd..96a678842cde 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -121,6 +121,7 @@ struct gpio_chip { unsigned int irq_base; irq_flow_handler_t irq_handler; unsigned int irq_default_type; + int irq_parent; #endif #if defined(CONFIG_OF_GPIO) -- cgit v1.2.3 From f05be589ff32e87821b86845625ed3d402d37dc7 Mon Sep 17 00:00:00 2001 From: Boris BREZILLON Date: Fri, 10 Apr 2015 12:09:01 +0800 Subject: mfd: axp20x: Add AXP22x PMIC support Add support for the AXP22x PMIC devices to the existing AXP20x driver. This includes the AXP221 and AXP223, which are identical except for the external data bus. Only AXP221 is added for now. AXP223 will be added after it's Reduced Serial Bus (RSB) interface is supported. AXP22x defines a new set of registers, power supplies and regulators, but most of the API is similar to the AXP20x ones. A new irq chip definition is used, even though the available interrupts on AXP22x is a subset of those on AXP20x. This is done so the interrupt numbers match those on the datasheet. This patch only enables the interrupts, system power-off function, and PEK sub-device. The regulator driver must first support different variants before we enable it from the mfd driver. Signed-off-by: Boris BREZILLON [wens@csie.org: fix interrupts and move regulators to separate patch] Signed-off-by: Chen-Yu Tsai Signed-off-by: Lee Jones --- include/linux/mfd/axp20x.h | 86 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index dfabd6db7ddf..95568eb798c3 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -14,6 +14,7 @@ enum { AXP202_ID = 0, AXP209_ID, + AXP221_ID, AXP288_ID, NR_AXP20X_VARIANTS, }; @@ -45,6 +46,28 @@ enum { #define AXP20X_V_LTF_DISCHRG 0x3c #define AXP20X_V_HTF_DISCHRG 0x3d +#define AXP22X_PWR_OUT_CTRL1 0x10 +#define AXP22X_PWR_OUT_CTRL2 0x12 +#define AXP22X_PWR_OUT_CTRL3 0x13 +#define AXP22X_DLDO1_V_OUT 0x15 +#define AXP22X_DLDO2_V_OUT 0x16 +#define AXP22X_DLDO3_V_OUT 0x17 +#define AXP22X_DLDO4_V_OUT 0x18 +#define AXP22X_ELDO1_V_OUT 0x19 +#define AXP22X_ELDO2_V_OUT 0x1a +#define AXP22X_ELDO3_V_OUT 0x1b +#define AXP22X_DC5LDO_V_OUT 0x1c +#define AXP22X_DCDC1_V_OUT 0x21 +#define AXP22X_DCDC2_V_OUT 0x22 +#define AXP22X_DCDC3_V_OUT 0x23 +#define AXP22X_DCDC4_V_OUT 0x24 +#define AXP22X_DCDC5_V_OUT 0x25 +#define AXP22X_DCDC23_V_RAMP_CTRL 0x27 +#define AXP22X_ALDO1_V_OUT 0x28 +#define AXP22X_ALDO2_V_OUT 0x29 +#define AXP22X_ALDO3_V_OUT 0x2a +#define AXP22X_CHRG_CTRL3 0x35 + /* Interrupt */ #define AXP20X_IRQ1_EN 0x40 #define AXP20X_IRQ2_EN 0x41 @@ -100,6 +123,9 @@ enum { #define AXP20X_VBUS_MON 0x8b #define AXP20X_OVER_TMP 0x8f +#define AXP22X_PWREN_CTRL1 0x8c +#define AXP22X_PWREN_CTRL2 0x8d + /* GPIO */ #define AXP20X_GPIO0_CTRL 0x90 #define AXP20X_LDO5_V_OUT 0x91 @@ -108,6 +134,11 @@ enum { #define AXP20X_GPIO20_SS 0x94 #define AXP20X_GPIO3_CTRL 0x95 +#define AXP22X_LDO_IO0_V_OUT 0x91 +#define AXP22X_LDO_IO1_V_OUT 0x93 +#define AXP22X_GPIO_STATE 0x94 +#define AXP22X_GPIO_PULL_DOWN 0x95 + /* Battery */ #define AXP20X_CHRG_CC_31_24 0xb0 #define AXP20X_CHRG_CC_23_16 0xb1 @@ -120,6 +151,9 @@ enum { #define AXP20X_CC_CTRL 0xb8 #define AXP20X_FG_RES 0xb9 +/* AXP22X specific registers */ +#define AXP22X_BATLOW_THRES1 0xe6 + /* AXP288 specific registers */ #define AXP288_PMIC_ADC_H 0x56 #define AXP288_PMIC_ADC_L 0x57 @@ -158,6 +192,30 @@ enum { AXP20X_REG_ID_MAX, }; +enum { + AXP22X_DCDC1 = 0, + AXP22X_DCDC2, + AXP22X_DCDC3, + AXP22X_DCDC4, + AXP22X_DCDC5, + AXP22X_DC1SW, + AXP22X_DC5LDO, + AXP22X_ALDO1, + AXP22X_ALDO2, + AXP22X_ALDO3, + AXP22X_ELDO1, + AXP22X_ELDO2, + AXP22X_ELDO3, + AXP22X_DLDO1, + AXP22X_DLDO2, + AXP22X_DLDO3, + AXP22X_DLDO4, + AXP22X_RTC_LDO, + AXP22X_LDO_IO0, + AXP22X_LDO_IO1, + AXP22X_REG_ID_MAX, +}; + /* IRQs */ enum { AXP20X_IRQ_ACIN_OVER_V = 1, @@ -199,6 +257,34 @@ enum { AXP20X_IRQ_GPIO0_INPUT, }; +enum axp22x_irqs { + AXP22X_IRQ_ACIN_OVER_V = 1, + AXP22X_IRQ_ACIN_PLUGIN, + AXP22X_IRQ_ACIN_REMOVAL, + AXP22X_IRQ_VBUS_OVER_V, + AXP22X_IRQ_VBUS_PLUGIN, + AXP22X_IRQ_VBUS_REMOVAL, + AXP22X_IRQ_VBUS_V_LOW, + AXP22X_IRQ_BATT_PLUGIN, + AXP22X_IRQ_BATT_REMOVAL, + AXP22X_IRQ_BATT_ENT_ACT_MODE, + AXP22X_IRQ_BATT_EXIT_ACT_MODE, + AXP22X_IRQ_CHARG, + AXP22X_IRQ_CHARG_DONE, + AXP22X_IRQ_BATT_TEMP_HIGH, + AXP22X_IRQ_BATT_TEMP_LOW, + AXP22X_IRQ_DIE_TEMP_HIGH, + AXP22X_IRQ_PEK_SHORT, + AXP22X_IRQ_PEK_LONG, + AXP22X_IRQ_LOW_PWR_LVL1, + AXP22X_IRQ_LOW_PWR_LVL2, + AXP22X_IRQ_TIMER, + AXP22X_IRQ_PEK_RIS_EDGE, + AXP22X_IRQ_PEK_FAL_EDGE, + AXP22X_IRQ_GPIO1_INPUT, + AXP22X_IRQ_GPIO0_INPUT, +}; + enum axp288_irqs { AXP288_IRQ_VBUS_FALL = 2, AXP288_IRQ_VBUS_RISE, -- cgit v1.2.3 From 275e2bc0f25d5eb99c99ebb7293fc3722533124b Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:17 +0200 Subject: netfilter: ipset: Fix ext_*() macros So pointers returned by these macros could be referenced with -> directly. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 34b172301558..f88be7258e5f 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -122,13 +122,13 @@ struct ip_set_skbinfo { struct ip_set; #define ext_timeout(e, s) \ -(unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT]) +((unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT])) #define ext_counter(e, s) \ -(struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER]) +((struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER])) #define ext_comment(e, s) \ -(struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT]) +((struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT])) #define ext_skbinfo(e, s) \ -(struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO]) +((struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO])) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, -- cgit v1.2.3 From 289fcff4bcdb1dcc0ce8788b7ea0f58a9e4a495f Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 13 May 2015 15:26:42 +0300 Subject: usb: add bus type for USB ULPI UTMI+ Low Pin Interface (ULPI) is a commonly used PHY interface for USB 2.0. The ULPI specification describes a standard set of registers which the vendors can extend for their specific needs. ULPI PHYs provide often functions such as charger detection and ADP sensing and probing. There are two major issues that the bus type is meant to tackle: Firstly, ULPI registers are accessed from the controller. The bus provides convenient method for the controller drivers to share that access with the actual PHY drivers. Secondly, there are already platforms that assume ULPI PHYs are runtime detected, such as many Intel Baytrail based platforms. They do not provide any kind of hardware description for the ULPI PHYs like separate ACPI device object that could be used to enumerate a device from. Signed-off-by: Heikki Krogerus Acked-by: David Cohen Signed-off-by: Felipe Balbi --- include/linux/mod_devicetable.h | 6 ++ include/linux/ulpi/driver.h | 60 ++++++++++++++++++ include/linux/ulpi/interface.h | 23 +++++++ include/linux/ulpi/regs.h | 130 ++++++++++++++++++++++++++++++++++++++ include/linux/usb/ulpi.h | 134 +--------------------------------------- 5 files changed, 221 insertions(+), 132 deletions(-) create mode 100644 include/linux/ulpi/driver.h create mode 100644 include/linux/ulpi/interface.h create mode 100644 include/linux/ulpi/regs.h (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 3bfd56778c29..7ab00d61d30a 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -629,4 +629,10 @@ struct mcb_device_id { kernel_ulong_t driver_data; }; +struct ulpi_device_id { + __u16 vendor; + __u16 product; + kernel_ulong_t driver_data; +}; + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/include/linux/ulpi/driver.h b/include/linux/ulpi/driver.h new file mode 100644 index 000000000000..388f6e08b9d4 --- /dev/null +++ b/include/linux/ulpi/driver.h @@ -0,0 +1,60 @@ +#ifndef __LINUX_ULPI_DRIVER_H +#define __LINUX_ULPI_DRIVER_H + +#include + +#include + +struct ulpi_ops; + +/** + * struct ulpi - describes ULPI PHY device + * @id: vendor and product ids for ULPI device + * @ops: I/O access + * @dev: device interface + */ +struct ulpi { + struct ulpi_device_id id; + struct ulpi_ops *ops; + struct device dev; +}; + +#define to_ulpi_dev(d) container_of(d, struct ulpi, dev) + +static inline void ulpi_set_drvdata(struct ulpi *ulpi, void *data) +{ + dev_set_drvdata(&ulpi->dev, data); +} + +static inline void *ulpi_get_drvdata(struct ulpi *ulpi) +{ + return dev_get_drvdata(&ulpi->dev); +} + +/** + * struct ulpi_driver - describes a ULPI PHY driver + * @id_table: array of device identifiers supported by this driver + * @probe: binds this driver to ULPI device + * @remove: unbinds this driver from ULPI device + * @driver: the name and owner members must be initialized by the drivers + */ +struct ulpi_driver { + const struct ulpi_device_id *id_table; + int (*probe)(struct ulpi *ulpi); + void (*remove)(struct ulpi *ulpi); + struct device_driver driver; +}; + +#define to_ulpi_driver(d) container_of(d, struct ulpi_driver, driver) + +int ulpi_register_driver(struct ulpi_driver *drv); +void ulpi_unregister_driver(struct ulpi_driver *drv); + +#define module_ulpi_driver(__ulpi_driver) \ + module_driver(__ulpi_driver, ulpi_register_driver, \ + ulpi_unregister_driver) + +int ulpi_read(struct ulpi *ulpi, u8 addr); +int ulpi_write(struct ulpi *ulpi, u8 addr, u8 val); + +#endif /* __LINUX_ULPI_DRIVER_H */ diff --git a/include/linux/ulpi/interface.h b/include/linux/ulpi/interface.h new file mode 100644 index 000000000000..4de8ab491038 --- /dev/null +++ b/include/linux/ulpi/interface.h @@ -0,0 +1,23 @@ +#ifndef __LINUX_ULPI_INTERFACE_H +#define __LINUX_ULPI_INTERFACE_H + +#include + +struct ulpi; + +/** + * struct ulpi_ops - ULPI register access + * @dev: the interface provider + * @read: read operation for ULPI register access + * @write: write operation for ULPI register access + */ +struct ulpi_ops { + struct device *dev; + int (*read)(struct ulpi_ops *ops, u8 addr); + int (*write)(struct ulpi_ops *ops, u8 addr, u8 val); +}; + +struct ulpi *ulpi_register_interface(struct device *, struct ulpi_ops *); +void ulpi_unregister_interface(struct ulpi *); + +#endif /* __LINUX_ULPI_INTERFACE_H */ diff --git a/include/linux/ulpi/regs.h b/include/linux/ulpi/regs.h new file mode 100644 index 000000000000..b5b8b8804560 --- /dev/null +++ b/include/linux/ulpi/regs.h @@ -0,0 +1,130 @@ +#ifndef __LINUX_ULPI_REGS_H +#define __LINUX_ULPI_REGS_H + +/* + * Macros for Set and Clear + * See ULPI 1.1 specification to find the registers with Set and Clear offsets + */ +#define ULPI_SET(a) (a + 1) +#define ULPI_CLR(a) (a + 2) + +/* + * Register Map + */ +#define ULPI_VENDOR_ID_LOW 0x00 +#define ULPI_VENDOR_ID_HIGH 0x01 +#define ULPI_PRODUCT_ID_LOW 0x02 +#define ULPI_PRODUCT_ID_HIGH 0x03 +#define ULPI_FUNC_CTRL 0x04 +#define ULPI_IFC_CTRL 0x07 +#define ULPI_OTG_CTRL 0x0a +#define ULPI_USB_INT_EN_RISE 0x0d +#define ULPI_USB_INT_EN_FALL 0x10 +#define ULPI_USB_INT_STS 0x13 +#define ULPI_USB_INT_LATCH 0x14 +#define ULPI_DEBUG 0x15 +#define ULPI_SCRATCH 0x16 +/* Optional Carkit Registers */ +#define ULPI_CARKIT_CTRL 0x19 +#define ULPI_CARKIT_INT_DELAY 0x1c +#define ULPI_CARKIT_INT_EN 0x1d +#define ULPI_CARKIT_INT_STS 0x20 +#define ULPI_CARKIT_INT_LATCH 0x21 +#define ULPI_CARKIT_PLS_CTRL 0x22 +/* Other Optional Registers */ +#define ULPI_TX_POS_WIDTH 0x25 +#define ULPI_TX_NEG_WIDTH 0x26 +#define ULPI_POLARITY_RECOVERY 0x27 +/* Access Extended Register Set */ +#define ULPI_ACCESS_EXTENDED 0x2f +/* Vendor Specific */ +#define ULPI_VENDOR_SPECIFIC 0x30 +/* Extended Registers */ +#define ULPI_EXT_VENDOR_SPECIFIC 0x80 + +/* + * Register Bits + */ + +/* Function Control */ +#define ULPI_FUNC_CTRL_XCVRSEL BIT(0) +#define ULPI_FUNC_CTRL_XCVRSEL_MASK 0x3 +#define ULPI_FUNC_CTRL_HIGH_SPEED 0x0 +#define ULPI_FUNC_CTRL_FULL_SPEED 0x1 +#define ULPI_FUNC_CTRL_LOW_SPEED 0x2 +#define ULPI_FUNC_CTRL_FS4LS 0x3 +#define ULPI_FUNC_CTRL_TERMSELECT BIT(2) +#define ULPI_FUNC_CTRL_OPMODE BIT(3) +#define ULPI_FUNC_CTRL_OPMODE_MASK (0x3 << 3) +#define ULPI_FUNC_CTRL_OPMODE_NORMAL (0x0 << 3) +#define ULPI_FUNC_CTRL_OPMODE_NONDRIVING (0x1 << 3) +#define ULPI_FUNC_CTRL_OPMODE_DISABLE_NRZI (0x2 << 3) +#define ULPI_FUNC_CTRL_OPMODE_NOSYNC_NOEOP (0x3 << 3) +#define ULPI_FUNC_CTRL_RESET BIT(5) +#define ULPI_FUNC_CTRL_SUSPENDM BIT(6) + +/* Interface Control */ +#define ULPI_IFC_CTRL_6_PIN_SERIAL_MODE BIT(0) +#define ULPI_IFC_CTRL_3_PIN_SERIAL_MODE BIT(1) +#define ULPI_IFC_CTRL_CARKITMODE BIT(2) +#define ULPI_IFC_CTRL_CLOCKSUSPENDM BIT(3) +#define ULPI_IFC_CTRL_AUTORESUME BIT(4) +#define ULPI_IFC_CTRL_EXTERNAL_VBUS BIT(5) +#define ULPI_IFC_CTRL_PASSTHRU BIT(6) +#define ULPI_IFC_CTRL_PROTECT_IFC_DISABLE BIT(7) + +/* OTG Control */ +#define ULPI_OTG_CTRL_ID_PULLUP BIT(0) +#define ULPI_OTG_CTRL_DP_PULLDOWN BIT(1) +#define ULPI_OTG_CTRL_DM_PULLDOWN BIT(2) +#define ULPI_OTG_CTRL_DISCHRGVBUS BIT(3) +#define ULPI_OTG_CTRL_CHRGVBUS BIT(4) +#define ULPI_OTG_CTRL_DRVVBUS BIT(5) +#define ULPI_OTG_CTRL_DRVVBUS_EXT BIT(6) +#define ULPI_OTG_CTRL_EXTVBUSIND BIT(7) + +/* USB Interrupt Enable Rising, + * USB Interrupt Enable Falling, + * USB Interrupt Status and + * USB Interrupt Latch + */ +#define ULPI_INT_HOST_DISCONNECT BIT(0) +#define ULPI_INT_VBUS_VALID BIT(1) +#define ULPI_INT_SESS_VALID BIT(2) +#define ULPI_INT_SESS_END BIT(3) +#define ULPI_INT_IDGRD BIT(4) + +/* Debug */ +#define ULPI_DEBUG_LINESTATE0 BIT(0) +#define ULPI_DEBUG_LINESTATE1 BIT(1) + +/* Carkit Control */ +#define ULPI_CARKIT_CTRL_CARKITPWR BIT(0) +#define ULPI_CARKIT_CTRL_IDGNDDRV BIT(1) +#define ULPI_CARKIT_CTRL_TXDEN BIT(2) +#define ULPI_CARKIT_CTRL_RXDEN BIT(3) +#define ULPI_CARKIT_CTRL_SPKLEFTEN BIT(4) +#define ULPI_CARKIT_CTRL_SPKRIGHTEN BIT(5) +#define ULPI_CARKIT_CTRL_MICEN BIT(6) + +/* Carkit Interrupt Enable */ +#define ULPI_CARKIT_INT_EN_IDFLOAT_RISE BIT(0) +#define ULPI_CARKIT_INT_EN_IDFLOAT_FALL BIT(1) +#define ULPI_CARKIT_INT_EN_CARINTDET BIT(2) +#define ULPI_CARKIT_INT_EN_DP_RISE BIT(3) +#define ULPI_CARKIT_INT_EN_DP_FALL BIT(4) + +/* Carkit Interrupt Status and + * Carkit Interrupt Latch + */ +#define ULPI_CARKIT_INT_IDFLOAT BIT(0) +#define ULPI_CARKIT_INT_CARINTDET BIT(1) +#define ULPI_CARKIT_INT_DP BIT(2) + +/* Carkit Pulse Control*/ +#define ULPI_CARKIT_PLS_CTRL_TXPLSEN BIT(0) +#define ULPI_CARKIT_PLS_CTRL_RXPLSEN BIT(1) +#define ULPI_CARKIT_PLS_CTRL_SPKRLEFT_BIASEN BIT(2) +#define ULPI_CARKIT_PLS_CTRL_SPKRRIGHT_BIASEN BIT(3) + +#endif /* __LINUX_ULPI_REGS_H */ diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h index 5c295c26ad37..5f07407a367a 100644 --- a/include/linux/usb/ulpi.h +++ b/include/linux/usb/ulpi.h @@ -12,6 +12,8 @@ #define __LINUX_USB_ULPI_H #include +#include + /*-------------------------------------------------------------------------*/ /* @@ -49,138 +51,6 @@ /*-------------------------------------------------------------------------*/ -/* - * Macros for Set and Clear - * See ULPI 1.1 specification to find the registers with Set and Clear offsets - */ -#define ULPI_SET(a) (a + 1) -#define ULPI_CLR(a) (a + 2) - -/*-------------------------------------------------------------------------*/ - -/* - * Register Map - */ -#define ULPI_VENDOR_ID_LOW 0x00 -#define ULPI_VENDOR_ID_HIGH 0x01 -#define ULPI_PRODUCT_ID_LOW 0x02 -#define ULPI_PRODUCT_ID_HIGH 0x03 -#define ULPI_FUNC_CTRL 0x04 -#define ULPI_IFC_CTRL 0x07 -#define ULPI_OTG_CTRL 0x0a -#define ULPI_USB_INT_EN_RISE 0x0d -#define ULPI_USB_INT_EN_FALL 0x10 -#define ULPI_USB_INT_STS 0x13 -#define ULPI_USB_INT_LATCH 0x14 -#define ULPI_DEBUG 0x15 -#define ULPI_SCRATCH 0x16 -/* Optional Carkit Registers */ -#define ULPI_CARCIT_CTRL 0x19 -#define ULPI_CARCIT_INT_DELAY 0x1c -#define ULPI_CARCIT_INT_EN 0x1d -#define ULPI_CARCIT_INT_STS 0x20 -#define ULPI_CARCIT_INT_LATCH 0x21 -#define ULPI_CARCIT_PLS_CTRL 0x22 -/* Other Optional Registers */ -#define ULPI_TX_POS_WIDTH 0x25 -#define ULPI_TX_NEG_WIDTH 0x26 -#define ULPI_POLARITY_RECOVERY 0x27 -/* Access Extended Register Set */ -#define ULPI_ACCESS_EXTENDED 0x2f -/* Vendor Specific */ -#define ULPI_VENDOR_SPECIFIC 0x30 -/* Extended Registers */ -#define ULPI_EXT_VENDOR_SPECIFIC 0x80 - -/*-------------------------------------------------------------------------*/ - -/* - * Register Bits - */ - -/* Function Control */ -#define ULPI_FUNC_CTRL_XCVRSEL (1 << 0) -#define ULPI_FUNC_CTRL_XCVRSEL_MASK (3 << 0) -#define ULPI_FUNC_CTRL_HIGH_SPEED (0 << 0) -#define ULPI_FUNC_CTRL_FULL_SPEED (1 << 0) -#define ULPI_FUNC_CTRL_LOW_SPEED (2 << 0) -#define ULPI_FUNC_CTRL_FS4LS (3 << 0) -#define ULPI_FUNC_CTRL_TERMSELECT (1 << 2) -#define ULPI_FUNC_CTRL_OPMODE (1 << 3) -#define ULPI_FUNC_CTRL_OPMODE_MASK (3 << 3) -#define ULPI_FUNC_CTRL_OPMODE_NORMAL (0 << 3) -#define ULPI_FUNC_CTRL_OPMODE_NONDRIVING (1 << 3) -#define ULPI_FUNC_CTRL_OPMODE_DISABLE_NRZI (2 << 3) -#define ULPI_FUNC_CTRL_OPMODE_NOSYNC_NOEOP (3 << 3) -#define ULPI_FUNC_CTRL_RESET (1 << 5) -#define ULPI_FUNC_CTRL_SUSPENDM (1 << 6) - -/* Interface Control */ -#define ULPI_IFC_CTRL_6_PIN_SERIAL_MODE (1 << 0) -#define ULPI_IFC_CTRL_3_PIN_SERIAL_MODE (1 << 1) -#define ULPI_IFC_CTRL_CARKITMODE (1 << 2) -#define ULPI_IFC_CTRL_CLOCKSUSPENDM (1 << 3) -#define ULPI_IFC_CTRL_AUTORESUME (1 << 4) -#define ULPI_IFC_CTRL_EXTERNAL_VBUS (1 << 5) -#define ULPI_IFC_CTRL_PASSTHRU (1 << 6) -#define ULPI_IFC_CTRL_PROTECT_IFC_DISABLE (1 << 7) - -/* OTG Control */ -#define ULPI_OTG_CTRL_ID_PULLUP (1 << 0) -#define ULPI_OTG_CTRL_DP_PULLDOWN (1 << 1) -#define ULPI_OTG_CTRL_DM_PULLDOWN (1 << 2) -#define ULPI_OTG_CTRL_DISCHRGVBUS (1 << 3) -#define ULPI_OTG_CTRL_CHRGVBUS (1 << 4) -#define ULPI_OTG_CTRL_DRVVBUS (1 << 5) -#define ULPI_OTG_CTRL_DRVVBUS_EXT (1 << 6) -#define ULPI_OTG_CTRL_EXTVBUSIND (1 << 7) - -/* USB Interrupt Enable Rising, - * USB Interrupt Enable Falling, - * USB Interrupt Status and - * USB Interrupt Latch - */ -#define ULPI_INT_HOST_DISCONNECT (1 << 0) -#define ULPI_INT_VBUS_VALID (1 << 1) -#define ULPI_INT_SESS_VALID (1 << 2) -#define ULPI_INT_SESS_END (1 << 3) -#define ULPI_INT_IDGRD (1 << 4) - -/* Debug */ -#define ULPI_DEBUG_LINESTATE0 (1 << 0) -#define ULPI_DEBUG_LINESTATE1 (1 << 1) - -/* Carkit Control */ -#define ULPI_CARKIT_CTRL_CARKITPWR (1 << 0) -#define ULPI_CARKIT_CTRL_IDGNDDRV (1 << 1) -#define ULPI_CARKIT_CTRL_TXDEN (1 << 2) -#define ULPI_CARKIT_CTRL_RXDEN (1 << 3) -#define ULPI_CARKIT_CTRL_SPKLEFTEN (1 << 4) -#define ULPI_CARKIT_CTRL_SPKRIGHTEN (1 << 5) -#define ULPI_CARKIT_CTRL_MICEN (1 << 6) - -/* Carkit Interrupt Enable */ -#define ULPI_CARKIT_INT_EN_IDFLOAT_RISE (1 << 0) -#define ULPI_CARKIT_INT_EN_IDFLOAT_FALL (1 << 1) -#define ULPI_CARKIT_INT_EN_CARINTDET (1 << 2) -#define ULPI_CARKIT_INT_EN_DP_RISE (1 << 3) -#define ULPI_CARKIT_INT_EN_DP_FALL (1 << 4) - -/* Carkit Interrupt Status and - * Carkit Interrupt Latch - */ -#define ULPI_CARKIT_INT_IDFLOAT (1 << 0) -#define ULPI_CARKIT_INT_CARINTDET (1 << 1) -#define ULPI_CARKIT_INT_DP (1 << 2) - -/* Carkit Pulse Control*/ -#define ULPI_CARKIT_PLS_CTRL_TXPLSEN (1 << 0) -#define ULPI_CARKIT_PLS_CTRL_RXPLSEN (1 << 1) -#define ULPI_CARKIT_PLS_CTRL_SPKRLEFT_BIASEN (1 << 2) -#define ULPI_CARKIT_PLS_CTRL_SPKRRIGHT_BIASEN (1 << 3) - -/*-------------------------------------------------------------------------*/ - #if IS_ENABLED(CONFIG_USB_ULPI) struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, unsigned int flags); -- cgit v1.2.3 From f267caab44451baa70d25fa3191a68bb79ad1b08 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 14:03:51 -0400 Subject: tracing: Remove unused prototype ftrace_event_define_field() ftrace_event_define_field() has a prototype defined but never used. Remove it. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 65ce6de91307..f8465d65f3c7 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -219,10 +219,6 @@ struct ftrace_event_class { extern int ftrace_event_reg(struct ftrace_event_call *event, enum trace_reg type, void *data); -int ftrace_event_define_field(struct ftrace_event_call *call, - char *type, int len, char *item, int offset, - int field_size, int sign, int filter); - struct ftrace_event_buffer { struct ring_buffer *buffer; struct ring_buffer_event *event; @@ -238,10 +234,6 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); -int ftrace_event_define_field(struct ftrace_event_call *call, - char *type, int len, char *item, int offset, - int field_size, int sign, int filter); - enum { TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, -- cgit v1.2.3 From af658dca221207174fc0a7bcdcd4cff7c589fdd8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 29 Apr 2015 14:36:05 -0400 Subject: tracing: Rename ftrace_event.h to trace_events.h The term "ftrace" is really the infrastructure of the function hooks, and not the trace events. Rename ftrace_event.h to trace_events.h to represent the trace_event infrastructure and decouple the term ftrace from it. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 616 ------------------------------------------- include/linux/trace_events.h | 616 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 616 insertions(+), 616 deletions(-) delete mode 100644 include/linux/ftrace_event.h create mode 100644 include/linux/trace_events.h (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h deleted file mode 100644 index f8465d65f3c7..000000000000 --- a/include/linux/ftrace_event.h +++ /dev/null @@ -1,616 +0,0 @@ - -#ifndef _LINUX_FTRACE_EVENT_H -#define _LINUX_FTRACE_EVENT_H - -#include -#include -#include -#include -#include -#include - -struct trace_array; -struct trace_buffer; -struct tracer; -struct dentry; -struct bpf_prog; - -struct trace_print_flags { - unsigned long mask; - const char *name; -}; - -struct trace_print_flags_u64 { - unsigned long long mask; - const char *name; -}; - -const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, - unsigned long flags, - const struct trace_print_flags *flag_array); - -const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); - -#if BITS_PER_LONG == 32 -const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, - unsigned long long val, - const struct trace_print_flags_u64 - *symbol_array); -#endif - -const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, - unsigned int bitmask_size); - -const char *ftrace_print_hex_seq(struct trace_seq *p, - const unsigned char *buf, int len); - -const char *ftrace_print_array_seq(struct trace_seq *p, - const void *buf, int count, - size_t el_size); - -struct trace_iterator; -struct trace_event; - -int ftrace_raw_output_prep(struct trace_iterator *iter, - struct trace_event *event); - -/* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned short type; - unsigned char flags; - unsigned char preempt_count; - int pid; -}; - -#define FTRACE_MAX_EVENT \ - ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) - -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - struct trace_buffer *trace_buffer; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter **buffer_iter; - unsigned long iter_flags; - - /* trace_seq for __print_flags() and __print_symbolic() etc. */ - struct trace_seq tmp_seq; - - cpumask_var_t started; - - /* it's true when current open file is snapshot */ - bool snapshot; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - unsigned long lost_events; - int leftover; - int ent_size; - int cpu; - u64 ts; - - loff_t pos; - long idx; - - /* All new field here will be zeroed out in pipe_read */ -}; - -enum trace_iter_flags { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, - TRACE_FILE_TIME_IN_NS = 4, -}; - - -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags, struct trace_event *event); - -struct trace_event_functions { - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - -struct trace_event { - struct hlist_node node; - struct list_head list; - int type; - struct trace_event_functions *funcs; -}; - -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); - -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - -/* - * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq - * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function - * simplifies those functions and keeps them in sync. - */ -static inline enum print_line_t trace_handle_return(struct trace_seq *s) -{ - return trace_seq_has_overflowed(s) ? - TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; -} - -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); -struct ftrace_event_file; - -struct ring_buffer_event * -trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, - struct ftrace_event_file *ftrace_file, - int type, unsigned long len, - unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, - int type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - struct pt_regs *regs); -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event); - -void tracing_record_cmdline(struct task_struct *tsk); - -int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); - -struct event_filter; - -enum trace_reg { - TRACE_REG_REGISTER, - TRACE_REG_UNREGISTER, -#ifdef CONFIG_PERF_EVENTS - TRACE_REG_PERF_REGISTER, - TRACE_REG_PERF_UNREGISTER, - TRACE_REG_PERF_OPEN, - TRACE_REG_PERF_CLOSE, - TRACE_REG_PERF_ADD, - TRACE_REG_PERF_DEL, -#endif -}; - -struct ftrace_event_call; - -struct ftrace_event_class { - const char *system; - void *probe; -#ifdef CONFIG_PERF_EVENTS - void *perf_probe; -#endif - int (*reg)(struct ftrace_event_call *event, - enum trace_reg type, void *data); - int (*define_fields)(struct ftrace_event_call *); - struct list_head *(*get_fields)(struct ftrace_event_call *); - struct list_head fields; - int (*raw_init)(struct ftrace_event_call *); -}; - -extern int ftrace_event_reg(struct ftrace_event_call *event, - enum trace_reg type, void *data); - -struct ftrace_event_buffer { - struct ring_buffer *buffer; - struct ring_buffer_event *event; - struct ftrace_event_file *ftrace_file; - void *entry; - unsigned long flags; - int pc; -}; - -void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, - struct ftrace_event_file *ftrace_file, - unsigned long len); - -void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); - -enum { - TRACE_EVENT_FL_FILTERED_BIT, - TRACE_EVENT_FL_CAP_ANY_BIT, - TRACE_EVENT_FL_NO_SET_FILTER_BIT, - TRACE_EVENT_FL_IGNORE_ENABLE_BIT, - TRACE_EVENT_FL_WAS_ENABLED_BIT, - TRACE_EVENT_FL_USE_CALL_FILTER_BIT, - TRACE_EVENT_FL_TRACEPOINT_BIT, - TRACE_EVENT_FL_KPROBE_BIT, -}; - -/* - * Event flags: - * FILTERED - The event has a filter attached - * CAP_ANY - Any user can enable for perf - * NO_SET_FILTER - Set when filter has error and is to be ignored - * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file - * WAS_ENABLED - Set and stays set when an event was ever enabled - * (used for module unloading, if a module event is enabled, - * it is best to clear the buffers that used it). - * USE_CALL_FILTER - For ftrace internal events, don't use file filter - * TRACEPOINT - Event is a tracepoint - * KPROBE - Event is a kprobe - */ -enum { - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), - TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), - TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), - TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), - TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), - TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), - TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), - TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), -}; - -struct ftrace_event_call { - struct list_head list; - struct ftrace_event_class *class; - union { - char *name; - /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ - struct tracepoint *tp; - }; - struct trace_event event; - char *print_fmt; - struct event_filter *filter; - void *mod; - void *data; - /* - * bit 0: filter_active - * bit 1: allow trace by non root (cap any) - * bit 2: failed to apply filter - * bit 3: ftrace internal event (do not enable) - * bit 4: Event was enabled by module - * bit 5: use call filter rather than file filter - * bit 6: Event is a tracepoint - */ - int flags; /* static flags of different events */ - -#ifdef CONFIG_PERF_EVENTS - int perf_refcount; - struct hlist_head __percpu *perf_events; - struct bpf_prog *prog; - - int (*perf_perm)(struct ftrace_event_call *, - struct perf_event *); -#endif -}; - -static inline const char * -ftrace_event_name(struct ftrace_event_call *call) -{ - if (call->flags & TRACE_EVENT_FL_TRACEPOINT) - return call->tp ? call->tp->name : NULL; - else - return call->name; -} - -struct trace_array; -struct ftrace_subsystem_dir; - -enum { - FTRACE_EVENT_FL_ENABLED_BIT, - FTRACE_EVENT_FL_RECORDED_CMD_BIT, - FTRACE_EVENT_FL_FILTERED_BIT, - FTRACE_EVENT_FL_NO_SET_FILTER_BIT, - FTRACE_EVENT_FL_SOFT_MODE_BIT, - FTRACE_EVENT_FL_SOFT_DISABLED_BIT, - FTRACE_EVENT_FL_TRIGGER_MODE_BIT, - FTRACE_EVENT_FL_TRIGGER_COND_BIT, -}; - -/* - * Ftrace event file flags: - * ENABLED - The event is enabled - * RECORDED_CMD - The comms should be recorded at sched_switch - * FILTERED - The event has a filter attached - * NO_SET_FILTER - Set when filter has error and is to be ignored - * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED - * SOFT_DISABLED - When set, do not trace the event (even though its - * tracepoint may be enabled) - * TRIGGER_MODE - When set, invoke the triggers associated with the event - * TRIGGER_COND - When set, one or more triggers has an associated filter - */ -enum { - FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), - FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), - FTRACE_EVENT_FL_FILTERED = (1 << FTRACE_EVENT_FL_FILTERED_BIT), - FTRACE_EVENT_FL_NO_SET_FILTER = (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT), - FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT), - FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT), - FTRACE_EVENT_FL_TRIGGER_MODE = (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT), - FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), -}; - -struct ftrace_event_file { - struct list_head list; - struct ftrace_event_call *event_call; - struct event_filter *filter; - struct dentry *dir; - struct trace_array *tr; - struct ftrace_subsystem_dir *system; - struct list_head triggers; - - /* - * 32 bit flags: - * bit 0: enabled - * bit 1: enabled cmd record - * bit 2: enable/disable with the soft disable bit - * bit 3: soft disabled - * bit 4: trigger enabled - * - * Note: The bits must be set atomically to prevent races - * from other writers. Reads of flags do not need to be in - * sync as they occur in critical sections. But the way flags - * is currently used, these changes do not affect the code - * except that when a change is made, it may have a slight - * delay in propagating the changes to other CPUs due to - * caching and such. Which is mostly OK ;-) - */ - unsigned long flags; - atomic_t sm_ref; /* soft-mode reference counter */ - atomic_t tm_ref; /* trigger-mode reference counter */ -}; - -#define __TRACE_EVENT_FLAGS(name, value) \ - static int __init trace_init_flags_##name(void) \ - { \ - event_##name.flags |= value; \ - return 0; \ - } \ - early_initcall(trace_init_flags_##name); - -#define __TRACE_EVENT_PERF_PERM(name, expr...) \ - static int perf_perm_##name(struct ftrace_event_call *tp_event, \ - struct perf_event *p_event) \ - { \ - return ({ expr; }); \ - } \ - static int __init trace_init_perf_perm_##name(void) \ - { \ - event_##name.perf_perm = &perf_perm_##name; \ - return 0; \ - } \ - early_initcall(trace_init_perf_perm_##name); - -#define PERF_MAX_TRACE_SIZE 2048 - -#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ - -enum event_trigger_type { - ETT_NONE = (0), - ETT_TRACE_ONOFF = (1 << 0), - ETT_SNAPSHOT = (1 << 1), - ETT_STACKTRACE = (1 << 2), - ETT_EVENT_ENABLE = (1 << 3), -}; - -extern int filter_match_preds(struct event_filter *filter, void *rec); - -extern int filter_check_discard(struct ftrace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); -extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); -extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file, - void *rec); -extern void event_triggers_post_call(struct ftrace_event_file *file, - enum event_trigger_type tt); - -/** - * ftrace_trigger_soft_disabled - do triggers and test if soft disabled - * @file: The file pointer of the event to test - * - * If any triggers without filters are attached to this event, they - * will be called here. If the event is soft disabled and has no - * triggers that require testing the fields, it will return true, - * otherwise false. - */ -static inline bool -ftrace_trigger_soft_disabled(struct ftrace_event_file *file) -{ - unsigned long eflags = file->flags; - - if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) { - if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE) - event_triggers_call(file, NULL); - if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED) - return true; - } - return false; -} - -/* - * Helper function for event_trigger_unlock_commit{_regs}(). - * If there are event triggers attached to this event that requires - * filtering against its fields, then they wil be called as the - * entry already holds the field information of the current event. - * - * It also checks if the event should be discarded or not. - * It is to be discarded if the event is soft disabled and the - * event was only recorded to process triggers, or if the event - * filter is active and this event did not match the filters. - * - * Returns true if the event is discarded, false otherwise. - */ -static inline bool -__event_trigger_test_discard(struct ftrace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, - enum event_trigger_type *tt) -{ - unsigned long eflags = file->flags; - - if (eflags & FTRACE_EVENT_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry); - - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags)) - ring_buffer_discard_commit(buffer, event); - else if (!filter_check_discard(file, entry, buffer, event)) - return false; - - return true; -} - -/** - * event_trigger_unlock_commit - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - */ -static inline void -event_trigger_unlock_commit(struct ftrace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit(buffer, event, irq_flags, pc); - - if (tt) - event_triggers_post_call(file, tt); -} - -/** - * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - * - * Same as event_trigger_unlock_commit() but calls - * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). - */ -static inline void -event_trigger_unlock_commit_regs(struct ftrace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc, - struct pt_regs *regs) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); - - if (tt) - event_triggers_post_call(file, tt); -} - -#ifdef CONFIG_BPF_SYSCALL -unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); -#else -static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) -{ - return 1; -} -#endif - -enum { - FILTER_OTHER = 0, - FILTER_STATIC_STRING, - FILTER_DYN_STRING, - FILTER_PTR_STRING, - FILTER_TRACE_FN, -}; - -extern int trace_event_raw_init(struct ftrace_event_call *call); -extern int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, - int is_signed, int filter_type); -extern int trace_add_event_call(struct ftrace_event_call *call); -extern int trace_remove_event_call(struct ftrace_event_call *call); - -#define is_signed_type(type) (((type)(-1)) < (type)1) - -int trace_set_clr_event(const char *system, const char *event, int set); - -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) - -#ifdef CONFIG_PERF_EVENTS -struct perf_event; - -DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); - -extern int perf_trace_init(struct perf_event *event); -extern void perf_trace_destroy(struct perf_event *event); -extern int perf_trace_add(struct perf_event *event, int flags); -extern void perf_trace_del(struct perf_event *event, int flags); -extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, - char *filter_str); -extern void ftrace_profile_free_filter(struct perf_event *event); -extern void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs **regs, int *rctxp); - -static inline void -perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, - u64 count, struct pt_regs *regs, void *head, - struct task_struct *task) -{ - perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); -} -#endif - -#endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h new file mode 100644 index 000000000000..f8465d65f3c7 --- /dev/null +++ b/include/linux/trace_events.h @@ -0,0 +1,616 @@ + +#ifndef _LINUX_FTRACE_EVENT_H +#define _LINUX_FTRACE_EVENT_H + +#include +#include +#include +#include +#include +#include + +struct trace_array; +struct trace_buffer; +struct tracer; +struct dentry; +struct bpf_prog; + +struct trace_print_flags { + unsigned long mask; + const char *name; +}; + +struct trace_print_flags_u64 { + unsigned long long mask; + const char *name; +}; + +const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array); + +const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array); + +#if BITS_PER_LONG == 32 +const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, + unsigned long long val, + const struct trace_print_flags_u64 + *symbol_array); +#endif + +const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size); + +const char *ftrace_print_hex_seq(struct trace_seq *p, + const unsigned char *buf, int len); + +const char *ftrace_print_array_seq(struct trace_seq *p, + const void *buf, int count, + size_t el_size); + +struct trace_iterator; +struct trace_event; + +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *event); + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned short type; + unsigned char flags; + unsigned char preempt_count; + int pid; +}; + +#define FTRACE_MAX_EVENT \ + ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + struct trace_buffer *trace_buffer; + void *private; + int cpu_file; + struct mutex mutex; + struct ring_buffer_iter **buffer_iter; + unsigned long iter_flags; + + /* trace_seq for __print_flags() and __print_symbolic() etc. */ + struct trace_seq tmp_seq; + + cpumask_var_t started; + + /* it's true when current open file is snapshot */ + bool snapshot; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + unsigned long lost_events; + int leftover; + int ent_size; + int cpu; + u64 ts; + + loff_t pos; + long idx; + + /* All new field here will be zeroed out in pipe_read */ +}; + +enum trace_iter_flags { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, + TRACE_FILE_TIME_IN_NS = 4, +}; + + +typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, + int flags, struct trace_event *event); + +struct trace_event_functions { + trace_print_func trace; + trace_print_func raw; + trace_print_func hex; + trace_print_func binary; +}; + +struct trace_event { + struct hlist_node node; + struct list_head list; + int type; + struct trace_event_functions *funcs; +}; + +extern int register_ftrace_event(struct trace_event *event); +extern int unregister_ftrace_event(struct trace_event *event); + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ + TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ +}; + +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +static inline enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} + +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); +struct ftrace_event_file; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc); +struct ring_buffer_event * +trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, + int type, unsigned long len, + unsigned long flags, int pc); +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + +void tracing_record_cmdline(struct task_struct *tsk); + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); + +struct event_filter; + +enum trace_reg { + TRACE_REG_REGISTER, + TRACE_REG_UNREGISTER, +#ifdef CONFIG_PERF_EVENTS + TRACE_REG_PERF_REGISTER, + TRACE_REG_PERF_UNREGISTER, + TRACE_REG_PERF_OPEN, + TRACE_REG_PERF_CLOSE, + TRACE_REG_PERF_ADD, + TRACE_REG_PERF_DEL, +#endif +}; + +struct ftrace_event_call; + +struct ftrace_event_class { + const char *system; + void *probe; +#ifdef CONFIG_PERF_EVENTS + void *perf_probe; +#endif + int (*reg)(struct ftrace_event_call *event, + enum trace_reg type, void *data); + int (*define_fields)(struct ftrace_event_call *); + struct list_head *(*get_fields)(struct ftrace_event_call *); + struct list_head fields; + int (*raw_init)(struct ftrace_event_call *); +}; + +extern int ftrace_event_reg(struct ftrace_event_call *event, + enum trace_reg type, void *data); + +struct ftrace_event_buffer { + struct ring_buffer *buffer; + struct ring_buffer_event *event; + struct ftrace_event_file *ftrace_file; + void *entry; + unsigned long flags; + int pc; +}; + +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, + struct ftrace_event_file *ftrace_file, + unsigned long len); + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); + +enum { + TRACE_EVENT_FL_FILTERED_BIT, + TRACE_EVENT_FL_CAP_ANY_BIT, + TRACE_EVENT_FL_NO_SET_FILTER_BIT, + TRACE_EVENT_FL_IGNORE_ENABLE_BIT, + TRACE_EVENT_FL_WAS_ENABLED_BIT, + TRACE_EVENT_FL_USE_CALL_FILTER_BIT, + TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_KPROBE_BIT, +}; + +/* + * Event flags: + * FILTERED - The event has a filter attached + * CAP_ANY - Any user can enable for perf + * NO_SET_FILTER - Set when filter has error and is to be ignored + * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file + * WAS_ENABLED - Set and stays set when an event was ever enabled + * (used for module unloading, if a module event is enabled, + * it is best to clear the buffers that used it). + * USE_CALL_FILTER - For ftrace internal events, don't use file filter + * TRACEPOINT - Event is a tracepoint + * KPROBE - Event is a kprobe + */ +enum { + TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), + TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), + TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), + TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), + TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), + TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), + TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), +}; + +struct ftrace_event_call { + struct list_head list; + struct ftrace_event_class *class; + union { + char *name; + /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ + struct tracepoint *tp; + }; + struct trace_event event; + char *print_fmt; + struct event_filter *filter; + void *mod; + void *data; + /* + * bit 0: filter_active + * bit 1: allow trace by non root (cap any) + * bit 2: failed to apply filter + * bit 3: ftrace internal event (do not enable) + * bit 4: Event was enabled by module + * bit 5: use call filter rather than file filter + * bit 6: Event is a tracepoint + */ + int flags; /* static flags of different events */ + +#ifdef CONFIG_PERF_EVENTS + int perf_refcount; + struct hlist_head __percpu *perf_events; + struct bpf_prog *prog; + + int (*perf_perm)(struct ftrace_event_call *, + struct perf_event *); +#endif +}; + +static inline const char * +ftrace_event_name(struct ftrace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_TRACEPOINT) + return call->tp ? call->tp->name : NULL; + else + return call->name; +} + +struct trace_array; +struct ftrace_subsystem_dir; + +enum { + FTRACE_EVENT_FL_ENABLED_BIT, + FTRACE_EVENT_FL_RECORDED_CMD_BIT, + FTRACE_EVENT_FL_FILTERED_BIT, + FTRACE_EVENT_FL_NO_SET_FILTER_BIT, + FTRACE_EVENT_FL_SOFT_MODE_BIT, + FTRACE_EVENT_FL_SOFT_DISABLED_BIT, + FTRACE_EVENT_FL_TRIGGER_MODE_BIT, + FTRACE_EVENT_FL_TRIGGER_COND_BIT, +}; + +/* + * Ftrace event file flags: + * ENABLED - The event is enabled + * RECORDED_CMD - The comms should be recorded at sched_switch + * FILTERED - The event has a filter attached + * NO_SET_FILTER - Set when filter has error and is to be ignored + * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED + * SOFT_DISABLED - When set, do not trace the event (even though its + * tracepoint may be enabled) + * TRIGGER_MODE - When set, invoke the triggers associated with the event + * TRIGGER_COND - When set, one or more triggers has an associated filter + */ +enum { + FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), + FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), + FTRACE_EVENT_FL_FILTERED = (1 << FTRACE_EVENT_FL_FILTERED_BIT), + FTRACE_EVENT_FL_NO_SET_FILTER = (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT), + FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT), + FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT), + FTRACE_EVENT_FL_TRIGGER_MODE = (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT), + FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), +}; + +struct ftrace_event_file { + struct list_head list; + struct ftrace_event_call *event_call; + struct event_filter *filter; + struct dentry *dir; + struct trace_array *tr; + struct ftrace_subsystem_dir *system; + struct list_head triggers; + + /* + * 32 bit flags: + * bit 0: enabled + * bit 1: enabled cmd record + * bit 2: enable/disable with the soft disable bit + * bit 3: soft disabled + * bit 4: trigger enabled + * + * Note: The bits must be set atomically to prevent races + * from other writers. Reads of flags do not need to be in + * sync as they occur in critical sections. But the way flags + * is currently used, these changes do not affect the code + * except that when a change is made, it may have a slight + * delay in propagating the changes to other CPUs due to + * caching and such. Which is mostly OK ;-) + */ + unsigned long flags; + atomic_t sm_ref; /* soft-mode reference counter */ + atomic_t tm_ref; /* trigger-mode reference counter */ +}; + +#define __TRACE_EVENT_FLAGS(name, value) \ + static int __init trace_init_flags_##name(void) \ + { \ + event_##name.flags |= value; \ + return 0; \ + } \ + early_initcall(trace_init_flags_##name); + +#define __TRACE_EVENT_PERF_PERM(name, expr...) \ + static int perf_perm_##name(struct ftrace_event_call *tp_event, \ + struct perf_event *p_event) \ + { \ + return ({ expr; }); \ + } \ + static int __init trace_init_perf_perm_##name(void) \ + { \ + event_##name.perf_perm = &perf_perm_##name; \ + return 0; \ + } \ + early_initcall(trace_init_perf_perm_##name); + +#define PERF_MAX_TRACE_SIZE 2048 + +#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ + +enum event_trigger_type { + ETT_NONE = (0), + ETT_TRACE_ONOFF = (1 << 0), + ETT_SNAPSHOT = (1 << 1), + ETT_STACKTRACE = (1 << 2), + ETT_EVENT_ENABLE = (1 << 3), +}; + +extern int filter_match_preds(struct event_filter *filter, void *rec); + +extern int filter_check_discard(struct ftrace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); +extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); +extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file, + void *rec); +extern void event_triggers_post_call(struct ftrace_event_file *file, + enum event_trigger_type tt); + +/** + * ftrace_trigger_soft_disabled - do triggers and test if soft disabled + * @file: The file pointer of the event to test + * + * If any triggers without filters are attached to this event, they + * will be called here. If the event is soft disabled and has no + * triggers that require testing the fields, it will return true, + * otherwise false. + */ +static inline bool +ftrace_trigger_soft_disabled(struct ftrace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) { + if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE) + event_triggers_call(file, NULL); + if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED) + return true; + } + return false; +} + +/* + * Helper function for event_trigger_unlock_commit{_regs}(). + * If there are event triggers attached to this event that requires + * filtering against its fields, then they wil be called as the + * entry already holds the field information of the current event. + * + * It also checks if the event should be discarded or not. + * It is to be discarded if the event is soft disabled and the + * event was only recorded to process triggers, or if the event + * filter is active and this event did not match the filters. + * + * Returns true if the event is discarded, false otherwise. + */ +static inline bool +__event_trigger_test_discard(struct ftrace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, + enum event_trigger_type *tt) +{ + unsigned long eflags = file->flags; + + if (eflags & FTRACE_EVENT_FL_TRIGGER_COND) + *tt = event_triggers_call(file, entry); + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags)) + ring_buffer_discard_commit(buffer, event); + else if (!filter_check_discard(file, entry, buffer, event)) + return false; + + return true; +} + +/** + * event_trigger_unlock_commit - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + */ +static inline void +event_trigger_unlock_commit(struct ftrace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + if (tt) + event_triggers_post_call(file, tt); +} + +/** + * event_trigger_unlock_commit_regs - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + * + * Same as event_trigger_unlock_commit() but calls + * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). + */ +static inline void +event_trigger_unlock_commit_regs(struct ftrace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc, + struct pt_regs *regs) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); + + if (tt) + event_triggers_post_call(file, tt); +} + +#ifdef CONFIG_BPF_SYSCALL +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); +#else +static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + return 1; +} +#endif + +enum { + FILTER_OTHER = 0, + FILTER_STATIC_STRING, + FILTER_DYN_STRING, + FILTER_PTR_STRING, + FILTER_TRACE_FN, +}; + +extern int trace_event_raw_init(struct ftrace_event_call *call); +extern int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type); +extern int trace_add_event_call(struct ftrace_event_call *call); +extern int trace_remove_event_call(struct ftrace_event_call *call); + +#define is_signed_type(type) (((type)(-1)) < (type)1) + +int trace_set_clr_event(const char *system, const char *event, int set); + +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement optimizing out. + */ +#define event_trace_printk(ip, fmt, args...) \ +do { \ + __trace_printk_check_format(fmt, ##args); \ + tracing_record_cmdline(current); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(ip, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(ip, fmt, ##args); \ +} while (0) + +#ifdef CONFIG_PERF_EVENTS +struct perf_event; + +DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); + +extern int perf_trace_init(struct perf_event *event); +extern void perf_trace_destroy(struct perf_event *event); +extern int perf_trace_add(struct perf_event *event, int flags); +extern void perf_trace_del(struct perf_event *event, int flags); +extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str); +extern void ftrace_profile_free_filter(struct perf_event *event); +extern void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs **regs, int *rctxp); + +static inline void +perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, + u64 count, struct pt_regs *regs, void *head, + struct task_struct *task) +{ + perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); +} +#endif + +#endif /* _LINUX_FTRACE_EVENT_H */ -- cgit v1.2.3 From 645df987f7c1740bb1ba783ab907001720a20cf7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 4 May 2015 18:12:44 -0400 Subject: tracing: Rename ftrace_print_*() functions ta trace_print_*() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The functions ftrace_print_*() are not part of the function infrastructure, and the names can be confusing. Rename them to be trace_print_*(). Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f8465d65f3c7..29627cbafdea 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -1,6 +1,6 @@ -#ifndef _LINUX_FTRACE_EVENT_H -#define _LINUX_FTRACE_EVENT_H +#ifndef _LINUX_TRACE_EVENT_H +#define _LINUX_TRACE_EVENT_H #include #include @@ -25,27 +25,27 @@ struct trace_print_flags_u64 { const char *name; }; -const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, - unsigned long flags, - const struct trace_print_flags *flag_array); +const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array); -const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); +const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 -const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, - unsigned long long val, - const struct trace_print_flags_u64 +const char *trace_print_symbols_seq_u64(struct trace_seq *p, + unsigned long long val, + const struct trace_print_flags_u64 *symbol_array); #endif -const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, - unsigned int bitmask_size); +const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size); -const char *ftrace_print_hex_seq(struct trace_seq *p, - const unsigned char *buf, int len); +const char *trace_print_hex_seq(struct trace_seq *p, + const unsigned char *buf, int len); -const char *ftrace_print_array_seq(struct trace_seq *p, +const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); -- cgit v1.2.3 From 9023c930902fbbcf0cebf6110828700f792989a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 09:39:12 -0400 Subject: tracing: Rename (un)register_ftrace_event() to (un)register_trace_event() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The functions (un)register_ftrace_event() is really about trace_events, and the name should be register_trace_event() instead. Also renamed ftrace_event_reg() to trace_event_reg() for the same reason. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 29627cbafdea..99924c07a042 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -132,8 +132,8 @@ struct trace_event { struct trace_event_functions *funcs; }; -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); +extern int register_trace_event(struct trace_event *event); +extern int unregister_trace_event(struct trace_event *event); /* Return values for print_line callback */ enum print_line_t { @@ -216,7 +216,7 @@ struct ftrace_event_class { int (*raw_init)(struct ftrace_event_call *); }; -extern int ftrace_event_reg(struct ftrace_event_call *event, +extern int trace_event_reg(struct ftrace_event_call *event, enum trace_reg type, void *data); struct ftrace_event_buffer { -- cgit v1.2.3 From 7f1d2f8210195c8c309d424a77dbf06a6d2186f4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 10:09:53 -0400 Subject: tracing: Rename ftrace_event_file to trace_event_file The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The structure ftrace_event_file is really about trace events and not "ftrace". Rename it to trace_event_file. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 99924c07a042..ae19233c7dd8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -157,11 +157,11 @@ static inline enum print_line_t trace_handle_return(struct trace_seq *s) void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc); -struct ftrace_event_file; +struct trace_event_file; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, - struct ftrace_event_file *ftrace_file, + struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc); struct ring_buffer_event * @@ -222,14 +222,14 @@ extern int trace_event_reg(struct ftrace_event_call *event, struct ftrace_event_buffer { struct ring_buffer *buffer; struct ring_buffer_event *event; - struct ftrace_event_file *ftrace_file; + struct trace_event_file *trace_file; void *entry; unsigned long flags; int pc; }; void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, - struct ftrace_event_file *ftrace_file, + struct trace_event_file *trace_file, unsigned long len); void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); @@ -349,7 +349,7 @@ enum { FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), }; -struct ftrace_event_file { +struct trace_event_file { struct list_head list; struct ftrace_event_call *event_call; struct event_filter *filter; @@ -414,15 +414,15 @@ enum event_trigger_type { extern int filter_match_preds(struct event_filter *filter, void *rec); -extern int filter_check_discard(struct ftrace_event_file *file, void *rec, +extern int filter_check_discard(struct trace_event_file *file, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); -extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file, +extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, void *rec); -extern void event_triggers_post_call(struct ftrace_event_file *file, +extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); /** @@ -435,7 +435,7 @@ extern void event_triggers_post_call(struct ftrace_event_file *file, * otherwise false. */ static inline bool -ftrace_trigger_soft_disabled(struct ftrace_event_file *file) +ftrace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; @@ -462,7 +462,7 @@ ftrace_trigger_soft_disabled(struct ftrace_event_file *file) * Returns true if the event is discarded, false otherwise. */ static inline bool -__event_trigger_test_discard(struct ftrace_event_file *file, +__event_trigger_test_discard(struct trace_event_file *file, struct ring_buffer *buffer, struct ring_buffer_event *event, void *entry, @@ -495,7 +495,7 @@ __event_trigger_test_discard(struct ftrace_event_file *file, * if the event is soft disabled and should be discarded. */ static inline void -event_trigger_unlock_commit(struct ftrace_event_file *file, +event_trigger_unlock_commit(struct trace_event_file *file, struct ring_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc) @@ -526,7 +526,7 @@ event_trigger_unlock_commit(struct ftrace_event_file *file, * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). */ static inline void -event_trigger_unlock_commit_regs(struct ftrace_event_file *file, +event_trigger_unlock_commit_regs(struct trace_event_file *file, struct ring_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc, -- cgit v1.2.3 From 2425bcb9240f8c97d793cb31c8e8d8d0a843fa29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 11:45:27 -0400 Subject: tracing: Rename ftrace_event_{call,class} to trace_event_{call,class} The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The structures ftrace_event_call and ftrace_event_class have nothing to do with the function hooks, and are really trace_event structures. Rename ftrace_event_* to trace_event_*. Signed-off-by: Steven Rostedt --- include/linux/module.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/syscalls.h | 12 ++++++------ include/linux/trace_events.h | 38 +++++++++++++++++++------------------- 4 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index c883b86ea964..3e0d492682bb 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -336,7 +336,7 @@ struct module { const char **trace_bprintk_fmt_start; #endif #ifdef CONFIG_EVENT_TRACING - struct ftrace_event_call **trace_events; + struct trace_event_call **trace_events; unsigned int num_trace_events; struct trace_enum_map **trace_enums; unsigned int num_trace_enums; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..d089d6d58ae0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -483,7 +483,7 @@ struct perf_event { void *overflow_handler_context; #ifdef CONFIG_EVENT_TRACING - struct ftrace_event_call *tp_event; + struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38aabe1..d8b06abb264f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -111,14 +111,14 @@ union bpf_attr; #define __SC_STR_ADECL(t, a) #a #define __SC_STR_TDECL(t, a) #t -extern struct ftrace_event_class event_class_syscall_enter; -extern struct ftrace_event_class event_class_syscall_exit; +extern struct trace_event_class event_class_syscall_enter; +extern struct trace_event_class event_class_syscall_exit; extern struct trace_event_functions enter_syscall_print_funcs; extern struct trace_event_functions exit_syscall_print_funcs; #define SYSCALL_TRACE_ENTER_EVENT(sname) \ static struct syscall_metadata __syscall_meta_##sname; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ event_enter_##sname = { \ .class = &event_class_syscall_enter, \ { \ @@ -128,13 +128,13 @@ extern struct trace_event_functions exit_syscall_print_funcs; .data = (void *)&__syscall_meta_##sname,\ .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) \ *__event_enter_##sname = &event_enter_##sname; #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static struct syscall_metadata __syscall_meta_##sname; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ event_exit_##sname = { \ .class = &event_class_syscall_exit, \ { \ @@ -144,7 +144,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; .data = (void *)&__syscall_meta_##sname,\ .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ - static struct ftrace_event_call __used \ + static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) \ *__event_exit_##sname = &event_exit_##sname; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ae19233c7dd8..d10ab04a17b2 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -200,23 +200,23 @@ enum trace_reg { #endif }; -struct ftrace_event_call; +struct trace_event_call; -struct ftrace_event_class { +struct trace_event_class { const char *system; void *probe; #ifdef CONFIG_PERF_EVENTS void *perf_probe; #endif - int (*reg)(struct ftrace_event_call *event, + int (*reg)(struct trace_event_call *event, enum trace_reg type, void *data); - int (*define_fields)(struct ftrace_event_call *); - struct list_head *(*get_fields)(struct ftrace_event_call *); + int (*define_fields)(struct trace_event_call *); + struct list_head *(*get_fields)(struct trace_event_call *); struct list_head fields; - int (*raw_init)(struct ftrace_event_call *); + int (*raw_init)(struct trace_event_call *); }; -extern int trace_event_reg(struct ftrace_event_call *event, +extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); struct ftrace_event_buffer { @@ -269,9 +269,9 @@ enum { TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), }; -struct ftrace_event_call { +struct trace_event_call { struct list_head list; - struct ftrace_event_class *class; + struct trace_event_class *class; union { char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ @@ -298,13 +298,13 @@ struct ftrace_event_call { struct hlist_head __percpu *perf_events; struct bpf_prog *prog; - int (*perf_perm)(struct ftrace_event_call *, + int (*perf_perm)(struct trace_event_call *, struct perf_event *); #endif }; static inline const char * -ftrace_event_name(struct ftrace_event_call *call) +ftrace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; @@ -351,7 +351,7 @@ enum { struct trace_event_file { struct list_head list; - struct ftrace_event_call *event_call; + struct trace_event_call *event_call; struct event_filter *filter; struct dentry *dir; struct trace_array *tr; @@ -388,7 +388,7 @@ struct trace_event_file { early_initcall(trace_init_flags_##name); #define __TRACE_EVENT_PERF_PERM(name, expr...) \ - static int perf_perm_##name(struct ftrace_event_call *tp_event, \ + static int perf_perm_##name(struct trace_event_call *tp_event, \ struct perf_event *p_event) \ { \ return ({ expr; }); \ @@ -417,7 +417,7 @@ extern int filter_match_preds(struct event_filter *filter, void *rec); extern int filter_check_discard(struct trace_event_file *file, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); -extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec, +extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, @@ -559,12 +559,12 @@ enum { FILTER_TRACE_FN, }; -extern int trace_event_raw_init(struct ftrace_event_call *call); -extern int trace_define_field(struct ftrace_event_call *call, const char *type, +extern int trace_event_raw_init(struct trace_event_call *call); +extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); -extern int trace_add_event_call(struct ftrace_event_call *call); -extern int trace_remove_event_call(struct ftrace_event_call *call); +extern int trace_add_event_call(struct trace_event_call *call); +extern int trace_remove_event_call(struct trace_event_call *call); #define is_signed_type(type) (((type)(-1)) < (type)1) @@ -613,4 +613,4 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, } #endif -#endif /* _LINUX_FTRACE_EVENT_H */ +#endif /* _LINUX_TRACE_EVENT_H */ -- cgit v1.2.3 From 3f795dcfc7364cd811c3f6f03d115fcefbbdc1ca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 13:18:46 -0400 Subject: tracing: Rename ftrace_event_buffer to trace_event_buffer. The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The ftrace_event_buffer functions and data structures are for trace_events and not for function hooks. Rename them to trace_event_buffer*. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d10ab04a17b2..a1fa8ebaf684 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -219,7 +219,7 @@ struct trace_event_class { extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); -struct ftrace_event_buffer { +struct trace_event_buffer { struct ring_buffer *buffer; struct ring_buffer_event *event; struct trace_event_file *trace_file; @@ -228,11 +228,11 @@ struct ftrace_event_buffer { int pc; }; -void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, +void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len); -void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer); +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { TRACE_EVENT_FL_FILTERED_BIT, -- cgit v1.2.3 From 892c505aac2bdded3c8ec2ec27abc6d74fd210f5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 May 2015 14:18:11 -0400 Subject: tracing: Rename ftrace_output functions to trace_output The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The ftrace_output_*() and ftrace_raw_output_*() functions represent the trace_event code. Rename them to just trace_output or trace_raw_output. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a1fa8ebaf684..12ca46322a94 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -52,8 +52,8 @@ const char *trace_print_array_seq(struct trace_seq *p, struct trace_iterator; struct trace_event; -int ftrace_raw_output_prep(struct trace_iterator *iter, - struct trace_event *event); +int trace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *event); /* * The trace entry - the most basic unit of tracing. This is what @@ -183,7 +183,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer *buffer, void tracing_record_cmdline(struct task_struct *tsk); -int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); struct event_filter; -- cgit v1.2.3 From 609a74045238c303bbe9396775eacf5bac1f51cc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 13:44:36 -0400 Subject: tracing: Rename FTRACE_MAX_EVENT to TRACE_EVENT_TYPE_MAX The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. Rename the max trace_event type size to something more descriptive and appropriate. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12ca46322a94..6f28464be418 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -68,7 +68,7 @@ struct trace_entry { int pid; }; -#define FTRACE_MAX_EVENT \ +#define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) /* -- cgit v1.2.3 From 687fcc4aee4567df14e31e82d6993418b826f408 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 14:20:14 -0400 Subject: tracing: Rename ftrace_event_name() to trace_event_name() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. ftrace_event_name() returns the name of an event tracepoint, has nothing to do with function tracing. Rename it to trace_event_name(). Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 6f28464be418..15617798849c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -304,7 +304,7 @@ struct trace_event_call { }; static inline const char * -ftrace_event_name(struct trace_event_call *call) +trace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; -- cgit v1.2.3 From 7967b3e0c40ff72fb2cf44d3b50e2cb388ef6c67 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 14:59:40 -0400 Subject: tracing: Rename struct ftrace_subsystem_dir to trace_subsystem_dir The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The structure ftrace_subsystem_dir holds the information about trace event subsystems. It should not be named ftrace, rename it to trace_subsystem_dir. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 15617798849c..d4ad58ec684a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -313,7 +313,7 @@ trace_event_name(struct trace_event_call *call) } struct trace_array; -struct ftrace_subsystem_dir; +struct trace_subsystem_dir; enum { FTRACE_EVENT_FL_ENABLED_BIT, @@ -355,7 +355,7 @@ struct trace_event_file { struct event_filter *filter; struct dentry *dir; struct trace_array *tr; - struct ftrace_subsystem_dir *system; + struct trace_subsystem_dir *system; struct list_head triggers; /* -- cgit v1.2.3 From 1bd758eb1cab2fa5b71a23f9e5d3c8076f4ed650 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:07 +0200 Subject: net: change name of flow_dissector header to match the .c file name add couple of empty lines on the way. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c0b574a414e7..e35c2b13d434 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include /* A. Checksumming of received packets by device. * -- cgit v1.2.3 From 10b89ee43e849544eddfe34e535341fc077464ec Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:09 +0200 Subject: net: move *skb_get_poff declarations into correct header Since these functions are defined in flow_dissector.c, move header declarations from skbuff.h into flow_dissector.h Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e35c2b13d434..17607ab9e7a2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3424,10 +3424,6 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)); -u32 skb_get_poff(const struct sk_buff *skb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen); - /** * skb_head_is_locked - Determine if the skb->head is locked down * @skb: skb to check -- cgit v1.2.3 From 9c684b5083bc191c4b7b189c73d75587e167a474 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:11 +0200 Subject: net: move __skb_get_hash function declaration to flow_dissector.h Since the definition of the function is in flow_dissector.c, it makes sense to have the declaration in flow_dissector.h Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 17607ab9e7a2..ae2d1b7769d8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -918,7 +918,6 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) skb->hash = hash; } -void __skb_get_hash(struct sk_buff *skb); static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) -- cgit v1.2.3 From 5605c76240aadc823e3d46ac9afde2f26fbcf019 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:12 +0200 Subject: net: move __skb_tx_hash to dev.c __skb_tx_hash function has no relation to flow_dissect so just move it to dev.c Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/skbuff.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cd0951c1893d..d3ed01c18247 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2832,6 +2832,9 @@ static inline int netif_set_xps_queue(struct net_device *dev, } #endif +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, + unsigned int num_tx_queues); + /* * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used * as a distribution range limit for the returned value. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ae2d1b7769d8..b01c7fba7c17 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3299,9 +3299,6 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) return skb->queue_mapping != 0; } -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, - unsigned int num_tx_queues); - static inline struct sec_path *skb_sec_path(struct sk_buff *skb) { #ifdef CONFIG_XFRM -- cgit v1.2.3 From 06635a35d13d42b95422bba6633f175245cc644e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:16 +0200 Subject: flow_dissect: use programable dissector in skb_flow_dissect and friends Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b01c7fba7c17..f83aa6568cbf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1935,8 +1935,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - else if (skb_flow_dissect(skb, &keys)) - skb_set_transport_header(skb, keys.thoff); + else if (skb_flow_dissect_flow_keys(skb, &keys)) + skb_set_transport_header(skb, keys.basic.thoff); else skb_set_transport_header(skb, offset_hint); } -- cgit v1.2.3 From 5d6ad960a71f0b36d95d74ef93285733b9f62f59 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 15:12:33 -0400 Subject: tracing: Rename FTRACE_EVENT_FL_* flags to EVENT_FILE_FL_* The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The FTRACE_EVENT_FL_* flags are flags to do with the trace_event files in the tracefs directory. They are not related to function tracing. Rename them to a more descriptive name. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 50 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d4ad58ec684a..a46c138b2eea 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -250,11 +250,11 @@ enum { * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored - * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file + * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * WAS_ENABLED - Set and stays set when an event was ever enabled * (used for module unloading, if a module event is enabled, * it is best to clear the buffers that used it). - * USE_CALL_FILTER - For ftrace internal events, don't use file filter + * USE_CALL_FILTER - For trace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint * KPROBE - Event is a kprobe */ @@ -286,7 +286,7 @@ struct trace_event_call { * bit 0: filter_active * bit 1: allow trace by non root (cap any) * bit 2: failed to apply filter - * bit 3: ftrace internal event (do not enable) + * bit 3: trace internal event (do not enable) * bit 4: Event was enabled by module * bit 5: use call filter rather than file filter * bit 6: Event is a tracepoint @@ -316,18 +316,18 @@ struct trace_array; struct trace_subsystem_dir; enum { - FTRACE_EVENT_FL_ENABLED_BIT, - FTRACE_EVENT_FL_RECORDED_CMD_BIT, - FTRACE_EVENT_FL_FILTERED_BIT, - FTRACE_EVENT_FL_NO_SET_FILTER_BIT, - FTRACE_EVENT_FL_SOFT_MODE_BIT, - FTRACE_EVENT_FL_SOFT_DISABLED_BIT, - FTRACE_EVENT_FL_TRIGGER_MODE_BIT, - FTRACE_EVENT_FL_TRIGGER_COND_BIT, + EVENT_FILE_FL_ENABLED_BIT, + EVENT_FILE_FL_RECORDED_CMD_BIT, + EVENT_FILE_FL_FILTERED_BIT, + EVENT_FILE_FL_NO_SET_FILTER_BIT, + EVENT_FILE_FL_SOFT_MODE_BIT, + EVENT_FILE_FL_SOFT_DISABLED_BIT, + EVENT_FILE_FL_TRIGGER_MODE_BIT, + EVENT_FILE_FL_TRIGGER_COND_BIT, }; /* - * Ftrace event file flags: + * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch * FILTERED - The event has a filter attached @@ -339,14 +339,14 @@ enum { * TRIGGER_COND - When set, one or more triggers has an associated filter */ enum { - FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), - FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), - FTRACE_EVENT_FL_FILTERED = (1 << FTRACE_EVENT_FL_FILTERED_BIT), - FTRACE_EVENT_FL_NO_SET_FILTER = (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT), - FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT), - FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT), - FTRACE_EVENT_FL_TRIGGER_MODE = (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT), - FTRACE_EVENT_FL_TRIGGER_COND = (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT), + EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), + EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), + EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), + EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), + EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), + EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT), + EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT), + EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), }; struct trace_event_file { @@ -439,10 +439,10 @@ ftrace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; - if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) { - if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE) + if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) event_triggers_call(file, NULL); - if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED) + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) return true; } return false; @@ -470,10 +470,10 @@ __event_trigger_test_discard(struct trace_event_file *file, { unsigned long eflags = file->flags; - if (eflags & FTRACE_EVENT_FL_TRIGGER_COND) + if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, entry); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags)) + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) ring_buffer_discard_commit(buffer, event); else if (!filter_check_discard(file, entry, buffer, event)) return false; -- cgit v1.2.3 From 09a5059aa1a2cbf8c8993e61b013cc83a0dd5833 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 May 2015 15:21:25 -0400 Subject: tracing: Rename ftrace_trigger_soft_disabled() to trace_trigger_soft_disabled() The name "ftrace" really refers to the function hook infrastructure. It is not about the trace_events. The ftrace_trigger_soft_disabled() tests if a trace_event is soft disabled (called but not traced), and returns true if it is. It has nothing to do with function tracing and should be renamed. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a46c138b2eea..1063c850dbab 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -426,7 +426,7 @@ extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); /** - * ftrace_trigger_soft_disabled - do triggers and test if soft disabled + * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test * * If any triggers without filters are attached to this event, they @@ -435,7 +435,7 @@ extern void event_triggers_post_call(struct trace_event_file *file, * otherwise false. */ static inline bool -ftrace_trigger_soft_disabled(struct trace_event_file *file) +trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; -- cgit v1.2.3 From f0b5e8a42f37a880b8467e59dc814f4f21581d3d Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 12 May 2015 20:28:07 +0200 Subject: net: kill useless net_*_ingress_queue() definitions when NET_CLS_ACT is unset This fixes 4577139b2dabf589 ("net: use jump label patching for ingress qdisc in __netif_receive_skb_core"). The only client of this is sch_ingress and it depends on NET_CLS_ACT. So there is no way these definition can be of any help. Cc: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 7b8e260c4a27..bd29ab4b0941 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -82,14 +82,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); #ifdef CONFIG_NET_CLS_ACT void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); -#else -static inline void net_inc_ingress_queue(void) -{ -} - -static inline void net_dec_ingress_queue(void) -{ -} #endif extern void rtnetlink_init(void); -- cgit v1.2.3 From 25956b6612601cf36022392ffa83f6bf97939bcd Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Mon, 11 May 2015 12:17:13 +0800 Subject: ACPI / processor: Introduce invalid_logical_cpuid() In ACPI processor drivers, we use direct comparisons of cpu logical id with -1 which are error prone in case logical cpuid is accidentally assinged an error code and prevents us from returning an error-encoding cpuid directly in some cases. So introduce invalid_logical_cpuid() to identify cpu with invalid logical cpu num, then it will be used to replace the direct comparisons with -1. Signed-off-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..913b49f9a6e6 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -158,6 +158,11 @@ typedef u32 phys_cpuid_t; #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1) #endif +static inline bool invalid_logical_cpuid(u32 cpuid) +{ + return (int)cpuid < 0; +} + #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu); -- cgit v1.2.3 From ddcc18f5bdd1aafd457032ec693fd9d0af764d61 Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Wed, 13 May 2015 16:19:30 +0800 Subject: ACPI / processor: Introduce invalid_phys_cpuid() Introduce invalid_phys_cpuid() to identify cpu with invalid physical ID, then used it as replacement of the direct comparisons with PHYS_CPUID_INVALID. Signed-off-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 913b49f9a6e6..90e4ed1eb191 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -163,6 +163,11 @@ static inline bool invalid_logical_cpuid(u32 cpuid) return (int)cpuid < 0; } +static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) +{ + return phys_id == PHYS_CPUID_INVALID; +} + #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu); -- cgit v1.2.3 From 87d5c18ce1f41a2410d4fb26d5d68c55867450f5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:34 +0200 Subject: netfilter: cleanup struct nf_hook_ops indentation Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 63560d0a8dfe..83be4a3cec98 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -79,16 +79,16 @@ typedef unsigned int nf_hookfn(const struct nf_hook_ops *ops, const struct nf_hook_state *state); struct nf_hook_ops { - struct list_head list; + struct list_head list; /* User fills in from here down. */ - nf_hookfn *hook; - struct module *owner; - void *priv; - u_int8_t pf; - unsigned int hooknum; + nf_hookfn *hook; + struct module *owner; + void *priv; + u_int8_t pf; + unsigned int hooknum; /* Hooks are ordered in ascending priority. */ - int priority; + int priority; }; struct nf_sockopt_ops { -- cgit v1.2.3 From f7191483461ce2ae579b6f7227fa7ce49e006656 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:35 +0200 Subject: netfilter: add hook list to nf_hook_state Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 83be4a3cec98..388ed1952242 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -54,10 +54,12 @@ struct nf_hook_state { struct net_device *in; struct net_device *out; struct sock *sk; + struct list_head *hook_list; int (*okfn)(struct sock *, struct sk_buff *); }; static inline void nf_hook_state_init(struct nf_hook_state *p, + struct list_head *hook_list, unsigned int hook, int thresh, u_int8_t pf, struct net_device *indev, @@ -71,6 +73,7 @@ static inline void nf_hook_state_init(struct nf_hook_state *p, p->in = indev; p->out = outdev; p->sk = sk; + p->hook_list = hook_list; p->okfn = okfn; } @@ -166,8 +169,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, if (nf_hooks_active(pf, hook)) { struct nf_hook_state state; - nf_hook_state_init(&state, hook, thresh, pf, - indev, outdev, sk, okfn); + nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh, + pf, indev, outdev, sk, okfn); return nf_hook_slow(skb, &state); } return 1; -- cgit v1.2.3 From b8d0aad0c77f488d1d51a02d871a5cbc2d8032b9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:36 +0200 Subject: netfilter: add nf_hook_list_active() In preparation to have netfilter ingress per-device hook list. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 388ed1952242..49d00638d1fa 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -134,26 +134,33 @@ extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; -static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) +static inline bool nf_hook_list_active(struct list_head *nf_hook_list, + u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) return static_key_false(&nf_hooks_needed[pf][hook]); - return !list_empty(&nf_hooks[pf][hook]); + return !list_empty(nf_hook_list); } #else -static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) +static inline bool nf_hook_list_active(struct list_head *nf_hook_list, + u_int8_t pf, unsigned int hook) { - return !list_empty(&nf_hooks[pf][hook]); + return !list_empty(nf_hook_list); } #endif +static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) +{ + return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook); +} + int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state); /** * nf_hook_thresh - call a netfilter hook - * + * * Returns 1 if the hook has allowed the packet to pass. The function * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. -- cgit v1.2.3 From 1cf51900f8545b358b5deaacfda348d990f671db Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:37 +0200 Subject: net: add CONFIG_NET_INGRESS to enable ingress filtering This new config switch enables the ingress filtering infrastructure that is controlled through the ingress_needed static key. This prepares the introduction of the Netfilter ingress hook that resides under this unique static key. Note that CONFIG_SCH_INGRESS automatically selects this, that should be no problem since this also depends on CONFIG_NET_CLS_ACT. Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index bd29ab4b0941..a2324fb45cf4 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -79,7 +79,7 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif -- cgit v1.2.3 From e687ad60af09010936bbd0b2a3b5d90a8ee8353c Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:38 +0200 Subject: netfilter: add netfilter ingress hook after handle_ing() under unique static key This patch adds the Netfilter ingress hook just after the existing tc ingress hook, that seems to be the consensus solution for this. Note that the Netfilter hook resides under the global static key that enables ingress filtering. Nonetheless, Netfilter still also has its own static key for minimal impact on the existing handle_ing(). * Without this patch: Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags) 16086246pps 7721Mb/sec (7721398080bps) errors: 100000000 42.46% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 25.92% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.81% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.62% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.70% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.34% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.44% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch: Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags) 16090536pps 7723Mb/sec (7723457280bps) errors: 100000000 41.23% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 26.57% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.72% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.55% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.78% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.06% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.43% kpktgend_0 [kernel.kallsyms] [k] __build_skb * Without this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags) 10788648pps 5178Mb/sec (5178551040bps) errors: 100000000 40.99% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.50% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.77% kpktgend_0 [cls_u32] [k] u32_classify 5.62% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.18% kpktgend_0 [pktgen] [k] pktgen_thread_worker 3.23% kpktgend_0 [kernel.kallsyms] [k] tc_classify 2.97% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 1.83% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.50% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 0.99% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags) 10743194pps 5156Mb/sec (5156733120bps) errors: 100000000 42.01% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.78% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.70% kpktgend_0 [cls_u32] [k] u32_classify 5.46% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.16% kpktgend_0 [pktgen] [k] pktgen_thread_worker 2.98% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.84% kpktgend_0 [kernel.kallsyms] [k] tc_classify 1.96% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.57% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk Note that the results are very similar before and after. I can see gcc gets the code under the ingress static key out of the hot path. Then, on that cold branch, it generates the code to accomodate the netfilter ingress static key. My explanation for this is that this reduces the pressure on the instruction cache for non-users as the new code is out of the hot path, and it comes with minimal impact for tc ingress users. Using gcc version 4.8.4 on: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 8 [...] L1d cache: 16K L1i cache: 64K L2 cache: 2048K L3 cache: 8192K Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/netfilter.h | 1 + include/linux/netfilter_ingress.h | 41 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 include/linux/netfilter_ingress.h (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d3ed01c18247..51f8d2f5dc3f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1656,6 +1656,9 @@ struct net_device { struct tcf_proto __rcu *ingress_cl_list; #endif struct netdev_queue __rcu *ingress_queue; +#ifdef CONFIG_NETFILTER_INGRESS + struct list_head nf_hooks_ingress; +#endif unsigned char broadcast[MAX_ADDR_LEN]; #ifdef CONFIG_RFS_ACCEL diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 49d00638d1fa..f5ff5d156da8 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -86,6 +86,7 @@ struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; + struct net_device *dev; struct module *owner; void *priv; u_int8_t pf; diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h new file mode 100644 index 000000000000..cb0727fe2b3d --- /dev/null +++ b/include/linux/netfilter_ingress.h @@ -0,0 +1,41 @@ +#ifndef _NETFILTER_INGRESS_H_ +#define _NETFILTER_INGRESS_H_ + +#include +#include + +#ifdef CONFIG_NETFILTER_INGRESS +static inline int nf_hook_ingress_active(struct sk_buff *skb) +{ + return nf_hook_list_active(&skb->dev->nf_hooks_ingress, + NFPROTO_NETDEV, NF_NETDEV_INGRESS); +} + +static inline int nf_hook_ingress(struct sk_buff *skb) +{ + struct nf_hook_state state; + + nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress, + NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, NULL, + skb->dev, NULL, NULL); + return nf_hook_slow(skb, &state); +} + +static inline void nf_hook_ingress_init(struct net_device *dev) +{ + INIT_LIST_HEAD(&dev->nf_hooks_ingress); +} +#else /* CONFIG_NETFILTER_INGRESS */ +static inline int nf_hook_ingress_active(struct sk_buff *skb) +{ + return 0; +} + +static inline int nf_hook_ingress(struct sk_buff *skb) +{ + return 0; +} + +static inline void nf_hook_ingress_init(struct net_device *dev) {} +#endif /* CONFIG_NETFILTER_INGRESS */ +#endif /* _NETFILTER_INGRESS_H_ */ -- cgit v1.2.3 From af6c235d1a5c112964c3029eb0ed4b52c7aa33bf Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 13 May 2015 13:03:21 +0200 Subject: gpio: discourage passing base to gpio_chip Passing a fixed base in struct gpio_chip is done for legacy systems that cannot handle dynamic allocation. Discourage this behaviour in the kerneldoc. Acked-by: Alexandre Courbot Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 96a678842cde..cc7ec129b329 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -42,8 +42,12 @@ struct seq_file; * @dbg_show: optional routine to show contents in debugfs; default code * will be used when this is omitted, but custom code can show extra * state (such as pullup/pulldown configuration). - * @base: identifies the first GPIO number handled by this chip; or, if - * negative during registration, requests dynamic ID allocation. + * @base: identifies the first GPIO number handled by this chip; + * or, if negative during registration, requests dynamic ID allocation. + * DEPRECATION: providing anything non-negative and nailing the base + * base offset of GPIO chips is deprecated. Please pass -1 as base to + * let gpiolib select the chip base in all possible cases. We want to + * get rid of the static GPIO number space in the long run. * @ngpio: the number of GPIOs handled by this controller; the last GPIO * handled is (base + ngpio - 1). * @desc: array of ngpio descriptors. Private. -- cgit v1.2.3 From 7fb48c5bc3100f7674a8e26f42c1518196500728 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 3 May 2015 22:05:28 +0200 Subject: netfilter: bridge: neigh_head and physoutdev can't be used at same time The neigh_header is only needed when we detect DNAT after prerouting and neigh cache didn't have a mac address for us. The output port has not been chosen yet so we can re-use the storage area, bringing struct size down to 32 bytes on x86_64. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c0b574a414e7..3d932e64125a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -170,12 +170,14 @@ struct nf_bridge_info { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE - } orig_proto; + } orig_proto:8; bool pkt_otherhost; unsigned int mask; struct net_device *physindev; - struct net_device *physoutdev; - char neigh_header[8]; + union { + struct net_device *physoutdev; + char neigh_header[8]; + }; }; #endif -- cgit v1.2.3 From a3b1c1eb50f9b3e0c73c37157d0c61b2e90ae580 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 6 May 2015 16:28:57 +0200 Subject: netfilter: ipset: deinline ip_set_put_extensions() On x86 allyesconfig build: The function compiles to 489 bytes of machine code. It has 25 callsites. text data bss dec hex filename 82441375 22255384 20627456 125324215 7784bb7 vmlinux.before 82434909 22255384 20627456 125317749 7783275 vmlinux Signed-off-by: Denys Vlasenko CC: Jozsef Kadlecsik CC: Eric W. Biederman CC: David S. Miller CC: Jan Engelhardt CC: Jiri Pirko CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: netfilter-devel@vger.kernel.org Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index f88be7258e5f..ffdfdc24952a 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -533,29 +533,9 @@ bitmap_bytes(u32 a, u32 b) #include #include -static inline int +int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, - const void *e, bool active) -{ - if (SET_WITH_TIMEOUT(set)) { - unsigned long *timeout = ext_timeout(e, set); - - if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(active ? ip_set_timeout_get(timeout) - : *timeout))) - return -EMSGSIZE; - } - if (SET_WITH_COUNTER(set) && - ip_set_put_counter(skb, ext_counter(e, set))) - return -EMSGSIZE; - if (SET_WITH_COMMENT(set) && - ip_set_put_comment(skb, ext_comment(e, set))) - return -EMSGSIZE; - if (SET_WITH_SKBINFO(set) && - ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) - return -EMSGSIZE; - return 0; -} + const void *e, bool active); #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, \ -- cgit v1.2.3 From 922f2dd1b65a888e34c472979460dc23211750a2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 12 May 2015 10:33:24 -0700 Subject: net: phy: Add phy_ignore_ta_mask to account for broken turn-around Some PHY devices/switches will not release the turn-around line as they should do at the end of a MDIO transaction. To help with such situations, allow MDIO bus drivers to be made aware of such restrictions. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 685809835b5c..701c7a3946e0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -181,6 +181,9 @@ struct mii_bus { /* PHY addresses to be ignored when probing */ u32 phy_mask; + /* PHY addresses to ignore the TA/read failure */ + u32 phy_ignore_ta_mask; + /* * Pointer to an array of interrupts, each PHY's * interrupt at the index matching its address -- cgit v1.2.3 From 8c8a457a60050d5922676f81913d87e4af6fd97b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 14 May 2015 14:31:01 +0300 Subject: sched: Remove redundant #ifdef Two adjacent members in task_struct were guarded by the same #define, so we can merge the two blocks. Signed-off-by: Nikolay Borisov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431603061-29408-1-git-send-email-kernel@kyup.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0eceeec5a01a..5f8defa155cf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1398,8 +1398,6 @@ struct task_struct { int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ -#ifdef CONFIG_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU -- cgit v1.2.3 From faad38492814112e3e7ce94d90123bbe301fff33 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 10 May 2015 01:18:03 +0200 Subject: sched / idle: Call idle_set_state() from cpuidle_enter_state() Introduce a wrapper function around idle_set_state() called sched_idle_set_state() that will pass this_rq() to it as the first argument and make cpuidle_enter_state() call the new function before and after entering the target state. At the same time, remove direct invocations of idle_set_state() from call_cpuidle(). This will allow the invocation of default_idle_call() to be moved from call_cpuidle() to cpuidle_enter_state() safely and call_cpuidle() to be simplified a bit as a result. Signed-off-by: Rafael J. Wysocki Reviewed-by: Preeti U Murthy Tested-by: Preeti U Murthy Tested-by: Sudeep Holla Acked-by: Kevin Hilman --- include/linux/cpuidle.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 9c5e89254796..301eaaab40e3 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -200,6 +200,9 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif +/* kernel/sched/idle.c */ +extern void sched_idle_set_state(struct cpuidle_state *idle_state); + #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a); #else -- cgit v1.2.3 From 827a5aefc542b8fb17c00de06118e5cd0e3800f2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 10 May 2015 01:18:46 +0200 Subject: sched / idle: Call default_idle_call() from cpuidle_enter_state() The check of the cpuidle_enter() return value against -EBUSY made in call_cpuidle() will not be necessary any more if cpuidle_enter_state() calls default_idle_call() directly when it is about to return -EBUSY, so make that happen and eliminate the check. Signed-off-by: Rafael J. Wysocki Reviewed-by: Preeti U Murthy Tested-by: Preeti U Murthy Tested-by: Sudeep Holla Acked-by: Kevin Hilman --- include/linux/cpuidle.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 301eaaab40e3..c7a63643658e 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -202,6 +202,7 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( /* kernel/sched/idle.c */ extern void sched_idle_set_state(struct cpuidle_state *idle_state); +extern void default_idle_call(void); #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a); -- cgit v1.2.3 From 4573237b01221881702fbe6655f3ae5135be1c18 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 12 May 2015 12:22:34 +0530 Subject: cpufreq: Manage governor usage history with 'policy->last_governor' History of which governor was used last is common to all CPUs within a policy and maintaining it per-cpu isn't the best approach for sure. Apart from wasting memory, this also increases the complexity of managing this data structure as it has to be updated for all CPUs. To make that somewhat simpler, lets store this information in a new field 'last_governor' in struct cpufreq_policy and update it on removal of last cpu of a policy. As a side-effect it also solves an old problem, consider a system with two clusters 0 & 1. And there is one policy per cluster. Cluster 0: CPU0 and 1. Cluster 1: CPU2 and 3. - CPU2 is first brought online, and governor is set to performance (default as cpufreq_cpu_governor wasn't set). - Governor is changed to ondemand. - CPU2 is taken offline and cpufreq_cpu_governor is updated for CPU2. - CPU3 is brought online. - Because cpufreq_cpu_governor wasn't set for CPU3, the default governor performance is picked for CPU3. This patch fixes the bug as we now have a single variable to update for policy. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888c1f47..48e37c07eb84 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -80,6 +80,7 @@ struct cpufreq_policy { struct cpufreq_governor *governor; /* see below */ void *governor_data; bool governor_enabled; /* governor start/stop flag */ + char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */ struct work_struct update; /* if update_policy() needs to be * called, but you're in IRQ context */ -- cgit v1.2.3 From a4afd37b26f4b9f640310a89b7f8d176ae3460b1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 13 May 2015 13:12:43 +0200 Subject: test_bpf: add tests related to BPF_MAXINSNS Couple of torture test cases related to the bug fixed in 0b59d8806a31 ("ARM: net: delegate filter to kernel interpreter when imm_offset() return value can't fit into 12bits."). I've added a helper to allocate and fill the insn space. Output on x86_64 from my laptop: test_bpf: #233 BPF_MAXINSNS: Maximum possible literals jited:0 7 PASS test_bpf: #234 BPF_MAXINSNS: Single literal jited:0 8 PASS test_bpf: #235 BPF_MAXINSNS: Run/add until end jited:0 11553 PASS test_bpf: #236 BPF_MAXINSNS: Too many instructions PASS test_bpf: #237 BPF_MAXINSNS: Very long jump jited:0 9 PASS test_bpf: #238 BPF_MAXINSNS: Ctx heavy transformations jited:0 20329 20398 PASS test_bpf: #239 BPF_MAXINSNS: Call heavy transformations jited:0 32178 32475 PASS test_bpf: #240 BPF_MAXINSNS: Jump heavy test jited:0 10518 PASS test_bpf: #233 BPF_MAXINSNS: Maximum possible literals jited:1 4 PASS test_bpf: #234 BPF_MAXINSNS: Single literal jited:1 4 PASS test_bpf: #235 BPF_MAXINSNS: Run/add until end jited:1 1625 PASS test_bpf: #236 BPF_MAXINSNS: Too many instructions PASS test_bpf: #237 BPF_MAXINSNS: Very long jump jited:1 8 PASS test_bpf: #238 BPF_MAXINSNS: Ctx heavy transformations jited:1 3301 3174 PASS test_bpf: #239 BPF_MAXINSNS: Call heavy transformations jited:1 24107 23491 PASS test_bpf: #240 BPF_MAXINSNS: Jump heavy test jited:1 8651 PASS Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Nicolas Schichan Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index ce1d72d34382..200be4a74a33 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -277,6 +277,14 @@ struct bpf_prog_aux; .off = 0, \ .imm = 0 }) +/* Internal classic blocks for direct assignment */ + +#define __BPF_STMT(CODE, K) \ + ((struct sock_filter) BPF_STMT(CODE, K)) + +#define __BPF_JUMP(CODE, K, JT, JF) \ + ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF)) + #define bytes_to_bpf_size(bytes) \ ({ \ int bpf_size = -EINVAL; \ -- cgit v1.2.3 From ef7f3a5c7149ad2dbd1d8a71d0aa88a02d1dbcb8 Mon Sep 17 00:00:00 2001 From: Bert Vermeulen Date: Wed, 13 May 2015 13:35:39 +0200 Subject: mdio-gpio: Propagate mii_bus.phy_ignore_ta_mask This also changes mii_bus.phy_mask to u32 for consistency. Signed-off-by: Bert Vermeulen Signed-off-by: David S. Miller --- include/linux/mdio-gpio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h index 66c30a763b10..11f00cdabe3d 100644 --- a/include/linux/mdio-gpio.h +++ b/include/linux/mdio-gpio.h @@ -23,7 +23,8 @@ struct mdio_gpio_platform_data { bool mdio_active_low; bool mdo_active_low; - unsigned int phy_mask; + u32 phy_mask; + u32 phy_ignore_ta_mask; int irqs[PHY_MAX_ADDR]; /* reset callback */ int (*reset)(struct mii_bus *bus); -- cgit v1.2.3 From 8fa9dd24667f2d6997ec21341019657342859d31 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 13:37:40 +1100 Subject: VFS/namei: make the use of touch_atime() in get_link() RCU-safe. touch_atime is not RCU-safe, and so cannot be called on an RCU walk. However, in situations where RCU-walk makes a difference, the symlink will likely to accessed much more often than it is useful to update the atime. So split out the test of "Does the atime actually need to be updated" into atime_needs_update(), and have get_link() unlazy if it finds that it will need to do that update. Signed-off-by: NeilBrown Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8f738512c874..1426c435d455 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1880,6 +1880,7 @@ enum file_time_flags { S_VERSION = 8, }; +extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); static inline void file_accessed(struct file *file) { -- cgit v1.2.3 From 89076bc31950eee576ecc06460c23466e2d50939 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 12 May 2015 08:29:38 -0400 Subject: get rid of assorted nameidata-related debris pointless forward declarations, stale comments Signed-off-by: Al Viro --- include/linux/fs.h | 1 - include/linux/namei.h | 1 - include/linux/sched.h | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1426c435d455..b577e801b4af 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -38,7 +38,6 @@ struct backing_dev_info; struct export_operations; struct hd_geometry; struct iovec; -struct nameidata; struct kiocb; struct kobject; struct pipe_inode_info; diff --git a/include/linux/namei.h b/include/linux/namei.h index d756304aa09b..1208e489f83e 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -7,7 +7,6 @@ #include struct vfsmount; -struct nameidata; enum { MAX_NESTED_LINKS = 8 }; diff --git a/include/linux/sched.h b/include/linux/sched.h index f6c9b69d66f2..a1158c954f0f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -132,6 +132,7 @@ struct fs_struct; struct perf_event_context; struct blk_plug; struct filename; +struct nameidata; #define VMACACHE_BITS 2 #define VMACACHE_SIZE (1U << VMACACHE_BITS) -- cgit v1.2.3 From b853a16176cf3e02c57e215743015614152c2428 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 13 May 2015 09:12:02 -0400 Subject: turn user_{path_at,path,lpath,path_dir}() into static inlines Signed-off-by: Al Viro --- include/linux/namei.h | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index 1208e489f83e..d8c6334cd150 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -1,12 +1,10 @@ #ifndef _LINUX_NAMEI_H #define _LINUX_NAMEI_H -#include -#include -#include +#include #include - -struct vfsmount; +#include +#include enum { MAX_NESTED_LINKS = 8 }; @@ -46,13 +44,29 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_ROOT 0x2000 #define LOOKUP_EMPTY 0x4000 -extern int user_path_at(int, const char __user *, unsigned, struct path *); extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); -#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path) -#define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path) -#define user_path_dir(name, path) \ - user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path) +static inline int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) +{ + return user_path_at_empty(dfd, name, flags, path, NULL); +} + +static inline int user_path(const char __user *name, struct path *path) +{ + return user_path_at_empty(AT_FDCWD, name, LOOKUP_FOLLOW, path, NULL); +} + +static inline int user_lpath(const char __user *name, struct path *path) +{ + return user_path_at_empty(AT_FDCWD, name, 0, path, NULL); +} + +static inline int user_path_dir(const char __user *name, struct path *path) +{ + return user_path_at_empty(AT_FDCWD, name, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path, NULL); +} extern int kern_path(const char *, unsigned, struct path *); -- cgit v1.2.3 From 55917a21d0cc012bb6073bb05bb768fd51d8e237 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 May 2015 14:57:23 +0200 Subject: netfilter: x_tables: add context to know if extension runs from nft_compat Currently, we have four xtables extensions that cannot be used from the xt over nft compat layer. The problem is that they need real access to the full blown xt_entry to validate that the rule comes with the right dependencies. This check was introduced to overcome the lack of sufficient userspace dependency validation in iptables. To resolve this problem, this patch introduces a new field to the xt_tgchk_param structure that tell us if the extension is run from nft_compat context. The three affected extensions are: 1) CLUSTERIP, this target has been superseded by xt_cluster. So just bail out by returning -EINVAL. 2) TCPMSS. Relax the checking when used from nft_compat. If used with the wrong configuration, it will corrupt !syn packets by adding TCP MSS option. 3) ebt_stp. Relax the check to make sure it uses the reserved destination MAC address for STP. Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez --- include/linux/netfilter/x_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a3e215bb0241..09f38206c18f 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -62,6 +62,7 @@ struct xt_mtchk_param { void *matchinfo; unsigned int hook_mask; u_int8_t family; + bool nft_compat; }; /** @@ -92,6 +93,7 @@ struct xt_tgchk_param { void *targinfo; unsigned int hook_mask; u_int8_t family; + bool nft_compat; }; /* Target destructor parameters */ -- cgit v1.2.3 From 3f7f642b9bc46453e1435e8b67f1c4f7949be7ff Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Wed, 13 May 2015 12:26:42 +0200 Subject: iio: core: add high pass filter attributes Add a high pass filter attribute for measurements (like the existing low pass) Also add both high and low pass attributes for events. Signed-off-by: Martin Fuzzey Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 1 + include/linux/iio/types.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 058441da4984..f79148261d16 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -32,6 +32,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW, IIO_CHAN_INFO_AVERAGE_RAW, IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY, + IIO_CHAN_INFO_HIGH_PASS_FILTER_3DB_FREQUENCY, IIO_CHAN_INFO_SAMP_FREQ, IIO_CHAN_INFO_FREQUENCY, IIO_CHAN_INFO_PHASE, diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 942b6de68e2f..32b579525004 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -17,6 +17,8 @@ enum iio_event_info { IIO_EV_INFO_VALUE, IIO_EV_INFO_HYSTERESIS, IIO_EV_INFO_PERIOD, + IIO_EV_INFO_HIGH_PASS_FILTER_3DB, + IIO_EV_INFO_LOW_PASS_FILTER_3DB, }; #define IIO_VAL_INT 1 -- cgit v1.2.3 From c91d46065209bfe6124396fc409765ced0601b59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 05:48:07 -0700 Subject: net: fix two sparse errors First one in __skb_checksum_validate_complete() fixes the following (and other callers) make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/tcp_ipv4.o CHECK net/ipv4/tcp_ipv4.c include/linux/skbuff.h:3052:24: warning: incorrect type in return expression (different base types) include/linux/skbuff.h:3052:24: expected restricted __sum16 include/linux/skbuff.h:3052:24: got int Second is fixing gso_make_checksum() : CHECK net/ipv4/gre_offload.c include/linux/skbuff.h:3360:14: warning: incorrect type in assignment (different base types) include/linux/skbuff.h:3360:14: expected unsigned short [unsigned] [usertype] csum include/linux/skbuff.h:3360:14: got restricted __sum16 include/linux/skbuff.h:3365:16: warning: incorrect type in return expression (different base types) include/linux/skbuff.h:3365:16: expected restricted __sum16 include/linux/skbuff.h:3365:16: got unsigned short [unsigned] [usertype] csum Fixes: 5a21232983aa7 ("net: Support for csum_bad in skbuff") Fixes: 7e2b10c1e52ca ("net: Support for multiple checksums with gso") Signed-off-by: Eric Dumazet CC: Tom Herbert Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f83aa6568cbf..b57eebfb67e0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3051,7 +3051,7 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, } } else if (skb->csum_bad) { /* ip_summed == CHECKSUM_NONE in this case */ - return 1; + return (__force __sum16)1; } skb->csum = psum; @@ -3353,15 +3353,14 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) - - skb_transport_offset(skb); - __u16 csum; + skb_transport_offset(skb); + __wsum partial; - csum = csum_fold(csum_partial(skb_transport_header(skb), - plen, skb->csum)); + partial = csum_partial(skb_transport_header(skb), plen, skb->csum); skb->csum = res; SKB_GSO_CB(skb)->csum_start -= plen; - return csum; + return csum_fold(partial); } static inline bool skb_is_gso(const struct sk_buff *skb) -- cgit v1.2.3 From e1031dc1f7ba5c8724ba211062134076df292791 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 7 May 2015 17:38:07 +0200 Subject: dmaengine: Support different source and destination stride In interleaved mode, we can expect to have different source and destination strides. Add support for such case to dmaengine. Signed-off-by: Maxime Ripard Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..5d63acb09813 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -122,10 +122,18 @@ enum dma_transfer_direction { * chunk and before first src/dst address for next chunk. * Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false. * Ignored for src(assumed 0), if src_inc is true and src_sgl is false. + * @dst_icg: Number of bytes to jump after last dst address of this + * chunk and before the first dst address for next chunk. + * Ignored if dst_inc is true and dst_sgl is false. + * @src_icg: Number of bytes to jump after last src address of this + * chunk and before the first src address for next chunk. + * Ignored if src_inc is true and src_sgl is false. */ struct data_chunk { size_t size; size_t icg; + size_t dst_icg; + size_t src_icg; }; /** -- cgit v1.2.3 From 4cfafd3082afc707653aeb82e9f8e7b596fbbfd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 May 2015 12:23:11 +0200 Subject: sched,perf: Fix periodic timers In the below two commits (see Fixes) we have periodic timers that can stop themselves when they're no longer required, but need to be (re)-started when their idle condition changes. Further complications is that we want the timer handler to always do the forward such that it will always correctly deal with the overruns, and we do not want to race such that the handler has already decided to stop, but the (external) restart sees the timer still active and we end up with a 'lost' timer. The problem with the current code is that the re-start can come before the callback does the forward, at which point the forward from the callback will WARN about forwarding an enqueued timer. Now, conceptually its easy to detect if you're before or after the fwd by comparing the expiration time against the current time. Of course, that's expensive (and racy) because we don't have the current time. Alternatively one could cache this state inside the timer, but then everybody pays the overhead of maintaining this extra state, and that is undesired. The only other option that I could see is the external timer_active variable, which I tried to kill before. I would love a nicer interface for this seemingly simple 'problem' but alas. Fixes: 272325c4821f ("perf: Fix mux_interval hrtimer wreckage") Fixes: 77a4d1a1b9a1 ("sched: Cleanup bandwidth timers") Cc: pjt@google.com Cc: tglx@linutronix.de Cc: klamm@yandex-team.ru Cc: mingo@kernel.org Cc: bsegall@google.com Cc: hpa@zytor.com Cc: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20150514102311.GX21418@twins.programming.kicks-ass.net --- include/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..cf3342a8ad80 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -566,8 +566,12 @@ struct perf_cpu_context { struct perf_event_context *task_ctx; int active_oncpu; int exclusive; + + raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; + unsigned int hrtimer_active; + struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; -- cgit v1.2.3 From 5f22f5c668204f3af7557018b2ad6cf2074defac Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Sat, 16 May 2015 11:44:13 +0200 Subject: irqdomain: Add non-hierarchy helper irq_domain_set_info This adds the helper irq_domain_set_info() in a non-domain hierarchy variant. This allows to use the helper for generic chip since not all chips using generic chip support domain hierarchy. Signed-off-by: Stefan Agner Cc: marc.zyngier@arm.com Cc: linux@arm.linux.org.uk Cc: u.kleine-koenig@pengutronix.de Cc: olof@lixom.net Cc: arnd@arndb.de Cc: daniel.lezcano@linaro.org Cc: mark.rutland@arm.com Cc: pawel.moll@arm.com Cc: robh+dt@kernel.org Cc: ijc+devicetree@hellion.org.uk Cc: galak@codeaurora.org Cc: mcoquelin.stm32@gmail.com Cc: linux-arm-kernel@lists.infradead.org Cc: shawn.guo@linaro.org Cc: kernel@pengutronix.de Cc: jason@lakedaemon.net Link: http://lkml.kernel.org/r/1431769465-26867-2-git-send-email-stefan@agner.ch Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 676d7306a360..744ac0ec98eb 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -258,6 +258,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, /* V2 interfaces to support hierarchy IRQ domains. */ extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq); +extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, unsigned int flags, unsigned int size, @@ -281,10 +285,6 @@ extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_chip *chip, void *chip_data); -extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, - void *chip_data, irq_flow_handler_t handler, - void *handler_data, const char *handler_name); extern void irq_domain_reset_irq_data(struct irq_data *irq_data); extern void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, -- cgit v1.2.3 From 3cfeffc265791bc953527458e0a44ea77c459340 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Sat, 16 May 2015 11:44:14 +0200 Subject: genirq: Add irq_chip_(enable/disable)_parent Add helper irq_chip_enable_parent and irq_chip_disable_parent. The helper implement the default behavior in case irq_enable or irq_disable is not implemented for the parent interrupt chip, which is calling the irq_mask or irq_unmask respectively. Signed-off-by: Stefan Agner Cc: marc.zyngier@arm.com Cc: linux@arm.linux.org.uk Cc: u.kleine-koenig@pengutronix.de Cc: olof@lixom.net Cc: arnd@arndb.de Cc: daniel.lezcano@linaro.org Cc: mark.rutland@arm.com Cc: pawel.moll@arm.com Cc: robh+dt@kernel.org Cc: ijc+devicetree@hellion.org.uk Cc: galak@codeaurora.org Cc: mcoquelin.stm32@gmail.com Cc: linux-arm-kernel@lists.infradead.org Cc: shawn.guo@linaro.org Cc: kernel@pengutronix.de Cc: jason@lakedaemon.net Link: http://lkml.kernel.org/r/1431769465-26867-3-git-send-email-stefan@agner.ch Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 62c6901cab55..2633061364b1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -458,6 +458,8 @@ extern void handle_nested_irq(unsigned int irq); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +extern void irq_chip_enable_parent(struct irq_data *data); +extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); extern int irq_chip_retrigger_hierarchy(struct irq_data *data); extern void irq_chip_mask_parent(struct irq_data *data); -- cgit v1.2.3 From b4a04ab7a37b490cad48e69abfe14288cacb669c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 15:38:40 -0400 Subject: cgroup: separate out include/linux/cgroup-defs.h From 2d728f74bfc071df06773e2fd7577dd5dab6425d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 15:37:01 -0400 This patch separates out cgroup-defs.h from cgroup.h which has grown a lot of dependencies. cgroup-defs.h currently only contains constant and type definitions and can be used to break circular include dependency. While moving, definitions are reordered so that cgroup-defs.h has consistent logical structure. This patch is pure reorganization. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 464 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup.h | 455 +------------------------------------------ 2 files changed, 466 insertions(+), 453 deletions(-) create mode 100644 include/linux/cgroup-defs.h (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h new file mode 100644 index 000000000000..55f3120fb952 --- /dev/null +++ b/include/linux/cgroup-defs.h @@ -0,0 +1,464 @@ +/* + * linux/cgroup-defs.h - basic definitions for cgroup + * + * This file provides basic type and interface. Include this file directly + * only if necessary to avoid cyclic dependencies. + */ +#ifndef _LINUX_CGROUP_DEFS_H +#define _LINUX_CGROUP_DEFS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CGROUPS + +struct cgroup; +struct cgroup_root; +struct cgroup_subsys; +struct cgroup_taskset; +struct kernfs_node; +struct kernfs_ops; +struct kernfs_open_file; + +#define MAX_CGROUP_TYPE_NAMELEN 32 +#define MAX_CGROUP_ROOT_NAMELEN 64 +#define MAX_CFTYPE_NAME 64 + +/* define the enumeration of all cgroup subsystems */ +#define SUBSYS(_x) _x ## _cgrp_id, +enum cgroup_subsys_id { +#include + CGROUP_SUBSYS_COUNT, +}; +#undef SUBSYS + +/* bits in struct cgroup_subsys_state flags field */ +enum { + CSS_NO_REF = (1 << 0), /* no reference counting for this css */ + CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ + CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ +}; + +/* bits in struct cgroup flags field */ +enum { + /* Control Group requires release notifications to userspace */ + CGRP_NOTIFY_ON_RELEASE, + /* + * Clone the parent's configuration when creating a new child + * cpuset cgroup. For historical reasons, this option can be + * specified at mount time and thus is implemented here. + */ + CGRP_CPUSET_CLONE_CHILDREN, +}; + +/* cgroup_root->flags */ +enum { + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ + CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ + CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ +}; + +/* cftype->flags */ +enum { + CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ + CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ + + /* internal flags, do not use outside cgroup core proper */ + __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ + __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ +}; + +/* + * Per-subsystem/per-cgroup state maintained by the system. This is the + * fundamental structural building block that controllers deal with. + * + * Fields marked with "PI:" are public and immutable and may be accessed + * directly without synchronization. + */ +struct cgroup_subsys_state { + /* PI: the cgroup that this css is attached to */ + struct cgroup *cgroup; + + /* PI: the cgroup subsystem that this css is attached to */ + struct cgroup_subsys *ss; + + /* reference count - access via css_[try]get() and css_put() */ + struct percpu_ref refcnt; + + /* PI: the parent css */ + struct cgroup_subsys_state *parent; + + /* siblings list anchored at the parent's ->children */ + struct list_head sibling; + struct list_head children; + + /* + * PI: Subsys-unique ID. 0 is unused and root is always 1. The + * matching css can be looked up using css_from_id(). + */ + int id; + + unsigned int flags; + + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all csses. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr and + * used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + + /* percpu_ref killing and RCU release */ + struct rcu_head rcu_head; + struct work_struct destroy_work; +}; + +/* + * A css_set is a structure holding pointers to a set of + * cgroup_subsys_state objects. This saves space in the task struct + * object and speeds up fork()/exit(), since a single inc/dec and a + * list_add()/del() can bump the reference count on the entire cgroup + * set for a task. + */ +struct css_set { + /* Reference count */ + atomic_t refcount; + + /* + * List running through all cgroup groups in the same hash + * slot. Protected by css_set_lock + */ + struct hlist_node hlist; + + /* + * Lists running through all tasks using this cgroup group. + * mg_tasks lists tasks which belong to this cset but are in the + * process of being migrated out or in. Protected by + * css_set_rwsem, but, during migration, once tasks are moved to + * mg_tasks, it can be read safely while holding cgroup_mutex. + */ + struct list_head tasks; + struct list_head mg_tasks; + + /* + * List of cgrp_cset_links pointing at cgroups referenced from this + * css_set. Protected by css_set_lock. + */ + struct list_head cgrp_links; + + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; + + /* + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). + */ + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + + /* + * List of csets participating in the on-going migration either as + * source or destination. Protected by cgroup_mutex. + */ + struct list_head mg_preload_node; + struct list_head mg_node; + + /* + * If this cset is acting as the source of migration the following + * two fields are set. mg_src_cgrp is the source cgroup of the + * on-going migration and mg_dst_cset is the destination cset the + * target tasks on this cset should be migrated to. Protected by + * cgroup_mutex. + */ + struct cgroup *mg_src_cgrp; + struct css_set *mg_dst_cset; + + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + + /* For RCU-protected deletion */ + struct rcu_head rcu_head; +}; + +struct cgroup { + /* self css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state self; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * idr allocated in-hierarchy ID. + * + * ID 0 is not used, the ID of the root cgroup is always 1, and a + * new cgroup will be assigned with a smallest available ID. + * + * Allocating/Removing ID must be protected by cgroup_mutex. + */ + int id; + + /* + * If this cgroup contains any tasks, it contributes one to + * populated_cnt. All children with non-zero popuplated_cnt of + * their own contribute one. The count is zero iff there's no task + * in this cgroup or its subtree. + */ + int populated_cnt; + + struct kernfs_node *kn; /* cgroup kernfs entry */ + struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ + + /* + * The bitmask of subsystems enabled on the child cgroups. + * ->subtree_control is the one configured through + * "cgroup.subtree_control" while ->child_subsys_mask is the + * effective one which may have more subsystems enabled. + * Controller knobs are made available iff it's enabled in + * ->subtree_control. + */ + unsigned int subtree_control; + unsigned int child_subsys_mask; + + /* Private pointers for each registered subsystem */ + struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; + + struct cgroup_root *root; + + /* + * List of cgrp_cset_links pointing at css_sets with tasks in this + * cgroup. Protected by css_set_lock. + */ + struct list_head cset_links; + + /* + * On the default hierarchy, a css_set for a cgroup with some + * susbsys disabled will point to css's which are associated with + * the closest ancestor which has the subsys enabled. The + * following lists all css_sets which point to this cgroup's css + * for the given subsystem. + */ + struct list_head e_csets[CGROUP_SUBSYS_COUNT]; + + /* + * list of pidlists, up to two for each namespace (one for procs, one + * for tasks); created on demand. + */ + struct list_head pidlists; + struct mutex pidlist_mutex; + + /* used to wait for offlining of csses */ + wait_queue_head_t offline_waitq; + + /* used to schedule release agent */ + struct work_struct release_agent_work; +}; + +/* + * A cgroup_root represents the root of a cgroup hierarchy, and may be + * associated with a kernfs_root to form an active hierarchy. This is + * internal to cgroup core. Don't access directly from controllers. + */ +struct cgroup_root { + struct kernfs_root *kf_root; + + /* The bitmask of subsystems attached to this hierarchy */ + unsigned int subsys_mask; + + /* Unique id for this hierarchy. */ + int hierarchy_id; + + /* The root cgroup. Root is destroyed on its release. */ + struct cgroup cgrp; + + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ + atomic_t nr_cgrps; + + /* A list running through the active hierarchies */ + struct list_head root_list; + + /* Hierarchy-specific flags */ + unsigned int flags; + + /* IDs for cgroups in this hierarchy */ + struct idr cgroup_idr; + + /* The path to use for release notifications. */ + char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; +}; + +/* + * struct cftype: handler definitions for cgroup control files + * + * When reading/writing to a file: + * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata + */ +struct cftype { + /* + * By convention, the name should begin with the name of the + * subsystem, followed by a period. Zero length string indicates + * end of cftype array. + */ + char name[MAX_CFTYPE_NAME]; + int private; + /* + * If not 0, file mode is set to this value, otherwise it will + * be figured out automatically + */ + umode_t mode; + + /* + * The maximum length of string, excluding trailing nul, that can + * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. + */ + size_t max_write_len; + + /* CFTYPE_* flags */ + unsigned int flags; + + /* + * Fields used for internal bookkeeping. Initialized automatically + * during registration. + */ + struct cgroup_subsys *ss; /* NULL for cgroup core files */ + struct list_head node; /* anchored at ss->cfts */ + struct kernfs_ops *kf_ops; + + /* + * read_u64() is a shortcut for the common case of returning a + * single integer. Use it in place of read() + */ + u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); + /* + * read_s64() is a signed version of read_u64() + */ + s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); + + /* generic seq_file read interface */ + int (*seq_show)(struct seq_file *sf, void *v); + + /* optional ops, implement all or none */ + void *(*seq_start)(struct seq_file *sf, loff_t *ppos); + void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); + void (*seq_stop)(struct seq_file *sf, void *v); + + /* + * write_u64() is a shortcut for the common case of accepting + * a single integer (as parsed by simple_strtoull) from + * userspace. Use in place of write(); return 0 or error. + */ + int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val); + /* + * write_s64() is a signed version of write_u64() + */ + int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val); + + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. Use + * of_css/cft() to access the associated css and cft. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key lockdep_key; +#endif +}; + +/* + * Control Group subsystem type. + * See Documentation/cgroups/cgroups.txt for details + */ +struct cgroup_subsys { + struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); + int (*css_online)(struct cgroup_subsys_state *css); + void (*css_offline)(struct cgroup_subsys_state *css); + void (*css_released)(struct cgroup_subsys_state *css); + void (*css_free)(struct cgroup_subsys_state *css); + void (*css_reset)(struct cgroup_subsys_state *css); + void (*css_e_css_changed)(struct cgroup_subsys_state *css); + + int (*can_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*fork)(struct task_struct *task); + void (*exit)(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task); + void (*bind)(struct cgroup_subsys_state *root_css); + + int disabled; + int early_init; + + /* + * If %false, this subsystem is properly hierarchical - + * configuration, resource accounting and restriction on a parent + * cgroup cover those of its children. If %true, hierarchy support + * is broken in some ways - some subsystems ignore hierarchy + * completely while others are only implemented half-way. + * + * It's now disallowed to create nested cgroups if the subsystem is + * broken and cgroup core will emit a warning message on such + * cases. Eventually, all subsystems will be made properly + * hierarchical and this will go away. + */ + bool broken_hierarchy; + bool warned_broken_hierarchy; + + /* the following two fields are initialized automtically during boot */ + int id; + const char *name; + + /* link to parent, protected by cgroup_lock() */ + struct cgroup_root *root; + + /* idr for css->id */ + struct idr css_idr; + + /* + * List of cftypes. Each entry is the first entry of an array + * terminated by zero length name. + */ + struct list_head cfts; + + /* + * Base cftypes which are automatically registered. The two can + * point to the same array. + */ + struct cftype *dfl_cftypes; /* for the default hierarchy */ + struct cftype *legacy_cftypes; /* for the legacy hierarchies */ + + /* + * A subsystem may depend on other subsystems. When such subsystem + * is enabled on a cgroup, the depended-upon subsystems are enabled + * together if available. Subsystems enabled due to dependency are + * not visible to userland until explicitly enabled. The following + * specifies the mask of subsystems that this one depends on. + */ + unsigned int depends_on; +}; + +#endif /* CONFIG_CGROUPS */ +#endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b9cb94c3102a..96a2ecd5aa69 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -11,23 +11,16 @@ #include #include #include -#include #include #include #include -#include -#include #include -#include #include #include -#include -#ifdef CONFIG_CGROUPS +#include -struct cgroup_root; -struct cgroup_subsys; -struct cgroup; +#ifdef CONFIG_CGROUPS extern int cgroup_init_early(void); extern int cgroup_init(void); @@ -40,66 +33,6 @@ extern int cgroupstats_build(struct cgroupstats *stats, extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); -/* define the enumeration of all cgroup subsystems */ -#define SUBSYS(_x) _x ## _cgrp_id, -enum cgroup_subsys_id { -#include - CGROUP_SUBSYS_COUNT, -}; -#undef SUBSYS - -/* - * Per-subsystem/per-cgroup state maintained by the system. This is the - * fundamental structural building block that controllers deal with. - * - * Fields marked with "PI:" are public and immutable and may be accessed - * directly without synchronization. - */ -struct cgroup_subsys_state { - /* PI: the cgroup that this css is attached to */ - struct cgroup *cgroup; - - /* PI: the cgroup subsystem that this css is attached to */ - struct cgroup_subsys *ss; - - /* reference count - access via css_[try]get() and css_put() */ - struct percpu_ref refcnt; - - /* PI: the parent css */ - struct cgroup_subsys_state *parent; - - /* siblings list anchored at the parent's ->children */ - struct list_head sibling; - struct list_head children; - - /* - * PI: Subsys-unique ID. 0 is unused and root is always 1. The - * matching css can be looked up using css_from_id(). - */ - int id; - - unsigned int flags; - - /* - * Monotonically increasing unique serial number which defines a - * uniform order among all csses. It's guaranteed that all - * ->children lists are in the ascending order of ->serial_nr and - * used to allow interrupting and resuming iterations. - */ - u64 serial_nr; - - /* percpu_ref killing and RCU release */ - struct rcu_head rcu_head; - struct work_struct destroy_work; -}; - -/* bits in struct cgroup_subsys_state flags field */ -enum { - CSS_NO_REF = (1 << 0), /* no reference counting for this css */ - CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ - CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ -}; - /** * css_get - obtain a reference on the specified css * @css: target css @@ -185,307 +118,6 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } -/* bits in struct cgroup flags field */ -enum { - /* Control Group requires release notifications to userspace */ - CGRP_NOTIFY_ON_RELEASE, - /* - * Clone the parent's configuration when creating a new child - * cpuset cgroup. For historical reasons, this option can be - * specified at mount time and thus is implemented here. - */ - CGRP_CPUSET_CLONE_CHILDREN, -}; - -struct cgroup { - /* self css with NULL ->ss, points back to this cgroup */ - struct cgroup_subsys_state self; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - - /* - * If this cgroup contains any tasks, it contributes one to - * populated_cnt. All children with non-zero popuplated_cnt of - * their own contribute one. The count is zero iff there's no task - * in this cgroup or its subtree. - */ - int populated_cnt; - - struct kernfs_node *kn; /* cgroup kernfs entry */ - struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ - - /* - * The bitmask of subsystems enabled on the child cgroups. - * ->subtree_control is the one configured through - * "cgroup.subtree_control" while ->child_subsys_mask is the - * effective one which may have more subsystems enabled. - * Controller knobs are made available iff it's enabled in - * ->subtree_control. - */ - unsigned int subtree_control; - unsigned int child_subsys_mask; - - /* Private pointers for each registered subsystem */ - struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; - - struct cgroup_root *root; - - /* - * List of cgrp_cset_links pointing at css_sets with tasks in this - * cgroup. Protected by css_set_lock. - */ - struct list_head cset_links; - - /* - * On the default hierarchy, a css_set for a cgroup with some - * susbsys disabled will point to css's which are associated with - * the closest ancestor which has the subsys enabled. The - * following lists all css_sets which point to this cgroup's css - * for the given subsystem. - */ - struct list_head e_csets[CGROUP_SUBSYS_COUNT]; - - /* - * list of pidlists, up to two for each namespace (one for procs, one - * for tasks); created on demand. - */ - struct list_head pidlists; - struct mutex pidlist_mutex; - - /* used to wait for offlining of csses */ - wait_queue_head_t offline_waitq; - - /* used to schedule release agent */ - struct work_struct release_agent_work; -}; - -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* cgroup_root->flags */ -enum { - CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ - CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ - CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ -}; - -/* - * A cgroup_root represents the root of a cgroup hierarchy, and may be - * associated with a kernfs_root to form an active hierarchy. This is - * internal to cgroup core. Don't access directly from controllers. - */ -struct cgroup_root { - struct kernfs_root *kf_root; - - /* The bitmask of subsystems attached to this hierarchy */ - unsigned int subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The root cgroup. Root is destroyed on its release. */ - struct cgroup cgrp; - - /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ - atomic_t nr_cgrps; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* Hierarchy-specific flags */ - unsigned int flags; - - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - -/* - * A css_set is a structure holding pointers to a set of - * cgroup_subsys_state objects. This saves space in the task struct - * object and speeds up fork()/exit(), since a single inc/dec and a - * list_add()/del() can bump the reference count on the entire cgroup - * set for a task. - */ - -struct css_set { - - /* Reference count */ - atomic_t refcount; - - /* - * List running through all cgroup groups in the same hash - * slot. Protected by css_set_lock - */ - struct hlist_node hlist; - - /* - * Lists running through all tasks using this cgroup group. - * mg_tasks lists tasks which belong to this cset but are in the - * process of being migrated out or in. Protected by - * css_set_rwsem, but, during migration, once tasks are moved to - * mg_tasks, it can be read safely while holding cgroup_mutex. - */ - struct list_head tasks; - struct list_head mg_tasks; - - /* - * List of cgrp_cset_links pointing at cgroups referenced from this - * css_set. Protected by css_set_lock. - */ - struct list_head cgrp_links; - - /* the default cgroup associated with this css_set */ - struct cgroup *dfl_cgrp; - - /* - * Set of subsystem states, one for each subsystem. This array is - * immutable after creation apart from the init_css_set during - * subsystem registration (at boot time). - */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; - - /* - * List of csets participating in the on-going migration either as - * source or destination. Protected by cgroup_mutex. - */ - struct list_head mg_preload_node; - struct list_head mg_node; - - /* - * If this cset is acting as the source of migration the following - * two fields are set. mg_src_cgrp is the source cgroup of the - * on-going migration and mg_dst_cset is the destination cset the - * target tasks on this cset should be migrated to. Protected by - * cgroup_mutex. - */ - struct cgroup *mg_src_cgrp; - struct css_set *mg_dst_cset; - - /* - * On the default hierarhcy, ->subsys[ssid] may point to a css - * attached to an ancestor instead of the cgroup this css_set is - * associated with. The following node is anchored at - * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to - * iterate through all css's attached to a given cgroup. - */ - struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; - - /* For RCU-protected deletion */ - struct rcu_head rcu_head; -}; - -/* - * struct cftype: handler definitions for cgroup control files - * - * When reading/writing to a file: - * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_path.dentry->d_fsdata - */ - -/* cftype->flags */ -enum { - CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ - CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ - CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ - - /* internal flags, do not use outside cgroup core proper */ - __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ - __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ -}; - -#define MAX_CFTYPE_NAME 64 - -struct cftype { - /* - * By convention, the name should begin with the name of the - * subsystem, followed by a period. Zero length string indicates - * end of cftype array. - */ - char name[MAX_CFTYPE_NAME]; - int private; - /* - * If not 0, file mode is set to this value, otherwise it will - * be figured out automatically - */ - umode_t mode; - - /* - * The maximum length of string, excluding trailing nul, that can - * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. - */ - size_t max_write_len; - - /* CFTYPE_* flags */ - unsigned int flags; - - /* - * Fields used for internal bookkeeping. Initialized automatically - * during registration. - */ - struct cgroup_subsys *ss; /* NULL for cgroup core files */ - struct list_head node; /* anchored at ss->cfts */ - struct kernfs_ops *kf_ops; - - /* - * read_u64() is a shortcut for the common case of returning a - * single integer. Use it in place of read() - */ - u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); - /* - * read_s64() is a signed version of read_u64() - */ - s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); - - /* generic seq_file read interface */ - int (*seq_show)(struct seq_file *sf, void *v); - - /* optional ops, implement all or none */ - void *(*seq_start)(struct seq_file *sf, loff_t *ppos); - void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); - void (*seq_stop)(struct seq_file *sf, void *v); - - /* - * write_u64() is a shortcut for the common case of accepting - * a single integer (as parsed by simple_strtoull) from - * userspace. Use in place of write(); return 0 or error. - */ - int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val); - /* - * write_s64() is a signed version of write_u64() - */ - int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val); - - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. Use - * of_css/cft() to access the associated css and cft. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lock_class_key lockdep_key; -#endif -}; - extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; @@ -612,11 +244,6 @@ int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); -/* - * Control Group taskset, used to pass around set of tasks to cgroup_subsys - * methods. - */ -struct cgroup_taskset; struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); @@ -629,84 +256,6 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) -/* - * Control Group subsystem type. - * See Documentation/cgroups/cgroups.txt for details - */ - -struct cgroup_subsys { - struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); - int (*css_online)(struct cgroup_subsys_state *css); - void (*css_offline)(struct cgroup_subsys_state *css); - void (*css_released)(struct cgroup_subsys_state *css); - void (*css_free)(struct cgroup_subsys_state *css); - void (*css_reset)(struct cgroup_subsys_state *css); - void (*css_e_css_changed)(struct cgroup_subsys_state *css); - - int (*can_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*fork)(struct task_struct *task); - void (*exit)(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task); - void (*bind)(struct cgroup_subsys_state *root_css); - - int disabled; - int early_init; - - /* - * If %false, this subsystem is properly hierarchical - - * configuration, resource accounting and restriction on a parent - * cgroup cover those of its children. If %true, hierarchy support - * is broken in some ways - some subsystems ignore hierarchy - * completely while others are only implemented half-way. - * - * It's now disallowed to create nested cgroups if the subsystem is - * broken and cgroup core will emit a warning message on such - * cases. Eventually, all subsystems will be made properly - * hierarchical and this will go away. - */ - bool broken_hierarchy; - bool warned_broken_hierarchy; - - /* the following two fields are initialized automtically during boot */ - int id; -#define MAX_CGROUP_TYPE_NAMELEN 32 - const char *name; - - /* link to parent, protected by cgroup_lock() */ - struct cgroup_root *root; - - /* idr for css->id */ - struct idr css_idr; - - /* - * List of cftypes. Each entry is the first entry of an array - * terminated by zero length name. - */ - struct list_head cfts; - - /* - * Base cftypes which are automatically registered. The two can - * point to the same array. - */ - struct cftype *dfl_cftypes; /* for the default hierarchy */ - struct cftype *legacy_cftypes; /* for the legacy hierarchies */ - - /* - * A subsystem may depend on other subsystems. When such subsystem - * is enabled on a cgroup, the depended-upon subsystems are enabled - * together if available. Subsystems enabled due to dependency are - * not visible to userland until explicitly enabled. The following - * specifies the mask of subsystems that this one depends on. - */ - unsigned int depends_on; -}; - #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include #undef SUBSYS -- cgit v1.2.3 From c326aa2bb2209e10df4a381801bb34ca0f923038 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:24:16 -0400 Subject: cgroup: reorganize include/linux/cgroup.h From c4d440938b5e2015c70594fe6666a099c844f929 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:21:40 -0400 Over time, cgroup.h grew organically and doesn't have much logical structure at this point. Separation of cgroup-defs.h in the previous patch gives us a good chance for reorganizing cgroup.h as changes to the header are likely to cause conflicts anyway. This patch reorganizes cgroup.h so that it has consistent logical grouping. This is pure reorganization. v2: Relocating #ifdef CONFIG_CGROUPS caused build failure when cgroup is disabled. Dropped. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 552 ++++++++++++++++++++++++------------------------- 1 file changed, 271 insertions(+), 281 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 96a2ecd5aa69..82319fb31cfe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -22,16 +22,189 @@ #ifdef CONFIG_CGROUPS -extern int cgroup_init_early(void); -extern int cgroup_init(void); -extern void cgroup_fork(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); -extern void cgroup_exit(struct task_struct *p); -extern int cgroupstats_build(struct cgroupstats *stats, - struct dentry *dentry); +/* a css_task_iter should be treated as an opaque object */ +struct css_task_iter { + struct cgroup_subsys *ss; + + struct list_head *cset_pos; + struct list_head *cset_head; + + struct list_head *task_pos; + struct list_head *tasks_head; + struct list_head *mg_tasks_head; +}; + +extern struct cgroup_root cgrp_dfl_root; +extern struct css_set init_css_set; + +#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; +#include +#undef SUBSYS + +bool css_has_online_children(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, + struct cgroup_subsys *ss); +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); + +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); + +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_rm_cftypes(struct cftype *cfts); + +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); +int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk); + +void cgroup_fork(struct task_struct *p); +void cgroup_post_fork(struct task_struct *p); +void cgroup_exit(struct task_struct *p); + +int cgroup_init_early(void); +int cgroup_init(void); + +/* + * Iteration helpers and macros. + */ + +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent); +struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); +struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); +struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); + +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); + +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it); +struct task_struct *css_task_iter_next(struct css_task_iter *it); +void css_task_iter_end(struct css_task_iter *it); + +/** + * css_for_each_child - iterate through children of a css + * @pos: the css * to use as the loop cursor + * @parent: css whose children to walk + * + * Walk @parent's children. Must be called under rcu_read_lock(). + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. + */ +#define css_for_each_child(pos, parent) \ + for ((pos) = css_next_child(NULL, (parent)); (pos); \ + (pos) = css_next_child((pos), (parent))) + +/** + * css_for_each_descendant_pre - pre-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @root: css whose descendants to walk + * + * Walk @root's descendants. @root is included in the iteration and the + * first node to be visited. Must be called under rcu_read_lock(). + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * For example, the following guarantees that a descendant can't escape + * state updates of its ancestors. + * + * my_online(@css) + * { + * Lock @css's parent and @css; + * Inherit state from the parent; + * Unlock both. + * } + * + * my_update_state(@css) + * { + * css_for_each_descendant_pre(@pos, @css) { + * Lock @pos; + * if (@pos == @css) + * Update @css's state; + * else + * Verify @pos is alive and inherit state from its parent; + * Unlock @pos; + * } + * } + * + * As long as the inheriting step, including checking the parent state, is + * enclosed inside @pos locking, double-locking the parent isn't necessary + * while inheriting. The state update to the parent is guaranteed to be + * visible by walking order and, as long as inheriting operations to the + * same @pos are atomic to each other, multiple updates racing each other + * still result in the correct state. It's guaranateed that at least one + * inheritance happens for any css after the latest update to its parent. + * + * If checking parent's state requires locking the parent, each inheriting + * iteration should lock and unlock both @pos->parent and @pos. + * + * Alternatively, a subsystem may choose to use a single global lock to + * synchronize ->css_online() and ->css_offline() against tree-walking + * operations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. + */ +#define css_for_each_descendant_pre(pos, css) \ + for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ + (pos) = css_next_descendant_pre((pos), (css))) + +/** + * css_for_each_descendant_post - post-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @css: css whose descendants to walk + * + * Similar to css_for_each_descendant_pre() but performs post-order + * traversal instead. @root is included in the iteration and the last + * node to be visited. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * Note that the walk visibility guarantee example described in pre-order + * walk doesn't apply the same to post-order walks. + */ +#define css_for_each_descendant_post(pos, css) \ + for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ + (pos) = css_next_descendant_post((pos), (css))) + +/** + * cgroup_taskset_for_each - iterate cgroup_taskset + * @task: the loop cursor + * @tset: taskset to iterate + */ +#define cgroup_taskset_for_each(task, tset) \ + for ((task) = cgroup_taskset_first((tset)); (task); \ + (task) = cgroup_taskset_next((tset))) -extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk); +/* + * Inline functions. + */ /** * css_get - obtain a reference on the specified css @@ -118,8 +291,87 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } -extern struct cgroup_root cgrp_dfl_root; -extern struct css_set init_css_set; +/** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for + * @__c: extra condition expression to be passed to rcu_dereference_check() + * + * A task's css_set is RCU protected, initialized and exited while holding + * task_lock(), and can only be modified while holding both cgroup_mutex + * and task_lock() while the task is alive. This macro verifies that the + * caller is inside proper critical section and returns @task's css_set. + * + * The caller can also specify additional allowed conditions via @__c, such + * as locks used during the cgroup_subsys::attach() methods. + */ +#ifdef CONFIG_PROVE_RCU +extern struct mutex cgroup_mutex; +extern struct rw_semaphore css_set_rwsem; +#define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ + lockdep_is_held(&cgroup_mutex) || \ + lockdep_is_held(&css_set_rwsem) || \ + ((task)->flags & PF_EXITING) || (__c)) +#else +#define task_css_set_check(task, __c) \ + rcu_dereference((task)->cgroups) +#endif + +/** + * task_css_check - obtain css for (task, subsys) w/ extra access conds + * @task: the target task + * @subsys_id: the target subsystem ID + * @__c: extra condition expression to be passed to rcu_dereference_check() + * + * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The + * synchronization rules are the same as task_css_set_check(). + */ +#define task_css_check(task, subsys_id, __c) \ + task_css_set_check((task), (__c))->subsys[(subsys_id)] + +/** + * task_css_set - obtain a task's css_set + * @task: the task to obtain css_set for + * + * See task_css_set_check(). + */ +static inline struct css_set *task_css_set(struct task_struct *task) +{ + return task_css_set_check(task, false); +} + +/** + * task_css - obtain css for (task, subsys) + * @task: the target task + * @subsys_id: the target subsystem ID + * + * See task_css_check(). + */ +static inline struct cgroup_subsys_state *task_css(struct task_struct *task, + int subsys_id) +{ + return task_css_check(task, subsys_id, false); +} + +/** + * task_css_is_root - test whether a task belongs to the root css + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Test whether @task belongs to the root css on the specified subsystem. + * May be invoked in any context. + */ +static inline bool task_css_is_root(struct task_struct *task, int subsys_id) +{ + return task_css_check(task, subsys_id, true) == + init_css_set.subsys[subsys_id]; +} + +static inline struct cgroup *task_cgroup(struct task_struct *task, + int subsys_id) +{ + return task_css(task, subsys_id)->cgroup; +} /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy @@ -236,284 +488,22 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); - -int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_rm_cftypes(struct cftype *cfts); - -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); - -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); - -/** - * cgroup_taskset_for_each - iterate cgroup_taskset - * @task: the loop cursor - * @tset: taskset to iterate - */ -#define cgroup_taskset_for_each(task, tset) \ - for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) - -#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; -#include -#undef SUBSYS - -/** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for - * @__c: extra condition expression to be passed to rcu_dereference_check() - * - * A task's css_set is RCU protected, initialized and exited while holding - * task_lock(), and can only be modified while holding both cgroup_mutex - * and task_lock() while the task is alive. This macro verifies that the - * caller is inside proper critical section and returns @task's css_set. - * - * The caller can also specify additional allowed conditions via @__c, such - * as locks used during the cgroup_subsys::attach() methods. - */ -#ifdef CONFIG_PROVE_RCU -extern struct mutex cgroup_mutex; -extern struct rw_semaphore css_set_rwsem; -#define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ - lockdep_is_held(&cgroup_mutex) || \ - lockdep_is_held(&css_set_rwsem) || \ - ((task)->flags & PF_EXITING) || (__c)) -#else -#define task_css_set_check(task, __c) \ - rcu_dereference((task)->cgroups) -#endif - -/** - * task_css_check - obtain css for (task, subsys) w/ extra access conds - * @task: the target task - * @subsys_id: the target subsystem ID - * @__c: extra condition expression to be passed to rcu_dereference_check() - * - * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The - * synchronization rules are the same as task_css_set_check(). - */ -#define task_css_check(task, subsys_id, __c) \ - task_css_set_check((task), (__c))->subsys[(subsys_id)] - -/** - * task_css_set - obtain a task's css_set - * @task: the task to obtain css_set for - * - * See task_css_set_check(). - */ -static inline struct css_set *task_css_set(struct task_struct *task) -{ - return task_css_set_check(task, false); -} - -/** - * task_css - obtain css for (task, subsys) - * @task: the target task - * @subsys_id: the target subsystem ID - * - * See task_css_check(). - */ -static inline struct cgroup_subsys_state *task_css(struct task_struct *task, - int subsys_id) -{ - return task_css_check(task, subsys_id, false); -} - -/** - * task_css_is_root - test whether a task belongs to the root css - * @task: the target task - * @subsys_id: the target subsystem ID - * - * Test whether @task belongs to the root css on the specified subsystem. - * May be invoked in any context. - */ -static inline bool task_css_is_root(struct task_struct *task, int subsys_id) -{ - return task_css_check(task, subsys_id, true) == - init_css_set.subsys[subsys_id]; -} - -static inline struct cgroup *task_cgroup(struct task_struct *task, - int subsys_id) -{ - return task_css(task, subsys_id)->cgroup; -} - -struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *parent); - -struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); - -/** - * css_for_each_child - iterate through children of a css - * @pos: the css * to use as the loop cursor - * @parent: css whose children to walk - * - * Walk @parent's children. Must be called under rcu_read_lock(). - * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. - * - * It is allowed to temporarily drop RCU read lock during iteration. The - * caller is responsible for ensuring that @pos remains accessible until - * the start of the next iteration by, for example, bumping the css refcnt. - */ -#define css_for_each_child(pos, parent) \ - for ((pos) = css_next_child(NULL, (parent)); (pos); \ - (pos) = css_next_child((pos), (parent))) - -struct cgroup_subsys_state * -css_next_descendant_pre(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *css); - -struct cgroup_subsys_state * -css_rightmost_descendant(struct cgroup_subsys_state *pos); - -/** - * css_for_each_descendant_pre - pre-order walk of a css's descendants - * @pos: the css * to use as the loop cursor - * @root: css whose descendants to walk - * - * Walk @root's descendants. @root is included in the iteration and the - * first node to be visited. Must be called under rcu_read_lock(). - * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. - * - * For example, the following guarantees that a descendant can't escape - * state updates of its ancestors. - * - * my_online(@css) - * { - * Lock @css's parent and @css; - * Inherit state from the parent; - * Unlock both. - * } - * - * my_update_state(@css) - * { - * css_for_each_descendant_pre(@pos, @css) { - * Lock @pos; - * if (@pos == @css) - * Update @css's state; - * else - * Verify @pos is alive and inherit state from its parent; - * Unlock @pos; - * } - * } - * - * As long as the inheriting step, including checking the parent state, is - * enclosed inside @pos locking, double-locking the parent isn't necessary - * while inheriting. The state update to the parent is guaranteed to be - * visible by walking order and, as long as inheriting operations to the - * same @pos are atomic to each other, multiple updates racing each other - * still result in the correct state. It's guaranateed that at least one - * inheritance happens for any css after the latest update to its parent. - * - * If checking parent's state requires locking the parent, each inheriting - * iteration should lock and unlock both @pos->parent and @pos. - * - * Alternatively, a subsystem may choose to use a single global lock to - * synchronize ->css_online() and ->css_offline() against tree-walking - * operations. - * - * It is allowed to temporarily drop RCU read lock during iteration. The - * caller is responsible for ensuring that @pos remains accessible until - * the start of the next iteration by, for example, bumping the css refcnt. - */ -#define css_for_each_descendant_pre(pos, css) \ - for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ - (pos) = css_next_descendant_pre((pos), (css))) - -struct cgroup_subsys_state * -css_next_descendant_post(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *css); - -/** - * css_for_each_descendant_post - post-order walk of a css's descendants - * @pos: the css * to use as the loop cursor - * @css: css whose descendants to walk - * - * Similar to css_for_each_descendant_pre() but performs post-order - * traversal instead. @root is included in the iteration and the last - * node to be visited. - * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. - * - * Note that the walk visibility guarantee example described in pre-order - * walk doesn't apply the same to post-order walks. - */ -#define css_for_each_descendant_post(pos, css) \ - for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ - (pos) = css_next_descendant_post((pos), (css))) - -bool css_has_online_children(struct cgroup_subsys_state *css); - -/* A css_task_iter should be treated as an opaque object */ -struct css_task_iter { - struct cgroup_subsys *ss; - - struct list_head *cset_pos; - struct list_head *cset_head; - - struct list_head *task_pos; - struct list_head *tasks_head; - struct list_head *mg_tasks_head; -}; - -void css_task_iter_start(struct cgroup_subsys_state *css, - struct css_task_iter *it); -struct task_struct *css_task_iter_next(struct css_task_iter *it); -void css_task_iter_end(struct css_task_iter *it); - -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); - -struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, - struct cgroup_subsys *ss); -struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss); - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; -static inline int cgroup_init_early(void) { return 0; } -static inline int cgroup_init(void) { return 0; } +static inline void css_put(struct cgroup_subsys_state *css) {} +static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } +static inline int cgroupstats_build(struct cgroupstats *stats, + struct dentry *dentry) { return -EINVAL; } + static inline void cgroup_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {} -static inline int cgroupstats_build(struct cgroupstats *stats, - struct dentry *dentry) -{ - return -EINVAL; -} - -static inline void css_put(struct cgroup_subsys_state *css) {} - -/* No cgroups - nothing to do */ -static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) -{ - return 0; -} +static inline int cgroup_init_early(void) { return 0; } +static inline int cgroup_init(void) { return 0; } #endif /* !CONFIG_CGROUPS */ -- cgit v1.2.3 From 87e9b9f1d86c2ee9a10c2a4186a72d0af4cc963e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 16 May 2015 01:38:15 +0200 Subject: PM / sleep: Make suspend-to-idle-specific code depend on CONFIG_SUSPEND Since idle_should_freeze() is defined to always return 'false' for CONFIG_SUSPEND unset, all of the code depending on it in cpuidle_idle_call() is not necessary in that case. Make that code depend on CONFIG_SUSPEND too to avoid building it when it is not going to be used. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- include/linux/cpuidle.h | 16 ++++++++++------ include/linux/tick.h | 12 ++++++++---- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 9c5e89254796..13ee266ca98c 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -151,10 +151,6 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); -extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev); -extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, - struct cpuidle_device *dev); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else @@ -190,14 +186,22 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } +static inline struct cpuidle_driver *cpuidle_get_cpu_driver( + struct cpuidle_device *dev) {return NULL; } +#endif + +#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) +extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev); +extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, + struct cpuidle_device *dev); +#else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } -static inline struct cpuidle_driver *cpuidle_get_cpu_driver( - struct cpuidle_device *dev) {return NULL; } #endif #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED diff --git a/include/linux/tick.h b/include/linux/tick.h index f8492da57ad3..ec6e8bc992bf 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -13,8 +13,6 @@ #ifdef CONFIG_GENERIC_CLOCKEVENTS extern void __init tick_init(void); -extern void tick_freeze(void); -extern void tick_unfreeze(void); /* Should be core only, but ARM BL switcher requires it */ extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ @@ -23,14 +21,20 @@ extern void tick_handover_do_timer(void); extern void tick_cleanup_dead_cpu(int cpu); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } -static inline void tick_freeze(void) { } -static inline void tick_unfreeze(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } static inline void tick_handover_do_timer(void) { } static inline void tick_cleanup_dead_cpu(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_SUSPEND) +extern void tick_freeze(void); +extern void tick_unfreeze(void); +#else +static inline void tick_freeze(void) { } +static inline void tick_unfreeze(void) { } +#endif + #ifdef CONFIG_TICK_ONESHOT extern void tick_irq_enter(void); # ifndef arch_needs_cpu -- cgit v1.2.3 From ab3f02fc237211f0583c1e7ba3bf504747be9b8d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2015 10:52:27 +0200 Subject: locking/arch: Add WRITE_ONCE() to set_mb() Since we assume set_mb() to result in a single store followed by a full memory barrier, employ WRITE_ONCE(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index a7c0941d10da..03e227ba481c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -250,7 +250,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; }) #define WRITE_ONCE(x, val) \ - ({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; }) + ({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; }) #endif /* __KERNEL__ */ -- cgit v1.2.3 From b92b8b35a2e38bde319fd1d68ec84628c1f1b0fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2015 10:51:55 +0200 Subject: locking/arch: Rename set_mb() to smp_store_mb() Since set_mb() is really about an smp_mb() -- not a IO/DMA barrier like mb() rename it to match the recent smp_load_acquire() and smp_store_release(). Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..18f197223ebd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -252,7 +252,7 @@ extern char ___assert_task_state[1 - 2*!!( #define set_task_state(tsk, state_value) \ do { \ (tsk)->task_state_change = _THIS_IP_; \ - set_mb((tsk)->state, (state_value)); \ + smp_store_mb((tsk)->state, (state_value)); \ } while (0) /* @@ -274,7 +274,7 @@ extern char ___assert_task_state[1 - 2*!!( #define set_current_state(state_value) \ do { \ current->task_state_change = _THIS_IP_; \ - set_mb(current->state, (state_value)); \ + smp_store_mb(current->state, (state_value)); \ } while (0) #else @@ -282,7 +282,7 @@ extern char ___assert_task_state[1 - 2*!!( #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ - set_mb((tsk)->state, (state_value)) + smp_store_mb((tsk)->state, (state_value)) /* * set_current_state() includes a barrier so that the write of current->state @@ -298,7 +298,7 @@ extern char ___assert_task_state[1 - 2*!!( #define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) #define set_current_state(state_value) \ - set_mb(current->state, (state_value)) + smp_store_mb(current->state, (state_value)) #endif -- cgit v1.2.3 From 92cf211874e954027b8e91cc9a15485a50b58d6b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:46 +0200 Subject: sched/preempt: Merge preempt_mask.h into preempt.h preempt_mask.h defines all the preempt_count semantics and related symbols: preempt, softirq, hardirq, nmi, preempt active, need resched, etc... preempt.h defines the accessors and mutators of preempt_count. But there is a messy dependency game around those two header files: * preempt_mask.h includes preempt.h in order to access preempt_count() * preempt_mask.h defines all preempt_count semantic and symbols except PREEMPT_NEED_RESCHED that is needed by asm/preempt.h Thus we need to define it from preempt.h, right before including asm/preempt.h, instead of defining it to preempt_mask.h with the other preempt_count symbols. Therefore the preempt_count semantics happen to be spread out. * We plan to introduce preempt_active_[enter,exit]() to consolidate preempt_schedule*() code. But we'll need to access both preempt_count mutators (preempt_count_add()) and preempt_count symbols (PREEMPT_ACTIVE, PREEMPT_OFFSET). The usual place to define preempt operations is in preempt.h but then we'll need symbols in preempt_mask.h which already includes preempt.h. So we end up with a ressource circle dependency. Lets merge preempt_mask.h into preempt.h to solve these dependency issues. This way we gather semantic symbols and operation definition of preempt_count in a single file. This is a dumb copy-paste merge. Further merge re-arrangments are performed in a subsequent patch to ease review. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/bottom_half.h | 1 - include/linux/hardirq.h | 2 +- include/linux/preempt.h | 111 ++++++++++++++++++++++++++++++++++++++++ include/linux/preempt_mask.h | 117 ------------------------------------------- include/linux/sched.h | 2 +- 5 files changed, 113 insertions(+), 120 deletions(-) delete mode 100644 include/linux/preempt_mask.h (limited to 'include/linux') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 86c12c93e3cf..8fdcb783197d 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,7 +2,6 @@ #define _LINUX_BH_H #include -#include #ifdef CONFIG_TRACE_IRQFLAGS extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f4af03404b97..dfd59d6bc6f0 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -1,7 +1,7 @@ #ifndef LINUX_HARDIRQ_H #define LINUX_HARDIRQ_H -#include +#include #include #include #include diff --git a/include/linux/preempt.h b/include/linux/preempt.h index de83b4eb1642..8cc0338a5e9a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -17,6 +17,117 @@ #include +/* + * We put the hardirq and softirq counter into the preemption + * counter. The bitmask has the following meaning: + * + * - bits 0-7 are the preemption count (max preemption depth: 256) + * - bits 8-15 are the softirq count (max # of softirqs: 256) + * + * The hardirq count could in theory be the same as the number of + * interrupts in the system, but we run all interrupt handlers with + * interrupts disabled, so we cannot have nesting interrupts. Though + * there are a few palaeontologic drivers which reenable interrupts in + * the handler, so we need more than one bit here. + * + * PREEMPT_MASK: 0x000000ff + * SOFTIRQ_MASK: 0x0000ff00 + * HARDIRQ_MASK: 0x000f0000 + * NMI_MASK: 0x00100000 + * PREEMPT_ACTIVE: 0x00200000 + */ +#define PREEMPT_BITS 8 +#define SOFTIRQ_BITS 8 +#define HARDIRQ_BITS 4 +#define NMI_BITS 1 + +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) + +#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) +#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) +#define NMI_OFFSET (1UL << NMI_SHIFT) + +#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) + +#define PREEMPT_ACTIVE_BITS 1 +#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) +#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) + +#define hardirq_count() (preempt_count() & HARDIRQ_MASK) +#define softirq_count() (preempt_count() & SOFTIRQ_MASK) +#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ + | NMI_MASK)) + +/* + * Are we doing bottom half or hardware interrupt processing? + * Are we in a softirq context? Interrupt context? + * in_softirq - Are we currently processing softirq or have bh disabled? + * in_serving_softirq - Are we currently processing softirq? + */ +#define in_irq() (hardirq_count()) +#define in_softirq() (softirq_count()) +#define in_interrupt() (irq_count()) +#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) + +/* + * Are we in NMI context? + */ +#define in_nmi() (preempt_count() & NMI_MASK) + +#if defined(CONFIG_PREEMPT_COUNT) +# define PREEMPT_CHECK_OFFSET 1 +#else +# define PREEMPT_CHECK_OFFSET 0 +#endif + +/* + * The preempt_count offset needed for things like: + * + * spin_lock_bh() + * + * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and + * softirqs, such that unlock sequences of: + * + * spin_unlock(); + * local_bh_enable(); + * + * Work as expected. + */ +#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) + +/* + * Are we running in atomic context? WARNING: this macro cannot + * always detect atomic context; in particular, it cannot know about + * held spinlocks in non-preemptible kernels. Thus it should not be + * used in the general case to determine whether sleeping is possible. + * Do not use in_atomic() in driver code. + */ +#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) + +/* + * Check whether we were atomic before we did preempt_disable(): + * (used by the scheduler, *after* releasing the kernel lock) + */ +#define in_atomic_preempt_off() \ + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + +#ifdef CONFIG_PREEMPT_COUNT +# define preemptible() (preempt_count() == 0 && !irqs_disabled()) +#else +# define preemptible() 0 +#endif + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h deleted file mode 100644 index dbeec4d4a3be..000000000000 --- a/include/linux/preempt_mask.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef LINUX_PREEMPT_MASK_H -#define LINUX_PREEMPT_MASK_H - -#include - -/* - * We put the hardirq and softirq counter into the preemption - * counter. The bitmask has the following meaning: - * - * - bits 0-7 are the preemption count (max preemption depth: 256) - * - bits 8-15 are the softirq count (max # of softirqs: 256) - * - * The hardirq count could in theory be the same as the number of - * interrupts in the system, but we run all interrupt handlers with - * interrupts disabled, so we cannot have nesting interrupts. Though - * there are a few palaeontologic drivers which reenable interrupts in - * the handler, so we need more than one bit here. - * - * PREEMPT_MASK: 0x000000ff - * SOFTIRQ_MASK: 0x0000ff00 - * HARDIRQ_MASK: 0x000f0000 - * NMI_MASK: 0x00100000 - * PREEMPT_ACTIVE: 0x00200000 - */ -#define PREEMPT_BITS 8 -#define SOFTIRQ_BITS 8 -#define HARDIRQ_BITS 4 -#define NMI_BITS 1 - -#define PREEMPT_SHIFT 0 -#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) -#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) -#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) - -#define __IRQ_MASK(x) ((1UL << (x))-1) - -#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) -#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) -#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) -#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) - -#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) -#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) -#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) -#define NMI_OFFSET (1UL << NMI_SHIFT) - -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) - -#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) -#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) - -#define hardirq_count() (preempt_count() & HARDIRQ_MASK) -#define softirq_count() (preempt_count() & SOFTIRQ_MASK) -#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ - | NMI_MASK)) - -/* - * Are we doing bottom half or hardware interrupt processing? - * Are we in a softirq context? Interrupt context? - * in_softirq - Are we currently processing softirq or have bh disabled? - * in_serving_softirq - Are we currently processing softirq? - */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) - -/* - * Are we in NMI context? - */ -#define in_nmi() (preempt_count() & NMI_MASK) - -#if defined(CONFIG_PREEMPT_COUNT) -# define PREEMPT_CHECK_OFFSET 1 -#else -# define PREEMPT_CHECK_OFFSET 0 -#endif - -/* - * The preempt_count offset needed for things like: - * - * spin_lock_bh() - * - * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and - * softirqs, such that unlock sequences of: - * - * spin_unlock(); - * local_bh_enable(); - * - * Work as expected. - */ -#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) - -/* - * Are we running in atomic context? WARNING: this macro cannot - * always detect atomic context; in particular, it cannot know about - * held spinlocks in non-preemptible kernels. Thus it should not be - * used in the general case to determine whether sleeping is possible. - * Do not use in_atomic() in driver code. - */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) - -/* - * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler, *after* releasing the kernel lock) - */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) - -#ifdef CONFIG_PREEMPT_COUNT -# define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#else -# define preemptible() 0 -#endif - -#endif /* LINUX_PREEMPT_MASK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5f8defa155cf..c53a1784d7a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,7 +25,7 @@ struct sched_param { #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 2e10e71ce88e3eaccfd09a045ae6ecebe657ba09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:47 +0200 Subject: sched/preempt: Rearrange a few symbols after headers merge Adjust a few comments, and further integrate a few definitions after the dumb headers copy. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 8cc0338a5e9a..37974cd4f092 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -9,14 +9,6 @@ #include #include -/* - * We use the MSB mostly because its available; see for - * the other bits -- can't include that header due to inclusion hell. - */ -#define PREEMPT_NEED_RESCHED 0x80000000 - -#include - /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: @@ -30,11 +22,12 @@ * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * - * PREEMPT_MASK: 0x000000ff - * SOFTIRQ_MASK: 0x0000ff00 - * HARDIRQ_MASK: 0x000f0000 - * NMI_MASK: 0x00100000 - * PREEMPT_ACTIVE: 0x00200000 + * PREEMPT_MASK: 0x000000ff + * SOFTIRQ_MASK: 0x0000ff00 + * HARDIRQ_MASK: 0x000f0000 + * NMI_MASK: 0x00100000 + * PREEMPT_ACTIVE: 0x00200000 + * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 @@ -64,6 +57,12 @@ #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) #define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) +/* We use the MSB mostly because its available */ +#define PREEMPT_NEED_RESCHED 0x80000000 + +/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ +#include + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ @@ -122,12 +121,6 @@ #define in_atomic_preempt_off() \ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) -#ifdef CONFIG_PREEMPT_COUNT -# define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#else -# define preemptible() 0 -#endif - #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); @@ -160,6 +153,8 @@ do { \ #define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#define preemptible() (preempt_count() == 0 && !irqs_disabled()) + #ifdef CONFIG_PREEMPT #define preempt_enable() \ do { \ @@ -232,6 +227,7 @@ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() +#define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ -- cgit v1.2.3 From 90b62b5129d5cb50f62f40e684de7a1961e57197 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:48 +0200 Subject: sched/preempt: Rename PREEMPT_CHECK_OFFSET to PREEMPT_DISABLE_OFFSET "CHECK" suggests it's only used as a comparison mask. But now it's used further as a config-conditional preempt disabler offset. Lets disambiguate this name. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 37974cd4f092..4689ef210a13 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -85,9 +85,9 @@ #define in_nmi() (preempt_count() & NMI_MASK) #if defined(CONFIG_PREEMPT_COUNT) -# define PREEMPT_CHECK_OFFSET 1 +# define PREEMPT_DISABLE_OFFSET 1 #else -# define PREEMPT_CHECK_OFFSET 0 +# define PREEMPT_DISABLE_OFFSET 0 #endif /* @@ -103,7 +103,7 @@ * * Work as expected. */ -#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) +#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot @@ -119,7 +119,7 @@ * (used by the scheduler, *after* releasing the kernel lock) */ #define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); -- cgit v1.2.3 From b30f0e3ffedfa52b1d67a302ae5860c49998e5e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:49 +0200 Subject: sched/preempt: Optimize preemption operations on __schedule() callers __schedule() disables preemption and some of its callers (the preempt_schedule*() family) also set PREEMPT_ACTIVE. So we have two preempt_count() modifications that could be performed at once. Lets remove the preemption disablement from __schedule() and pull this responsibility to its callers in order to optimize preempt_count() operations in a single place. Suggested-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4689ef210a13..45da394f2779 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -137,6 +137,18 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#define preempt_active_enter() \ +do { \ + preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ + barrier(); \ +} while (0) + +#define preempt_active_exit() \ +do { \ + barrier(); \ + preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ +} while (0) + #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ -- cgit v1.2.3 From e017cf21ae82e0b36f026b22083a8ae67926f465 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:50 +0200 Subject: sched/preempt: Fix out of date comment Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 45da394f2779..4057696c641c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -116,7 +116,7 @@ /* * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler, *after* releasing the kernel lock) + * (used by the scheduler) */ #define in_atomic_preempt_off() \ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) -- cgit v1.2.3 From 3e51f3c4004c9b01f66da03214a3e206f5ed627b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:51 +0200 Subject: sched/preempt: Remove PREEMPT_ACTIVE unmasking off in_atomic() Now that PREEMPT_ACTIVE implies PREEMPT_DISABLE_OFFSET, ignoring PREEMPT_ACTIVE from in_atomic() check isn't useful anymore. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-7-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4057696c641c..a1a00e14c14f 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -112,7 +112,7 @@ * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) +#define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): -- cgit v1.2.3 From 8bcbde5480f9777f8b74d71493722c663e22c21b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:06 +0200 Subject: sched/preempt, mm/fault: Count pagefault_disable() levels in pagefault_disabled Until now, pagefault_disable()/pagefault_enabled() used the preempt count to track whether in an environment with pagefaults disabled (can be queried via in_atomic()). This patch introduces a separate counter in task_struct to count the level of pagefault_disable() calls. We'll keep manipulating the preempt count to retain compatibility to existing pagefault handlers. It is now possible to verify whether in a pagefault_disable() envionment by calling pagefault_disabled(). In contrast to in_atomic() it will not be influenced by preempt_enable()/preempt_disable(). This patch is based on a patch from Ingo Molnar. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-2-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/uaccess.h | 36 +++++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c53a1784d7a9..dd07ac03f82a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1788,6 +1788,7 @@ struct task_struct { #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif + int pagefault_disabled; }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ecd3319dac33..23290cc93a24 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,20 +2,36 @@ #define __LINUX_UACCESS_H__ #include +#include #include +static __always_inline void pagefault_disabled_inc(void) +{ + current->pagefault_disabled++; +} + +static __always_inline void pagefault_disabled_dec(void) +{ + current->pagefault_disabled--; + WARN_ON(current->pagefault_disabled < 0); +} + /* - * These routines enable/disable the pagefault handler in that - * it will not take any locks and go straight to the fixup table. + * These routines enable/disable the pagefault handler. If disabled, it will + * not take any locks and go straight to the fixup table. + * + * We increase the preempt and the pagefault count, to be able to distinguish + * whether we run in simple atomic context or in a real pagefault_disable() + * context. + * + * For now, after pagefault_disabled() has been called, we run in atomic + * context. User access methods will not sleep. * - * They have great resemblance to the preempt_disable/enable calls - * and in fact they are identical; this is because currently there is - * no other way to make the pagefault handlers do this. So we do - * disable preemption but we don't necessarily care about that. */ static inline void pagefault_disable(void) { preempt_count_inc(); + pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. @@ -25,18 +41,24 @@ static inline void pagefault_disable(void) static inline void pagefault_enable(void) { -#ifndef CONFIG_PREEMPT /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); + pagefault_disabled_dec(); +#ifndef CONFIG_PREEMPT preempt_count_dec(); #else preempt_enable(); #endif } +/* + * Is the pagefault handler disabled? If so, user access methods will not sleep. + */ +#define pagefault_disabled() (current->pagefault_disabled != 0) + #ifndef ARCH_HAS_NOCACHE_UACCESS static inline unsigned long __copy_from_user_inatomic_nocache(void *to, -- cgit v1.2.3 From 9ec23531fd48031d1b6ca5366f5f967d17a8bc28 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:07 +0200 Subject: sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with disabled pagefaults Commit 662bbcb2747c ("mm, sched: Allow uaccess in atomic with pagefault_disable()") removed might_sleep() checks for all user access code (that uses might_fault()). The reason was to disable wrong "sleep in atomic" warnings in the following scenario: pagefault_disable() rc = copy_to_user(...) pagefault_enable() Which is valid, as pagefault_disable() increments the preempt counter and therefore disables the pagefault handler. copy_to_user() will not sleep and return an error code if a page is not available. However, as all might_sleep() checks are removed, CONFIG_DEBUG_ATOMIC_SLEEP would no longer detect the following scenario: spin_lock(&lock); rc = copy_to_user(...) spin_unlock(&lock) If the kernel is compiled with preemption turned on, preempt_disable() will make in_atomic() detect disabled preemption. The fault handler would correctly never sleep on user access. However, with preemption turned off, preempt_disable() is usually a NOP (with !CONFIG_PREEMPT_COUNT), therefore in_atomic() will not be able to detect disabled preemption nor disabled pagefaults. The fault handler could sleep. We really want to enable CONFIG_DEBUG_ATOMIC_SLEEP checks for user access functions again, otherwise we can end up with horrible deadlocks. Root of all evil is that pagefault_disable() acts almost as preempt_disable(), depending on preemption being turned on/off. As we now have pagefault_disabled(), we can use it to distinguish whether user acces functions might sleep. Convert might_fault() into a makro that calls __might_fault(), to allow proper file + line messages in case of a might_sleep() warning. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-3-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..060dd7b61c6d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro) #if defined(CONFIG_MMU) && \ (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)) -void might_fault(void); +#define might_fault() __might_fault(__FILE__, __LINE__) +void __might_fault(const char *file, int line); #else static inline void might_fault(void) { } #endif -- cgit v1.2.3 From 2cb7c9cb426660b5ed58b643d9e7dd5d50ba901f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:09 +0200 Subject: sched/preempt, mm/kmap: Explicitly disable/enable preemption in kmap_atomic_* The existing code relies on pagefault_disable() implicitly disabling preemption, so that no schedule will happen between kmap_atomic() and kunmap_atomic(). Let's make this explicit, to prepare for pagefault_disable() not touching preemption anymore. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-5-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/highmem.h | 2 ++ include/linux/io-mapping.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 9286a46b7d69..6aefcd0031a6 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -65,6 +65,7 @@ static inline void kunmap(struct page *page) static inline void *kmap_atomic(struct page *page) { + preempt_disable(); pagefault_disable(); return page_address(page); } @@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page) static inline void __kunmap_atomic(void *addr) { pagefault_enable(); + preempt_enable(); } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 657fab4efab3..c27dde7215b5 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -141,6 +141,7 @@ static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { + preempt_disable(); pagefault_disable(); return ((char __force __iomem *) mapping) + offset; } @@ -149,6 +150,7 @@ static inline void io_mapping_unmap_atomic(void __iomem *vaddr) { pagefault_enable(); + preempt_enable(); } /* Non-atomic map/unmap */ -- cgit v1.2.3 From 70ffdb9393a7264a069265edded729078dcf0425 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:11 +0200 Subject: mm/fault, arch: Use pagefault_disable() to check for disabled pagefaults in the handler Introduce faulthandler_disabled() and use it to check for irq context and disabled pagefaults (via pagefault_disable()) in the pagefault handlers. Please note that we keep the in_atomic() checks in place - to detect whether in irq context (in which case preemption is always properly disabled). In contrast, preempt_disable() should never be used to disable pagefaults. With !CONFIG_PREEMPT_COUNT, preempt_disable() doesn't modify the preempt counter, and therefore the result of in_atomic() differs. We validate that condition by using might_fault() checks when calling might_sleep(). Therefore, add a comment to faulthandler_disabled(), describing why this is needed. faulthandler_disabled() and pagefault_disable() are defined in linux/uaccess.h, so let's properly add that include to all relevant files. This patch is based on a patch from Thomas Gleixner. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-7-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 23290cc93a24..90786d2d74e5 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -59,6 +59,18 @@ static inline void pagefault_enable(void) */ #define pagefault_disabled() (current->pagefault_disabled != 0) +/* + * The pagefault handler is in general disabled by pagefault_disable() or + * when in irq context (via in_atomic()). + * + * This function should only be used by the fault handlers. Other users should + * stick to pagefault_disabled(). + * Please NEVER use preempt_disable() to disable the fault handler. With + * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. + * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. + */ +#define faulthandler_disabled() (pagefault_disabled() || in_atomic()) + #ifndef ARCH_HAS_NOCACHE_UACCESS static inline unsigned long __copy_from_user_inatomic_nocache(void *to, -- cgit v1.2.3 From 8222dbe21e79338de92d5e1956cd1e3994cc9f93 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:20 +0200 Subject: sched/preempt, mm/fault: Decouple preemption from the page fault logic As the fault handlers now all rely on the pagefault_disabled() checks and implicit preempt_disable() calls by pagefault_disable() have been made explicit, we can completely rely on the pagefault_disableD counter. So let's no longer touch the preempt count when disabling/enabling pagefaults. After a call to pagefault_disable(), pagefault_disabled() will return true, but in_atomic() won't. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-16-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 90786d2d74e5..ae572c138607 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -1,7 +1,6 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ -#include #include #include @@ -20,17 +19,11 @@ static __always_inline void pagefault_disabled_dec(void) * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * - * We increase the preempt and the pagefault count, to be able to distinguish - * whether we run in simple atomic context or in a real pagefault_disable() - * context. - * - * For now, after pagefault_disabled() has been called, we run in atomic - * context. User access methods will not sleep. - * + * User access methods will not sleep when called from a pagefault_disabled() + * environment. */ static inline void pagefault_disable(void) { - preempt_count_inc(); pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault @@ -47,11 +40,6 @@ static inline void pagefault_enable(void) */ barrier(); pagefault_disabled_dec(); -#ifndef CONFIG_PREEMPT - preempt_count_dec(); -#else - preempt_enable(); -#endif } /* -- cgit v1.2.3 From 80ed87c8a9ca0cad7ca66cf3bbdfb17559a66dcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2015 14:23:45 +0200 Subject: sched/wait: Introduce TASK_NOLOAD and TASK_IDLE Currently people use TASK_INTERRUPTIBLE to idle kthreads and wait for 'work' because TASK_UNINTERRUPTIBLE contributes to the loadavg. Having all idle kthreads contribute to the loadavg is somewhat silly. Now mostly this works OK, because kthreads have all their signals masked. However there's a few sites where this is causing problems and TASK_UNINTERRUPTIBLE should be used, except for that loadavg issue. This patch adds TASK_NOLOAD which, when combined with TASK_UNINTERRUPTIBLE avoids the loadavg accounting. As most of imagined usage sites are loops where a thread wants to idle, waiting for work, a helper TASK_IDLE is introduced. Signed-off-by: Peter Zijlstra (Intel) Cc: Julian Anastasov Cc: Linus Torvalds Cc: NeilBrown Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index dd07ac03f82a..7de815c6fa78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -218,9 +218,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); #define TASK_WAKEKILL 128 #define TASK_WAKING 256 #define TASK_PARKED 512 -#define TASK_STATE_MAX 1024 +#define TASK_NOLOAD 1024 +#define TASK_STATE_MAX 2048 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; @@ -230,6 +231,8 @@ extern char ___assert_task_state[1 - 2*!!( #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) +#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) + /* Convenience macros for the sake of wake_up */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) @@ -245,7 +248,8 @@ extern char ___assert_task_state[1 - 2*!!( ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0) + (task->flags & PF_FROZEN) == 0 && \ + (task->state & TASK_NOLOAD) == 0) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -- cgit v1.2.3 From f03123783d4e43cd59df58e23e963136e04f8280 Mon Sep 17 00:00:00 2001 From: Ramakrishna Pallala Date: Thu, 30 Apr 2015 20:44:45 +0530 Subject: extcon: axp288: Add axp288 extcon driver support This patch adds the extcon support for AXP288 PMIC which has the BC1.2 charger detection capability. Additionally it also adds the USB mux switching support b/w SOC and PMIC based on GPIO control. Signed-off-by: Ramakrishna Pallala Acked-by: Lee Jones [cw00.choi: Modify the log message to keep the consistent log message pattern] Signed-off-by: Chanwoo Choi --- include/linux/mfd/axp20x.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index dfabd6db7ddf..4ed8071d062e 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -275,4 +275,9 @@ struct axp20x_fg_pdata { int thermistor_curve[MAX_THERM_CURVE_SIZE][2]; }; +struct axp288_extcon_pdata { + /* GPIO pin control to switch D+/D- lines b/w PMIC and SOC */ + struct gpio_desc *gpio_mux_cntl; +}; + #endif /* __LINUX_MFD_AXP20X_H */ -- cgit v1.2.3 From 707d7550875a9bd245ce5f1077d2d2cb44eab218 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 15 Apr 2015 13:57:51 +0900 Subject: extcon: Add extcon_get_edev_name() API to get the extcon device name This patch adds the extcon_get_edev_name() API to get the name of extcon device because all information inclued in the structure extcon_dev should be accessed by extcon core API instead of directly accessing the data. Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 36f49c405dfb..e2cf6254c86a 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -259,6 +259,10 @@ extern int extcon_unregister_notifier(struct extcon_dev *edev, * This function use phandle of devicetree to get extcon device directly. */ extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index); + +/* Following API to get information of extcon device */ +extern const char *extcon_get_edev_name(struct extcon_dev *edev); + #else /* CONFIG_EXTCON */ static inline int extcon_dev_register(struct extcon_dev *edev) { -- cgit v1.2.3 From b9ec23c08a0274d31ee626f14b359563ea0cae46 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 24 Apr 2015 14:48:52 +0900 Subject: extcon: Fix the checkpatch warning and minor coding style issue This patch clean up the extcon core driver by fixing the checkpatch warning and minor coding style issue. Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index e2cf6254c86a..799474d9dc48 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -97,8 +97,8 @@ struct extcon_cable; * @state: Attach/detach state of this extcon. Do not provide at * register-time. * @nh: Notifier for the state change events from this extcon - * @entry: To support list of extcon devices so that users can search - * for extcon devices based on the extcon name. + * @entry: To support list of extcon devices so that users can + * search for extcon devices based on the extcon name. * @lock: * @max_supported: Internal value to store the number of cables. * @extcon_dev_type: Device_type struct to provide attribute_groups @@ -258,7 +258,8 @@ extern int extcon_unregister_notifier(struct extcon_dev *edev, * Following API get the extcon device from devicetree. * This function use phandle of devicetree to get extcon device directly. */ -extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index); +extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, + int index); /* Following API to get information of extcon device */ extern const char *extcon_get_edev_name(struct extcon_dev *edev); -- cgit v1.2.3 From 9e86b2ad4c11fd52ee8133abce7a29e0b32d29a7 Mon Sep 17 00:00:00 2001 From: Inha Song Date: Mon, 4 May 2015 13:42:13 +0900 Subject: extcon: arizona: Add support for select accessory detect mode when headphone detection This patch add support for select accessory detect mode to HPDETL or HPDETR. Arizona provides a headphone detection circuit on the HPDETL and HPDETR pins to measure the impedance of an external load connected to the headphone. Depending on board design, headphone detect pins can change to HPDETR or HPDETL. Signed-off-by: Inha Song Acked-by: Lee Jones Acked-by: Charles Keepax Signed-off-by: Chanwoo Choi --- include/linux/mfd/arizona/pdata.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 1789cb0f4f17..aa5c48b06755 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -121,6 +121,9 @@ struct arizona_pdata { /** GPIO used for mic isolation with HPDET */ int hpdet_id_gpio; + /** Channel to use for headphone detection */ + unsigned int hpdet_channel; + /** Extra debounce timeout used during initial mic detection (ms) */ int micd_detect_debounce; -- cgit v1.2.3 From ca42aaf0c8616cde6161ea4391dff364efeee46a Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Mon, 18 May 2015 14:19:13 +0200 Subject: time: Refactor msecs_to_jiffies Refactor the msecs_to_jiffies conditional code part in time.c and jiffies.h putting it into conditional functions rather than #ifdefs to improve readability. [ tglx: Verified that there is no binary code change ] Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1431951554-5563-2-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index c367cbdf73ab..9527ddbb0f1b 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -7,6 +7,7 @@ #include #include #include /* for HZ */ +#include /* * The following defines establish the engineering parameters of the PLL @@ -288,7 +289,68 @@ static inline u64 jiffies_to_nsecs(const unsigned long j) return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } -extern unsigned long msecs_to_jiffies(const unsigned int m); +extern unsigned long __msecs_to_jiffies(const unsigned int m); +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) +/* + * HZ is equal to or smaller than 1000, and 1000 is a nice round + * multiple of HZ, divide with the factor between them, but round + * upwards: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +} +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) +/* + * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - + * simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot overflow: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return m * (HZ / MSEC_PER_SEC); +} +#else +/* + * Generic case - multiply, round and divide. But first check that if + * we are doing a net multiplication, that we wouldn't overflow: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; +} +#endif +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * the HZ range specific helpers _msecs_to_jiffies() are called from + * __msecs_to_jiffies(). + */ +static inline unsigned long msecs_to_jiffies(const unsigned int m) +{ + return __msecs_to_jiffies(m); +} + extern unsigned long usecs_to_jiffies(const unsigned int u); extern unsigned long timespec_to_jiffies(const struct timespec *value); extern void jiffies_to_timespec(const unsigned long jiffies, -- cgit v1.2.3 From daa67b4b70568a07fef3cffacb2055891bf42ddb Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Mon, 18 May 2015 14:19:14 +0200 Subject: time: Allow gcc to fold constants when possible To allow constant folding in msecs_to_jiffies() conditionally calls the HZ dependent _msecs_to_jiffies() helpers or, when gcc can not figure out constant folding, __msecs_to_jiffies which is the renamed original msecs_to_jiffies() function. Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1431951554-5563-3-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 9527ddbb0f1b..5e75af6cf1bc 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -343,12 +343,24 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * - * the HZ range specific helpers _msecs_to_jiffies() are called from - * __msecs_to_jiffies(). + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the HZ range specific helpers _msecs_to_jiffies() are called both + * directly here and from __msecs_to_jiffies() in the case where + * constant folding is not possible. */ static inline unsigned long msecs_to_jiffies(const unsigned int m) { - return __msecs_to_jiffies(m); + if (__builtin_constant_p(m)) { + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + return _msecs_to_jiffies(m); + } else { + return __msecs_to_jiffies(m); + } } extern unsigned long usecs_to_jiffies(const unsigned int u); -- cgit v1.2.3 From 0a4377de305684c883bf90ad21e3cbdeead70f5c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 19 May 2015 17:07:14 +0800 Subject: genirq: Introduce irq_set_vcpu_affinity() to target an interrupt to a VCPU With Posted-Interrupts support in Intel CPU and IOMMU, an external interrupt from assigned-devices could be directly delivered to a virtual CPU in a virtual machine. Instead of hacking KVM and Intel IOMMU drivers, we propose a platform independent interface to target an interrupt to a specific virtual CPU in a virtual machine, or set virtual CPU affinity for an interrupt. By adopting this new interface and the hierarchy irqdomain, we could easily support posted-interrupts on Intel platforms, and also provide flexible enough interfaces for other platforms to support similar features. Here is the usage scenario for this interface: Guest update MSI/MSI-X interrupt configuration -->QEMU and KVM handle this -->KVM call this interface (passing posted interrupts descriptor and guest vector) -->irq core will transfer the control to IOMMU -->IOMMU will do the real work of updating IRTE (IRTE has new format for VT-d Posted-Interrupts) Signed-off-by: Jiang Liu Signed-off-by: Feng Wu Link: http://lkml.kernel.org/r/1432026437-16560-2-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 62c6901cab55..48cb7d1aa58f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -327,6 +327,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_write_msi_msg: optional to write message content for MSI * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt + * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine * @flags: chip specific flags */ struct irq_chip { @@ -369,6 +370,8 @@ struct irq_chip { int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state); int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state); + int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); + unsigned long flags; }; @@ -422,6 +425,7 @@ extern void irq_cpu_online(void); extern void irq_cpu_offline(void); extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); +extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void irq_move_irq(struct irq_data *data); @@ -467,6 +471,8 @@ extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); +extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, + void *vcpu_info); #endif /* Handling of unhandled and spurious interrupts: */ -- cgit v1.2.3 From 8fff52fd50934580c5108afed12043a774edf728 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 3 Apr 2015 09:04:04 +0530 Subject: clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state When no timers/hrtimers are pending, the expiry time is set to a special value: 'KTIME_MAX'. This normally happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes. When 'expiry == KTIME_MAX', we either cancel the 'tick-sched' hrtimer (NOHZ_MODE_HIGHRES) or skip reprogramming clockevent device (NOHZ_MODE_LOWRES). But, the clockevent device is already reprogrammed from the tick-handler for next tick. As the clock event device is programmed in ONESHOT mode it will at least fire one more time (unnecessarily). Timers on few implementations (like arm_arch_timer, etc.) only support PERIODIC mode and their drivers emulate ONESHOT over that. Which means that on these platforms we will get spurious interrupts periodically (at last programmed interval rate, normally tick rate). In order to avoid spurious interrupts, the clockevent device should be stopped or its interrupts should be masked. A simple (yet hacky) solution to get this fixed could be: update hrtimer_force_reprogram() to always reprogram clockevent device and update clockevent drivers to STOP generating events (or delay it to max time) when 'expires' is set to KTIME_MAX. But the drawback here is that every clockevent driver has to be hacked for this particular case and its very easy for new ones to miss this. However, Thomas suggested to add an optional state ONESHOT_STOPPED to solve this problem: lkml.org/lkml/2014/5/9/508. This patch adds support for ONESHOT_STOPPED state in clockevents core. It will only be available to drivers that implement the state-specific callbacks instead of the legacy ->set_mode() callback. Signed-off-by: Viresh Kumar Reviewed-by: Preeti U. Murthy Cc: linaro-kernel@lists.linaro.org Cc: Frederic Weisbecker Cc: Kevin Hilman Cc: Daniel Lezcano Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/b8b383a03ac07b13312c16850b5106b82e4245b5.1428031396.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 96c280b2c263..271fa4c8eb29 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -37,12 +37,15 @@ enum clock_event_mode { * reached from DETACHED or SHUTDOWN. * ONESHOT: Device is programmed to generate event only once. Can be reached * from DETACHED or SHUTDOWN. + * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily + * stopped. */ enum clock_event_state { CLOCK_EVT_STATE_DETACHED, CLOCK_EVT_STATE_SHUTDOWN, CLOCK_EVT_STATE_PERIODIC, CLOCK_EVT_STATE_ONESHOT, + CLOCK_EVT_STATE_ONESHOT_STOPPED, }; /* @@ -90,6 +93,7 @@ enum clock_event_state { * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. * @set_state_periodic: switch state to periodic, if !set_mode * @set_state_oneshot: switch state to oneshot, if !set_mode + * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode * @set_state_shutdown: switch state to shutdown, if !set_mode * @tick_resume: resume clkevt device, if !set_mode * @broadcast: function to broadcast events @@ -121,11 +125,12 @@ struct clock_event_device { * State transition callback(s): Only one of the two groups should be * defined: * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME. - * - set_state_{shutdown|periodic|oneshot}(), tick_resume(). + * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume(). */ void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); int (*set_state_periodic)(struct clock_event_device *); int (*set_state_oneshot)(struct clock_event_device *); + int (*set_state_oneshot_stopped)(struct clock_event_device *); int (*set_state_shutdown)(struct clock_event_device *); int (*tick_resume)(struct clock_event_device *); -- cgit v1.2.3 From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2015 09:38:13 +0200 Subject: block: use an atomic_t for mq_freeze_depth lockdep gets unhappy about the not disabling irqs when using the queue_lock around it. Instead of trying to fix that up just switch to an atomic_t and get rid of the lock. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2da818a48097..bc917956a6d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -444,7 +444,7 @@ struct request_queue { struct mutex sysfs_lock; int bypass_depth; - int mq_freeze_depth; + atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; -- cgit v1.2.3 From b25de9d6da49b1a8760a89672283128aa8c78345 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2015 21:41:01 +0200 Subject: block: remove BIO_EOPNOTSUPP Since the big barrier rewrite/removal in 2007 we never fail FLUSH or FUA requests, which means we can remove the magic BIO_EOPNOTSUPP flag to help propagating those to the buffer_head layer. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 93d2e7153816..daf95915d104 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -118,7 +118,6 @@ struct bio { #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ -#define BIO_EOPNOTSUPP 7 /* not supported */ #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ -- cgit v1.2.3 From 97ca223c3b37ed12a5b67a5dc6247e5a4799d337 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2015 21:41:02 +0200 Subject: block: remove unused BIO_RW_BLOCK and BIO_EOF flags Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index daf95915d104..09c7a2cd48ef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -112,8 +112,6 @@ struct bio { * bio flags */ #define BIO_UPTODATE 0 /* ok after I/O completion */ -#define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ -#define BIO_EOF 2 /* out-out-bounds error */ #define BIO_SEG_VALID 3 /* bi_phys_segments valid */ #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ -- cgit v1.2.3 From 4e3d9cb0134fea035e6eb1707e5e7d8aaffa186d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 17:14:51 +0200 Subject: jiffies: Remove the extra indentation level Somehow I missed to clean that up when applying the patches. Fix it up now. Reported-by: Joe Perches Signed-off-by: Thomas Gleixner Cc: Nicholas Mc Guire --- include/linux/jiffies.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5e75af6cf1bc..3bde5eb8568b 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -298,7 +298,7 @@ extern unsigned long __msecs_to_jiffies(const unsigned int m); */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* @@ -309,9 +309,9 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - return m * (HZ / MSEC_PER_SEC); + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return m * (HZ / MSEC_PER_SEC); } #else /* @@ -320,11 +320,10 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** -- cgit v1.2.3 From b2dbe0a60f1bcf4db5c701f1577b3135c3159eb5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 May 2015 09:18:28 -0600 Subject: block: collapse bio bit space Various previous patches removed bits and left holes, collapse them all. Leave the reset start bit where it is, we don't need to change that. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09c7a2cd48ef..3f4ded0b1a34 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -112,15 +112,15 @@ struct bio { * bio flags */ #define BIO_UPTODATE 0 /* ok after I/O completion */ -#define BIO_SEG_VALID 3 /* bi_phys_segments valid */ -#define BIO_CLONED 4 /* doesn't own data */ -#define BIO_BOUNCED 5 /* bio is a bounce bio */ -#define BIO_USER_MAPPED 6 /* contains user pages */ -#define BIO_NULL_MAPPED 8 /* contains invalid user pages */ -#define BIO_QUIET 9 /* Make BIO Quiet */ -#define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ -#define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ -#define BIO_REFFED 12 /* bio has elevated ->bi_cnt */ +#define BIO_SEG_VALID 1 /* bi_phys_segments valid */ +#define BIO_CLONED 2 /* doesn't own data */ +#define BIO_BOUNCED 3 /* bio is a bounce bio */ +#define BIO_USER_MAPPED 4 /* contains user pages */ +#define BIO_NULL_MAPPED 5 /* contains invalid user pages */ +#define BIO_QUIET 6 /* Make BIO Quiet */ +#define BIO_SNAP_STABLE 7 /* bio data must be snapshotted during write */ +#define BIO_CHAIN 8 /* chained bio, ->bi_remaining in effect */ +#define BIO_REFFED 9 /* bio has elevated ->bi_cnt */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From 343df3c79c62b644ce6ff5dff96c9e0be1ecb242 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2015 09:23:23 +0200 Subject: suspend: simplify block I/O handling Stop abusing struct page functionality and the swap end_io handler, and instead add a modified version of the blk-lib.c bio_batch helpers. Also move the block I/O code into swap.c as they are directly tied into each other. Signed-off-by: Christoph Hellwig Tested-by: Pavel Machek Tested-by: Ming Lin Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index cee108cbe2d5..38874729dc5f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -377,7 +377,6 @@ extern void end_swap_bio_write(struct bio *bio, int err); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)); extern int swap_set_page_dirty(struct page *page); -extern void end_swap_bio_read(struct bio *bio, int err); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); -- cgit v1.2.3 From 3520469d65f26a1cd2f610f5d5de976f78db74fe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 2 Apr 2015 11:20:48 +0200 Subject: KVM: export __gfn_to_pfn_memslot, drop gfn_to_pfn_async gfn_to_pfn_async is used in just one place, and because of x86-specific treatment that place will need to look at the memory slot. Hence inline it into try_async_pf and export __gfn_to_pfn_memslot. The patch also switches the subsequent call to gfn_to_pfn_prot to use __gfn_to_pfn_memslot. This is a small optimization. Finally, remove the now-unused async argument of __gfn_to_pfn. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b7a08cd6f4a8..87fd74a04005 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -539,13 +539,13 @@ void kvm_release_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, - bool write_fault, bool *writable); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); +pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, + bool *async, bool write_fault, bool *writable); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); -- cgit v1.2.3 From cad706df7e4a00a595f2662f32c0fc174aa4e61f Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Tue, 19 May 2015 12:01:18 +0200 Subject: livepatch: make kobject in klp_object statically allocated Make kobj variable (of type struct kobject) statically allocated in klp_object structure. It will allow us to move in the func-object-patch hierarchy through kobject links. The only reason to have it dynamic was to not have empty release callback in the code. However we have empty callbacks for function and patch in the code now, so it is no longer valid and the advantage of static allocation is clear. Signed-off-by: Miroslav Benes Signed-off-by: Jiri Slaby Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- include/linux/livepatch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index ee6dbb39a809..fe45f2f02c8d 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -99,7 +99,7 @@ struct klp_object { struct klp_func *funcs; /* internal */ - struct kobject *kobj; + struct kobject kobj; struct module *mod; enum klp_state state; }; -- cgit v1.2.3 From 8cdd043ab32c2ff28d2a77c514a768a9edce244c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 19 May 2015 12:01:19 +0200 Subject: livepatch: introduce patch/func-walking helpers klp_for_each_object and klp_for_each_func are now used all over the code. One need not think what is the proper condition to check in the for loop now. Signed-off-by: Jiri Slaby Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- include/linux/livepatch.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index fe45f2f02c8d..31db7a05dd36 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -123,6 +123,12 @@ struct klp_patch { enum klp_state state; }; +#define klp_for_each_object(patch, obj) \ + for (obj = patch->objs; obj->funcs; obj++) + +#define klp_for_each_func(obj, func) \ + for (func = obj->funcs; func->old_name; func++) + int klp_register_patch(struct klp_patch *); int klp_unregister_patch(struct klp_patch *); int klp_enable_patch(struct klp_patch *); -- cgit v1.2.3 From 4990d4fe327b9d9a7a3be7103a82699406fdde69 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 18 May 2015 15:40:29 -0700 Subject: PM / Wakeirq: Add automated device wake IRQ handling Turns out we can automate the handling for the device_may_wakeup() quite a bit by using the kernel wakeup source list as suggested by Rafael J. Wysocki . And as some hardware has separate dedicated wake-up interrupt in addition to the IO interrupt, we can automate the handling by adding a generic threaded interrupt handler that just calls the device PM runtime to wake up the device. This allows dropping code from device drivers as we currently are doing it in multiple ways, and often wrong. For most drivers, we should be able to drop the following boilerplate code from runtime_suspend and runtime_resume functions: ... device_init_wakeup(dev, true); ... if (device_may_wakeup(dev)) enable_irq_wake(irq); ... if (device_may_wakeup(dev)) disable_irq_wake(irq); ... device_init_wakeup(dev, false); ... We can replace it with just the following init and exit time code: ... device_init_wakeup(dev, true); dev_pm_set_wake_irq(dev, irq); ... dev_pm_clear_wake_irq(dev); device_init_wakeup(dev, false); ... And for hardware with dedicated wake-up interrupts: ... device_init_wakeup(dev, true); dev_pm_set_dedicated_wake_irq(dev, irq); ... dev_pm_clear_wake_irq(dev); device_init_wakeup(dev, false); ... Signed-off-by: Tony Lindgren Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 2 ++ include/linux/pm_wakeirq.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_wakeup.h | 9 ++++++++ 3 files changed, 63 insertions(+) create mode 100644 include/linux/pm_wakeirq.h (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 2d29c64f8fb1..1c4ed0cb7907 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -529,6 +529,7 @@ enum rpm_request { }; struct wakeup_source; +struct wake_irq; struct pm_domain_data; struct pm_subsys_data { @@ -568,6 +569,7 @@ struct dev_pm_info { unsigned long timer_expires; struct work_struct work; wait_queue_head_t wait_queue; + struct wake_irq *wakeirq; atomic_t usage_count; atomic_t child_count; unsigned int disable_depth:3; diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h new file mode 100644 index 000000000000..4046fa1b7d25 --- /dev/null +++ b/include/linux/pm_wakeirq.h @@ -0,0 +1,52 @@ +/* + * pm_wakeirq.h - Device wakeirq helper functions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_PM_WAKEIRQ_H +#define _LINUX_PM_WAKEIRQ_H + +#ifdef CONFIG_PM + +extern int dev_pm_set_wake_irq(struct device *dev, int irq); +extern int dev_pm_set_dedicated_wake_irq(struct device *dev, + int irq); +extern void dev_pm_clear_wake_irq(struct device *dev); +extern void dev_pm_enable_wake_irq(struct device *dev); +extern void dev_pm_disable_wake_irq(struct device *dev); + +#else /* !CONFIG_PM */ + +static inline int dev_pm_set_wake_irq(struct device *dev, int irq) +{ + return 0; +} + +static inline int dev_pm_set_dedicated__wake_irq(struct device *dev, + int irq) +{ + return 0; +} + +static inline void dev_pm_clear_wake_irq(struct device *dev) +{ +} + +static inline void dev_pm_enable_wake_irq(struct device *dev) +{ +} + +static inline void dev_pm_disable_wake_irq(struct device *dev) +{ +} + +#endif /* CONFIG_PM */ +#endif /* _LINUX_PM_WAKEIRQ_H */ diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index a0f70808d7f4..a3447932df1f 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -28,9 +28,17 @@ #include +struct wake_irq; + /** * struct wakeup_source - Representation of wakeup sources * + * @name: Name of the wakeup source + * @entry: Wakeup source list entry + * @lock: Wakeup source lock + * @wakeirq: Optional device specific wakeirq + * @timer: Wakeup timer list + * @timer_expires: Wakeup timer expiration * @total_time: Total time this wakeup source has been active. * @max_time: Maximum time this wakeup source has been continuously active. * @last_time: Monotonic clock when the wakeup source's was touched last time. @@ -47,6 +55,7 @@ struct wakeup_source { const char *name; struct list_head entry; spinlock_t lock; + struct wake_irq *wakeirq; struct timer_list timer; unsigned long timer_expires; ktime_t total_time; -- cgit v1.2.3 From ecc8617053e0a97272ef2eee138809f30080e84b Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 30 Mar 2015 16:20:03 -0700 Subject: module: add extra argument for parse_params() callback This adds an extra argument onto parse_params() to be used as a way to make the unused callback a bit more useful and generic by allowing the caller to pass on a data structure of its choice. An example use case is to allow us to easily make module parameters for every module which we will do next. @ parse @ identifier name, args, params, num, level_min, level_max; identifier unknown, param, val, doing; type s16; @@ extern char *parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, s16 level_min, s16 level_max, + void *arg, int (*unknown)(char *param, char *val, const char *doing + , void *arg )); @ parse_mod @ identifier name, args, params, num, level_min, level_max; identifier unknown, param, val, doing; type s16; @@ char *parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, s16 level_min, s16 level_max, + void *arg, int (*unknown)(char *param, char *val, const char *doing + , void *arg )) { ... } @ parse_args_found @ expression R, E1, E2, E3, E4, E5, E6; identifier func; @@ ( R = parse_args(E1, E2, E3, E4, E5, E6, + NULL, func); | R = parse_args(E1, E2, E3, E4, E5, E6, + NULL, &func); | R = parse_args(E1, E2, E3, E4, E5, E6, + NULL, NULL); | parse_args(E1, E2, E3, E4, E5, E6, + NULL, func); | parse_args(E1, E2, E3, E4, E5, E6, + NULL, &func); | parse_args(E1, E2, E3, E4, E5, E6, + NULL, NULL); ) @ parse_args_unused depends on parse_args_found @ identifier parse_args_found.func; @@ int func(char *param, char *val, const char *unused + , void *arg ) { ... } @ mod_unused depends on parse_args_found @ identifier parse_args_found.func; expression A1, A2, A3; @@ - func(A1, A2, A3); + func(A1, A2, A3, NULL); Generated-by: Coccinelle SmPL Cc: cocci@systeme.lip6.fr Cc: Tejun Heo Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Rusty Russell Cc: Christoph Hellwig Cc: Felipe Contreras Cc: Ewan Milne Cc: Jean Delvare Cc: Hannes Reinecke Cc: Jani Nikula Cc: linux-kernel@vger.kernel.org Reviewed-by: Tejun Heo Acked-by: Rusty Russell Signed-off-by: Luis R. Rodriguez Signed-off-by: Greg Kroah-Hartman --- include/linux/moduleparam.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 1c9effa25e26..13923709d30d 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -357,8 +357,9 @@ extern char *parse_args(const char *name, unsigned num, s16 level_min, s16 level_max, + void *arg, int (*unknown)(char *param, char *val, - const char *doing)); + const char *doing, void *arg)); /* Called by module remove. */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 765230b5f084863183aa8adb3405ab3f32c0b16e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 30 Mar 2015 16:20:04 -0700 Subject: driver-core: add asynchronous probing support for drivers Some devices take a long time when initializing, and not all drivers are suited to initialize their devices when they are open. For example, input drivers need to interrogate their devices in order to publish device's capabilities before userspace will open them. When such drivers are compiled into kernel they may stall entire kernel initialization. This change allows drivers request for their probe functions to be called asynchronously during driver and device registration (manual binding is still synchronous). Because async_schedule is used to perform asynchronous calls module loading will still wait for the probing to complete. Note that the end goal is to make the probing asynchronous by default, so annotating drivers with PROBE_PREFER_ASYNCHRONOUS is a temporary measure that allows us to speed up boot process while we validating and fixing the rest of the drivers and preparing userspace. This change is based on earlier patch by "Luis R. Rodriguez" Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 6558af90c8fe..7857b46c548b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -195,6 +195,31 @@ extern int bus_unregister_notifier(struct bus_type *bus, extern struct kset *bus_get_kset(struct bus_type *bus); extern struct klist *bus_get_device_klist(struct bus_type *bus); +/** + * enum probe_type - device driver probe type to try + * Device drivers may opt in for special handling of their + * respective probe routines. This tells the core what to + * expect and prefer. + * + * @PROBE_SYNCHRONOUS: Default. Drivers expect their probe routines + * to run synchronously with driver and device registration + * (with the exception of -EPROBE_DEFER handling - re-probing + * always ends up being done asynchronously). + * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which + * probing order is not essential for booting the system may + * opt into executing their probes asynchronously. + * + * Note that the end goal is to switch the kernel to use asynchronous + * probing by default, so annotating drivers with + * %PROBE_PREFER_ASYNCHRONOUS is a temporary measure that allows us + * to speed up boot process while we are validating the rest of the + * drivers. + */ +enum probe_type { + PROBE_SYNCHRONOUS, + PROBE_PREFER_ASYNCHRONOUS, +}; + /** * struct device_driver - The basic device driver structure * @name: Name of the device driver. @@ -202,6 +227,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * @owner: The module owner. * @mod_name: Used for built-in modules. * @suppress_bind_attrs: Disables bind/unbind via sysfs. + * @probe_type: Type of the probe (synchronous or asynchronous) to use. * @of_match_table: The open firmware table. * @acpi_match_table: The ACPI match table. * @probe: Called to query the existence of a specific device, @@ -235,6 +261,7 @@ struct device_driver { const char *mod_name; /* used for built-in modules */ bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ + enum probe_type probe_type; const struct of_device_id *of_match_table; const struct acpi_device_id *acpi_match_table; @@ -975,6 +1002,7 @@ extern int __must_check device_bind_driver(struct device *dev); extern void device_release_driver(struct device *dev); extern int __must_check device_attach(struct device *dev); extern int __must_check driver_attach(struct device_driver *drv); +extern void device_initial_probe(struct device *dev); extern int __must_check device_reprobe(struct device *dev); /* -- cgit v1.2.3 From f2411da746985e60d4d087f3a43e271c61785927 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 30 Mar 2015 16:20:05 -0700 Subject: driver-core: add driver module asynchronous probe support Some init systems may wish to express the desire to have device drivers run their probe() code asynchronously. This implements support for this and allows userspace to request async probe as a preference through a generic shared device driver module parameter, async_probe. Implementation for async probe is supported through a module parameter given that since synchronous probe has been prevalent for years some userspace might exist which relies on the fact that the device driver will probe synchronously and the assumption that devices it provides will be immediately available after this. Signed-off-by: Luis R. Rodriguez Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 8 +++++--- include/linux/module.h | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 7857b46c548b..77b7cd9e5467 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -201,10 +201,12 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * respective probe routines. This tells the core what to * expect and prefer. * - * @PROBE_SYNCHRONOUS: Default. Drivers expect their probe routines + * @PROBE_DEFAULT_STRATEGY: Drivers expect their probe routines * to run synchronously with driver and device registration * (with the exception of -EPROBE_DEFER handling - re-probing - * always ends up being done asynchronously). + * always ends up being done asynchronously) unless user + * explicitly requested asynchronous probing via module + * parameter. * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which * probing order is not essential for booting the system may * opt into executing their probes asynchronously. @@ -216,7 +218,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * drivers. */ enum probe_type { - PROBE_SYNCHRONOUS, + PROBE_DEFAULT_STRATEGY, PROBE_PREFER_ASYNCHRONOUS, }; diff --git a/include/linux/module.h b/include/linux/module.h index c883b86ea964..f46a47d3c0dc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -257,6 +257,8 @@ struct module { bool sig_ok; #endif + bool async_probe_requested; + /* symbols that will be GPL-only in the near future. */ const struct kernel_symbol *gpl_future_syms; const unsigned long *gpl_future_crcs; -- cgit v1.2.3 From d173a137c5bd95ee29d02705e5fa8890ef149718 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 30 Mar 2015 16:20:06 -0700 Subject: driver-core: enable drivers to opt-out of async probe There are drivers that can not be probed asynchronously. One such group is platform drivers registered with platform_driver_probe(), which expects driver's probe routine be discarded after the driver has been registered and initial binding attempt executed. Also platform_driver_probe() an error when no devices were bound to the driver, allowing failing to load such driver module altogether. Other drivers do not work well with asynchronous probing because of driver bug or not optimal driver organization. To allow using such drivers even when user requests asynchronous probing as default boot strategy, let's allow them to opt out. Signed-off-by: Luis R. Rodriguez Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 77b7cd9e5467..00ac57c26615 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -201,15 +201,15 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); * respective probe routines. This tells the core what to * expect and prefer. * - * @PROBE_DEFAULT_STRATEGY: Drivers expect their probe routines - * to run synchronously with driver and device registration - * (with the exception of -EPROBE_DEFER handling - re-probing - * always ends up being done asynchronously) unless user - * explicitly requested asynchronous probing via module - * parameter. + * @PROBE_DEFAULT_STRATEGY: Used by drivers that work equally well + * whether probed synchronously or asynchronously. * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which * probing order is not essential for booting the system may * opt into executing their probes asynchronously. + * @PROBE_FORCE_SYNCHRONOUS: Use this to annotate drivers that need + * their probe routines to run synchronously with driver and + * device registration (with the exception of -EPROBE_DEFER + * handling - re-probing always ends up being done asynchronously). * * Note that the end goal is to switch the kernel to use asynchronous * probing by default, so annotating drivers with @@ -220,6 +220,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus); enum probe_type { PROBE_DEFAULT_STRATEGY, PROBE_PREFER_ASYNCHRONOUS, + PROBE_FORCE_SYNCHRONOUS, }; /** -- cgit v1.2.3 From ec0ccc16a09fc32f7142ef3ddf1c2276fbbb35d0 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 30 Mar 2015 16:20:09 -0700 Subject: module: add core_param_unsafe Similarly to module_param_unsafe(), add the helper to be used by core code wishing to expose unsafe debugging or testing parameters that taint the kernel when set. Acked-by: Rusty Russell Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/moduleparam.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 13923709d30d..6480dcaca275 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -310,6 +310,15 @@ static inline void __kernel_param_unlock(void) #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ __module_param_call("", name, ¶m_ops_##type, &var, perm, -1, 0) + +/** + * core_param_unsafe - same as core_param but taints kernel + */ +#define core_param_unsafe(name, var, type, perm) \ + param_check_##type(name, &(var)); \ + __module_param_call("", name, ¶m_ops_##type, &var, perm, \ + -1, KERNEL_PARAM_FL_UNSAFE) + #endif /* !MODULE */ /** -- cgit v1.2.3 From 985eba3aba77a997f65109e1418837b1d8b8512a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 3 Dec 2014 16:46:14 +0100 Subject: mfd: syscon: Add Atmel MC (Memory Controller) registers definition The at91rm9200 SoC embeds a Memory Controller block which is used to configure several aspects of the platform: - AHB/APB Bus behavior - SDRAM Controller - EBI (External Bus Interface) and SMC (Static Memory Controller) config Those registers might be accessed by different drivers, hence we need to define it as a syscon device. Signed-off-by: Boris Brezillon Signed-off-by: Alexandre Belloni Acked-by: Lee Jones Acked-by: Nicolas Ferre --- include/linux/mfd/syscon/atmel-mc.h | 144 ++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 include/linux/mfd/syscon/atmel-mc.h (limited to 'include/linux') diff --git a/include/linux/mfd/syscon/atmel-mc.h b/include/linux/mfd/syscon/atmel-mc.h new file mode 100644 index 000000000000..afd9b8f1e363 --- /dev/null +++ b/include/linux/mfd/syscon/atmel-mc.h @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2005 Ivan Kokshaysky + * Copyright (C) SAN People + * + * Memory Controllers (MC, EBI, SMC, SDRAMC, BFC) - System peripherals + * registers. + * Based on AT91RM9200 datasheet revision E. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _LINUX_MFD_SYSCON_ATMEL_MC_H_ +#define _LINUX_MFD_SYSCON_ATMEL_MC_H_ + +/* Memory Controller */ +#define AT91_MC_RCR 0x00 +#define AT91_MC_RCB BIT(0) + +#define AT91_MC_ASR 0x04 +#define AT91_MC_UNADD BIT(0) +#define AT91_MC_MISADD BIT(1) +#define AT91_MC_ABTSZ GENMASK(9, 8) +#define AT91_MC_ABTSZ_BYTE (0 << 8) +#define AT91_MC_ABTSZ_HALFWORD (1 << 8) +#define AT91_MC_ABTSZ_WORD (2 << 8) +#define AT91_MC_ABTTYP GENMASK(11, 10) +#define AT91_MC_ABTTYP_DATAREAD (0 << 10) +#define AT91_MC_ABTTYP_DATAWRITE (1 << 10) +#define AT91_MC_ABTTYP_FETCH (2 << 10) +#define AT91_MC_MST(n) BIT(16 + (n)) +#define AT91_MC_SVMST(n) BIT(24 + (n)) + +#define AT91_MC_AASR 0x08 + +#define AT91_MC_MPR 0x0c +#define AT91_MPR_MSTP(n) GENMASK(2 + ((x) * 4), ((x) * 4)) + +/* External Bus Interface (EBI) registers */ +#define AT91_MC_EBI_CSA 0x60 +#define AT91_MC_EBI_CS(n) BIT(x) +#define AT91_MC_EBI_NUM_CS 8 + +#define AT91_MC_EBI_CFGR 0x64 +#define AT91_MC_EBI_DBPUC BIT(0) + +/* Static Memory Controller (SMC) registers */ +#define AT91_MC_SMC_CSR(n) (0x70 + ((n) * 4)) +#define AT91_MC_SMC_NWS GENMASK(6, 0) +#define AT91_MC_SMC_NWS_(x) ((x) << 0) +#define AT91_MC_SMC_WSEN BIT(7) +#define AT91_MC_SMC_TDF GENMASK(11, 8) +#define AT91_MC_SMC_TDF_(x) ((x) << 8) +#define AT91_MC_SMC_TDF_MAX 0xf +#define AT91_MC_SMC_BAT BIT(12) +#define AT91_MC_SMC_DBW GENMASK(14, 13) +#define AT91_MC_SMC_DBW_16 (1 << 13) +#define AT91_MC_SMC_DBW_8 (2 << 13) +#define AT91_MC_SMC_DPR BIT(15) +#define AT91_MC_SMC_ACSS GENMASK(17, 16) +#define AT91_MC_SMC_ACSS_(x) ((x) << 16) +#define AT91_MC_SMC_ACSS_MAX 3 +#define AT91_MC_SMC_RWSETUP GENMASK(26, 24) +#define AT91_MC_SMC_RWSETUP_(x) ((x) << 24) +#define AT91_MC_SMC_RWHOLD GENMASK(30, 28) +#define AT91_MC_SMC_RWHOLD_(x) ((x) << 28) +#define AT91_MC_SMC_RWHOLDSETUP_MAX 7 + +/* SDRAM Controller registers */ +#define AT91_MC_SDRAMC_MR 0x90 +#define AT91_MC_SDRAMC_MODE GENMASK(3, 0) +#define AT91_MC_SDRAMC_MODE_NORMAL (0 << 0) +#define AT91_MC_SDRAMC_MODE_NOP (1 << 0) +#define AT91_MC_SDRAMC_MODE_PRECHARGE (2 << 0) +#define AT91_MC_SDRAMC_MODE_LMR (3 << 0) +#define AT91_MC_SDRAMC_MODE_REFRESH (4 << 0) +#define AT91_MC_SDRAMC_DBW_16 BIT(4) + +#define AT91_MC_SDRAMC_TR 0x94 +#define AT91_MC_SDRAMC_COUNT GENMASK(11, 0) + +#define AT91_MC_SDRAMC_CR 0x98 +#define AT91_MC_SDRAMC_NC GENMASK(1, 0) +#define AT91_MC_SDRAMC_NC_8 (0 << 0) +#define AT91_MC_SDRAMC_NC_9 (1 << 0) +#define AT91_MC_SDRAMC_NC_10 (2 << 0) +#define AT91_MC_SDRAMC_NC_11 (3 << 0) +#define AT91_MC_SDRAMC_NR GENMASK(3, 2) +#define AT91_MC_SDRAMC_NR_11 (0 << 2) +#define AT91_MC_SDRAMC_NR_12 (1 << 2) +#define AT91_MC_SDRAMC_NR_13 (2 << 2) +#define AT91_MC_SDRAMC_NB BIT(4) +#define AT91_MC_SDRAMC_NB_2 (0 << 4) +#define AT91_MC_SDRAMC_NB_4 (1 << 4) +#define AT91_MC_SDRAMC_CAS GENMASK(6, 5) +#define AT91_MC_SDRAMC_CAS_2 (2 << 5) +#define AT91_MC_SDRAMC_TWR GENMASK(10, 7) +#define AT91_MC_SDRAMC_TRC GENMASK(14, 11) +#define AT91_MC_SDRAMC_TRP GENMASK(18, 15) +#define AT91_MC_SDRAMC_TRCD GENMASK(22, 19) +#define AT91_MC_SDRAMC_TRAS GENMASK(26, 23) +#define AT91_MC_SDRAMC_TXSR GENMASK(30, 27) + +#define AT91_MC_SDRAMC_SRR 0x9c +#define AT91_MC_SDRAMC_SRCB BIT(0) + +#define AT91_MC_SDRAMC_LPR 0xa0 +#define AT91_MC_SDRAMC_LPCB BIT(0) + +#define AT91_MC_SDRAMC_IER 0xa4 +#define AT91_MC_SDRAMC_IDR 0xa8 +#define AT91_MC_SDRAMC_IMR 0xac +#define AT91_MC_SDRAMC_ISR 0xb0 +#define AT91_MC_SDRAMC_RES BIT(0) + +/* Burst Flash Controller register */ +#define AT91_MC_BFC_MR 0xc0 +#define AT91_MC_BFC_BFCOM GENMASK(1, 0) +#define AT91_MC_BFC_BFCOM_DISABLED (0 << 0) +#define AT91_MC_BFC_BFCOM_ASYNC (1 << 0) +#define AT91_MC_BFC_BFCOM_BURST (2 << 0) +#define AT91_MC_BFC_BFCC GENMASK(3, 2) +#define AT91_MC_BFC_BFCC_MCK (1 << 2) +#define AT91_MC_BFC_BFCC_DIV2 (2 << 2) +#define AT91_MC_BFC_BFCC_DIV4 (3 << 2) +#define AT91_MC_BFC_AVL GENMASK(7, 4) +#define AT91_MC_BFC_PAGES GENMASK(10, 8) +#define AT91_MC_BFC_PAGES_NO_PAGE (0 << 8) +#define AT91_MC_BFC_PAGES_16 (1 << 8) +#define AT91_MC_BFC_PAGES_32 (2 << 8) +#define AT91_MC_BFC_PAGES_64 (3 << 8) +#define AT91_MC_BFC_PAGES_128 (4 << 8) +#define AT91_MC_BFC_PAGES_256 (5 << 8) +#define AT91_MC_BFC_PAGES_512 (6 << 8) +#define AT91_MC_BFC_PAGES_1024 (7 << 8) +#define AT91_MC_BFC_OEL GENMASK(13, 12) +#define AT91_MC_BFC_BAAEN BIT(16) +#define AT91_MC_BFC_BFOEH BIT(17) +#define AT91_MC_BFC_MUXEN BIT(18) +#define AT91_MC_BFC_RDYEN BIT(19) + +#endif /* _LINUX_MFD_SYSCON_ATMEL_MC_H_ */ -- cgit v1.2.3 From be32417796c2b8a83fe4cbece83bea96ab9e378f Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Wed, 6 May 2015 12:26:22 +0800 Subject: block: export blkdev_reread_part() and __blkdev_reread_part() This patch exports blkdev_reread_part() for block drivers, also introduce __blkdev_reread_part(). For some drivers, such as loop, reread of partitions can be run from the release path, and bd_mutex may already be held prior to calling ioctl_by_bdev(bdev, BLKRRPART, 0), so introduce __blkdev_reread_part for use in such cases. CC: Christoph Hellwig CC: Jens Axboe CC: Tejun Heo CC: Alexander Viro CC: Markus Pargmann CC: Stefan Weinhuber CC: Stefan Haberland CC: Sebastian Ott CC: Fabian Frederick CC: Ming Lei CC: David Herrmann CC: Andrew Morton CC: Peter Zijlstra CC: nbd-general@lists.sourceforge.net CC: linux-s390@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jarod Wilson Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..1ef63900243c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2279,6 +2279,9 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); extern void blkdev_put(struct block_device *bdev, fmode_t mode); +extern int __blkdev_reread_part(struct block_device *bdev); +extern int blkdev_reread_part(struct block_device *bdev); + #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); extern void bd_unlink_disk_holder(struct block_device *bdev, -- cgit v1.2.3 From 6e844b035360294636271f4f0fd4a5a685e03c06 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Apr 2015 10:45:55 -0700 Subject: ARM: BCM63xx: Add Broadcom BCM63xx PMB controller helpers This patch adds both common register definitions and helper functions used to issue read/write commands to the Broadcom BCM63xx PMB controller which is used to power on and release from reset internal on-chip peripherals such as the integrated Ethernet switch, AHCI, USB, as well as the secondary CPU core. This is going to be utilized by the BCM63138 SMP code, as well as by the BCM63138 reset controller later. Signed-off-by: Florian Fainelli --- include/linux/reset/bcm63xx_pmb.h | 88 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 include/linux/reset/bcm63xx_pmb.h (limited to 'include/linux') diff --git a/include/linux/reset/bcm63xx_pmb.h b/include/linux/reset/bcm63xx_pmb.h new file mode 100644 index 000000000000..bb4af7b5eb36 --- /dev/null +++ b/include/linux/reset/bcm63xx_pmb.h @@ -0,0 +1,88 @@ +/* + * Broadcom BCM63xx Processor Monitor Bus shared routines (SMP and reset) + * + * Copyright (C) 2015, Broadcom Corporation + * Author: Florian Fainelli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation version 2. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef __BCM63XX_PMB_H +#define __BCM63XX_PMB_H + +#include +#include +#include +#include + +/* PMB Master controller register */ +#define PMB_CTRL 0x00 +#define PMC_PMBM_START (1 << 31) +#define PMC_PMBM_TIMEOUT (1 << 30) +#define PMC_PMBM_SLAVE_ERR (1 << 29) +#define PMC_PMBM_BUSY (1 << 28) +#define PMC_PMBM_READ (0 << 20) +#define PMC_PMBM_WRITE (1 << 20) +#define PMB_WR_DATA 0x04 +#define PMB_TIMEOUT 0x08 +#define PMB_RD_DATA 0x0C + +#define PMB_BUS_ID_SHIFT 8 + +/* Perform the low-level PMB master operation, shared between reads and + * writes. + */ +static inline int __bpcm_do_op(void __iomem *master, unsigned int addr, + u32 off, u32 op) +{ + unsigned int timeout = 1000; + u32 cmd; + + cmd = (PMC_PMBM_START | op | (addr & 0xff) << 12 | off); + writel(cmd, master + PMB_CTRL); + do { + cmd = readl(master + PMB_CTRL); + if (!(cmd & PMC_PMBM_START)) + return 0; + + if (cmd & PMC_PMBM_SLAVE_ERR) + return -EIO; + + if (cmd & PMC_PMBM_TIMEOUT) + return -ETIMEDOUT; + + udelay(1); + } while (timeout-- > 0); + + return -ETIMEDOUT; +} + +static inline int bpcm_rd(void __iomem *master, unsigned int addr, + u32 off, u32 *val) +{ + int ret = 0; + + ret = __bpcm_do_op(master, addr, off >> 2, PMC_PMBM_READ); + *val = readl(master + PMB_RD_DATA); + + return ret; +} + +static inline int bpcm_wr(void __iomem *master, unsigned int addr, + u32 off, u32 val) +{ + int ret = 0; + + writel(val, master + PMB_WR_DATA); + ret = __bpcm_do_op(master, addr, off >> 2, PMC_PMBM_WRITE); + + return ret; +} + +#endif /* __BCM63XX_PMB_H */ -- cgit v1.2.3 From 7f1a57fdd6cb6e7be2ed31878a34655df38e1861 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 May 2015 16:13:02 +0900 Subject: power_supply: Fix possible NULL pointer dereference on early uevent Don't call the power_supply_changed() from power_supply_register() when parent is still probing because it may lead to accessing parent too early. In bq27x00_battery this caused NULL pointer exception because uevent of power_supply_changed called back the the get_property() method provided by the driver. The get_property() method accessed pointer which should be returned by power_supply_register(). Starting from bq27x00_battery_probe(): di->bat = power_supply_register() power_supply_changed() kobject_uevent() power_supply_uevent() power_supply_show_property() power_supply_get_property() bq27x00_battery_get_property() dereference of di->bat which is NULL here The dereference of di->bat (value returned by power_supply_register()) is the currently visible problem. However calling back the methods provided by driver before ending the probe may lead to accessing other driver-related data which is not yet initialized. The call to power_supply_changed() is postponed till probing ends - mutex of parent device is released. Reported-by: H. Nikolaus Schaller Signed-off-by: Krzysztof Kozlowski Fixes: 297d716f6260 ("power_supply: Change ownership from driver to core") Tested-By: Dr. H. Nikolaus Schaller Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 75a1dd8dc56e..a80f1fd01ddb 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -237,6 +237,7 @@ struct power_supply { /* private */ struct device dev; struct work_struct changed_work; + struct delayed_work deferred_register_work; spinlock_t changed_lock; bool changed; atomic_t use_cnt; -- cgit v1.2.3 From 04fd61ab36ec065e194ab5e74ae34a5240d992bb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 19 May 2015 16:59:03 -0700 Subject: bpf: allow bpf programs to tail-call other bpf programs introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. In case of x64 JIT the bigger part of generated assembler prologue is common for all programs, so it is simply skipped while jumping. Other JITs can do similar prologue-skipping optimization or do stack unwind before jumping into the next program. bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table Since all BPF programs are idenitified by file descriptor, user space need to populate the jmp_table with FDs of other BPF programs. If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere and program execution continues as normal. New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can populate this jmp_table array with FDs of other bpf programs. Programs can share the same jmp_table array or use multiple jmp_tables. The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. Use cases: Acked-by: Daniel Borkmann ========== - simplify complex programs by splitting them into a sequence of small programs - dispatch routine For tracing and future seccomp the program may be triggered on all system calls, but processing of syscall arguments will be different. It's more efficient to implement them as: int syscall_entry(struct seccomp_data *ctx) { bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */); ... default: process unknown syscall ... } int sys_write_event(struct seccomp_data *ctx) {...} int sys_read_event(struct seccomp_data *ctx) {...} syscall_jmp_table[__NR_write] = sys_write_event; syscall_jmp_table[__NR_read] = sys_read_event; For networking the program may call into different parsers depending on packet format, like: int packet_parser(struct __sk_buff *skb) { ... parse L2, L3 here ... __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol)); bpf_tail_call(skb, &ipproto_jmp_table, ipproto); ... default: process unknown protocol ... } int parse_tcp(struct __sk_buff *skb) {...} int parse_udp(struct __sk_buff *skb) {...} ipproto_jmp_table[IPPROTO_TCP] = parse_tcp; ipproto_jmp_table[IPPROTO_UDP] = parse_udp; - for TC use case, bpf_tail_call() allows to implement reclassify-like logic - bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table are atomic, so user space can build chains of BPF programs on the fly Implementation details: ======================= - high performance of bpf_tail_call() is the goal. It could have been implemented without JIT changes as a wrapper on top of BPF_PROG_RUN() macro, but with two downsides: . all programs would have to pay performance penalty for this feature and tail call itself would be slower, since mandatory stack unwind, return, stack allocate would be done for every tailcall. . tailcall would be limited to programs running preempt_disabled, since generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would need to be either global per_cpu variable accessed by helper and by wrapper or global variable protected by locks. In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue. - bpf_prog_array_compatible() ensures that prog_type of callee and caller are the same and JITed/non-JITed flag is the same, since calling JITed program from non-JITed is invalid, since stack frames are different. Similarly calling kprobe type program from socket type program is invalid. - jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map' abstraction, its user space API and all of verifier logic. It's in the existing arraymap.c file, since several functions are shared with regular array map. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 22 ++++++++++++++++++++++ include/linux/filter.h | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d5cda067115a..8821b9a8689e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -126,6 +126,27 @@ struct bpf_prog_aux { struct work_struct work; }; +struct bpf_array { + struct bpf_map map; + u32 elem_size; + /* 'ownership' of prog_array is claimed by the first program that + * is going to use this map or by the first program which FD is stored + * in the map to make sure that all callers and callees have the same + * prog_type and JITed flag + */ + enum bpf_prog_type owner_prog_type; + bool owner_jited; + union { + char value[0] __aligned(8); + struct bpf_prog *prog[0] __aligned(8); + }; +}; +#define MAX_TAIL_CALL_CNT 32 + +u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); +void bpf_prog_array_map_clear(struct bpf_map *map); +bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); + #ifdef CONFIG_BPF_SYSCALL void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl); @@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; +extern const struct bpf_func_proto bpf_tail_call_proto; #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 200be4a74a33..17724f6ea983 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) int sk_filter(struct sock *sk, struct sk_buff *skb); -void bpf_prog_select_runtime(struct bpf_prog *fp); +int bpf_prog_select_runtime(struct bpf_prog *fp); void bpf_prog_free(struct bpf_prog *fp); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); -- cgit v1.2.3 From 37b1ef31a568fc02e53587620226e5f3c66454c8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 May 2015 14:41:19 +0800 Subject: workqueue: move flush_scheduled_work() to workqueue.h flush_scheduled_work() is just a simple call to flush_work(). Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4618dd672d1b..738b30b39b68 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -435,7 +435,6 @@ extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); -extern void flush_scheduled_work(void); extern int schedule_on_each_cpu(work_func_t func); @@ -531,6 +530,35 @@ static inline bool schedule_work(struct work_struct *work) return queue_work(system_wq, work); } +/** + * flush_scheduled_work - ensure that any scheduled work has run to completion. + * + * Forces execution of the kernel-global workqueue and blocks until its + * completion. + * + * Think twice before calling this function! It's very easy to get into + * trouble if you don't take great care. Either of the following situations + * will lead to deadlock: + * + * One of the work items currently on the workqueue needs to acquire + * a lock held by your code or its caller. + * + * Your code is running in the context of a work routine. + * + * They will be detected by lockdep when they occur, but the first might not + * occur very often. It depends on what work items are on the workqueue and + * what locks they need, which you have no control over. + * + * In most situations flushing the entire workqueue is overkill; you merely + * need to know that a particular work item isn't queued and isn't running. + * In such cases you should use cancel_delayed_work_sync() or + * cancel_work_sync() instead. + */ +static inline void flush_scheduled_work(void) +{ + flush_workqueue(system_wq); +} + /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use -- cgit v1.2.3 From 2efd055c53c06b7e89c167c98069bab9afce7e59 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 20 May 2015 16:35:41 -0700 Subject: tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info This patch tracks the total number of inbound and outbound segments on a TCP socket. One may use this number to have an idea on connection quality when compared against the retransmissions. RFC4898 named these : tcpEStatsPerfSegsIn and tcpEStatsPerfSegsOut These are a 32bit field each and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->segs_out was placed near tp->snd_nxt for good data locality and minimal performance impact, while tp->segs_in was placed near tp->bytes_received for the same reason. Join work with Eric Dumazet. Note that received SYN are accounted on the listener, but sent SYNACK are not accounted. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e6fb5df22db1..f0212026c77f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -149,11 +149,16 @@ struct tcp_sock { * sum(delta(rcv_nxt)), or how many bytes * were acked. */ + u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ - + u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. -- cgit v1.2.3 From 2d0f230fe0649758394466cb69b553c0e8184df9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 21 May 2015 15:11:02 +0800 Subject: crypto: aead - Rename aead_alg to old_aead_alg This patch is the first step in the introduction of a new AEAD alg type. Unlike normal conversions this patch only renames the existing aead_alg structure because there are external references to it. Those references will be removed after this patch. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 59ca4086ce6a..7d290a91c6f9 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -268,7 +268,7 @@ struct ablkcipher_alg { }; /** - * struct aead_alg - AEAD cipher definition + * struct old_aead_alg - AEAD cipher definition * @maxauthsize: Set the maximum authentication tag size supported by the * transformation. A transformation may support smaller tag sizes. * As the authentication tag is a message digest to ensure the @@ -293,7 +293,7 @@ struct ablkcipher_alg { * All fields except @givencrypt , @givdecrypt , @geniv and @ivsize are * mandatory and must be filled. */ -struct aead_alg { +struct old_aead_alg { int (*setkey)(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize); @@ -501,7 +501,7 @@ struct crypto_alg { union { struct ablkcipher_alg ablkcipher; - struct aead_alg aead; + struct old_aead_alg aead; struct blkcipher_alg blkcipher; struct cipher_alg cipher; struct compress_alg compress; -- cgit v1.2.3 From 2a9de9c0f08d61fbe3764a21d22d0b72df97d6ae Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 24 Apr 2015 19:16:05 +0900 Subject: extcon: Use the unique id for external connector instead of string This patch uses the unique id to identify the type of external connector instead of string name. The string name have the many potential issues. So, this patch defines the 'extcon' enumeration which includes all supported external connector on EXTCON subsystem. If new external connector is necessary, the unique id of new connector have to be added in 'extcon' enumeration. There are current supported external connector in 'enum extcon' as following: enum extcon { EXTCON_NONE = 0x0, /* USB external connector */ EXTCON_USB = 0x1, EXTCON_USB_HOST = 0x2, /* Charger external connector */ EXTCON_TA = 0x10, EXTCON_FAST_CHARGER = 0x11, EXTCON_SLOW_CHARGER = 0x12, EXTCON_CHARGE_DOWNSTREAM = 0x13, /* Audio and video external connector */ EXTCON_LINE_IN = 0x20, EXTCON_LINE_OUT = 0x21, EXTCON_MICROPHONE = 0x22, EXTCON_HEADPHONE = 0x23, EXTCON_HDMI = 0x30, EXTCON_MHL = 0x31, EXTCON_DVI = 0x32, EXTCON_VGA = 0x33, EXTCON_SPDIF_IN = 0x34, EXTCON_SPDIF_OUT = 0x35, EXTCON_VIDEO_IN = 0x36, EXTCON_VIDEO_OUT = 0x37, /* Miscellaneous external connector */ EXTCON_DOCK = 0x50, EXTCON_JIG = 0x51, EXTCON_MECHANICAL = 0x52, EXTCON_END, }; For example in extcon-arizona.c: To use unique id removes the potential issue about handling the inconsistent name of external connector with string. - Previously, use the string to register the type of arizona jack connector static const char *arizona_cable[] = { "Mechanical", "Microphone", "Headphone", "Line-out", }; - Newly, use the unique id to register the type of arizona jack connector static const enum extcon arizona_cable[] = { EXTCON_MECHANICAL, EXTCON_MICROPHONE, EXTCON_HEADPHONE, EXTCON_LINE_OUT, EXTCON_NONE, }; And this patch modify the prototype of extcon_{get|set}_cable_state_() which uses the 'enum extcon id' instead of 'cable_index'. Because although one more extcon drivers support USB cable, each extcon driver might has the differnt 'cable_index' for USB cable. All extcon drivers can use the unique id number for same external connector with modified extcon_{get|set}_cable_state_(). - Previously, use 'cable_index' on these functions: extcon_get_cable_state_(struct extcon_dev*, int cable_index) extcon_set_cable_state_(struct extcon_dev*, int cable_index, bool state) -Newly, use 'enum extcon id' on these functions: extcon_get_cable_state_(struct extcon_dev*, enum extcon id) extcon_set_cable_state_(struct extcon_dev*, enum extcon id, bool state) Cc: Arnd Bergmann Cc: Felipe Balbi Signed-off-by: Chanwoo Choi Acked-by: Roger Quadros Acked-by: Charles Keepax Acked-by: Ramakrishna Pallala Reviewed-by: Krzysztof Kozlowski [arnd: Report the build break about drivers/usb/phy/phy-tahvo.c after using the unique id for external connector insteadf of string] Reported-by: Arnd Bergmann [dan.carpenter: Report the build warning of extcon_{set|get}_cable_state_()] Reported-by: Dan Carpenter --- include/linux/extcon.h | 111 ++++++++++++++------------------- include/linux/extcon/extcon-adc-jack.h | 5 +- 2 files changed, 50 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 799474d9dc48..85c882f0029e 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -1,6 +1,9 @@ /* * External connector (extcon) class driver * + * Copyright (C) 2015 Samsung Electronics + * Author: Chanwoo Choi + * * Copyright (C) 2012 Samsung Electronics * Author: Donggeun Kim * Author: MyungJoo Ham @@ -27,50 +30,41 @@ #include #include -#define SUPPORTED_CABLE_MAX 32 -#define CABLE_NAME_MAX 30 - -/* - * The standard cable name is to help support general notifier - * and notifiee device drivers to share the common names. - * Please use standard cable names unless your notifier device has - * a very unique and abnormal cable or - * the cable type is supposed to be used with only one unique - * pair of notifier/notifiee devices. - * - * Please add any other "standard" cables used with extcon dev. - * - * You may add a dot and number to specify version or specification - * of the specific cable if it is required. (e.g., "Fast-charger.18" - * and "Fast-charger.10" for 1.8A and 1.0A chargers) - * However, the notifiee and notifier should be able to handle such - * string and if the notifiee can negotiate the protocol or identify, - * you don't need such convention. This convention is helpful when - * notifier can distinguish but notifiee cannot. - */ -enum extcon_cable_name { - EXTCON_USB = 0, - EXTCON_USB_HOST, - EXTCON_TA, /* Travel Adaptor */ - EXTCON_FAST_CHARGER, - EXTCON_SLOW_CHARGER, - EXTCON_CHARGE_DOWNSTREAM, /* Charging an external device */ - EXTCON_HDMI, - EXTCON_MHL, - EXTCON_DVI, - EXTCON_VGA, - EXTCON_DOCK, - EXTCON_LINE_IN, - EXTCON_LINE_OUT, - EXTCON_MIC_IN, - EXTCON_HEADPHONE_OUT, - EXTCON_SPDIF_IN, - EXTCON_SPDIF_OUT, - EXTCON_VIDEO_IN, - EXTCON_VIDEO_OUT, - EXTCON_MECHANICAL, +enum extcon { + EXTCON_NONE = 0x0, + + /* USB external connector */ + EXTCON_USB = 0x1, + EXTCON_USB_HOST = 0x2, + + /* Charger external connector */ + EXTCON_TA = 0x10, + EXTCON_FAST_CHARGER = 0x11, + EXTCON_SLOW_CHARGER = 0x12, + EXTCON_CHARGE_DOWNSTREAM = 0x13, + + /* Audio/Video external connector */ + EXTCON_LINE_IN = 0x20, + EXTCON_LINE_OUT = 0x21, + EXTCON_MICROPHONE = 0x22, + EXTCON_HEADPHONE = 0x23, + + EXTCON_HDMI = 0x30, + EXTCON_MHL = 0x31, + EXTCON_DVI = 0x32, + EXTCON_VGA = 0x33, + EXTCON_SPDIF_IN = 0x34, + EXTCON_SPDIF_OUT = 0x35, + EXTCON_VIDEO_IN = 0x36, + EXTCON_VIDEO_OUT = 0x37, + + /* Etc external connector */ + EXTCON_DOCK = 0x50, + EXTCON_JIG = 0x51, + EXTCON_MECHANICAL = 0x52, + + EXTCON_END, }; -extern const char extcon_cable_name[][CABLE_NAME_MAX + 1]; struct extcon_cable; @@ -78,7 +72,7 @@ struct extcon_cable; * struct extcon_dev - An extcon device represents one external connector. * @name: The name of this extcon device. Parent device name is * used if NULL. - * @supported_cable: Array of supported cable names ending with NULL. + * @supported_cable: Array of supported cable names ending with EXTCON_NONE. * If supported_cable is NULL, cable name related APIs * are disabled. * @mutually_exclusive: Array of mutually exclusive set of cables that cannot @@ -113,7 +107,7 @@ struct extcon_cable; struct extcon_dev { /* Optional user initializing data */ const char *name; - const char **supported_cable; + const enum extcon *supported_cable; const u32 *mutually_exclusive; /* Optional callbacks to override class functions */ @@ -194,10 +188,10 @@ extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name); /* * Following APIs control the memory of extcon device. */ -extern struct extcon_dev *extcon_dev_allocate(const char **cables); +extern struct extcon_dev *extcon_dev_allocate(const enum extcon *cable); extern void extcon_dev_free(struct extcon_dev *edev); extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const char **cables); + const enum extcon *cable); extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev); /* @@ -216,13 +210,10 @@ extern int extcon_update_state(struct extcon_dev *edev, u32 mask, u32 state); /* * get/set_cable_state access each bit of the 32b encoded state value. - * They are used to access the status of each cable based on the cable_name - * or cable_index, which is retrieved by extcon_find_cable_index + * They are used to access the status of each cable based on the cable_name. */ -extern int extcon_find_cable_index(struct extcon_dev *sdev, - const char *cable_name); -extern int extcon_get_cable_state_(struct extcon_dev *edev, int cable_index); -extern int extcon_set_cable_state_(struct extcon_dev *edev, int cable_index, +extern int extcon_get_cable_state_(struct extcon_dev *edev, enum extcon id); +extern int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id, bool cable_state); extern int extcon_get_cable_state(struct extcon_dev *edev, @@ -281,7 +272,7 @@ static inline int devm_extcon_dev_register(struct device *dev, static inline void devm_extcon_dev_unregister(struct device *dev, struct extcon_dev *edev) { } -static inline struct extcon_dev *extcon_dev_allocate(const char **cables) +static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable) { return ERR_PTR(-ENOSYS); } @@ -289,7 +280,7 @@ static inline struct extcon_dev *extcon_dev_allocate(const char **cables) static inline void extcon_dev_free(struct extcon_dev *edev) { } static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const char **cables) + const enum extcon *cable) { return ERR_PTR(-ENOSYS); } @@ -312,20 +303,14 @@ static inline int extcon_update_state(struct extcon_dev *edev, u32 mask, return 0; } -static inline int extcon_find_cable_index(struct extcon_dev *edev, - const char *cable_name) -{ - return 0; -} - static inline int extcon_get_cable_state_(struct extcon_dev *edev, - int cable_index) + enum extcon id) { return 0; } static inline int extcon_set_cable_state_(struct extcon_dev *edev, - int cable_index, bool cable_state) + enum extcon id, bool cable_state) { return 0; } diff --git a/include/linux/extcon/extcon-adc-jack.h b/include/linux/extcon/extcon-adc-jack.h index 9ca958c4e94c..53c60806bcfb 100644 --- a/include/linux/extcon/extcon-adc-jack.h +++ b/include/linux/extcon/extcon-adc-jack.h @@ -44,7 +44,7 @@ struct adc_jack_cond { * @consumer_channel: Unique name to identify the channel on the consumer * side. This typically describes the channels used within * the consumer. E.g. 'battery_voltage' - * @cable_names: array of cable names ending with null. + * @cable_names: array of extcon id for supported cables. * @adc_contitions: array of struct adc_jack_cond conditions ending * with .state = 0 entry. This describes how to decode * adc values into extcon state. @@ -58,8 +58,7 @@ struct adc_jack_pdata { const char *name; const char *consumer_channel; - /* The last entry should be NULL */ - const char **cable_names; + const enum extcon *cable_names; /* The last entry's state should be 0 */ struct adc_jack_cond *adc_conditions; -- cgit v1.2.3 From 046050f6e623e442e9c71c525462ebd395dae526 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Tue, 19 May 2015 20:01:12 +0900 Subject: extcon: Update the prototype of extcon_register_notifier() with enum extcon Previously, extcon consumer driver used the extcon_register_interest() to register the notifier chain and then to receive the notifier event when external connector's state is changed. When registering the notifier chain for specific external connector with extcon_register_interest(), it used the the string name of external connector directly. There are potential problem because of unclear, non-standard and inconsequent cable name. Namely, it is not appropriate method to identify each external connector. So, this patch modify the prototype of extcon_register_notifier() by using the 'enum extcon' which are the unique id for each external connector instead of unclear string method. - Previously, the extcon consumer driver used the extcon_register_interest() with 'cable_name' to point out the specific external connector. Also. it used the un-needed structure (struct extcon_specific_cable_nb). : int extcon_register_interest(struct extcon_specific_cable_nb *obj, const char *extcon_name, const char *cable_name, struct notifier_block *nb) - Newly, the updated extcon_register_notifier() would definitely support the same feature to detech the changed state of external connector without any specific structure (struct extcon_specific_cable_nb). : int extcon_register_notifier(struct extcon_dev *edev, enum extcon id, struct notifier_block *nb) This patch support the both extcon_register_interest() and new extcon_register_ notifier(). But the extcon_{register|unregister}_interest() will be deprecated because extcon core would support the notifier event for extcon consumer driver with only updated extcon_register_notifier() and 'extcon_specific_cable_nb' will be removed if there are no extcon consumer driver with legacy extcon_{register|unregister}_interest(). Signed-off-by: Chanwoo Choi Reviewed-by: Krzysztof Kozlowski --- include/linux/extcon.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 85c882f0029e..be9652b3a154 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -116,7 +116,7 @@ struct extcon_dev { /* Internal data. Please do not set. */ struct device dev; - struct raw_notifier_head nh; + struct raw_notifier_head *nh; struct list_head entry; int max_supported; spinlock_t lock; /* could be called by irq handler */ @@ -155,8 +155,6 @@ struct extcon_cable { /** * struct extcon_specific_cable_nb - An internal data for * extcon_register_interest(). - * @internal_nb: A notifier block bridging extcon notifier - * and cable notifier. * @user_nb: user provided notifier block for events from * a specific cable. * @cable_index: the target cable. @@ -164,7 +162,6 @@ struct extcon_cable { * @previous_value: the saved previous event value. */ struct extcon_specific_cable_nb { - struct notifier_block internal_nb; struct notifier_block *user_nb; int cable_index; struct extcon_dev *edev; @@ -240,10 +237,10 @@ extern int extcon_unregister_interest(struct extcon_specific_cable_nb *nb); * we do not recommend to use this for normal 'notifiee' device drivers who * want to be notified by a specific external port of the notifier. */ -extern int extcon_register_notifier(struct extcon_dev *edev, +extern int extcon_register_notifier(struct extcon_dev *edev, enum extcon id, + struct notifier_block *nb); +extern int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id, struct notifier_block *nb); -extern int extcon_unregister_notifier(struct extcon_dev *edev, - struct notifier_block *nb); /* * Following API get the extcon device from devicetree. @@ -333,13 +330,15 @@ static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name) } static inline int extcon_register_notifier(struct extcon_dev *edev, - struct notifier_block *nb) + enum extcon id, + struct notifier_block *nb) { return 0; } static inline int extcon_unregister_notifier(struct extcon_dev *edev, - struct notifier_block *nb) + enum extcon id, + struct notifier_block *nb) { return 0; } -- cgit v1.2.3 From 668abc729fcb9d034eccadf63166d2c76cd645d1 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 21 May 2015 17:42:43 +0100 Subject: regmap: Introduce regmap_get_max_register This patch introduces regmap_get_max_register() function which would be used by the infrastructures like nvmem framework built on top of regmap. Signed-off-by: Srinivas Kandagatla Signed-off-by: Mark Brown --- include/linux/regmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 116655d92269..2d87deda79cd 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -433,6 +433,7 @@ int regmap_update_bits_check_async(struct regmap *map, unsigned int reg, unsigned int mask, unsigned int val, bool *change); int regmap_get_val_bytes(struct regmap *map); +int regmap_get_max_register(struct regmap *map); int regmap_async_complete(struct regmap *map); bool regmap_can_raw_write(struct regmap *map); @@ -676,6 +677,12 @@ static inline int regmap_get_val_bytes(struct regmap *map) return -EINVAL; } +static inline int regmap_get_max_register(struct regmap *map) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regcache_sync(struct regmap *map) { WARN_ONCE(1, "regmap API is disabled"); -- cgit v1.2.3 From a2f776cbb8271d7149784207da0b0c51e8b1847c Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 21 May 2015 17:42:54 +0100 Subject: regmap: Introduce regmap_get_reg_stride This patch introduces regmap_get_reg_stride() function which would be used by the infrastructures like nvmem framework built on top of regmap. Mostly this function would be used for sanity checks on inputs within such infrastructure. Signed-off-by: Srinivas Kandagatla Signed-off-by: Mark Brown --- include/linux/regmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 2d87deda79cd..59c55ea0f0b5 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -434,6 +434,7 @@ int regmap_update_bits_check_async(struct regmap *map, unsigned int reg, bool *change); int regmap_get_val_bytes(struct regmap *map); int regmap_get_max_register(struct regmap *map); +int regmap_get_reg_stride(struct regmap *map); int regmap_async_complete(struct regmap *map); bool regmap_can_raw_write(struct regmap *map); @@ -683,6 +684,12 @@ static inline int regmap_get_max_register(struct regmap *map) return -EINVAL; } +static inline int regmap_get_reg_stride(struct regmap *map) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regcache_sync(struct regmap *map) { WARN_ONCE(1, "regmap API is disabled"); -- cgit v1.2.3 From 69eb0980ab4ced06f7c2b4774575337ce32912fb Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Thu, 23 Apr 2015 16:10:24 +0530 Subject: regulator: max8973: add mechanism to enable/disable through GPIO MAX8973 supports the voltage output enable/disable through its EN pin. This EN pin can be connected through GPIO from host processor. Add support to provide GPIO number from platform/DT and if it is valid GPIO then enable external control default. Signed-off-by: Laxman Dewangan Signed-off-by: Mark Brown --- include/linux/regulator/max8973-regulator.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/max8973-regulator.h b/include/linux/regulator/max8973-regulator.h index f8acc052e353..f6a8a16a0d4d 100644 --- a/include/linux/regulator/max8973-regulator.h +++ b/include/linux/regulator/max8973-regulator.h @@ -58,6 +58,9 @@ * control signal from EN input pin. If it is false then * voltage output will be enabled/disabled through EN bit of * device register. + * @enable_gpio: Enable GPIO. If EN pin is controlled through GPIO from host + * then GPIO number can be provided. If no GPIO controlled then + * it should be -1. * @dvs_gpio: GPIO for dvs. It should be -1 if this is tied with fixed logic. * @dvs_def_state: Default state of dvs. 1 if it is high else 0. */ @@ -65,6 +68,7 @@ struct max8973_regulator_platform_data { struct regulator_init_data *reg_init_data; unsigned long control_flags; bool enable_ext_control; + int enable_gpio; int dvs_gpio; unsigned dvs_def_state:1; }; -- cgit v1.2.3 From f705f837c58ebe1ea69dfffff4dcc234e2fbc8dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 11:12:38 +0200 Subject: nvme: consolidate synchronous command submission helpers Note that we keep the unused timeout argument, but allow callers to pass 0 instead of a timeout if they want the default. This will allow adding a timeout to the pass through path later on. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8dbd05e70f09..61488b2ae291 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -158,11 +158,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length); void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod); -int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *, - struct nvme_command *, u32 *); -int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); -int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, - u32 *result); +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd); int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, dma_addr_t dma_addr); int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, -- cgit v1.2.3 From e75ec752d725b7b612c0b2db1bca50a9e53c0879 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 11:12:39 +0200 Subject: nvme: store a struct device pointer in struct nvme_dev Most users want the generic device, so store that in struct nvme_dev instead of the pci_dev. This also happens to be a nice step towards making some code reusable for non-PCI transports. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 61488b2ae291..de0e49a716b8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -74,7 +74,7 @@ struct nvme_dev { struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; - struct pci_dev *pci_dev; + struct device *dev; struct dma_pool *prp_page_pool; struct dma_pool *prp_small_pool; int instance; -- cgit v1.2.3 From d29ec8241c10eacf59c23b3828a88dbae06e7e3f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 11:12:46 +0200 Subject: nvme: submit internal commands through the block layer Use block layer queues with an internal cmd_type to submit internally generated NVMe commands. This both simplifies the code a lot and allow for a better structure. For example now the LighNVM code can construct commands without knowing the details of the underlying I/O descriptors. Or a future NVMe over network target could inject commands, as well as could the SCSI translation and ioctl code be reused for such a beast. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index de0e49a716b8..986bf8ad8e93 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -146,21 +146,15 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } -/** - * nvme_free_iod - frees an nvme_iod - * @dev: The device that the I/O was submitted to - * @iod: The memory to free - */ -void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod); - -int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t); -struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, - unsigned long addr, unsigned length); -void nvme_unmap_user_pages(struct nvme_dev *dev, int write, - struct nvme_iod *iod); -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd); -int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, - dma_addr_t dma_addr); +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buf, unsigned bufflen); +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, void __user *ubuffer, unsigned bufflen, + u32 *result, unsigned timeout); +int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id); +int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, + struct nvme_id_ns **id); +int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log); int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result); int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, -- cgit v1.2.3 From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 22 May 2015 09:14:03 -0400 Subject: block: remove management of bi_remaining when restoring original bi_end_io Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") regressed all existing callers that followed this pattern: 1) saving a bio's original bi_end_io 2) wiring up an intermediate bi_end_io 3) restoring the original bi_end_io from intermediate bi_end_io 4) calling bio_endio() to execute the restored original bi_end_io The regression was due to BIO_CHAIN only ever getting set if bio_inc_remaining() is called. For the above pattern it isn't set until step 3 above (step 2 would've needed to establish BIO_CHAIN). As such the first bio_endio(), in step 2 above, never decremented __bi_remaining before calling the intermediate bi_end_io -- leaving __bi_remaining with the value 1 instead of 0. When bio_inc_remaining() occurred during step 3 it brought it to a value of 2. When the second bio_endio() was called, in step 4 above, it should've called the original bi_end_io but it didn't because there was an extra reference that wasn't dropped (due to atomic operations being optimized away since BIO_CHAIN wasn't set upfront). Fix this issue by removing the __bi_remaining management complexity for all callers that use the above pattern -- bio_chain() is the only interface that _needs_ to be concerned with __bi_remaining. For the above pattern callers just expect the bi_end_io they set to get called! Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls that aren't associated with the bio_chain() interface. Also, the bio_inc_remaining() interface has been moved local to bio.c. Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7486ea103f6e..f0291cf64cc5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -427,7 +427,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } extern void bio_endio(struct bio *, int); -extern void bio_endio_nodec(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -658,17 +657,6 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } -/* - * Increment chain count for the bio. Make sure the CHAIN flag update - * is visible before the raised count. - */ -static inline void bio_inc_remaining(struct bio *bio) -{ - bio->bi_flags |= (1 << BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. -- cgit v1.2.3 From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 09:14:04 -0400 Subject: block, dm: don't copy bios for request clones Currently dm-multipath has to clone the bios for every request sent to the lower devices, which wastes cpu cycles and ties down memory. This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio to not complete bios attached to a request, which we set on clone requests similar to bios in a flush sequence. With this change I/O errors on a path failure only get propagated to dm-multipath, which can then either resubmit the I/O or complete the bios on the original request. I've done some basic testing of this on a Linux target with ALUA support, and it survives path failures during I/O nicely. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3f4ded0b1a34..45a6be89957c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -192,6 +192,7 @@ enum rq_flag_bits { __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NO_TIMEOUT, /* requests may never expire */ + __REQ_CLONE, /* cloned bios */ __REQ_NR_BITS, /* stops here */ }; @@ -246,5 +247,6 @@ enum rq_flag_bits { #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) +#define REQ_CLONE (1ULL << __REQ_CLONE) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc917956a6d0..9ded80da2c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -775,11 +775,7 @@ extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); -extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data); -extern void blk_rq_unprep_clone(struct request *rq); +extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern void blk_delay_queue(struct request_queue *, unsigned long); -- cgit v1.2.3 From d0751b98dfa391f862e02dc36a233a54615e3f1d Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Thu, 21 May 2015 15:05:02 +0800 Subject: PCI: Add dev->has_secondary_link to track downstream PCIe links A PCIe Port is an interface to a Link. A Root Port is a PCI-PCI bridge in a Root Complex and has a Link on its secondary (downstream) side. For other Ports, the Link may be on either the upstream (closer to the Root Complex) or downstream side of the Port. The usual topology has a Root Port connected to an Upstream Port. We previously assumed this was the only possible topology, and that a Downstream Port's Link was always on its downstream side, like this: +---------------------+ +------+ | Downstream | | Root | | Upstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ But systems do exist (see URL below) where the Root Port is connected to a Downstream Port. In this case, a Downstream Port's Link may be on either the upstream or downstream side: +---------------------+ +------+ | Upstream | | Root | | Downstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ We can't use the Port type to determine which side the Link is on, so add a bit in struct pci_dev to keep track. A Root Port's Link is always on the Port's secondary side. A component (Endpoint or Port) on the other end of the Link obviously has the Link on its upstream side. If that component is a Port, it is part of a Switch or a Bridge. A Bridge has a PCI or PCI-X bus on its secondary side, not a Link. The internal bus of a Switch connects the Port to another Port whose Link is on the downstream side. [bhelgaas: changelog, comment, cache "type", use if/else] Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361 Suggested-by: Bjorn Helgaas Signed-off-by: Yijing Wang Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..1989f6dc9618 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -355,6 +355,7 @@ struct pci_dev { unsigned int broken_intx_masking:1; unsigned int io_window_1k:1; /* Intel P2P bridge 1K I/O windows */ unsigned int irq_managed:1; + unsigned int has_secondary_link:1; pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ -- cgit v1.2.3 From 6374f9124efea5fae9cba263108583c39e22f86b Mon Sep 17 00:00:00 2001 From: Harald Geyer Date: Tue, 7 Apr 2015 11:12:35 +0000 Subject: timekeeping: Provide new API to get the current time resolution This patch series introduces a new function u32 ktime_get_resolution_ns(void) which allows to clean up some driver code. In particular the IIO subsystem has a function to provide timestamps for events but no means to get their resolution. So currently the dht11 driver tries to guess the resolution in a rather messy and convoluted way. We can do much better with the new code. This API is not designed to be exposed to user space. This has been tested on i386, sunxi and mxs. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Harald Geyer [jstultz: Tweaked to make it build after upstream changes] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 99176af216af..9af5c1214682 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -163,6 +163,7 @@ extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); +extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format -- cgit v1.2.3 From 57d05a93ada77c4f8a6112cbc867a2948dce7991 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 13 May 2015 16:04:47 -0700 Subject: time: Rework debugging variables so they aren't global Ingo suggested that the timekeeping debugging variables recently added should not be global, and should be tied to the timekeeper's read_base. Thus this patch implements that suggestion. This version is different from the earlier versions as it keeps the variables in the timekeeper structure rather then in the tkr. Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Signed-off-by: John Stultz --- include/linux/timekeeper_internal.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 6f8276ae579c..e1f5a1136554 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -61,6 +61,9 @@ struct tk_read_base { * shifted nano seconds. * @ntp_error_shift: Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. + * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) + * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) + * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffie times) @@ -106,6 +109,18 @@ struct timekeeper { s64 ntp_error; u32 ntp_error_shift; u32 ntp_err_mult; +#ifdef CONFIG_DEBUG_TIMEKEEPING + long last_warning; + /* + * These simple flag variables are managed + * without locks, which is racy, but they are + * ok since we don't really care about being + * super precise about how many events were + * seen, just that a problem was observed. + */ + int underflow_seen; + int overflow_seen; +#endif }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL -- cgit v1.2.3 From 30f3b3f9836c7c7e1f42ca855bbe8127fff4b99a Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 9 Apr 2015 09:04:40 +0800 Subject: time: Include math64.h in time64.h On 32-bit systems, timespec64_add_ns() calls __iter_div_u64_rem() which needs math64.h, and we want to include time64.h in some cases. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Xunlei Pang Signed-off-by: John Stultz --- include/linux/time64.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index a3831478d9cf..12d4e82b0276 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -2,6 +2,7 @@ #define _LINUX_TIME64_H #include +#include typedef __s64 time64_t; -- cgit v1.2.3 From e83d0a4106d81dd08b70318f078f3bad6acdc110 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 9 Apr 2015 09:04:42 +0800 Subject: time: Remove read_boot_clock() Now that we have a read_boot_clock64() function available on every architecture, and converted all the users to it, it's time to remove the (now unused) read_boot_clock() completely from the kernel. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Xunlei Pang [jstultz: Minor commit message tweak suggested by Ingo] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 9af5c1214682..3aa72e648650 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -267,7 +267,6 @@ extern int persistent_clock_is_local; extern void read_persistent_clock(struct timespec *ts); extern void read_persistent_clock64(struct timespec64 *ts); -extern void read_boot_clock(struct timespec *ts); extern void read_boot_clock64(struct timespec64 *ts); extern int update_persistent_clock(struct timespec now); extern int update_persistent_clock64(struct timespec64 now); -- cgit v1.2.3 From 25d238b2260973bfca0b82181824340c7aeae45a Mon Sep 17 00:00:00 2001 From: Rajeev Kumar Date: Fri, 22 May 2015 09:58:30 -0700 Subject: Input: update email-id of Rajeev Kumar rajeev-dlh.kumar@st.com email-id doesn't exist anymore as I have left the company. Signed-off-by: Rajeev Kumar Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/keyboard-spear.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/keyboard-spear.h b/include/linux/platform_data/keyboard-spear.h index 9248e3a7e333..5e3ff653900c 100644 --- a/include/linux/platform_data/keyboard-spear.h +++ b/include/linux/platform_data/keyboard-spear.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2010 ST Microelectronics - * Rajeev Kumar + * Rajeev Kumar * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any -- cgit v1.2.3 From 9d16f207112f77711600fb0770182a06e056e5de Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 18 May 2015 10:43:31 +0530 Subject: cpufreq: Track cpu managing sysfs kobjects separately In order to prepare for the next few commits, that will stop migrating sysfs files on cpu hotplug, this patch starts managing sysfs-cpu separately. The behavior is still the same as we are still migrating sysfs files on hotplug, later commits would change that. Signed-off-by: Saravana Kannan Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 48e37c07eb84..29ad97c34fd5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -65,7 +65,9 @@ struct cpufreq_policy { unsigned int shared_type; /* ACPI: ANY or ALL affected CPUs should set cpufreq */ - unsigned int cpu; /* cpu nr of CPU managing this policy */ + unsigned int cpu; /* cpu managing this policy, must be online */ + unsigned int kobj_cpu; /* cpu managing sysfs files, can be offline */ + struct clk *clk; struct cpufreq_cpuinfo cpuinfo;/* see above */ -- cgit v1.2.3 From 0824965140fff1bf640a987dc790d1594a8e0699 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Apr 2015 16:23:36 +0200 Subject: PCI: Propagate the "ignore hotplug" setting to parent Refine the mechanism introduced by commit f244d8b623da ("ACPIPHP / radeon / nouveau: Fix VGA switcheroo problem related to hotplug") to propagate the ignore_hotplug setting of the device to its parent bridge in case hotplug notifications related to the graphics adapter switching are given for the bridge rather than for the device itself (they need to be ignored in both cases). Link: https://bugzilla.kernel.org/show_bug.cgi?id=61891 Link: https://bugs.freedesktop.org/show_bug.cgi?id=88927 Fixes: b440bde74f04 ("PCI: Add pci_ignore_hotplug() to ignore hotplug events for a device") Reported-and-tested-by: tiagdtd-lava Signed-off-by: Rafael J. Wysocki Signed-off-by: Bjorn Helgaas CC: stable@vger.kernel.org # v3.17+ --- include/linux/pci.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..ef45ffe9ca88 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1006,6 +1006,7 @@ int __must_check pci_assign_resource(struct pci_dev *dev, int i); int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align); int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); +void pci_ignore_hotplug(struct pci_dev *dev); /* ROM control related routines */ int pci_enable_rom(struct pci_dev *pdev); @@ -1043,11 +1044,6 @@ bool pci_dev_run_wake(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); -static inline void pci_ignore_hotplug(struct pci_dev *dev) -{ - dev->ignore_hotplug = 1; -} - static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable) { -- cgit v1.2.3 From edd4ab0559316a1efe0881a4e2ccaeb4fec73142 Mon Sep 17 00:00:00 2001 From: Ramakrishna Pallala Date: Sun, 24 May 2015 09:11:58 +0530 Subject: power: max17042_battery: add HEALTH and TEMP_* properties support This patch adds the support for following battery properties to max17042 fuel gauge driver. POWER_SUPPLY_PROP_TEMP_ALERT_MIN POWER_SUPPLY_PROP_TEMP_ALERT_MAX POWER_SUPPLY_PROP_TEMP_MIN POWER_SUPPLY_PROP_TEMP_MAX POWER_SUPPLY_PROP_HEALTH Signed-off-by: Ramakrishna Pallala Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel --- include/linux/power/max17042_battery.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index cf112b4075c8..522757ac9cd4 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -215,6 +215,10 @@ struct max17042_platform_data { * the datasheet although it can be changed by board designers. */ unsigned int r_sns; + int vmin; /* in millivolts */ + int vmax; /* in millivolts */ + int temp_min; /* in tenths of degree Celsius */ + int temp_max; /* in tenths of degree Celsius */ }; #endif /* __MAX17042_BATTERY_H_ */ -- cgit v1.2.3 From 843735b788a4e49c453f4aefdae80e6dfbe9ee85 Mon Sep 17 00:00:00 2001 From: Ramakrishna Pallala Date: Mon, 4 May 2015 22:16:07 +0530 Subject: power: axp288_charger: axp288 charger driver This patch adds new power supply charger driver support for X-Power AXP288 PMIC integrated charger. This driver interfaces with the axp20x mfd driver as a cell and listens to extcon cable events for setting up charging. Signed-off-by: Ramakrishna Pallala Acked-by: Lee Jones Signed-off-by: Sebastian Reichel --- include/linux/mfd/axp20x.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index dfabd6db7ddf..f9030df5acd1 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -275,4 +275,11 @@ struct axp20x_fg_pdata { int thermistor_curve[MAX_THERM_CURVE_SIZE][2]; }; +struct axp20x_chrg_pdata { + int max_cc; + int max_cv; + int def_cc; + int def_cv; +}; + #endif /* __LINUX_MFD_AXP20X_H */ -- cgit v1.2.3 From c93b76b34b4d8dbe8e3443eb27e49ac60034342b Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 7 May 2015 15:54:02 +0300 Subject: mei: bus: report also uuid in module alias In order to automate modules matching add device uuid which is reported in client enumeration, keep also the name that is needed in for nfc distinguishing radio vendor Report mei:name:uuid Cc: linux-api@vger.kernel.org Cc: Samuel Ortiz Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 3bfd56778c29..2d2b2b571d61 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -599,9 +599,22 @@ struct ipack_device_id { #define MEI_CL_MODULE_PREFIX "mei:" #define MEI_CL_NAME_SIZE 32 +#define MEI_CL_UUID_FMT "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x" +#define MEI_CL_UUID_ARGS(_u) \ + _u[0], _u[1], _u[2], _u[3], _u[4], _u[5], _u[6], _u[7], \ + _u[8], _u[9], _u[10], _u[11], _u[12], _u[13], _u[14], _u[15] +/** + * struct mei_cl_device_id - MEI client device identifier + * @name: helper name + * @uuid: client uuid + * @driver_info: information used by the driver. + * + * identifies mei client device by uuid and name + */ struct mei_cl_device_id { char name[MEI_CL_NAME_SIZE]; + __u8 uuid[16]; kernel_ulong_t driver_info; }; -- cgit v1.2.3 From dbac993f6a6df24d5edc362667e524ba43543472 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 7 May 2015 15:54:07 +0300 Subject: mei: export mei client device struct to external use Cc: Samuel Ortiz Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- include/linux/mei_cl_bus.h | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index 0819d36a3a74..a16b1f9c1aca 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -7,6 +7,42 @@ struct mei_cl_device; +typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device, + u32 events, void *context); + +/** + * struct mei_cl_device - MEI device handle + * An mei_cl_device pointer is returned from mei_add_device() + * and links MEI bus clients to their actual ME host client pointer. + * Drivers for MEI devices will get an mei_cl_device pointer + * when being probed and shall use it for doing ME bus I/O. + * + * @dev: linux driver model device pointer + * @me_cl: me client + * @cl: mei client + * @name: device name + * @event_work: async work to execute event callback + * @event_cb: Drivers register this callback to get asynchronous ME + * events (e.g. Rx buffer pending) notifications. + * @event_context: event callback run context + * @events: Events bitmask sent to the driver. + * @priv_data: client private data + */ +struct mei_cl_device { + struct device dev; + + struct mei_me_client *me_cl; + struct mei_cl *cl; + char name[MEI_CL_NAME_SIZE]; + + struct work_struct event_work; + mei_cl_event_cb_t event_cb; + void *event_context; + unsigned long events; + + void *priv_data; +}; + struct mei_cl_driver { struct device_driver driver; const char *name; @@ -28,8 +64,6 @@ void mei_cl_driver_unregister(struct mei_cl_driver *driver); ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length); ssize_t mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length); -typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device, - u32 events, void *context); int mei_cl_register_event_cb(struct mei_cl_device *device, mei_cl_event_cb_t read_cb, void *context); -- cgit v1.2.3 From 7df20f2d893db42eaa1ea1e30a2573c971ec9238 Mon Sep 17 00:00:00 2001 From: Sudeep Dutt Date: Wed, 29 Apr 2015 05:32:28 -0700 Subject: misc: mic: SCIF header file and IOCTL interface This patch introduces the SCIF documentation in the header file and describes the IOCTL interface for user mode. mic_overview.txt is updated with documentation on SCIF and a new document describing SCIF in more details is available in scif_overview.txt. Reviewed-by: Nikhil Rao Reviewed-by: Ashutosh Dixit Signed-off-by: Sudeep Dutt Signed-off-by: Greg Kroah-Hartman --- include/linux/scif.h | 993 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 993 insertions(+) create mode 100644 include/linux/scif.h (limited to 'include/linux') diff --git a/include/linux/scif.h b/include/linux/scif.h new file mode 100644 index 000000000000..44f4f3898bbe --- /dev/null +++ b/include/linux/scif.h @@ -0,0 +1,993 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel SCIF driver. + * + */ +#ifndef __SCIF_H__ +#define __SCIF_H__ + +#include +#include +#include + +#define SCIF_ACCEPT_SYNC 1 +#define SCIF_SEND_BLOCK 1 +#define SCIF_RECV_BLOCK 1 + +enum { + SCIF_PROT_READ = (1 << 0), + SCIF_PROT_WRITE = (1 << 1) +}; + +enum { + SCIF_MAP_FIXED = 0x10, + SCIF_MAP_KERNEL = 0x20, +}; + +enum { + SCIF_FENCE_INIT_SELF = (1 << 0), + SCIF_FENCE_INIT_PEER = (1 << 1), + SCIF_SIGNAL_LOCAL = (1 << 4), + SCIF_SIGNAL_REMOTE = (1 << 5) +}; + +enum { + SCIF_RMA_USECPU = (1 << 0), + SCIF_RMA_USECACHE = (1 << 1), + SCIF_RMA_SYNC = (1 << 2), + SCIF_RMA_ORDERED = (1 << 3) +}; + +/* End of SCIF Admin Reserved Ports */ +#define SCIF_ADMIN_PORT_END 1024 + +/* End of SCIF Reserved Ports */ +#define SCIF_PORT_RSVD 1088 + +typedef struct scif_endpt *scif_epd_t; + +#define SCIF_OPEN_FAILED ((scif_epd_t)-1) +#define SCIF_REGISTER_FAILED ((off_t)-1) +#define SCIF_MMAP_FAILED ((void *)-1) + +/** + * scif_open() - Create an endpoint + * + * Return: + * Upon successful completion, scif_open() returns an endpoint descriptor to + * be used in subsequent SCIF functions calls to refer to that endpoint; + * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is + * returned and errno is set to indicate the error; in kernel mode a NULL + * scif_epd_t is returned. + * + * Errors: + * ENOMEM - Insufficient kernel memory was available + */ +scif_epd_t scif_open(void); + +/** + * scif_bind() - Bind an endpoint to a port + * @epd: endpoint descriptor + * @pn: port number + * + * scif_bind() binds endpoint epd to port pn, where pn is a port number on the + * local node. If pn is zero, a port number greater than or equal to + * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to + * exactly one local port. Ports less than 1024 when requested can only be bound + * by system (or root) processes or by processes executed by privileged users. + * + * Return: + * Upon successful completion, scif_bind() returns the port number to which epd + * is bound; otherwise in user mode -1 is returned and errno is set to + * indicate the error; in kernel mode the negative of one of the following + * errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * EINVAL - the endpoint or the port is already bound + * EISCONN - The endpoint is already connected + * ENOSPC - No port number available for assignment + * EACCES - The port requested is protected and the user is not the superuser + */ +int scif_bind(scif_epd_t epd, u16 pn); + +/** + * scif_listen() - Listen for connections on an endpoint + * @epd: endpoint descriptor + * @backlog: maximum pending connection requests + * + * scif_listen() marks the endpoint epd as a listening endpoint - that is, as + * an endpoint that will be used to accept incoming connection requests. Once + * so marked, the endpoint is said to be in the listening state and may not be + * used as the endpoint of a connection. + * + * The endpoint, epd, must have been bound to a port. + * + * The backlog argument defines the maximum length to which the queue of + * pending connections for epd may grow. If a connection request arrives when + * the queue is full, the client may receive an error with an indication that + * the connection was refused. + * + * Return: + * Upon successful completion, scif_listen() returns 0; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * EINVAL - the endpoint is not bound to a port + * EISCONN - The endpoint is already connected or listening + */ +int scif_listen(scif_epd_t epd, int backlog); + +/** + * scif_connect() - Initiate a connection on a port + * @epd: endpoint descriptor + * @dst: global id of port to which to connect + * + * The scif_connect() function requests the connection of endpoint epd to remote + * port dst. If the connection is successful, a peer endpoint, bound to dst, is + * created on node dst.node. On successful return, the connection is complete. + * + * If the endpoint epd has not already been bound to a port, scif_connect() + * will bind it to an unused local port. + * + * A connection is terminated when an endpoint of the connection is closed, + * either explicitly by scif_close(), or when a process that owns one of the + * endpoints of the connection is terminated. + * + * In user space, scif_connect() supports an asynchronous connection mode + * if the application has set the O_NONBLOCK flag on the endpoint via the + * fcntl() system call. Setting this flag will result in the calling process + * not to wait during scif_connect(). + * + * Return: + * Upon successful completion, scif_connect() returns the port ID to which the + * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is + * set to indicate the error; in kernel mode the negative of one of the + * following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNREFUSED - The destination was not listening for connections or refused + * the connection request + * EINVAL - dst.port is not a valid port ID + * EISCONN - The endpoint is already connected + * ENOMEM - No buffer space is available + * ENODEV - The destination node does not exist, or the node is lost or existed, + * but is not currently in the network since it may have crashed + * ENOSPC - No port number available for assignment + * EOPNOTSUPP - The endpoint is listening and cannot be connected + */ +int scif_connect(scif_epd_t epd, struct scif_port_id *dst); + +/** + * scif_accept() - Accept a connection on an endpoint + * @epd: endpoint descriptor + * @peer: global id of port to which connected + * @newepd: new connected endpoint descriptor + * @flags: flags + * + * The scif_accept() call extracts the first connection request from the queue + * of pending connections for the port on which epd is listening. scif_accept() + * creates a new endpoint, bound to the same port as epd, and allocates a new + * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new + * endpoint is connected to the endpoint through which the connection was + * requested. epd is unaffected by this call, and remains in the listening + * state. + * + * On successful return, peer holds the global port identifier (node id and + * local port number) of the port which requested the connection. + * + * A connection is terminated when an endpoint of the connection is closed, + * either explicitly by scif_close(), or when a process that owns one of the + * endpoints of the connection is terminated. + * + * The number of connections that can (subsequently) be accepted on epd is only + * limited by system resources (memory). + * + * The flags argument is formed by OR'ing together zero or more of the + * following values. + * SCIF_ACCEPT_SYNC - block until a connection request is presented. If + * SCIF_ACCEPT_SYNC is not in flags, and no pending + * connections are present on the queue, scif_accept() + * fails with an EAGAIN error + * + * In user mode, the select() and poll() functions can be used to determine + * when there is a connection request. In kernel mode, the scif_poll() + * function may be used for this purpose. A readable event will be delivered + * when a connection is requested. + * + * Return: + * Upon successful completion, scif_accept() returns 0; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + * Errors: + * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be + * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete + * its connection request + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * EINTR - Interrupted function + * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is + * NULL, or newepd is NULL + * ENODEV - The requesting node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOENT - Secondary part of epd registration failed + */ +int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t + *newepd, int flags); + +/** + * scif_close() - Close an endpoint + * @epd: endpoint descriptor + * + * scif_close() closes an endpoint and performs necessary teardown of + * facilities associated with that endpoint. + * + * If epd is a listening endpoint then it will no longer accept connection + * requests on the port to which it is bound. Any pending connection requests + * are rejected. + * + * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs + * which are in-process through epd or its peer endpoint will complete before + * scif_close() returns. Registered windows of the local and peer endpoints are + * released as if scif_unregister() was called against each window. + * + * Closing a SCIF endpoint does not affect local registered memory mapped by + * a SCIF endpoint on a remote node. The local memory remains mapped by the peer + * SCIF endpoint explicitly removed by calling munmap(..) by the peer. + * + * If the peer endpoint's receive queue is not empty at the time that epd is + * closed, then the peer endpoint can be passed as the endpoint parameter to + * scif_recv() until the receive queue is empty. + * + * epd is freed and may no longer be accessed. + * + * Return: + * Upon successful completion, scif_close() returns 0; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + */ +int scif_close(scif_epd_t epd); + +/** + * scif_send() - Send a message + * @epd: endpoint descriptor + * @msg: message buffer address + * @len: message length + * @flags: blocking mode flags + * + * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data + * are copied from memory starting at address msg. On successful execution the + * return value of scif_send() is the number of bytes that were sent, and is + * zero if no bytes were sent because len was zero. scif_send() may be called + * only when the endpoint is in a connected state. + * + * If a scif_send() call is non-blocking, then it sends only those bytes which + * can be sent without waiting, up to a maximum of len bytes. + * + * If a scif_send() call is blocking, then it normally returns after sending + * all len bytes. If a blocking call is interrupted or the connection is + * reset, the call is considered successful if some bytes were sent or len is + * zero, otherwise the call is considered unsuccessful. + * + * In user mode, the select() and poll() functions can be used to determine + * when the send queue is not full. In kernel mode, the scif_poll() function + * may be used for this purpose. + * + * It is recommended that scif_send()/scif_recv() only be used for short + * control-type message communication between SCIF endpoints. The SCIF RMA + * APIs are expected to provide better performance for transfer sizes of + * 1024 bytes or longer for the current MIC hardware and software + * implementation. + * + * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK + * is passed as the flags argument. + * + * Return: + * Upon successful completion, scif_send() returns the number of bytes sent; + * otherwise in user mode -1 is returned and errno is set to indicate the + * error; in kernel mode the negative of one of the following errors is + * returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - An invalid address was specified for a parameter + * EINVAL - flags is invalid, or len is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOTCONN - The endpoint is not connected + */ +int scif_send(scif_epd_t epd, void *msg, int len, int flags); + +/** + * scif_recv() - Receive a message + * @epd: endpoint descriptor + * @msg: message buffer address + * @len: message buffer length + * @flags: blocking mode flags + * + * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of + * data are copied to memory starting at address msg. On successful execution + * the return value of scif_recv() is the number of bytes that were received, + * and is zero if no bytes were received because len was zero. scif_recv() may + * be called only when the endpoint is in a connected state. + * + * If a scif_recv() call is non-blocking, then it receives only those bytes + * which can be received without waiting, up to a maximum of len bytes. + * + * If a scif_recv() call is blocking, then it normally returns after receiving + * all len bytes. If the blocking call was interrupted due to a disconnection, + * subsequent calls to scif_recv() will copy all bytes received upto the point + * of disconnection. + * + * In user mode, the select() and poll() functions can be used to determine + * when data is available to be received. In kernel mode, the scif_poll() + * function may be used for this purpose. + * + * It is recommended that scif_send()/scif_recv() only be used for short + * control-type message communication between SCIF endpoints. The SCIF RMA + * APIs are expected to provide better performance for transfer sizes of + * 1024 bytes or longer for the current MIC hardware and software + * implementation. + * + * scif_recv() will block until the entire message is received if + * SCIF_RECV_BLOCK is passed as the flags argument. + * + * Return: + * Upon successful completion, scif_recv() returns the number of bytes + * received; otherwise in user mode -1 is returned and errno is set to + * indicate the error; in kernel mode the negative of one of the following + * errors is returned. + * + * Errors: + * EAGAIN - The destination node is returning from a low power state + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - An invalid address was specified for a parameter + * EINVAL - flags is invalid, or len is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOTCONN - The endpoint is not connected + */ +int scif_recv(scif_epd_t epd, void *msg, int len, int flags); + +/** + * scif_register() - Mark a memory region for remote access. + * @epd: endpoint descriptor + * @addr: starting virtual address + * @len: length of range + * @offset: offset of window + * @prot_flags: read/write protection flags + * @map_flags: mapping flags + * + * The scif_register() function opens a window, a range of whole pages of the + * registered address space of the endpoint epd, starting at offset po and + * continuing for len bytes. The value of po, further described below, is a + * function of the parameters offset and len, and the value of map_flags. Each + * page of the window represents the physical memory page which backs the + * corresponding page of the range of virtual address pages starting at addr + * and continuing for len bytes. addr and len are constrained to be multiples + * of the page size. A successful scif_register() call returns po. + * + * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset + * exactly, and offset is constrained to be a multiple of the page size. The + * mapping established by scif_register() will not replace any existing + * registration; an error is returned if any page within the range [offset, + * offset + len - 1] intersects an existing window. + * + * When SCIF_MAP_FIXED is not set, the implementation uses offset in an + * implementation-defined manner to arrive at po. The po value so chosen will + * be an area of the registered address space that the implementation deems + * suitable for a mapping of len bytes. An offset value of 0 is interpreted as + * granting the implementation complete freedom in selecting po, subject to + * constraints described below. A non-zero value of offset is taken to be a + * suggestion of an offset near which the mapping should be placed. When the + * implementation selects a value for po, it does not replace any extant + * window. In all cases, po will be a multiple of the page size. + * + * The physical pages which are so represented by a window are available for + * access in calls to mmap(), scif_readfrom(), scif_writeto(), + * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the + * physical pages represented by the window will not be reused by the memory + * subsystem for any other purpose. Note that the same physical page may be + * represented by multiple windows. + * + * Subsequent operations which change the memory pages to which virtual + * addresses are mapped (such as mmap(), munmap()) have no effect on + * existing window. + * + * If the process will fork(), it is recommended that the registered + * virtual address range be marked with MADV_DONTFORK. Doing so will prevent + * problems due to copy-on-write semantics. + * + * The prot_flags argument is formed by OR'ing together one or more of the + * following values. + * SCIF_PROT_READ - allow read operations from the window + * SCIF_PROT_WRITE - allow write operations to the window + * + * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a + * fixed offset. + * + * Return: + * Upon successful completion, scif_register() returns the offset at which the + * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that + * is (off_t *)-1) is returned and errno is set to indicate the error; in + * kernel mode the negative of one of the following errors is returned. + * + * Errors: + * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range + * [offset, offset + len -1] are already registered + * EAGAIN - The mapping could not be performed due to lack of resources + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid + * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is + * set in flags, and offset is not a multiple of the page size, or addr is not a + * multiple of the page size, or len is not a multiple of the page size, or is + * 0, or offset is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOMEM - Not enough space + * ENOTCONN -The endpoint is not connected + */ +off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, + int prot_flags, int map_flags); + +/** + * scif_unregister() - Mark a memory region for remote access. + * @epd: endpoint descriptor + * @offset: start of range to unregister + * @len: length of range to unregister + * + * The scif_unregister() function closes those previously registered windows + * which are entirely within the range [offset, offset + len - 1]. It is an + * error to specify a range which intersects only a subrange of a window. + * + * On a successful return, pages within the window may no longer be specified + * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(), + * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, + * however, continues to exist until all previous references against it are + * removed. A window is referenced if there is a mapping to it created by + * mmap(), or if scif_get_pages() was called against the window + * (and the pages have not been returned via scif_put_pages()). A window is + * also referenced while an RMA, in which some range of the window is a source + * or destination, is in progress. Finally a window is referenced while some + * offset in that window was specified to scif_fence_signal(), and the RMAs + * marked by that call to scif_fence_signal() have not completed. While a + * window is in this state, its registered address space pages are not + * available for use in a new registered window. + * + * When all such references to the window have been removed, its references to + * all the physical pages which it represents are removed. Similarly, the + * registered address space pages of the window become available for + * registration in a new window. + * + * Return: + * Upon successful completion, scif_unregister() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. In the event of an + * error, no windows are unregistered. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a + * window, or offset is negative + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the + * registered address space of epd + */ +int scif_unregister(scif_epd_t epd, off_t offset, size_t len); + +/** + * scif_readfrom() - Copy from a remote address space + * @epd: endpoint descriptor + * @loffset: offset in local registered address space to + * which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space + * from which to copy + * @rma_flags: transfer mode flags + * + * scif_readfrom() copies len bytes from the remote registered address space of + * the peer of endpoint epd, starting at the offset roffset to the local + * registered address space of epd, starting at the offset loffset. + * + * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, + * roffset + len - 1] must be within some registered window or windows of the + * local and remote nodes. A range may intersect multiple registered windows, + * but only if those windows are contiguous in the registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * The optimal DMA performance will likely be realized if both + * loffset and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if loffset and roffset are not + * cacheline aligned but are separated by some multiple of 64. The lowest level + * of performance is likely if loffset and roffset are not separated by a + * multiple of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_readfrom() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered + * address space of epd, or, The range [roffset, roffset + len - 1] is invalid + * for the registered address space of the peer of epd, or loffset or roffset + * is negative + */ +int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t + roffset, int rma_flags); + +/** + * scif_writeto() - Copy to a remote address space + * @epd: endpoint descriptor + * @loffset: offset in local registered address space + * from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to + * which to copy + * @rma_flags: transfer mode flags + * + * scif_writeto() copies len bytes from the local registered address space of + * epd, starting at the offset loffset to the remote registered address space + * of the peer of endpoint epd, starting at the offset roffset. + * + * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, + * roffset + len - 1] must be within some registered window or windows of the + * local and remote nodes. A range may intersect multiple registered windows, + * but only if those windows are contiguous in the registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * The optimal DMA performance will likely be realized if both + * loffset and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if loffset and roffset are not cacheline + * aligned but are separated by some multiple of 64. The lowest level of + * performance is likely if loffset and roffset are not separated by a multiple + * of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_readfrom() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered + * address space of epd, or, The range [roffset , roffset + len -1] is invalid + * for the registered address space of the peer of epd, or loffset or roffset + * is negative + */ +int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t + roffset, int rma_flags); + +/** + * scif_vreadfrom() - Copy from a remote address space + * @epd: endpoint descriptor + * @addr: address to which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space + * from which to copy + * @rma_flags: transfer mode flags + * + * scif_vreadfrom() copies len bytes from the remote registered address + * space of the peer of endpoint epd, starting at the offset roffset, to local + * memory, starting at addr. + * + * The specified range [roffset, roffset + len - 1] must be within some + * registered window or windows of the remote nodes. The range may + * intersect multiple registered windows, but only if those windows are + * contiguous in the registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back + * the specified local memory range may be remain in a pinned state even after + * the specified transfer completes. This may reduce overhead if some or all of + * the same virtual address range is referenced in a subsequent call of + * scif_vreadfrom() or scif_vwriteto(). + * + * The optimal DMA performance will likely be realized if both + * addr and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if addr and roffset are not + * cacheline aligned but are separated by some multiple of 64. The lowest level + * of performance is likely if addr and roffset are not separated by a + * multiple of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_USECACHE - enable registration caching + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the + * registered address space of epd + */ +int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, + int rma_flags); + +/** + * scif_vwriteto() - Copy to a remote address space + * @epd: endpoint descriptor + * @addr: address from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to + * which to copy + * @rma_flags: transfer mode flags + * + * scif_vwriteto() copies len bytes from the local memory, starting at addr, to + * the remote registered address space of the peer of endpoint epd, starting at + * the offset roffset. + * + * The specified range [roffset, roffset + len - 1] must be within some + * registered window or windows of the remote nodes. The range may intersect + * multiple registered windows, but only if those windows are contiguous in the + * registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two asynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations on the same endpoint. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back + * the specified local memory range may be remain in a pinned state even after + * the specified transfer completes. This may reduce overhead if some or all of + * the same virtual address range is referenced in a subsequent call of + * scif_vreadfrom() or scif_vwriteto(). + * + * The optimal DMA performance will likely be realized if both + * addr and offset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if addr and offset are not cacheline + * aligned but are separated by some multiple of 64. The lowest level of + * performance is likely if addr and offset are not separated by a multiple of + * 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values. + * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA + * engine. + * SCIF_RMA_USECACHE - allow registration caching + * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag results in the + * current implementation busy waiting and consuming CPU cycles + * while the DMA transfer is in progress for best performance by + * avoiding the interrupt latency. + * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + * Return: + * Upon successful completion, scif_vwriteto() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EACCESS - Attempt to write to a read-only range + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid + * EINVAL - rma_flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the + * registered address space of epd + */ +int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, + int rma_flags); + +/** + * scif_fence_mark() - Mark previously issued RMAs + * @epd: endpoint descriptor + * @flags: control flags + * @mark: marked value returned as output. + * + * scif_fence_mark() returns after marking the current set of all uncompleted + * RMAs initiated through the endpoint epd or the current set of all + * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are + * marked with a value returned at mark. The application may subsequently call + * scif_fence_wait(), passing the value returned at mark, to await completion + * of all RMAs so marked. + * + * The flags argument has exactly one of the following values. + * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint + * epd are marked + * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer + * of endpoint epd are marked + * + * Return: + * Upon successful completion, scif_fence_mark() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - flags is invalid + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENOMEM - Insufficient kernel memory was available + */ +int scif_fence_mark(scif_epd_t epd, int flags, int *mark); + +/** + * scif_fence_wait() - Wait for completion of marked RMAs + * @epd: endpoint descriptor + * @mark: mark request + * + * scif_fence_wait() returns after all RMAs marked with mark have completed. + * The value passed in mark must have been obtained in a previous call to + * scif_fence_mark(). + * + * Return: + * Upon successful completion, scif_fence_wait() returns 0; otherwise in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENOMEM - Insufficient kernel memory was available + */ +int scif_fence_wait(scif_epd_t epd, int mark); + +/** + * scif_fence_signal() - Request a memory update on completion of RMAs + * @epd: endpoint descriptor + * @loff: local offset + * @lval: local value to write to loffset + * @roff: remote offset + * @rval: remote value to write to roffset + * @flags: flags + * + * scif_fence_signal() returns after marking the current set of all uncompleted + * RMAs initiated through the endpoint epd or marking the current set of all + * uncompleted RMAs initiated through the peer of endpoint epd. + * + * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the + * marked set, lval is written to memory at the address corresponding to offset + * loff in the local registered address space of epd. loff must be within a + * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion + * of the RMAs in the marked set, rval is written to memory at the address + * corresponding to offset roff in the remote registered address space of epd. + * roff must be within a remote registered window of the peer of epd. Note + * that any specified offset must be DWORD (4 byte / 32 bit) aligned. + * + * The flags argument is formed by OR'ing together the following. + * Exactly one of the following values. + * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint + * epd are marked + * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer + * of endpoint epd are marked + * One or more of the following values. + * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to + * memory at the address corresponding to offset loff in the local + * registered address space of epd. + * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to + * memory at the address corresponding to offset roff in the remote + * registered address space of epd. + * + * Return: + * Upon successful completion, scif_fence_signal() returns 0; otherwise in + * user mode -1 is returned and errno is set to indicate the error; in kernel + * mode the negative of one of the following errors is returned. + * + * Errors: + * EBADF, ENOTTY - epd is not a valid endpoint descriptor + * ECONNRESET - Connection reset by peer + * EINVAL - flags is invalid, or loff or roff are not DWORD aligned + * ENODEV - The remote node is lost or existed, but is not currently in the + * network since it may have crashed + * ENOTCONN - The endpoint is not connected + * ENXIO - loff is invalid for the registered address of epd, or roff is invalid + * for the registered address space, of the peer of epd + */ +int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff, + u64 rval, int flags); + +/** + * scif_get_node_ids() - Return information about online nodes + * @nodes: array in which to return online node IDs + * @len: number of entries in the nodes array + * @self: address to place the node ID of the local node + * + * scif_get_node_ids() fills in the nodes array with up to len node IDs of the + * nodes in the SCIF network. If there is not enough space in nodes, as + * indicated by the len parameter, only len node IDs are returned in nodes. The + * return value of scif_get_node_ids() is the total number of nodes currently in + * the SCIF network. By checking the return value against the len parameter, + * the user may determine if enough space for nodes was allocated. + * + * The node ID of the local node is returned at self. + * + * Return: + * Upon successful completion, scif_get_node_ids() returns the actual number of + * online nodes in the SCIF network including 'self'; otherwise in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode no + * errors are returned. + * + * Errors: + * EFAULT - Bad address + */ +int scif_get_node_ids(u16 *nodes, int len, u16 *self); + +#endif /* __SCIF_H__ */ -- cgit v1.2.3 From 3647a83d9dcf00b8e17777ec8aa1e48f1ed4fe06 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Sat, 11 Apr 2015 18:07:39 -0700 Subject: Drivers: hv: util: move kvp/vss function declarations to hyperv_vmbus.h These declarations are internal to hv_util module and hv_fcopy_* declarations already reside there. Signed-off-by: Vitaly Kuznetsov Tested-by: Alex Ng Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 902c37aef67e..1744148a39f9 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1236,13 +1236,6 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *, struct icmsg_negotiate *, u8 *, int, int); -int hv_kvp_init(struct hv_util_service *); -void hv_kvp_deinit(void); -void hv_kvp_onchannelcallback(void *); - -int hv_vss_init(struct hv_util_service *); -void hv_vss_deinit(void); -void hv_vss_onchannelcallback(void *); void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid); extern struct resource hyperv_mmio; -- cgit v1.2.3 From db9ba2088f6507fee370904f02db1eb9b49bd088 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Wed, 22 Apr 2015 21:31:31 -0700 Subject: drivers: hv: vmbus: Get rid of some unused definitions Get rid of some unused definitions. Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 1744148a39f9..e29ccddc6300 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -389,10 +389,6 @@ enum vmbus_channel_message_type { CHANNELMSG_INITIATE_CONTACT = 14, CHANNELMSG_VERSION_RESPONSE = 15, CHANNELMSG_UNLOAD = 16, -#ifdef VMBUS_FEATURE_PARENT_OR_PEER_MEMORY_MAPPED_INTO_A_CHILD - CHANNELMSG_VIEWRANGE_ADD = 17, - CHANNELMSG_VIEWRANGE_REMOVE = 18, -#endif CHANNELMSG_COUNT }; @@ -549,21 +545,6 @@ struct vmbus_channel_gpadl_torndown { u32 gpadl; } __packed; -#ifdef VMBUS_FEATURE_PARENT_OR_PEER_MEMORY_MAPPED_INTO_A_CHILD -struct vmbus_channel_view_range_add { - struct vmbus_channel_message_header header; - PHYSICAL_ADDRESS viewrange_base; - u64 viewrange_length; - u32 child_relid; -} __packed; - -struct vmbus_channel_view_range_remove { - struct vmbus_channel_message_header header; - PHYSICAL_ADDRESS viewrange_base; - u32 child_relid; -} __packed; -#endif - struct vmbus_channel_relid_released { struct vmbus_channel_message_header header; u32 child_relid; -- cgit v1.2.3 From 2db84eff127e3f4b3635edc589cd6a56db8755a3 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Wed, 22 Apr 2015 21:31:32 -0700 Subject: Drivers: hv: vmbus: Implement the protocol for tearing down vmbus state Implement the protocol for tearing down the monitor state established with the host. Signed-off-by: K. Y. Srinivasan Tested-by: Vitaly Kuznetsov Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index e29ccddc6300..ea934864293d 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -389,6 +389,7 @@ enum vmbus_channel_message_type { CHANNELMSG_INITIATE_CONTACT = 14, CHANNELMSG_VERSION_RESPONSE = 15, CHANNELMSG_UNLOAD = 16, + CHANNELMSG_UNLOAD_RESPONSE = 17, CHANNELMSG_COUNT }; -- cgit v1.2.3 From fea844a2b0edd6540d5cde2cd54a8a3c86e9c53f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov <[mailto:vkuznets@redhat.com]> Date: Wed, 6 May 2015 17:47:43 -0700 Subject: Drivers: hv: vmbus: briefly comment num_sc and next_oc next_oc and num_sc fields of struct vmbus_channel deserve a description. Move them closer to sc_list as these fields are related to it. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index ea934864293d..3932a993ff5a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -726,6 +726,15 @@ struct vmbus_channel { * All Sub-channels of a primary channel are linked here. */ struct list_head sc_list; + /* + * Current number of sub-channels. + */ + int num_sc; + /* + * Number of a sub-channel (position within sc_list) which is supposed + * to be used as the next outgoing channel. + */ + int next_oc; /* * The primary channel this sub-channel belongs to. * This will be NULL for the primary channel. @@ -740,9 +749,6 @@ struct vmbus_channel { * link up channels based on their CPU affinity. */ struct list_head percpu_list; - - int num_sc; - int next_oc; }; static inline void set_channel_read_state(struct vmbus_channel *c, bool state) -- cgit v1.2.3 From 80c6e1465948c2e91214f01764f427d31ebedb26 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 21 May 2015 15:49:37 -0700 Subject: driver-core: fix build for !CONFIG_MODULES Commit f2411da74698 ("driver-core: add driver module asynchronous probe support") broke build in case modules are disabled, because in this case "struct module" is not defined and we can't dereference it. Let's define module_requested_async_probing() helper and stub it out if modules are disabled. Reported-by: kbuild test robot Reported-by: Stephen Rothwell Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/module.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index f46a47d3c0dc..57f5c0a804c0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -510,6 +510,11 @@ int unregister_module_notifier(struct notifier_block *nb); extern void print_modules(void); +static inline bool module_requested_async_probing(struct module *module) +{ + return module && module->async_probe_requested; +} + #else /* !CONFIG_MODULES... */ /* Given an address, look for it in the exception tables. */ @@ -620,6 +625,12 @@ static inline int unregister_module_notifier(struct notifier_block *nb) static inline void print_modules(void) { } + +static inline bool module_requested_async_probing(struct module *module) +{ + return false; +} + #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 2539b258ec028351af954c169ea1b0ff72023a9f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 May 2015 14:45:34 +0100 Subject: drivers/base: cacheinfo: fix annoying typo when DT nodes are absent s/hierarcy/hierarchy/ Maybe the typo will annoy people enough so that they add the missing nodes to their device-tree files, but I still think this is better off fixed. Signed-off-by: Will Deacon Acked-by: Sudeep Holla Signed-off-by: Greg Kroah-Hartman --- include/linux/cacheinfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 3daf5ed392c9..2189935075b4 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -19,7 +19,7 @@ enum cache_type { /** * struct cacheinfo - represent a cache leaf node * @type: type of the cache - data, inst or unified - * @level: represents the hierarcy in the multi-level cache + * @level: represents the hierarchy in the multi-level cache * @coherency_line_size: size of each cache line usually representing * the minimum amount of data that gets transferred from memory * @number_of_sets: total number of sets, a set is a collection of cache -- cgit v1.2.3 From f4445f8b204de44a8baa4326b0e56537be867427 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 14 May 2015 15:28:24 +0100 Subject: drivers: of/base: move of_init to driver_init Commit 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from devices with an OF node") adds the symlink `of_node` for each device pointing to it's device tree node while creating/initialising it. However the devicetree sysfs is created and setup in of_init which is executed at core_initcall level. For all the devices created before of_init, the following error is thrown: "Error -2(-ENOENT) creating of_node link" Like many other components in driver model, initialize the sysfs support for OF/devicetree from driver_init so that it's ready before any devices are created. Fixes: 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from devices with an OF node") Suggested-by: Rob Herring Cc: Grant Likely Cc: Pawel Moll Cc: Benjamin Herrenschmidt Signed-off-by: Sudeep Holla Tested-by: Robert Schwebel Acked-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- include/linux/of.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index ddeaae6d2083..b871ff9d81d7 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -121,6 +121,8 @@ extern struct device_node *of_stdout; extern raw_spinlock_t devtree_lock; #ifdef CONFIG_OF +void of_core_init(void); + static inline bool is_of_node(struct fwnode_handle *fwnode) { return fwnode && fwnode->type == FWNODE_OF; @@ -376,6 +378,10 @@ bool of_console_check(struct device_node *dn, char *name, int index); #else /* CONFIG_OF */ +static inline void of_core_init(void) +{ +} + static inline bool is_of_node(struct fwnode_handle *fwnode) { return false; -- cgit v1.2.3 From be12a1fe298e8be04d5215364f94654dff81b0bc Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 16:59:58 +0200 Subject: net: skbuff: add skb_append_pagefrags and use it Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b617095adb88..f708936cdd23 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -861,6 +861,9 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, int len, int odd, struct sk_buff *skb), void *from, int length); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size); + struct skb_seq_state { __u32 lower_offset; __u32 upper_offset; -- cgit v1.2.3 From a60e3cc7c92973a31fad0fd04dc5cf4355d3d1ef Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 17:00:00 +0200 Subject: net: make skb_splice_bits more configureable Prepare skb_splice_bits to be able to deal with AF_UNIX sockets. AF_UNIX sockets don't use lock_sock/release_sock and thus we have to use a callback to make the locking and unlocking configureable. Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f708936cdd23..6b41c15efa27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -35,6 +35,7 @@ #include #include #include +#include /* A. Checksumming of received packets by device. * @@ -2699,9 +2700,15 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, __wsum csum); -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +ssize_t skb_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd); +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, - unsigned int flags); + unsigned int flags, + ssize_t (*splice_cb)(struct sock *, + struct pipe_inode_info *, + struct splice_pipe_desc *)); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, -- cgit v1.2.3 From a57e16cf03339c20b09642f46f60190069ff70c7 Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 25 May 2015 23:29:20 +0200 Subject: dmaengine: pxa: add pxa dmaengine driver This is a new driver for pxa SoCs, which is also compatible with the former mmp_pdma. The rationale behind a new driver (as opposed to incremental patching) was : - the new driver relies on virt-dma, which obsoletes all the internal structures of mmp_pdma (sw_desc, hw_desc, ...), and by consequence all the functions - mmp_pdma allocates dma coherent descriptors containing not only hardware descriptors but linked list information The new driver only puts the dma hardware descriptors (ie. 4 u32) into the dma pool allocated memory. This changes completely the way descriptors are handled - the architecture behind the interrupt/tasklet management was rewritten to be more conforming to virt-dma - the buffers alignment is handled differently The former driver assumed that the DMA channel stopped between each descriptor. The new one chains descriptors to let the channel running. This is a necessary guarantee for real-time high bandwidth usecases such as video capture on "old" architectures such as pxa. - hot chaining / cold chaining / no chaining Whenever possible, submitting a descriptor "hot chains" it to a running channel. There is still no guarantee that the descriptor will be issued, as the channel might be stopped just before the descriptor is submitted. Yet this allows to submit several video buffers, and resubmit a buffer while another is under handling. As before, dma_async_issue_pending() is the only guarantee to have all the buffers issued. When an alignment issue is detected (ie. one address in a descriptor is not a multiple of 8), if the already running channel is in "aligned mode", the channel will stop, and restarted in "misaligned mode" to finished the issued list. - descriptors reusing A submitted, issued and completed descriptor can be reused, ie resubmitted if it was prepared with the proper flag (DMA_PREP_ACK). Only a channel resources release will in this case release that buffer. This allows a rolling ring of buffers to be reused, where there are several thousands of hardware descriptors used (video buffer for example). Additionally, a set of more casual features is introduced : - debugging traces - lockless way to know if a descriptor is terminated or not The driver was tested on zylonite board (pxa3xx) and mioa701 (pxa27x), with dmatest, pxa_camera and pxamci. Signed-off-by: Robert Jarzmik Signed-off-by: Vinod Koul --- include/linux/dma/pxa-dma.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 include/linux/dma/pxa-dma.h (limited to 'include/linux') diff --git a/include/linux/dma/pxa-dma.h b/include/linux/dma/pxa-dma.h new file mode 100644 index 000000000000..3edc99294bf6 --- /dev/null +++ b/include/linux/dma/pxa-dma.h @@ -0,0 +1,27 @@ +#ifndef _PXA_DMA_H_ +#define _PXA_DMA_H_ + +enum pxad_chan_prio { + PXAD_PRIO_HIGHEST = 0, + PXAD_PRIO_NORMAL, + PXAD_PRIO_LOW, + PXAD_PRIO_LOWEST, +}; + +struct pxad_param { + unsigned int drcmr; + enum pxad_chan_prio prio; +}; + +struct dma_chan; + +#ifdef CONFIG_PXA_DMA +bool pxad_filter_fn(struct dma_chan *chan, void *param); +#else +static inline bool pxad_filter_fn(struct dma_chan *chan, void *param) +{ + return false; +} +#endif + +#endif /* _PXA_DMA_H_ */ -- cgit v1.2.3 From 09170a49422bd786be3eac5cec1955257c5a34b7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2015 13:59:39 +0200 Subject: KVM: const-ify uses of struct kvm_userspace_memory_region Architecture-specific helpers are not supposed to muck with struct kvm_userspace_memory_region contents. Add const to enforce this. In order to eliminate the only write in __kvm_set_memory_region, the cleaning of deleted slots is pulled up from update_memslots to __kvm_set_memory_region. Reviewed-by: Takuya Yoshikawa Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87fd74a04005..fbced7015ebd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -501,9 +501,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region *mem); int __kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -511,10 +511,10 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, void kvm_arch_memslots_updated(struct kvm *kvm); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, enum kvm_mr_change change); bool kvm_largepages_enabled(void); -- cgit v1.2.3 From 15f46015ee17681b542432df21747f5c51857156 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 17 May 2015 21:26:08 +0200 Subject: KVM: add memslots argument to kvm_arch_memslots_updated Prepare for the case of multiple address spaces. Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- include/linux/kvm_types.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fbced7015ebd..8815f1dffb77 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -508,7 +508,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages); -void kvm_arch_memslots_updated(struct kvm *kvm); +void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 931da7e917cf..1b47a185c2f0 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -28,6 +28,7 @@ struct kvm_run; struct kvm_userspace_memory_region; struct kvm_vcpu; struct kvm_vcpu_init; +struct kvm_memslots; enum kvm_mr_change; -- cgit v1.2.3 From cacce073bfd2b4091fbc58e906e7a9a21f538ff6 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Thu, 14 May 2015 23:05:49 +0200 Subject: bcma: add module_bcma_driver() This makes it possible to save some lines of code in drivers with an simple bcma driver registration. Signed-off-by: Hauke Mehrtens Signed-off-by: Kalle Valo --- include/linux/bcma/bcma.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index e34f906647d3..2ff4a9961e1d 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -305,6 +305,15 @@ int __bcma_driver_register(struct bcma_driver *drv, struct module *owner); extern void bcma_driver_unregister(struct bcma_driver *drv); +/* module_bcma_driver() - Helper macro for drivers that don't do + * anything special in module init/exit. This eliminates a lot of + * boilerplate. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit() + */ +#define module_bcma_driver(__bcma_driver) \ + module_driver(__bcma_driver, bcma_driver_register, \ + bcma_driver_unregister) + /* Set a fallback SPROM. * See kdoc at the function definition for complete documentation. */ extern int bcma_arch_register_fallback_sprom( -- cgit v1.2.3 From 069d4a7b583274e3fd8712c92a035626e0ebf7be Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Mar 2015 11:58:14 +0100 Subject: netfilter: ebtables: fix comment grammar s/stongly inspired on/strongly inspired by/ Signed-off-by: Geert Uytterhoeven Cc: David S. Miller Signed-off-by: Jiri Kosina --- include/linux/netfilter_bridge/ebtables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 34e7a2b7f867..9ac6f263956b 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -6,7 +6,7 @@ * * ebtables.c,v 2.0, April, 2002 * - * This code is stongly inspired on the iptables code which is + * This code is strongly inspired by the iptables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling */ #ifndef __LINUX_BRIDGE_EFF_H -- cgit v1.2.3 From 7667928601d2981b20011e357904bcb96c365427 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Fri, 8 May 2015 00:02:27 +0900 Subject: rapidio: Fix kerneldoc and comment This patch fix spelling typos found in DocBook/rapidio.xml Ths file was generated from comments in the source files, I had to fix them, instead of the xml file. Signed-off-by: Masanari Iida Signed-off-by: Jiri Kosina --- include/linux/rio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rio.h b/include/linux/rio.h index 6bda06f21930..cde976e86b48 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -298,7 +298,7 @@ struct rio_id_table { * struct rio_net - RIO network info * @node: Node in global list of RIO networks * @devices: List of devices in this network - * @switches: List of switches in this netowrk + * @switches: List of switches in this network * @mports: List of master ports accessing this network * @hport: Default port for accessing this network * @id: RIO network ID -- cgit v1.2.3 From 94268fcd9aea736d24cbdade16e7f7c9419c489e Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Tue, 28 Apr 2015 13:11:28 +0200 Subject: lib: crc-itu-t.[ch] fix 0x0x prefix in integer constants Signed-off-by: Antonio Ospite Acked-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/crc-itu-t.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crc-itu-t.h b/include/linux/crc-itu-t.h index 84920f3cc83e..a9953c762eee 100644 --- a/include/linux/crc-itu-t.h +++ b/include/linux/crc-itu-t.h @@ -3,7 +3,7 @@ * * Implements the standard CRC ITU-T V.41: * Width 16 - * Poly 0x0x1021 (x^16 + x^12 + x^15 + 1) + * Poly 0x1021 (x^16 + x^12 + x^15 + 1) * Init 0 * * This source code is licensed under the GNU General Public License, -- cgit v1.2.3 From e0213bc5467ca5fe44ab04527f0e47998f30c046 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 18 May 2015 20:04:14 +0900 Subject: usb: renesas_usbhs: Change USBHS_TYPE_R8A779x to USBHS_TYPE_RCAR_GEN2 Since the HSUSB controllers of R-Car Gen2 are the same specification (they have 16 pipes and usb-dmac), this patch changes USBHS_TYPE_R8A7790 and USBHS_TYPE_R8A7791 to USBHS_TYPE_RCAR_GEN2. Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- include/linux/usb/renesas_usbhs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h index f06529c14141..3dd5a781da99 100644 --- a/include/linux/usb/renesas_usbhs.h +++ b/include/linux/usb/renesas_usbhs.h @@ -169,8 +169,7 @@ struct renesas_usbhs_driver_param { #define USBHS_USB_DMAC_XFER_SIZE 32 /* hardcode the xfer size */ }; -#define USBHS_TYPE_R8A7790 1 -#define USBHS_TYPE_R8A7791 2 +#define USBHS_TYPE_RCAR_GEN2 1 /* * option: -- cgit v1.2.3 From a09e23f53e2c14a65a3b14a00060fea163081e1f Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Sat, 16 May 2015 22:33:35 +0200 Subject: usb: gadget: net2280: check interrupts for all endpoints USB3380 in enhanced mode has 4 IN and 4 OUT endpoints. Check interrupts for all of them. Tested-by: Ricardo Ribalda Delgado Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Felipe Balbi --- include/linux/usb/net2280.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/net2280.h b/include/linux/usb/net2280.h index 148b8fa5b1a2..725120224472 100644 --- a/include/linux/usb/net2280.h +++ b/include/linux/usb/net2280.h @@ -168,6 +168,9 @@ struct net2280_regs { #define ENDPOINT_B_INTERRUPT 2 #define ENDPOINT_A_INTERRUPT 1 #define ENDPOINT_0_INTERRUPT 0 +#define USB3380_IRQSTAT0_EP_INTR_MASK_IN (0xF << 17) +#define USB3380_IRQSTAT0_EP_INTR_MASK_OUT (0xF << 1) + u32 irqstat1; #define POWER_STATE_CHANGE_INTERRUPT 27 #define PCI_ARBITER_TIMEOUT_INTERRUPT 26 -- cgit v1.2.3 From c65c4f052bc3b67989bf54914798513685c54988 Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Sat, 16 May 2015 22:33:36 +0200 Subject: usb: gadget: net2280: fix use of GPEP in both directions USB3380 enhanced mode allows GPEP to be used in both IN and OUT directions. However, IN and OUT endpoints must use same USB endpoint address (bEndpointAddress). Fix this by setting the ep_cfg.ep_number during initialization and keep it in net2280_enable() Tested-by: Ricardo Ribalda Delgado Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Felipe Balbi --- include/linux/usb/usb338x.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/usb338x.h b/include/linux/usb/usb338x.h index f92eb635b9d3..11525d8d89a7 100644 --- a/include/linux/usb/usb338x.h +++ b/include/linux/usb/usb338x.h @@ -43,6 +43,10 @@ #define IN_ENDPOINT_TYPE 12 #define OUT_ENDPOINT_ENABLE 10 #define OUT_ENDPOINT_TYPE 8 +#define USB3380_EP_CFG_MASK_IN ((0x3 << IN_ENDPOINT_TYPE) | \ + BIT(IN_ENDPOINT_ENABLE)) +#define USB3380_EP_CFG_MASK_OUT ((0x3 << OUT_ENDPOINT_TYPE) | \ + BIT(OUT_ENDPOINT_ENABLE)) struct usb338x_usb_ext_regs { u32 usbclass; -- cgit v1.2.3 From e842b84c8e7221c45c8dbd7de09185c6149e1cf9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 09:52:48 +1100 Subject: usb: phy: Add interface to get phy give of device_node. Split the "get phy from device_node" functionality out of "get phy by phandle" so it can be used directly. This is useful when a battery-charger is intimately associated with a particular phy but handled by a separate driver. The charger can find the device_node based on sibling relationships without the need for a redundant declaration in the devicetree description. As a peripheral that gets a phy will often want to register a notifier block, and de-register it later, that functionality is included so the de-registration is automatic. Acked-by: Pavel Machek Signed-off-by: NeilBrown Signed-off-by: Felipe Balbi --- include/linux/usb/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index bc91b5d380fd..8ed1e29ef329 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -205,6 +205,8 @@ extern struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index); extern struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); +extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, + struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern int usb_bind_phy(const char *dev_name, u8 index, -- cgit v1.2.3 From e4b88e19897f1039fd83f1630517becafc0dd163 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 22 May 2015 13:44:33 -0700 Subject: Input: stmpe-ts - enforce device tree only mode The STMPE MFD is only used with device tree configured systems (and STMPE MFD core depends on OF), so force the configuration to come from device tree only. Tested-by: Heiner Kallweit Reviewed-by: Marek Vasut Acked-by: Lee Jones Signed-off-by: Dmitry Torokhov --- include/linux/mfd/stmpe.h | 44 -------------------------------------------- 1 file changed, 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h index c9d869027300..cb83883918a7 100644 --- a/include/linux/mfd/stmpe.h +++ b/include/linux/mfd/stmpe.h @@ -117,47 +117,6 @@ extern int stmpe_disable(struct stmpe *stmpe, unsigned int blocks); #define STMPE_GPIO_NOREQ_811_TOUCH (0xf0) -/** - * struct stmpe_ts_platform_data - stmpe811 touch screen controller platform - * data - * @sample_time: ADC converstion time in number of clock. - * (0 -> 36 clocks, 1 -> 44 clocks, 2 -> 56 clocks, 3 -> 64 clocks, - * 4 -> 80 clocks, 5 -> 96 clocks, 6 -> 144 clocks), - * recommended is 4. - * @mod_12b: ADC Bit mode (0 -> 10bit ADC, 1 -> 12bit ADC) - * @ref_sel: ADC reference source - * (0 -> internal reference, 1 -> external reference) - * @adc_freq: ADC Clock speed - * (0 -> 1.625 MHz, 1 -> 3.25 MHz, 2 || 3 -> 6.5 MHz) - * @ave_ctrl: Sample average control - * (0 -> 1 sample, 1 -> 2 samples, 2 -> 4 samples, 3 -> 8 samples) - * @touch_det_delay: Touch detect interrupt delay - * (0 -> 10 us, 1 -> 50 us, 2 -> 100 us, 3 -> 500 us, - * 4-> 1 ms, 5 -> 5 ms, 6 -> 10 ms, 7 -> 50 ms) - * recommended is 3 - * @settling: Panel driver settling time - * (0 -> 10 us, 1 -> 100 us, 2 -> 500 us, 3 -> 1 ms, - * 4 -> 5 ms, 5 -> 10 ms, 6 for 50 ms, 7 -> 100 ms) - * recommended is 2 - * @fraction_z: Length of the fractional part in z - * (fraction_z ([0..7]) = Count of the fractional part) - * recommended is 7 - * @i_drive: current limit value of the touchscreen drivers - * (0 -> 20 mA typical 35 mA max, 1 -> 50 mA typical 80 mA max) - * - * */ -struct stmpe_ts_platform_data { - u8 sample_time; - u8 mod_12b; - u8 ref_sel; - u8 adc_freq; - u8 ave_ctrl; - u8 touch_det_delay; - u8 settling; - u8 fraction_z; - u8 i_drive; -}; - /** * struct stmpe_platform_data - STMPE platform data * @id: device id to distinguish between multiple STMPEs on the same board @@ -168,7 +127,6 @@ struct stmpe_ts_platform_data { * @irq_over_gpio: true if gpio is used to get irq * @irq_gpio: gpio number over which irq will be requested (significant only if * irq_over_gpio is true) - * @ts: touchscreen-specific platform data */ struct stmpe_platform_data { int id; @@ -178,8 +136,6 @@ struct stmpe_platform_data { bool irq_over_gpio; int irq_gpio; int autosleep_timeout; - - struct stmpe_ts_platform_data *ts; }; #endif -- cgit v1.2.3 From 7d7efec368d537226142cbe559f45797f18672f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:35:16 -0400 Subject: sched, cgroup: reorganize threadgroup locking threadgroup_change_begin/end() are used to mark the beginning and end of threadgroup modifying operations to allow code paths which require a threadgroup to stay stable across blocking operations to synchronize against those sections using threadgroup_lock/unlock(). It's currently implemented as a general mechanism in sched.h using per-signal_struct rwsem; however, this never grew non-cgroup use cases and becomes noop if !CONFIG_CGROUPS. It turns out that cgroups is gonna be better served with a different sycnrhonization scheme and is a bit silly to keep cgroups specific details as a general mechanism. What's general here is identifying the places where threadgroups are modified. This patch restructures threadgroup locking so that threadgroup_change_begin/end() become a place where subsystems which need to sycnhronize against threadgroup changes can hook into. cgroup_threadgroup_change_begin/end() which operate on the per-signal_struct rwsem are created and threadgroup_lock/unlock() are moved to cgroup.c and made static. This is pure reorganization which doesn't cause any functional changes. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/cgroup-defs.h | 10 +++++++++ include/linux/sched.h | 53 +++++++++++++++------------------------------ 2 files changed, 27 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 55f3120fb952..1b8c93806dbd 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #ifdef CONFIG_CGROUPS @@ -460,5 +461,14 @@ struct cgroup_subsys { unsigned int depends_on; }; +void cgroup_threadgroup_change_begin(struct task_struct *tsk); +void cgroup_threadgroup_change_end(struct task_struct *tsk); + +#else /* CONFIG_CGROUPS */ + +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} + #endif /* CONFIG_CGROUPS */ + #endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8222ae40ecb0..5ee290003470 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -58,6 +58,7 @@ struct sched_param { #include #include #include +#include #include @@ -2648,53 +2649,33 @@ static inline void unlock_task_sighand(struct task_struct *tsk, spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } -#ifdef CONFIG_CGROUPS -static inline void threadgroup_change_begin(struct task_struct *tsk) -{ - down_read(&tsk->signal->group_rwsem); -} -static inline void threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); -} - /** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. + * threadgroup_change_begin - mark the beginning of changes to a threadgroup + * @tsk: task causing the changes * - * fork and exit paths explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. - * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. + * All operations which modify a threadgroup - a new thread joining the + * group, death of a member thread (the assertion of PF_EXITING) and + * exec(2) dethreading the process and replacing the leader - are wrapped + * by threadgroup_change_{begin|end}(). This is to provide a place which + * subsystems needing threadgroup stability can hook into for + * synchronization. */ -static inline void threadgroup_lock(struct task_struct *tsk) +static inline void threadgroup_change_begin(struct task_struct *tsk) { - down_write(&tsk->signal->group_rwsem); + might_sleep(); + cgroup_threadgroup_change_begin(tsk); } /** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock + * threadgroup_change_end - mark the end of changes to a threadgroup + * @tsk: task causing the changes * - * Reverse threadgroup_lock(). + * See threadgroup_change_begin(). */ -static inline void threadgroup_unlock(struct task_struct *tsk) +static inline void threadgroup_change_end(struct task_struct *tsk) { - up_write(&tsk->signal->group_rwsem); + cgroup_threadgroup_change_end(tsk); } -#else -static inline void threadgroup_change_begin(struct task_struct *tsk) {} -static inline void threadgroup_change_end(struct task_struct *tsk) {} -static inline void threadgroup_lock(struct task_struct *tsk) {} -static inline void threadgroup_unlock(struct task_struct *tsk) {} -#endif #ifndef __HAVE_THREAD_FUNCTIONS -- cgit v1.2.3 From d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:35:17 -0400 Subject: sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem The cgroup side of threadgroup locking uses signal_struct->group_rwsem to synchronize against threadgroup changes. This per-process rwsem adds small overhead to thread creation, exit and exec paths, forces cgroup code paths to do lock-verify-unlock-retry dance in a couple places and makes it impossible to atomically perform operations across multiple processes. This patch replaces signal_struct->group_rwsem with a global percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader side and contained in cgroups proper. This patch converts one-to-one. This does make writer side heavier and lower the granularity; however, cgroup process migration is a fairly cold path, we do want to optimize thread operations over it and cgroup migration operations don't take enough time for the lower granularity to matter. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/cgroup-defs.h | 27 +++++++++++++++++++++++++-- include/linux/init_task.h | 8 -------- include/linux/sched.h | 12 ------------ 3 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b8c93806dbd..7d83d7f73420 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -461,8 +461,31 @@ struct cgroup_subsys { unsigned int depends_on; }; -void cgroup_threadgroup_change_begin(struct task_struct *tsk); -void cgroup_threadgroup_change_end(struct task_struct *tsk); +extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + +/** + * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_begin() and allows cgroup operations to + * synchronize against threadgroup changes using a percpu_rw_semaphore. + */ +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + percpu_down_read(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_end(). Counterpart of + * cgroup_threadcgroup_change_begin(). + */ +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + percpu_up_read(&cgroup_threadgroup_rwsem); +} #else /* CONFIG_CGROUPS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..0cc0bbf20022 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,13 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#ifdef CONFIG_CGROUPS -#define INIT_GROUP_RWSEM(sig) \ - .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), -#else -#define INIT_GROUP_RWSEM(sig) -#endif - #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -56,7 +49,6 @@ extern struct fs_struct init_fs; }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ - INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ee290003470..add524a910bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -743,18 +743,6 @@ struct signal_struct { unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif -#ifdef CONFIG_CGROUPS - /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. - */ - struct rw_semaphore group_rwsem; -#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ -- cgit v1.2.3 From e463d88c36d42211aa72ed76d32fb8bf37820ef1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 26 May 2015 12:19:58 -0700 Subject: net: phy: Add phy_interface_is_rgmii helper RGMII interfaces come in 4 different flavors that the PHY library needs to care about: regular RGMII (no delays), RGMII with either RX or TX delay, and both. In order to avoid errors of checking only for one type of RGMII interface and miss the 3 others, introduce a convenience function which tests for all values. Suggested-by: David S. Miller Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 701c7a3946e0..a26c3f84b8dd 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -677,6 +677,17 @@ static inline bool phy_is_internal(struct phy_device *phydev) return phydev->is_internal; } +/** + * phy_interface_is_rgmii - Convenience function for testing if a PHY interface + * is RGMII (all variants) + * @phydev: the phy_device struct + */ +static inline bool phy_interface_is_rgmii(struct phy_device *phydev) +{ + return phydev->interface >= PHY_INTERFACE_MODE_RGMII && + phydev->interface <= PHY_INTERFACE_MODE_RGMII_TXID; +} + /** * phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. -- cgit v1.2.3 From b3df4ec4424f27e55d754cfe586195fecca1c4e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 00:00:51 +0000 Subject: perf/x86/intel/cqm: Use proper data types 'int' is really not a proper data type for an MSR. Use u32 to make it clear that we are dealing with a 32-bit unsigned hardware value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Matt Fleming Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vikas Shivappa Cc: Will Auld Link: http://lkml.kernel.org/r/20150518235149.919350144@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 248f7829ce41..06580028cee6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -120,7 +120,7 @@ struct hw_perf_event { }; struct { /* intel_cqm */ int cqm_state; - int cqm_rmid; + u32 cqm_rmid; struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; -- cgit v1.2.3 From 16b369a91d0dd80be214b7f7801fbc51875454cc Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 25 May 2015 15:08:47 +0200 Subject: random: Blocking API for accessing nonblocking_pool The added API calls provide a synchronous function call get_blocking_random_bytes where the caller is blocked until the nonblocking_pool is initialized. CC: Andreas Steffen CC: Theodore Ts'o CC: Sandy Harris Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- include/linux/random.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index b05856e16b75..796267d56901 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -14,6 +14,7 @@ extern void add_input_randomness(unsigned int type, unsigned int code, extern void add_interrupt_randomness(int irq, int irq_flags); extern void get_random_bytes(void *buf, int nbytes); +extern void get_blocking_random_bytes(void *buf, int nbytes); extern void get_random_bytes_arch(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); extern int random_int_secret_init(void); -- cgit v1.2.3 From 7d010fdf299929f9583ce5e17da629dcd83c36ef Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:13 +0200 Subject: x86/mm/mtrr: Avoid #ifdeffery with phys_wc_to_mtrr_index() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is only one user but since we're going to bury MTRR next out of access to drivers, expose this last piece of API to drivers in a general fashion only needing io.h for access to helpers. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Abhilash Kesavan Cc: Andrew Morton Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: Cristian Stoica Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Matthias Brugger Cc: Mel Gorman Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thierry Reding Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: Will Deacon Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1429722736-4473-1-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/io.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index 986f2bffea1e..04cce4da3685 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -111,6 +111,13 @@ static inline void arch_phys_wc_del(int handle) } #define arch_phys_wc_add arch_phys_wc_add +#ifndef arch_phys_wc_index +static inline int arch_phys_wc_index(int handle) +{ + return -1; +} +#define arch_phys_wc_index arch_phys_wc_index +#endif #endif #endif /* _LINUX_IO_H */ -- cgit v1.2.3 From 06931e62246844c73fba24d7aeb4a5dc897a2739 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 May 2015 15:11:28 +0200 Subject: sched/topology: Rename topology_thread_cpumask() to topology_sibling_cpumask() Rename topology_thread_cpumask() to topology_sibling_cpumask() for more consistency with scheduler code. Signed-off-by: Bartosz Golaszewski Reviewed-by: Thomas Gleixner Acked-by: Russell King Acked-by: Catalin Marinas Cc: Benoit Cousson Cc: Fenghua Yu Cc: Guenter Roeck Cc: Jean Delvare Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Oleg Drokin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Viresh Kumar Link: http://lkml.kernel.org/r/1432645896-12588-2-git-send-email-bgolaszewski@baylibre.com Signed-off-by: Ingo Molnar --- include/linux/topology.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 909b6e43b694..73ddad1e0fa3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif -#ifndef topology_thread_cpumask -#define topology_thread_cpumask(cpu) cpumask_of(cpu) +#ifndef topology_sibling_cpumask +#define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) @@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu) #ifdef CONFIG_SCHED_SMT static inline const struct cpumask *cpu_smt_mask(int cpu) { - return topology_thread_cpumask(cpu); + return topology_sibling_cpumask(cpu); } #endif -- cgit v1.2.3 From 66eb579e66ecfea55e2007be0594869ea9e453d4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 13 May 2015 17:12:23 +0100 Subject: perf: allow for PMU-specific event filtering In certain circumstances it may not be possible to schedule particular events due to constraints other than a lack of hardware counters (e.g. on big.LITTLE systems where CPUs support different events). The core perf event code does not distinguish these cases and pessimistically assumes that any failure to schedule an event means that it is not worth attempting to schedule later events, even if some hardware counters are still unused. When an event a pmu cannot schedule exists in a flexible group list it can unnecessarily prevent event groups following it in the list from being scheduled (until it is rotated to the end of the list). This means some events are scheduled for only a portion of the time they could be, and for short running programs no events may be scheduled if the list is initially sorted in an unfortunate order. This patch adds a new (optional) filter_match function pointer to struct pmu which a pmu driver can use to tell perf core when an event matches pmu-specific scheduling requirements. This plugs into the existing event_filter_match logic, and makes it possible to avoid the scheduling problem described above. When no filter is provided by the PMU, the existing behaviour is retained. Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Acked-by: Will Deacon Acked-by: Peter Zijlstra Signed-off-by: Mark Rutland Signed-off-by: Will Deacon --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..67c719cc91aa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -304,6 +304,11 @@ struct pmu { * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ + + /* + * Filter events for PMU-specific reasons. + */ + int (*filter_match) (struct perf_event *event); /* optional */ }; /** -- cgit v1.2.3 From 24fe86a617c550fb9bdc6c8bd7cf647d3955f8ba Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 29 Mar 2015 12:50:46 +0200 Subject: phy: sun4i-usb: Add a sunxi specific function for setting squelch-detect The sunxi otg phy has a bug where it wrongly detects a high speed squelch when reset on the root port gets de-asserted with a lo-speed device. The workaround for this is to disable squelch detect before de-asserting reset, and re-enabling it after the reset de-assert is done. Add a sunxi specific phy function to allow the sunxi-musb glue to do this. Acked-by: Kishon Vijay Abraham I Signed-off-by: Hans de Goede Signed-off-by: Felipe Balbi --- include/linux/phy/phy-sun4i-usb.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/phy/phy-sun4i-usb.h (limited to 'include/linux') diff --git a/include/linux/phy/phy-sun4i-usb.h b/include/linux/phy/phy-sun4i-usb.h new file mode 100644 index 000000000000..50aed92ea89c --- /dev/null +++ b/include/linux/phy/phy-sun4i-usb.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2015 Hans de Goede + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef PHY_SUN4I_USB_H_ +#define PHY_SUN4I_USB_H_ + +#include "phy.h" + +/** + * sun4i_usb_phy_set_squelch_detect() - Enable/disable squelch detect + * @phy: reference to a sun4i usb phy + * @enabled: wether to enable or disable squelch detect + */ +void sun4i_usb_phy_set_squelch_detect(struct phy *phy, bool enabled); + +#endif -- cgit v1.2.3 From e5c4708b2b6fec0300db60ee3cf4b4ae96430a12 Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Tue, 26 May 2015 19:20:14 -0700 Subject: pci: Add Cavium PCI vendor id This vendor id will be used by network (vNIC), USB (xHCI), SATA (AHCI), GPIO, I2C, MMC and maybe other drivers for ThunderX SoC. Acked-by: Bjorn Helgaas Signed-off-by: Sunil Goutham Signed-off-by: Aleksey Makarov Signed-off-by: David S. Miller --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 2f7b9a40f627..2972c7f3aa1d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2329,6 +2329,8 @@ #define PCI_DEVICE_ID_ALTIMA_AC9100 0x03ea #define PCI_DEVICE_ID_ALTIMA_AC1003 0x03eb +#define PCI_VENDOR_ID_CAVIUM 0x177d + #define PCI_VENDOR_ID_BELKIN 0x1799 #define PCI_DEVICE_ID_BELKIN_F5D7010V7 0x701f -- cgit v1.2.3 From 4612c715a6ea6b3af2aee0163c0721375b2548d7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 18 May 2015 12:58:40 +0200 Subject: mtd: cfi: deinline large functions With this .config: http://busybox.net/~vda/kernel_config, after uninlining these functions have sizes and callsite counts as follows: cfi_udelay(): 74 bytes, 26 callsites cfi_send_gen_cmd(): 153 bytes, 95 callsites cfi_build_cmd(): 274 bytes, 123 callsites cfi_build_cmd_addr(): 49 bytes, 15 callsites cfi_merge_status(): 230 bytes, 3 callsites Reduction in code size is about 50,000: text data bss dec hex filename 85842882 22294584 20627456 128764922 7accbfa vmlinux.before 85789648 22294616 20627456 128711720 7abfc28 vmlinux Signed-off-by: Denys Vlasenko CC: Dan Carpenter CC: Jingoo Han CC: Brian Norris CC: Aaron Sierra CC: Artem Bityutskiy CC: David Woodhouse CC: linux-mtd@lists.infradead.org CC: linux-kernel@vger.kernel.org Signed-off-by: Brian Norris --- include/linux/mtd/cfi.h | 188 +++--------------------------------------------- 1 file changed, 8 insertions(+), 180 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index 299d7d31fe53..9b57a9b1b081 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -296,183 +296,19 @@ struct cfi_private { struct flchip chips[0]; /* per-chip data structure for each chip */ }; -/* - * Returns the command address according to the given geometry. - */ -static inline uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs, - struct map_info *map, struct cfi_private *cfi) -{ - unsigned bankwidth = map_bankwidth(map); - unsigned interleave = cfi_interleave(cfi); - unsigned type = cfi->device_type; - uint32_t addr; - - addr = (cmd_ofs * type) * interleave; - - /* Modify the unlock address if we are in compatibility mode. - * For 16bit devices on 8 bit busses - * and 32bit devices on 16 bit busses - * set the low bit of the alternating bit sequence of the address. - */ - if (((type * interleave) > bankwidth) && ((cmd_ofs & 0xff) == 0xaa)) - addr |= (type >> 1)*interleave; - - return addr; -} - -/* - * Transforms the CFI command for the given geometry (bus width & interleave). - * It looks too long to be inline, but in the common case it should almost all - * get optimised away. - */ -static inline map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private *cfi) -{ - map_word val = { {0} }; - int wordwidth, words_per_bus, chip_mode, chips_per_word; - unsigned long onecmd; - int i; - - /* We do it this way to give the compiler a fighting chance - of optimising away all the crap for 'bankwidth' larger than - an unsigned long, in the common case where that support is - disabled */ - if (map_bankwidth_is_large(map)) { - wordwidth = sizeof(unsigned long); - words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1 - } else { - wordwidth = map_bankwidth(map); - words_per_bus = 1; - } - - chip_mode = map_bankwidth(map) / cfi_interleave(cfi); - chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map); - - /* First, determine what the bit-pattern should be for a single - device, according to chip mode and endianness... */ - switch (chip_mode) { - default: BUG(); - case 1: - onecmd = cmd; - break; - case 2: - onecmd = cpu_to_cfi16(map, cmd); - break; - case 4: - onecmd = cpu_to_cfi32(map, cmd); - break; - } - - /* Now replicate it across the size of an unsigned long, or - just to the bus width as appropriate */ - switch (chips_per_word) { - default: BUG(); -#if BITS_PER_LONG >= 64 - case 8: - onecmd |= (onecmd << (chip_mode * 32)); -#endif - case 4: - onecmd |= (onecmd << (chip_mode * 16)); - case 2: - onecmd |= (onecmd << (chip_mode * 8)); - case 1: - ; - } +uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs, + struct map_info *map, struct cfi_private *cfi); - /* And finally, for the multi-word case, replicate it - in all words in the structure */ - for (i=0; i < words_per_bus; i++) { - val.x[i] = onecmd; - } - - return val; -} +map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private *cfi); #define CMD(x) cfi_build_cmd((x), map, cfi) - -static inline unsigned long cfi_merge_status(map_word val, struct map_info *map, - struct cfi_private *cfi) -{ - int wordwidth, words_per_bus, chip_mode, chips_per_word; - unsigned long onestat, res = 0; - int i; - - /* We do it this way to give the compiler a fighting chance - of optimising away all the crap for 'bankwidth' larger than - an unsigned long, in the common case where that support is - disabled */ - if (map_bankwidth_is_large(map)) { - wordwidth = sizeof(unsigned long); - words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1 - } else { - wordwidth = map_bankwidth(map); - words_per_bus = 1; - } - - chip_mode = map_bankwidth(map) / cfi_interleave(cfi); - chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map); - - onestat = val.x[0]; - /* Or all status words together */ - for (i=1; i < words_per_bus; i++) { - onestat |= val.x[i]; - } - - res = onestat; - switch(chips_per_word) { - default: BUG(); -#if BITS_PER_LONG >= 64 - case 8: - res |= (onestat >> (chip_mode * 32)); -#endif - case 4: - res |= (onestat >> (chip_mode * 16)); - case 2: - res |= (onestat >> (chip_mode * 8)); - case 1: - ; - } - - /* Last, determine what the bit-pattern should be for a single - device, according to chip mode and endianness... */ - switch (chip_mode) { - case 1: - break; - case 2: - res = cfi16_to_cpu(map, res); - break; - case 4: - res = cfi32_to_cpu(map, res); - break; - default: BUG(); - } - return res; -} - +unsigned long cfi_merge_status(map_word val, struct map_info *map, + struct cfi_private *cfi); #define MERGESTATUS(x) cfi_merge_status((x), map, cfi) - -/* - * Sends a CFI command to a bank of flash for the given geometry. - * - * Returns the offset in flash where the command was written. - * If prev_val is non-null, it will be set to the value at the command address, - * before the command was written. - */ -static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t base, +uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t base, struct map_info *map, struct cfi_private *cfi, - int type, map_word *prev_val) -{ - map_word val; - uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, map, cfi); - val = cfi_build_cmd(cmd, map, cfi); - - if (prev_val) - *prev_val = map_read(map, addr); - - map_write(map, val, addr); - - return addr - base; -} + int type, map_word *prev_val); static inline uint8_t cfi_read_query(struct map_info *map, uint32_t addr) { @@ -506,15 +342,7 @@ static inline uint16_t cfi_read_query16(struct map_info *map, uint32_t addr) } } -static inline void cfi_udelay(int us) -{ - if (us >= 1000) { - msleep((us+999)/1000); - } else { - udelay(us); - cond_resched(); - } -} +void cfi_udelay(int us); int __xipram cfi_qry_present(struct map_info *map, __u32 base, struct cfi_private *cfi); -- cgit v1.2.3 From 7d0ae8086b828311250c6afdf800b568ac9bd693 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2015 14:57:58 -0800 Subject: rcu: Convert ACCESS_ONCE() to READ_ONCE() and WRITE_ONCE() This commit moves from the old ACCESS_ONCE() API to the new READ_ONCE() and WRITE_ONCE() APIs. Signed-off-by: Paul E. McKenney [ paulmck: Updated to include kernel/torture.c as suggested by Jason Low. ] --- include/linux/rculist.h | 6 +++--- include/linux/rcupdate.h | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index a18b16f1dc0e..665397247e82 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -29,8 +29,8 @@ */ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) { - ACCESS_ONCE(list->next) = list; - ACCESS_ONCE(list->prev) = list; + WRITE_ONCE(list->next, list); + WRITE_ONCE(list->prev, list); } /* @@ -288,7 +288,7 @@ static inline void list_splice_init_rcu(struct list_head *list, #define list_first_or_null_rcu(ptr, type, member) \ ({ \ struct list_head *__ptr = (ptr); \ - struct list_head *__next = ACCESS_ONCE(__ptr->next); \ + struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 573a5afd5ed8..87bb0eee665b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -364,8 +364,8 @@ extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch(t) \ do { \ rcu_all_qs(); \ - if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ - ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ + if (READ_ONCE((t)->rcu_tasks_holdout)) \ + WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) @@ -609,7 +609,7 @@ static inline void rcu_preempt_sleep_check(void) #define __rcu_access_pointer(p, space) \ ({ \ - typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \ + typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) @@ -630,7 +630,7 @@ static inline void rcu_preempt_sleep_check(void) #define __rcu_access_index(p, space) \ ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ + typeof(p) _________p1 = READ_ONCE(p); \ rcu_dereference_sparse(p, space); \ (_________p1); \ }) @@ -659,7 +659,7 @@ static inline void rcu_preempt_sleep_check(void) */ #define lockless_dereference(p) \ ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ + typeof(p) _________p1 = READ_ONCE(p); \ smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ (_________p1); \ }) @@ -702,7 +702,7 @@ static inline void rcu_preempt_sleep_check(void) * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the - * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful + * smp_read_barrier_depends() and keep the READ_ONCE(). This is useful * when the value of this pointer is accessed, but the pointer is not * dereferenced, for example, when testing an RCU-protected pointer against * NULL. Although rcu_access_pointer() may also be used in cases where @@ -791,7 +791,7 @@ static inline void rcu_preempt_sleep_check(void) * @p: The index to read * * Return the value of the specified RCU-protected index, but omit the - * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful + * smp_read_barrier_depends() and keep the READ_ONCE(). This is useful * when the value of this index is accessed, but the index is not * dereferenced, for example, when testing an RCU-protected index against * -1. Although rcu_access_index() may also be used in cases where @@ -827,7 +827,7 @@ static inline void rcu_preempt_sleep_check(void) * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This + * both the smp_read_barrier_depends() and the READ_ONCE(). This * is useful in cases where update-side locks prevent the value of the * pointer from changing. Please note that this primitive does -not- * prevent the compiler from repeating this reference or combining it -- cgit v1.2.3 From 1ebee8017d84ec8a0ba893cf7b8be3f70ead088b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Apr 2015 18:21:47 -0700 Subject: rcu: Eliminate array-index-based RCU primitives Now that rcu_access_index() and rcu_dereference_index_check() are no longer used, the commit removes them from the RCU API. This means that RCU's data dependencies now involve only pointers, give or take the occasional cast to and then back from an integer type to do pointer arithmetic. This in turn eliminates the need for a number of operations on values carrying RCU data dependencies. Signed-off-by: Paul E. McKenney Cc: linux-edac@vger.kernel.org Cc: Tony Luck Acked-by: Borislav Petkov --- include/linux/rcupdate.h | 50 ------------------------------------------------ 1 file changed, 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 87bb0eee665b..b97842ff71d2 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -628,21 +628,6 @@ static inline void rcu_preempt_sleep_check(void) ((typeof(*p) __force __kernel *)(p)); \ }) -#define __rcu_access_index(p, space) \ -({ \ - typeof(p) _________p1 = READ_ONCE(p); \ - rcu_dereference_sparse(p, space); \ - (_________p1); \ -}) -#define __rcu_dereference_index_check(p, c) \ -({ \ - /* Dependency order vs. p above. */ \ - typeof(p) _________p1 = lockless_dereference(p); \ - rcu_lockdep_assert(c, \ - "suspicious rcu_dereference_index_check() usage"); \ - (_________p1); \ -}) - /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. @@ -786,41 +771,6 @@ static inline void rcu_preempt_sleep_check(void) */ #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) -/** - * rcu_access_index() - fetch RCU index with no dereferencing - * @p: The index to read - * - * Return the value of the specified RCU-protected index, but omit the - * smp_read_barrier_depends() and keep the READ_ONCE(). This is useful - * when the value of this index is accessed, but the index is not - * dereferenced, for example, when testing an RCU-protected index against - * -1. Although rcu_access_index() may also be used in cases where - * update-side locks prevent the value of the index from changing, you - * should instead use rcu_dereference_index_protected() for this use case. - */ -#define rcu_access_index(p) __rcu_access_index((p), __rcu) - -/** - * rcu_dereference_index_check() - rcu_dereference for indices with debug checking - * @p: The pointer to read, prior to dereferencing - * @c: The conditions under which the dereference will take place - * - * Similar to rcu_dereference_check(), but omits the sparse checking. - * This allows rcu_dereference_index_check() to be used on integers, - * which can then be used as array indices. Attempting to use - * rcu_dereference_check() on an integer will give compiler warnings - * because the sparse address-space mechanism relies on dereferencing - * the RCU-protected pointer. Dereferencing integers is not something - * that even gcc will put up with. - * - * Note that this function does not implicitly check for RCU read-side - * critical sections. If this function gains lots of uses, it might - * make sense to provide versions for each flavor of RCU, but it does - * not make sense as of early 2010. - */ -#define rcu_dereference_index_check(p, c) \ - __rcu_dereference_index_check((p), (c)) - /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing -- cgit v1.2.3 From d956028e99b30726b0bce0ca684b40b1ad67b514 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 31 Mar 2015 09:39:41 +0100 Subject: documentation: memory-barriers: Fix smp_mb__before_spinlock() semantics Our current documentation claims that, when followed by an ACQUIRE, smp_mb__before_spinlock() orders prior loads against subsequent loads and stores, which isn't the intent. This commit therefore fixes the documentation to state that this sequence orders only prior stores against subsequent loads and stores. In addition, the original intent of smp_mb__before_spinlock() was to only order prior loads against subsequent stores, however, people have started using it as if it ordered prior loads against subsequent loads and stores. This commit therefore also updates smp_mb__before_spinlock()'s header comment to reflect this new reality. Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: Peter Zijlstra Signed-off-by: Will Deacon Signed-off-by: Paul E. McKenney --- include/linux/spinlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 3e18379dfa6f..0063b24b4f36 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -120,7 +120,7 @@ do { \ /* * Despite its name it doesn't necessarily has to be a full barrier. * It should only guarantee that a STORE before the critical section - * can not be reordered with a LOAD inside this section. + * can not be reordered with LOADs and STOREs inside this section. * spin_lock() is the one-way barrier, this LOAD can not escape out * of the region. So the default implementation simply ensures that * a STORE can not move into the critical section, smp_wmb() should -- cgit v1.2.3 From 3382adbc1bb8c80ea512243acf6059564287620b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Mar 2015 15:41:24 -0800 Subject: rcu: Eliminate a few CONFIG_RCU_NOCB_CPU_ALL #ifdefs This commit converts several CONFIG_RCU_NOCB_CPU_ALL #ifdefs to instead use IS_ENABLED(). This change should help avoid hiding code from compiler diagnostics. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- include/linux/rcutree.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 87bb0eee665b..5ec20bc4af76 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1153,13 +1153,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) -#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) +#ifdef CONFIG_TINY_RCU static inline int rcu_needs_cpu(unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; return 0; } -#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */ +#endif /* #ifdef CONFIG_TINY_RCU */ #if defined(CONFIG_RCU_NOCB_CPU_ALL) static inline bool rcu_is_nocb_cpu(int cpu) { return true; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d2e583a6aaca..0bd400b02430 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -31,9 +31,7 @@ #define __LINUX_RCUTREE_H void rcu_note_context_switch(void); -#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(unsigned long *delta_jiffies); -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ void rcu_cpu_stall_reset(void); /* -- cgit v1.2.3 From 5af4692a75daf08dddc93dbb4cd2a1b3d3b617af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 Apr 2015 12:48:29 -0700 Subject: smp: Make control dependencies work on Alpha, improve documentation The current formulation of control dependencies fails on DEC Alpha, which does not respect dependencies of any kind unless an explicit memory barrier is provided. This means that the current fomulation of control dependencies fails on Alpha. This commit therefore creates a READ_ONCE_CTRL() that has the same overhead on non-Alpha systems, but causes Alpha to produce the needed ordering. This commit also applies READ_ONCE_CTRL() to the one known use of control dependencies. Use of READ_ONCE_CTRL() also has the beneficial effect of adding a bit of self-documentation to control dependencies. Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) --- include/linux/compiler.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..5d66777914db 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -252,6 +252,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #define WRITE_ONCE(x, val) \ ({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; }) +/** + * READ_ONCE_CTRL - Read a value heading a control dependency + * @x: The value to be read, heading the control dependency + * + * Control dependencies are tricky. See Documentation/memory-barriers.txt + * for important information on how to use them. Note that in many cases, + * use of smp_load_acquire() will be much simpler. Control dependencies + * should be avoided except on the hottest of hotpaths. + */ +#define READ_ONCE_CTRL(x) \ +({ \ + typeof(x) __val = READ_ONCE(x); \ + smp_read_barrier_depends(); /* Enforce control dependency. */ \ + __val; \ +}) + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From f517700cce37ffcb36e7afae0294fd11c72ed134 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 26 Mar 2015 13:27:08 +0800 Subject: rculist: Fix another sparse warning This fixes the following sparse warnings: make C=1 CF=-D__CHECK_ENDIAN__ net/tipc/name_table.o net/tipc/name_table.c:977:17: error: incompatible types in comparison expression (different address spaces) net/tipc/name_table.c:977:17: error: incompatible types in comparison expression (different address spaces) To silence these spare complaints, an RCU annotation should be added to "next" pointer of hlist_node structure through hlist_next_rcu() macro when iterating over a hlist with hlist_for_each_entry_from_rcu(). Signed-off-by: Ying Xue Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 665397247e82..17c6b1f84a77 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -549,8 +549,8 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, */ #define hlist_for_each_entry_from_rcu(pos, member) \ for (; pos; \ - pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\ - typeof(*(pos)), member)) + pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ + &(pos)->member)), typeof(*(pos)), member)) #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From 51952bc633064311410b041fad38da1614f4539e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Apr 2015 11:15:30 -0700 Subject: rcu: Further shrink Tiny RCU by making empty functions static inlines The Tiny RCU counterparts to rcu_idle_enter(), rcu_idle_exit(), rcu_irq_enter(), and rcu_irq_exit() are empty functions, but each has EXPORT_SYMBOL_GPL(), which needlessly consumes extra memory, especially in kernels built with module support. This commit therefore moves these functions to static inlines in rcutiny.h, removing the need for exports. This won't affect the size of the tiniest kernels, which are likely built without module support, but might help semi-tiny kernels that might include module support. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 4 ---- include/linux/rcutiny.h | 16 ++++++++++++++++ include/linux/rcutree.h | 5 +++++ 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 87bb0eee665b..1b3d7bcb3a6c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -292,10 +292,6 @@ void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); struct notifier_block; -void rcu_idle_enter(void); -void rcu_idle_exit(void); -void rcu_irq_enter(void); -void rcu_irq_exit(void); int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 937edaeb150d..3df6c1ec4e25 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -159,6 +159,22 @@ static inline void rcu_cpu_stall_reset(void) { } +static inline void rcu_idle_enter(void) +{ +} + +static inline void rcu_idle_exit(void) +{ +} + +static inline void rcu_irq_enter(void) +{ +} + +static inline void rcu_irq_exit(void) +{ +} + static inline void exit_rcu(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d2e583a6aaca..f22d83f49e56 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -93,6 +93,11 @@ void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +void rcu_idle_enter(void); +void rcu_idle_exit(void); +void rcu_irq_enter(void); +void rcu_irq_exit(void); + void exit_rcu(void); void rcu_scheduler_starting(void); -- cgit v1.2.3 From ad5fb870c486d932a1749d7853dd70f436a7e03f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 3 Apr 2015 12:05:28 -0400 Subject: e820, efi: add ACPI 6.0 persistent memory types ACPI 6.0 formalizes e820-type-7 and efi-type-14 as persistent memory. Mark it "reserved" and allow it to be claimed by a persistent memory device driver. This definition is in addition to the Linux kernel's existing type-12 definition that was recently added in support of shipping platforms with NVDIMM support that predate ACPI 6.0 (which now classifies type-12 as OEM reserved). Note, /proc/iomem can be consulted for differentiating legacy "Persistent Memory (legacy)" E820_PRAM vs standard "Persistent Memory" E820_PMEM. Cc: Boaz Harrosh Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jens Axboe Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Thomas Gleixner Acked-by: Jeff Moyer Acked-by: Andy Lutomirski Reviewed-by: Ross Zwisler Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/efi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index af5be0368dec..825b6e3d69cb 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -85,7 +85,8 @@ typedef struct { #define EFI_MEMORY_MAPPED_IO 11 #define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12 #define EFI_PAL_CODE 13 -#define EFI_MAX_MEMORY_TYPE 14 +#define EFI_PERSISTENT_MEMORY 14 +#define EFI_MAX_MEMORY_TYPE 15 /* Attribute values: */ #define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -- cgit v1.2.3 From f36f3f2846b5578d62910ee0b6dbef59fdd1cfa4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2015 13:20:23 +0200 Subject: KVM: add "new" argument to kvm_arch_commit_memory_region This lets the function access the new memory slot without going through kvm_memslots and id_to_memslot. It will simplify the code when more than one address space will be supported. Unfortunately, the "const"ness of the new argument must be casted away in two places. Fixing KVM to accept const struct kvm_memory_slot pointers would require modifications in pretty much all architectures, and is left for later. Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8815f1dffb77..9bd3bc16be87 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -516,6 +516,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change); bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); -- cgit v1.2.3 From d9ef13c2b3983de8dd1373ef670799dbb6498122 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 19 May 2015 16:01:50 +0200 Subject: KVM: pass kvm_memory_slot to gfn_to_page_many_atomic The memory slot is already available from gfn_to_memslot_dirty_bitmap. Isn't it a shame to look it up again? Plus, it makes gfn_to_page_many_atomic agnostic of multiple VCPU address spaces. Reviewed-by: Radim Krcmar Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9bd3bc16be87..a8bcbc9c6078 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -526,8 +526,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm); void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); -int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, - int nr_pages); +int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); -- cgit v1.2.3 From bfa1ce5f38938cc9e6c7f2d1011f88eba2b9e2b2 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Thu, 28 May 2015 11:40:54 +0200 Subject: bus: mvebu-mbus: add mv_mbus_dram_info_nooverlap() This commit introduces a variant of the mv_mbus_dram_info() function called mv_mbus_dram_info_nooverlap(). Both functions are used by Marvell drivers supporting devices doing DMA, and provide them a description the DRAM ranges that they need to configure their DRAM windows. The ranges provided by the mv_mbus_dram_info() function may overlap with the I/O windows if there is a lot (>= 4 GB) of RAM installed. This is not a problem for most of the DMA masters, except for the upcoming new CESA crypto driver because it does DMA to the SRAM, which is mapped through an I/O window. For this unit, we need to have DRAM ranges that do not overlap with the I/O windows. A first implementation done in commit 1737cac69369 ("bus: mvebu-mbus: make sure SDRAM CS for DMA don't overlap the MBus bridge window"), changed the information returned by mv_mbus_dram_info() to match this requirement. However, it broke the requirement of the other DMA masters than the DRAM ranges should have power of two sizes. To solve this situation, this commit introduces a new mv_mbus_dram_info_nooverlap() function, which returns the same information as mv_mbus_dram_info(), but guaranteed to not overlap with the I/O windows. In the end, it gives us two variants of the mv_mbus_dram_info*() functions: - The normal one, mv_mbus_dram_info(), which has been around for many years. This function returns the raw DRAM ranges, which are guaranteed to use power of two sizes, but will overlap with I/O windows. This function will therefore be used by all DMA masters (SATA, XOR, Ethernet, etc.) except the CESA crypto driver. - The new 'nooverlap' variant, mv_mbus_dram_info_nooverlap(). This function returns DRAM ranges after they have been "tweaked" to make sure they don't overlap with I/O windows. By doing this tweaking, we remove the power of two size guarantee. This variant will be used by the new CESA crypto driver. Signed-off-by: Thomas Petazzoni Signed-off-by: Gregory CLEMENT --- include/linux/mbus.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mbus.h b/include/linux/mbus.h index 611b69fa8594..1f7bc630d225 100644 --- a/include/linux/mbus.h +++ b/include/linux/mbus.h @@ -54,11 +54,16 @@ struct mbus_dram_target_info */ #ifdef CONFIG_PLAT_ORION extern const struct mbus_dram_target_info *mv_mbus_dram_info(void); +extern const struct mbus_dram_target_info *mv_mbus_dram_info_nooverlap(void); #else static inline const struct mbus_dram_target_info *mv_mbus_dram_info(void) { return NULL; } +static inline const struct mbus_dram_target_info *mv_mbus_dram_info_nooverlap(void) +{ + return NULL; +} #endif int mvebu_mbus_save_cpu_target(u32 *store_addr); -- cgit v1.2.3 From 85e6f09785bd06f0510bdb00c40772c7f8da3c43 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Tue, 19 May 2015 16:16:14 +0100 Subject: ARM: 8367/1: sa1100: prepare for moving irq driver to drivers/irqchip Prepare for moving sa1100 irq driver to irqchip infrastructure - split sa1100_init_irq into helper code and irq parts. Signed-off-by: Dmitry Eremin-Solenikov Acked-by: Thomas Gleixner Signed-off-by: Russell King --- include/linux/irqchip/irq-sa11x0.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/irqchip/irq-sa11x0.h (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-sa11x0.h b/include/linux/irqchip/irq-sa11x0.h new file mode 100644 index 000000000000..15db6829c1e4 --- /dev/null +++ b/include/linux/irqchip/irq-sa11x0.h @@ -0,0 +1,17 @@ +/* + * Generic IRQ handling for the SA11x0. + * + * Copyright (C) 2015 Dmitry Eremin-Solenikov + * Copyright (C) 1999-2001 Nicolas Pitre + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H +#define __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H + +void __init sa11x0_init_irq_nodt(int irq_start, resource_size_t io_start); + +#endif -- cgit v1.2.3 From 307c858bc245da5229b30186546590e1d472fc8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 May 2015 16:08:02 +0200 Subject: usb: phy: add static inline wrapper for devm_usb_get_phy_by_node The newly introduced devm_usb_get_phy_by_node function only has an extern declaration, but no alternative for the case that CONFIG_USB_PHY is disabled, which leads to a build error when it is used anyway: drivers/power/twl4030_charger.c: In function 'twl4030_bci_probe': drivers/power/twl4030_charger.c:648:23: error: implicit declaration of function 'devm_usb_get_phy_by_node' [-Werror=implicit-function-declaration] bci->transceiver = devm_usb_get_phy_by_node( This adds the wrapper in the same way that we have one for all other usb-phy API functions. Signed-off-by: Arnd Bergmann Fixes: e842b84c8e7 ("usb: phy: Add interface to get phy give of device_node.") Signed-off-by: Felipe Balbi --- include/linux/usb/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index 8ed1e29ef329..e39f251cf861 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -240,6 +240,12 @@ static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, return ERR_PTR(-ENXIO); } +static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, + struct device_node *node, struct notifier_block *nb) +{ + return ERR_PTR(-ENXIO); +} + static inline void usb_put_phy(struct usb_phy *x) { } -- cgit v1.2.3 From 9626b6993b2e6faf047d2d96958e8474edc9c7a5 Mon Sep 17 00:00:00 2001 From: jilai wang Date: Fri, 10 Apr 2015 16:15:59 -0400 Subject: firmware: qcom: scm: Add HDCP Support HDCP driver needs to check if secure environment supports HDCP. If it's supported, then it requires to program some registers through SCM. Add qcom_scm_hdcp_available and qcom_scm_hdcp_req to support these requirements. Signed-off-by: Jilai Wang Signed-off-by: Kumar Gala --- include/linux/qcom_scm.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index d7a974d5f57c..6e7d5ec65838 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2010-2014, The Linux Foundation. All rights reserved. +/* Copyright (c) 2010-2015, The Linux Foundation. All rights reserved. * Copyright (C) 2015 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify @@ -16,6 +16,17 @@ extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus); extern int qcom_scm_set_warm_boot_addr(void *entry, const cpumask_t *cpus); +#define QCOM_SCM_HDCP_MAX_REQ_CNT 5 + +struct qcom_scm_hdcp_req { + u32 addr; + u32 val; +}; + +extern bool qcom_scm_hdcp_available(void); +extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, + u32 *resp); + #define QCOM_SCM_CPU_PWR_DOWN_L2_ON 0x0 #define QCOM_SCM_CPU_PWR_DOWN_L2_OFF 0x1 -- cgit v1.2.3 From 3c6296f716ebef704b76070d90567ab4faa8462c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 13:21:34 -0400 Subject: ring-buffer: Remove useless unused tracing_off_permanent() The tracing_off_permanent() call is a way to disable all ring_buffers. Nothing uses it and nothing should use it, as tracing_off() and friends are better, as they disable the ring buffers related to tracing. The tracing_off_permanent() even disabled non tracing ring buffers. This is a bit drastic, and was added to handle NMIs doing outputs that could corrupt the ring buffer when only tracing used them. It is now obsolete and adds a little overhead, it should be removed. Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..d948718a83d7 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -532,12 +532,6 @@ bool mac_pton(const char *s, u8 *mac); * * Most likely, you want to use tracing_on/tracing_off. */ -#ifdef CONFIG_RING_BUFFER -/* trace_off_permanent stops recording with no way to bring it back */ -void tracing_off_permanent(void); -#else -static inline void tracing_off_permanent(void) { } -#endif enum ftrace_dump_mode { DUMP_NONE, -- cgit v1.2.3 From 0040b933187b11f78e83dd162a31d64a46be4e37 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 20 Apr 2015 11:52:23 -0700 Subject: f2fs: add missing version info in superblock The mkfs.f2fs remains kernel version in superblock, but f2fs module has not added that so far. Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 591f8c3ef410..8d345c24bcf7 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -50,6 +50,8 @@ #define MAX_ACTIVE_NODE_LOGS 8 #define MAX_ACTIVE_DATA_LOGS 8 +#define VERSION_LEN 256 + /* * For superblock */ @@ -86,6 +88,9 @@ struct f2fs_super_block { __le32 extension_count; /* # of extensions below */ __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ __le32 cp_payload; + __u8 version[VERSION_LEN]; /* the kernel version */ + __u8 init_version[VERSION_LEN]; /* the initial kernel version */ + __u8 reserved[892]; /* valid reserved region */ } __packed; /* -- cgit v1.2.3 From 76f105a2dbcd47509bac6ba8d94cb3759a3e6e9d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Apr 2015 15:10:36 -0700 Subject: f2fs: add feature facility in superblock This patch introduces a feature in superblock, which will indicate any new features for f2fs. Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 8d345c24bcf7..d44e97f2b98e 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -90,7 +90,8 @@ struct f2fs_super_block { __le32 cp_payload; __u8 version[VERSION_LEN]; /* the kernel version */ __u8 init_version[VERSION_LEN]; /* the initial kernel version */ - __u8 reserved[892]; /* valid reserved region */ + __le32 feature; /* defined features */ + __u8 reserved[888]; /* valid reserved region */ } __packed; /* -- cgit v1.2.3 From cde4de1205770514005663d70a9a7d81cb555085 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 20 Apr 2015 13:57:51 -0700 Subject: f2fs crypto: declare some definitions for f2fs encryption feature This definitions will be used by inode and superblock for encyption. Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d44e97f2b98e..920408a21ffd 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -91,7 +91,9 @@ struct f2fs_super_block { __u8 version[VERSION_LEN]; /* the kernel version */ __u8 init_version[VERSION_LEN]; /* the initial kernel version */ __le32 feature; /* defined features */ - __u8 reserved[888]; /* valid reserved region */ + __u8 encryption_level; /* versioning level for encryption */ + __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __u8 reserved[871]; /* valid reserved region */ } __packed; /* -- cgit v1.2.3 From f8df88081183bd4d3c461c617c3519445eb85642 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 27 May 2015 23:06:30 +0900 Subject: extcon: Remove optional print_name() function pointer of extcon_dev This patch removes the optional print_name() function pointer included in 'struct extcon_dev' because the extcon must maintain the consistent name of extcon device on sysfs instead of inconsistent name. After merged patch[1], extcon can maintain the consistent name of extcon device without any hard-coded device name. [1] https://lkml.org/lkml/2015/4/27/258 Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index be9652b3a154..a7b224b20ecc 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -83,8 +83,6 @@ struct extcon_cable; * be attached simulataneously. {0x7, 0} is equivalent to * {0x3, 0x6, 0x5, 0}. If it is {0xFFFFFFFF, 0}, there * can be no simultaneous connections. - * @print_name: An optional callback to override the method to print the - * name of the extcon device. * @print_state: An optional callback to override the method to print the * status of the extcon device. * @dev: Device of this extcon. @@ -111,7 +109,6 @@ struct extcon_dev { const u32 *mutually_exclusive; /* Optional callbacks to override class functions */ - ssize_t (*print_name)(struct extcon_dev *edev, char *buf); ssize_t (*print_state)(struct extcon_dev *edev, char *buf); /* Internal data. Please do not set. */ -- cgit v1.2.3 From c80ef9e0c021ff86771fdd72583c75d8f7b6a720 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 May 2015 10:52:59 +0200 Subject: cgroup: add seq_file forward declaration for struct cftype Recent header file changes for cgroup caused lots of warnings about a missing struct seq_file form declaration for every inclusion of include/linux/cgroup-defs.h. As some files are built with -Werror, this leads to build failure like: from /git/arm-soc/drivers/gpu/drm/tilcdc/tilcdc_crtc.c:18: /git/arm-soc/include/linux/cgroup-defs.h:354:25: error: 'struct seq_file' declared inside parameter list [-Werror] cc1: all warnings being treated as errors make[6]: *** [drivers/gpu/drm/tilcdc/tilcdc_crtc.o] Error 1 This patch adds the declaration, which resolves both the warnings and the drm failure. tj: Moved it where other type declarations are. Signed-off-by: Arnd Bergmann Fixes: b4a04ab7a37b ("cgroup: separate out include/linux/cgroup-defs.h") Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7d83d7f73420..26d1cea7929f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -26,6 +26,7 @@ struct cgroup_taskset; struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; +struct seq_file; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 -- cgit v1.2.3 From e548ca4ee4595f65b262661d166310ad8a149bec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 May 2015 13:11:32 -0600 Subject: block: don't honor chunk sizes for data-less IO We don't need to honor chunk sizes for IO that doesn't carry any data. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9ded80da2c16..ccaa9aecd593 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -903,7 +903,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq) if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) return q->limits.max_hw_sectors; - if (!q->limits.chunk_sectors) + if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD)) return blk_queue_get_max_sectors(q, rq->cmd_flags); return min(blk_max_size_offset(q, blk_rq_pos(rq)), -- cgit v1.2.3 From 19bdb6e4ec071bc49a9871b41e6a59a1657ed365 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 26 May 2015 15:11:44 -0600 Subject: PCI: Move pci_ari_enabled() to global header pci_ari_enabled() is useful outside of drivers/pci, particularly for deriving INTx routing via ACPI _PRT, so move it to the global header. Also convert to bool return. Signed-off-by: Alex Williamson Signed-off-by: Bjorn Helgaas Reviewed-by: Don Dutile Acked-by: Rafael J. Wysocki --- include/linux/pci.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..2925561a8f1e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1905,4 +1905,15 @@ static inline bool pci_is_dev_assigned(struct pci_dev *pdev) { return (pdev->dev_flags & PCI_DEV_FLAGS_ASSIGNED) == PCI_DEV_FLAGS_ASSIGNED; } + +/** + * pci_ari_enabled - query ARI forwarding status + * @bus: the PCI bus + * + * Returns true if ARI forwarding is enabled. + */ +static inline bool pci_ari_enabled(struct pci_bus *bus) +{ + return bus->self && bus->self->ari_enabled; +} #endif /* LINUX_PCI_H */ -- cgit v1.2.3 From 3a9ad0b4fdcd57f775d3615004c8c64c021a9e7d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 27 May 2015 17:23:51 -0700 Subject: PCI: Add pci_bus_addr_t David Ahern reported that d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") fails to boot on sparc/T5-8: pci 0000:06:00.0: reg 0x184: can't handle BAR above 4GB (bus address 0x110204000) The problem is that sparc64 assumed that dma_addr_t only needed to hold DMA addresses, i.e., bus addresses returned via the DMA API (dma_map_single(), etc.), while the PCI core assumed dma_addr_t could hold *any* bus address, including raw BAR values. On sparc64, all DMA addresses fit in 32 bits, so dma_addr_t is a 32-bit type. However, BAR values can be 64 bits wide, so they don't fit in a dma_addr_t. d63e2e1f3df9 added new checking that tripped over this mismatch. Add pci_bus_addr_t, which is wide enough to hold any PCI bus address, including both raw BAR values and DMA addresses. This will be 64 bits on 64-bit platforms and on platforms with a 64-bit dma_addr_t. Then dma_addr_t only needs to be wide enough to hold addresses from the DMA API. [bhelgaas: changelog, bugzilla, Kconfig to ensure pci_bus_addr_t is at least as wide as dma_addr_t, documentation] Fixes: d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") Fixes: 23b13bc76f35 ("PCI: Fail safely if we can't handle BARs larger than 4GB") Link: http://lkml.kernel.org/r/CAE9FiQU1gJY1LYrxs+ma5LCTEEe4xmtjRG0aXJ9K_Tsu+m9Wuw@mail.gmail.com Link: http://lkml.kernel.org/r/1427857069-6789-1-git-send-email-yinghai@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=96231 Reported-by: David Ahern Tested-by: David Ahern Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Acked-by: David S. Miller CC: stable@vger.kernel.org # v3.19+ --- include/linux/pci.h | 12 +++++++++--- include/linux/types.h | 12 ++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..956f74bad37a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -577,9 +577,15 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 val); +#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT +typedef u64 pci_bus_addr_t; +#else +typedef u32 pci_bus_addr_t; +#endif + struct pci_bus_region { - dma_addr_t start; - dma_addr_t end; + pci_bus_addr_t start; + pci_bus_addr_t end; }; struct pci_dynids { @@ -1128,7 +1134,7 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus, int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); -static inline dma_addr_t pci_bus_address(struct pci_dev *pdev, int bar) +static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar) { struct pci_bus_region region; diff --git a/include/linux/types.h b/include/linux/types.h index 59698be03490..8715287c3b1f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -139,12 +139,20 @@ typedef unsigned long blkcnt_t; */ #define pgoff_t unsigned long -/* A dma_addr_t can hold any valid DMA or bus address for the platform */ +/* + * A dma_addr_t can hold any valid DMA address, i.e., any address returned + * by the DMA API. + * + * If the DMA API only uses 32-bit addresses, dma_addr_t need only be 32 + * bits wide. Bus addresses, e.g., PCI BARs, may be wider than 32 bits, + * but drivers do memory-mapped I/O to ioremapped kernel virtual addresses, + * so they don't care about the size of the actual bus addresses. + */ #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT typedef u64 dma_addr_t; #else typedef u32 dma_addr_t; -#endif /* dma_addr_t */ +#endif typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; -- cgit v1.2.3 From db874c7e10557f8f1af9a6fb1ec6589ae06f349c Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 29 May 2015 09:54:02 -0700 Subject: PM / wakeirq: Fix typo in prototype for dev_pm_set_dedicated_wake_irq Looks like I only built test the dev_pm_set_wake_irq and not the dev_pm_set_dedicated_wake_irq case on x86. Turns out there's a typo for the dev_pm_set_dedicated_wake_irq prototype that causes a build error if CONFIG_COMPILE_TEST and CONFIG_MMC_OMAP_HS are selected. Reported-by: Jim Davis Signed-off-by: Tony Lindgren Reviewed-by: Felipe Balbi Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeirq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h index 4046fa1b7d25..cd5b62db9084 100644 --- a/include/linux/pm_wakeirq.h +++ b/include/linux/pm_wakeirq.h @@ -30,8 +30,7 @@ static inline int dev_pm_set_wake_irq(struct device *dev, int irq) return 0; } -static inline int dev_pm_set_dedicated__wake_irq(struct device *dev, - int irq) +static inline int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) { return 0; } -- cgit v1.2.3 From 5790cf3c00c2f92aacba348e13f8a9a8f5dd96bd Mon Sep 17 00:00:00 2001 From: Mathieu Olivari Date: Wed, 27 May 2015 11:02:47 -0700 Subject: stmmac: add phy-handle support to the platform layer On stmmac driver, PHY specification in device-tree was done using the non-standard property "snps,phy-addr". Specifying a PHY on a different MDIO bus that the one within the stmmac controller doesn't seem to be possible when device-tree is used. This change adds support for the phy-handle property, as specified in Documentation/devicetree/bindings/net/ethernet.txt. Signed-off-by: Mathieu Olivari Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 7f484a239f53..c735f5c91eea 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -99,6 +99,7 @@ struct plat_stmmacenet_data { int phy_addr; int interface; struct stmmac_mdio_bus_data *mdio_bus_data; + struct device_node *phy_node; struct stmmac_dma_cfg *dma_cfg; int clk_csr; int has_gmac; -- cgit v1.2.3 From f4fb874cf076f9eafdd15c0a88cd0f0397b95e43 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 27 May 2015 21:07:26 -0400 Subject: if_vlan: fix vlaue -> value typo Fixes "vlaue" for "value" in include/linux/if_vlan.h. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index b9ab677c0c0a..a40d29846ac2 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -416,7 +416,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, /** * __vlan_get_tag - get the VLAN ID that is part of the payload * @skb: skbuff to query - * @vlan_tci: buffer to store vlaue + * @vlan_tci: buffer to store value * * Returns error if the skb is not of VLAN type */ @@ -435,7 +435,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) /** * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[] * @skb: skbuff to query - * @vlan_tci: buffer to store vlaue + * @vlan_tci: buffer to store value * * Returns error if @skb->vlan_tci is not set correctly */ @@ -456,7 +456,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, /** * vlan_get_tag - get the VLAN ID from the skb * @skb: skbuff to query - * @vlan_tci: buffer to store vlaue + * @vlan_tci: buffer to store value * * Returns error if the skb is not VLAN tagged */ -- cgit v1.2.3 From 64ffaa2159b752e6c263dc57eaaaed7367d37493 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Thu, 28 May 2015 22:28:38 +0300 Subject: net/mlx5_core,mlx5_ib: Do not use vmap() on coherent memory As David Daney pointed in mlx4_core driver [1], mlx5_core is also misusing the DMA-API. This patch is removing the code that vmap() memory allocated by dma_alloc_coherent(). After this patch, users of this drivers might fail allocating resources on memory fragmeneted systems. This will be fixed later on. [1] - https://patchwork.ozlabs.org/patch/458531/ CC: David Daney Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9a90e7523dc2..c4cf25ffcc16 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -334,8 +334,6 @@ struct mlx5_buf_list { struct mlx5_buf { struct mlx5_buf_list direct; - struct mlx5_buf_list *page_list; - int nbufs; int npages; int size; u8 page_shift; @@ -586,11 +584,7 @@ struct mlx5_pas { static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset) { - if (likely(BITS_PER_LONG == 64 || buf->nbufs == 1)) return buf->direct.buf + offset; - else - return buf->page_list[offset >> PAGE_SHIFT].buf + - (offset & (PAGE_SIZE - 1)); } extern struct workqueue_struct *mlx5_core_wq; @@ -669,8 +663,7 @@ void mlx5_health_cleanup(void); void __init mlx5_health_init(void); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev); -int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct, - struct mlx5_buf *buf); +int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf); void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf); struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, gfp_t flags, int npages); -- cgit v1.2.3 From db058a186f98b057c19c42f7b10d9a96fd3b5d59 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2015 22:28:39 +0300 Subject: net/mlx5_core: Set irq affinity hints Preparation for upcoming ethernet driver. - Move msix array from eq_table struct to priv since its not related to eq_table - Intorduce irq_info struct to hold all irq information - Move name from mlx5_eq to irq_info struct since it is irq property. - Set IRQ affinity hints Signed-off-by: Achiad Shochat Signed-off-by: Rana Shahout Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c4cf25ffcc16..9e8979502826 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -85,7 +85,7 @@ enum { }; enum { - MLX5_MAX_EQ_NAME = 32 + MLX5_MAX_IRQ_NAME = 32 }; enum { @@ -349,7 +349,6 @@ struct mlx5_eq { u8 eqn; int nent; u64 mask; - char name[MLX5_MAX_EQ_NAME]; struct list_head list; int index; struct mlx5_rsc_debug *dbg; @@ -412,7 +411,6 @@ struct mlx5_eq_table { struct mlx5_eq pages_eq; struct mlx5_eq async_eq; struct mlx5_eq cmd_eq; - struct msix_entry *msix_arr; int num_comp_vectors; /* protect EQs list */ @@ -465,9 +463,16 @@ struct mlx5_mr_table { struct radix_tree_root tree; }; +struct mlx5_irq_info { + cpumask_var_t mask; + char name[MLX5_MAX_IRQ_NAME]; +}; + struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; + struct msix_entry *msix_arr; + struct mlx5_irq_info *irq_info; struct mlx5_uuar_info uuari; MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock); -- cgit v1.2.3 From e281682bf29438848daac11627216bceb1507b71 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2015 22:28:40 +0300 Subject: net/mlx5_core: HW data structs/types definitions cleanup mlx5_ifc.h was heavily modified here since it is now generated by a script from the device specification (PRM rev 0.25). This specification is backward compatible to existing hardware. Some structures/fields were added here in order to enable the Ethernet functionality of the driver. Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 113 +- include/linux/mlx5/driver.h | 4 +- include/linux/mlx5/mlx5_ifc.h | 6608 ++++++++++++++++++++++++++++++++++++++++- include/linux/mlx5/qp.h | 25 + 4 files changed, 6650 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index abf65c790421..feebed7b392b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -35,6 +35,7 @@ #include #include +#include #if defined(__LITTLE_ENDIAN) #define MLX5_SET_HOST_ENDIANNESS 0 @@ -70,6 +71,14 @@ << __mlx5_dw_bit_off(typ, fld))); \ } while (0) +#define MLX5_SET_TO_ONES(typ, p, fld) do { \ + BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32); \ + *((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \ + cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \ + (~__mlx5_dw_mask(typ, fld))) | ((__mlx5_mask(typ, fld)) \ + << __mlx5_dw_bit_off(typ, fld))); \ +} while (0) + #define MLX5_GET(typ, p, fld) ((be32_to_cpu(*((__be32 *)(p) +\ __mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \ __mlx5_mask(typ, fld)) @@ -264,6 +273,7 @@ enum { MLX5_OPCODE_RDMA_WRITE_IMM = 0x09, MLX5_OPCODE_SEND = 0x0a, MLX5_OPCODE_SEND_IMM = 0x0b, + MLX5_OPCODE_LSO = 0x0e, MLX5_OPCODE_RDMA_READ = 0x10, MLX5_OPCODE_ATOMIC_CS = 0x11, MLX5_OPCODE_ATOMIC_FA = 0x12, @@ -541,6 +551,10 @@ struct mlx5_cmd_prot_block { u8 sig; }; +enum { + MLX5_CQE_SYND_FLUSHED_IN_ERROR = 5, +}; + struct mlx5_err_cqe { u8 rsvd0[32]; __be32 srqn; @@ -554,13 +568,22 @@ struct mlx5_err_cqe { }; struct mlx5_cqe64 { - u8 rsvd0[17]; + u8 rsvd0[4]; + u8 lro_tcppsh_abort_dupack; + u8 lro_min_ttl; + __be16 lro_tcp_win; + __be32 lro_ack_seq_num; + __be32 rss_hash_result; + u8 rss_hash_type; u8 ml_path; - u8 rsvd20[4]; + u8 rsvd20[2]; + __be16 check_sum; __be16 slid; __be32 flags_rqpn; - u8 rsvd28[4]; - __be32 srqn; + u8 hds_ip_ext; + u8 l4_hdr_type_etc; + __be16 vlan_info; + __be32 srqn; /* [31:24]: lro_num_seg, [23:0]: srqn */ __be32 imm_inval_pkey; u8 rsvd40[4]; __be32 byte_cnt; @@ -571,6 +594,40 @@ struct mlx5_cqe64 { u8 op_own; }; +static inline int get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) +{ + return (cqe->lro_tcppsh_abort_dupack >> 6) & 1; +} + +static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe) +{ + return (cqe->l4_hdr_type_etc >> 4) & 0x7; +} + +static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe) +{ + return !!(cqe->l4_hdr_type_etc & 0x1); +} + +enum { + CQE_L4_HDR_TYPE_NONE = 0x0, + CQE_L4_HDR_TYPE_TCP_NO_ACK = 0x1, + CQE_L4_HDR_TYPE_UDP = 0x2, + CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA = 0x3, + CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA = 0x4, +}; + +enum { + CQE_RSS_HTYPE_IP = 0x3 << 6, + CQE_RSS_HTYPE_L4 = 0x3 << 2, +}; + +enum { + CQE_L2_OK = 1 << 0, + CQE_L3_OK = 1 << 1, + CQE_L4_OK = 1 << 2, +}; + struct mlx5_sig_err_cqe { u8 rsvd0[16]; __be32 expected_trans_sig; @@ -996,4 +1053,52 @@ struct mlx5_destroy_psv_out { u8 rsvd[8]; }; +#define MLX5_CMD_OP_MAX 0x920 + +enum { + VPORT_STATE_DOWN = 0x0, + VPORT_STATE_UP = 0x1, +}; + +enum { + MLX5_L3_PROT_TYPE_IPV4 = 0, + MLX5_L3_PROT_TYPE_IPV6 = 1, +}; + +enum { + MLX5_L4_PROT_TYPE_TCP = 0, + MLX5_L4_PROT_TYPE_UDP = 1, +}; + +enum { + MLX5_HASH_FIELD_SEL_SRC_IP = 1 << 0, + MLX5_HASH_FIELD_SEL_DST_IP = 1 << 1, + MLX5_HASH_FIELD_SEL_L4_SPORT = 1 << 2, + MLX5_HASH_FIELD_SEL_L4_DPORT = 1 << 3, + MLX5_HASH_FIELD_SEL_IPSEC_SPI = 1 << 4, +}; + +enum { + MLX5_MATCH_OUTER_HEADERS = 1 << 0, + MLX5_MATCH_MISC_PARAMETERS = 1 << 1, + MLX5_MATCH_INNER_HEADERS = 1 << 2, + +}; + +enum { + MLX5_FLOW_TABLE_TYPE_NIC_RCV = 0, + MLX5_FLOW_TABLE_TYPE_ESWITCH = 4, +}; + +enum { + MLX5_FLOW_CONTEXT_DEST_TYPE_VPORT = 0, + MLX5_FLOW_CONTEXT_DEST_TYPE_FLOW_TABLE = 1, + MLX5_FLOW_CONTEXT_DEST_TYPE_TIR = 2, +}; + +enum { + MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE = 0x0, + MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM = 0x1, +}; + #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9e8979502826..3fd4fdc1ba16 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -44,7 +44,6 @@ #include #include -#include enum { MLX5_BOARD_ID_LEN = 64, @@ -278,7 +277,6 @@ struct mlx5_general_caps { u8 log_max_mkey; u8 log_max_pd; u8 log_max_srq; - u8 log_max_strq; u8 log_max_mrw_sz; u8 log_max_bsf_list_size; u8 log_max_klm_list_size; @@ -664,6 +662,8 @@ int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); +int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar); +void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar); void mlx5_health_cleanup(void); void __init mlx5_health_init(void); void mlx5_start_health_poll(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cb3ad17edd1f..b27e9f6e090a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -28,11 +28,44 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - */ - +*/ #ifndef MLX5_IFC_H #define MLX5_IFC_H +enum { + MLX5_EVENT_TYPE_CODING_COMPLETION_EVENTS = 0x0, + MLX5_EVENT_TYPE_CODING_PATH_MIGRATED_SUCCEEDED = 0x1, + MLX5_EVENT_TYPE_CODING_COMMUNICATION_ESTABLISHED = 0x2, + MLX5_EVENT_TYPE_CODING_SEND_QUEUE_DRAINED = 0x3, + MLX5_EVENT_TYPE_CODING_LAST_WQE_REACHED = 0x13, + MLX5_EVENT_TYPE_CODING_SRQ_LIMIT = 0x14, + MLX5_EVENT_TYPE_CODING_DCT_ALL_CONNECTIONS_CLOSED = 0x1c, + MLX5_EVENT_TYPE_CODING_DCT_ACCESS_KEY_VIOLATION = 0x1d, + MLX5_EVENT_TYPE_CODING_CQ_ERROR = 0x4, + MLX5_EVENT_TYPE_CODING_LOCAL_WQ_CATASTROPHIC_ERROR = 0x5, + MLX5_EVENT_TYPE_CODING_PATH_MIGRATION_FAILED = 0x7, + MLX5_EVENT_TYPE_CODING_PAGE_FAULT_EVENT = 0xc, + MLX5_EVENT_TYPE_CODING_INVALID_REQUEST_LOCAL_WQ_ERROR = 0x10, + MLX5_EVENT_TYPE_CODING_LOCAL_ACCESS_VIOLATION_WQ_ERROR = 0x11, + MLX5_EVENT_TYPE_CODING_LOCAL_SRQ_CATASTROPHIC_ERROR = 0x12, + MLX5_EVENT_TYPE_CODING_INTERNAL_ERROR = 0x8, + MLX5_EVENT_TYPE_CODING_PORT_STATE_CHANGE = 0x9, + MLX5_EVENT_TYPE_CODING_GPIO_EVENT = 0x15, + MLX5_EVENT_TYPE_CODING_REMOTE_CONFIGURATION_PROTOCOL_EVENT = 0x19, + MLX5_EVENT_TYPE_CODING_DOORBELL_BLUEFLAME_CONGESTION_EVENT = 0x1a, + MLX5_EVENT_TYPE_CODING_STALL_VL_EVENT = 0x1b, + MLX5_EVENT_TYPE_CODING_DROPPED_PACKET_LOGGED_EVENT = 0x1f, + MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION = 0xa, + MLX5_EVENT_TYPE_CODING_PAGE_REQUEST = 0xb +}; + +enum { + MLX5_MODIFY_TIR_BITMASK_LRO = 0x0, + MLX5_MODIFY_TIR_BITMASK_INDIRECT_TABLE = 0x1, + MLX5_MODIFY_TIR_BITMASK_HASH = 0x2, + MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN = 0x3 +}; + enum { MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, MLX5_CMD_OP_QUERY_ADAPTER = 0x101, @@ -43,6 +76,8 @@ enum { MLX5_CMD_OP_QUERY_PAGES = 0x107, MLX5_CMD_OP_MANAGE_PAGES = 0x108, MLX5_CMD_OP_SET_HCA_CAP = 0x109, + MLX5_CMD_OP_QUERY_ISSI = 0x10a, + MLX5_CMD_OP_SET_ISSI = 0x10b, MLX5_CMD_OP_CREATE_MKEY = 0x200, MLX5_CMD_OP_QUERY_MKEY = 0x201, MLX5_CMD_OP_DESTROY_MKEY = 0x202, @@ -66,6 +101,7 @@ enum { MLX5_CMD_OP_2ERR_QP = 0x507, MLX5_CMD_OP_2RST_QP = 0x50a, MLX5_CMD_OP_QUERY_QP = 0x50b, + MLX5_CMD_OP_SQD_RTS_QP = 0x50c, MLX5_CMD_OP_INIT2INIT_QP = 0x50e, MLX5_CMD_OP_CREATE_PSV = 0x600, MLX5_CMD_OP_DESTROY_PSV = 0x601, @@ -73,7 +109,10 @@ enum { MLX5_CMD_OP_DESTROY_SRQ = 0x701, MLX5_CMD_OP_QUERY_SRQ = 0x702, MLX5_CMD_OP_ARM_RQ = 0x703, - MLX5_CMD_OP_RESIZE_SRQ = 0x704, + MLX5_CMD_OP_CREATE_XRC_SRQ = 0x705, + MLX5_CMD_OP_DESTROY_XRC_SRQ = 0x706, + MLX5_CMD_OP_QUERY_XRC_SRQ = 0x707, + MLX5_CMD_OP_ARM_XRC_SRQ = 0x708, MLX5_CMD_OP_CREATE_DCT = 0x710, MLX5_CMD_OP_DESTROY_DCT = 0x711, MLX5_CMD_OP_DRAIN_DCT = 0x712, @@ -85,8 +124,12 @@ enum { MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT = 0x753, MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754, MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755, - MLX5_CMD_OP_QUERY_RCOE_ADDRESS = 0x760, + MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760, MLX5_CMD_OP_SET_ROCE_ADDRESS = 0x761, + MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT = 0x762, + MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT = 0x763, + MLX5_CMD_OP_QUERY_HCA_VPORT_GID = 0x764, + MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY = 0x765, MLX5_CMD_OP_QUERY_VPORT_COUNTER = 0x770, MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, @@ -98,7 +141,7 @@ enum { MLX5_CMD_OP_CONFIG_INT_MODERATION = 0x804, MLX5_CMD_OP_ACCESS_REG = 0x805, MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, - MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, + MLX5_CMD_OP_DETTACH_FROM_MCG = 0x807, MLX5_CMD_OP_GET_DROPPED_PACKET_LOG = 0x80a, MLX5_CMD_OP_MAD_IFC = 0x50d, MLX5_CMD_OP_QUERY_MAD_DEMUX = 0x80b, @@ -106,23 +149,22 @@ enum { MLX5_CMD_OP_NOP = 0x80d, MLX5_CMD_OP_ALLOC_XRCD = 0x80e, MLX5_CMD_OP_DEALLOC_XRCD = 0x80f, - MLX5_CMD_OP_SET_BURST_SIZE = 0x812, - MLX5_CMD_OP_QUERY_BURST_SZIE = 0x813, - MLX5_CMD_OP_ACTIVATE_TRACER = 0x814, - MLX5_CMD_OP_DEACTIVATE_TRACER = 0x815, - MLX5_CMD_OP_CREATE_SNIFFER_RULE = 0x820, - MLX5_CMD_OP_DESTROY_SNIFFER_RULE = 0x821, - MLX5_CMD_OP_QUERY_CONG_PARAMS = 0x822, - MLX5_CMD_OP_MODIFY_CONG_PARAMS = 0x823, - MLX5_CMD_OP_QUERY_CONG_STATISTICS = 0x824, + MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN = 0x816, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN = 0x817, + MLX5_CMD_OP_QUERY_CONG_STATUS = 0x822, + MLX5_CMD_OP_MODIFY_CONG_STATUS = 0x823, + MLX5_CMD_OP_QUERY_CONG_PARAMS = 0x824, + MLX5_CMD_OP_MODIFY_CONG_PARAMS = 0x825, + MLX5_CMD_OP_QUERY_CONG_STATISTICS = 0x826, + MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT = 0x827, + MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT = 0x828, + MLX5_CMD_OP_SET_L2_TABLE_ENTRY = 0x829, + MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY = 0x82a, + MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY = 0x82b, MLX5_CMD_OP_CREATE_TIR = 0x900, MLX5_CMD_OP_MODIFY_TIR = 0x901, MLX5_CMD_OP_DESTROY_TIR = 0x902, MLX5_CMD_OP_QUERY_TIR = 0x903, - MLX5_CMD_OP_CREATE_TIS = 0x912, - MLX5_CMD_OP_MODIFY_TIS = 0x913, - MLX5_CMD_OP_DESTROY_TIS = 0x914, - MLX5_CMD_OP_QUERY_TIS = 0x915, MLX5_CMD_OP_CREATE_SQ = 0x904, MLX5_CMD_OP_MODIFY_SQ = 0x905, MLX5_CMD_OP_DESTROY_SQ = 0x906, @@ -135,9 +177,430 @@ enum { MLX5_CMD_OP_MODIFY_RMP = 0x90d, MLX5_CMD_OP_DESTROY_RMP = 0x90e, MLX5_CMD_OP_QUERY_RMP = 0x90f, - MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x910, - MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY = 0x911, - MLX5_CMD_OP_MAX = 0x911 + MLX5_CMD_OP_CREATE_TIS = 0x912, + MLX5_CMD_OP_MODIFY_TIS = 0x913, + MLX5_CMD_OP_DESTROY_TIS = 0x914, + MLX5_CMD_OP_QUERY_TIS = 0x915, + MLX5_CMD_OP_CREATE_RQT = 0x916, + MLX5_CMD_OP_MODIFY_RQT = 0x917, + MLX5_CMD_OP_DESTROY_RQT = 0x918, + MLX5_CMD_OP_QUERY_RQT = 0x919, + MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930, + MLX5_CMD_OP_DESTROY_FLOW_TABLE = 0x931, + MLX5_CMD_OP_QUERY_FLOW_TABLE = 0x932, + MLX5_CMD_OP_CREATE_FLOW_GROUP = 0x933, + MLX5_CMD_OP_DESTROY_FLOW_GROUP = 0x934, + MLX5_CMD_OP_QUERY_FLOW_GROUP = 0x935, + MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x936, + MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY = 0x937, + MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938 +}; + +struct mlx5_ifc_flow_table_fields_supported_bits { + u8 outer_dmac[0x1]; + u8 outer_smac[0x1]; + u8 outer_ether_type[0x1]; + u8 reserved_0[0x1]; + u8 outer_first_prio[0x1]; + u8 outer_first_cfi[0x1]; + u8 outer_first_vid[0x1]; + u8 reserved_1[0x1]; + u8 outer_second_prio[0x1]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0x1]; + u8 reserved_2[0x1]; + u8 outer_sip[0x1]; + u8 outer_dip[0x1]; + u8 outer_frag[0x1]; + u8 outer_ip_protocol[0x1]; + u8 outer_ip_ecn[0x1]; + u8 outer_ip_dscp[0x1]; + u8 outer_udp_sport[0x1]; + u8 outer_udp_dport[0x1]; + u8 outer_tcp_sport[0x1]; + u8 outer_tcp_dport[0x1]; + u8 outer_tcp_flags[0x1]; + u8 outer_gre_protocol[0x1]; + u8 outer_gre_key[0x1]; + u8 outer_vxlan_vni[0x1]; + u8 reserved_3[0x5]; + u8 source_eswitch_port[0x1]; + + u8 inner_dmac[0x1]; + u8 inner_smac[0x1]; + u8 inner_ether_type[0x1]; + u8 reserved_4[0x1]; + u8 inner_first_prio[0x1]; + u8 inner_first_cfi[0x1]; + u8 inner_first_vid[0x1]; + u8 reserved_5[0x1]; + u8 inner_second_prio[0x1]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0x1]; + u8 reserved_6[0x1]; + u8 inner_sip[0x1]; + u8 inner_dip[0x1]; + u8 inner_frag[0x1]; + u8 inner_ip_protocol[0x1]; + u8 inner_ip_ecn[0x1]; + u8 inner_ip_dscp[0x1]; + u8 inner_udp_sport[0x1]; + u8 inner_udp_dport[0x1]; + u8 inner_tcp_sport[0x1]; + u8 inner_tcp_dport[0x1]; + u8 inner_tcp_flags[0x1]; + u8 reserved_7[0x9]; + + u8 reserved_8[0x40]; +}; + +struct mlx5_ifc_flow_table_prop_layout_bits { + u8 ft_support[0x1]; + u8 reserved_0[0x1f]; + + u8 reserved_1[0x2]; + u8 log_max_ft_size[0x6]; + u8 reserved_2[0x10]; + u8 max_ft_level[0x8]; + + u8 reserved_3[0x20]; + + u8 reserved_4[0x18]; + u8 log_max_ft_num[0x8]; + + u8 reserved_5[0x18]; + u8 log_max_destination[0x8]; + + u8 reserved_6[0x18]; + u8 log_max_flow[0x8]; + + u8 reserved_7[0x40]; + + struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support; + + struct mlx5_ifc_flow_table_fields_supported_bits ft_field_bitmask_support; +}; + +struct mlx5_ifc_odp_per_transport_service_cap_bits { + u8 send[0x1]; + u8 receive[0x1]; + u8 write[0x1]; + u8 read[0x1]; + u8 reserved_0[0x1]; + u8 srq_receive[0x1]; + u8 reserved_1[0x1a]; +}; + +struct mlx5_ifc_fte_match_set_lyr_2_4_bits { + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 ethertype[0x10]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 first_prio[0x3]; + u8 first_cfi[0x1]; + u8 first_vid[0xc]; + + u8 ip_protocol[0x8]; + u8 ip_dscp[0x6]; + u8 ip_ecn[0x2]; + u8 vlan_tag[0x1]; + u8 reserved_0[0x1]; + u8 frag[0x1]; + u8 reserved_1[0x4]; + u8 tcp_flags[0x9]; + + u8 tcp_sport[0x10]; + u8 tcp_dport[0x10]; + + u8 reserved_2[0x20]; + + u8 udp_sport[0x10]; + u8 udp_dport[0x10]; + + u8 src_ip[4][0x20]; + + u8 dst_ip[4][0x20]; +}; + +struct mlx5_ifc_fte_match_set_misc_bits { + u8 reserved_0[0x20]; + + u8 reserved_1[0x10]; + u8 source_port[0x10]; + + u8 outer_second_prio[0x3]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0xc]; + u8 inner_second_prio[0x3]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0xc]; + + u8 outer_second_vlan_tag[0x1]; + u8 inner_second_vlan_tag[0x1]; + u8 reserved_2[0xe]; + u8 gre_protocol[0x10]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 vxlan_vni[0x18]; + u8 reserved_3[0x8]; + + u8 reserved_4[0x20]; + + u8 reserved_5[0xc]; + u8 outer_ipv6_flow_label[0x14]; + + u8 reserved_6[0xc]; + u8 inner_ipv6_flow_label[0x14]; + + u8 reserved_7[0xe0]; +}; + +struct mlx5_ifc_cmd_pas_bits { + u8 pa_h[0x20]; + + u8 pa_l[0x14]; + u8 reserved_0[0xc]; +}; + +struct mlx5_ifc_uint64_bits { + u8 hi[0x20]; + + u8 lo[0x20]; +}; + +enum { + MLX5_ADS_STAT_RATE_NO_LIMIT = 0x0, + MLX5_ADS_STAT_RATE_2_5GBPS = 0x7, + MLX5_ADS_STAT_RATE_10GBPS = 0x8, + MLX5_ADS_STAT_RATE_30GBPS = 0x9, + MLX5_ADS_STAT_RATE_5GBPS = 0xa, + MLX5_ADS_STAT_RATE_20GBPS = 0xb, + MLX5_ADS_STAT_RATE_40GBPS = 0xc, + MLX5_ADS_STAT_RATE_60GBPS = 0xd, + MLX5_ADS_STAT_RATE_80GBPS = 0xe, + MLX5_ADS_STAT_RATE_120GBPS = 0xf, +}; + +struct mlx5_ifc_ads_bits { + u8 fl[0x1]; + u8 free_ar[0x1]; + u8 reserved_0[0xe]; + u8 pkey_index[0x10]; + + u8 reserved_1[0x8]; + u8 grh[0x1]; + u8 mlid[0x7]; + u8 rlid[0x10]; + + u8 ack_timeout[0x5]; + u8 reserved_2[0x3]; + u8 src_addr_index[0x8]; + u8 reserved_3[0x4]; + u8 stat_rate[0x4]; + u8 hop_limit[0x8]; + + u8 reserved_4[0x4]; + u8 tclass[0x8]; + u8 flow_label[0x14]; + + u8 rgid_rip[16][0x8]; + + u8 reserved_5[0x4]; + u8 f_dscp[0x1]; + u8 f_ecn[0x1]; + u8 reserved_6[0x1]; + u8 f_eth_prio[0x1]; + u8 ecn[0x2]; + u8 dscp[0x6]; + u8 udp_sport[0x10]; + + u8 dei_cfi[0x1]; + u8 eth_prio[0x3]; + u8 sl[0x4]; + u8 port[0x8]; + u8 rmac_47_32[0x10]; + + u8 rmac_31_0[0x20]; +}; + +struct mlx5_ifc_flow_table_nic_cap_bits { + u8 reserved_0[0x200]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; + + u8 reserved_1[0x200]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit; + + u8 reserved_2[0x200]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer; + + u8 reserved_3[0x7200]; +}; + +struct mlx5_ifc_per_protocol_networking_offload_caps_bits { + u8 csum_cap[0x1]; + u8 vlan_cap[0x1]; + u8 lro_cap[0x1]; + u8 lro_psh_flag[0x1]; + u8 lro_time_stamp[0x1]; + u8 reserved_0[0x6]; + u8 max_lso_cap[0x5]; + u8 reserved_1[0x4]; + u8 rss_ind_tbl_cap[0x4]; + u8 reserved_2[0x3]; + u8 tunnel_lso_const_out_ip_id[0x1]; + u8 reserved_3[0x2]; + u8 tunnel_statless_gre[0x1]; + u8 tunnel_stateless_vxlan[0x1]; + + u8 reserved_4[0x20]; + + u8 reserved_5[0x10]; + u8 lro_min_mss_size[0x10]; + + u8 reserved_6[0x120]; + + u8 lro_timer_supported_periods[4][0x20]; + + u8 reserved_7[0x600]; +}; + +struct mlx5_ifc_roce_cap_bits { + u8 roce_apm[0x1]; + u8 reserved_0[0x1f]; + + u8 reserved_1[0x60]; + + u8 reserved_2[0xc]; + u8 l3_type[0x4]; + u8 reserved_3[0x8]; + u8 roce_version[0x8]; + + u8 reserved_4[0x10]; + u8 r_roce_dest_udp_port[0x10]; + + u8 r_roce_max_src_udp_port[0x10]; + u8 r_roce_min_src_udp_port[0x10]; + + u8 reserved_5[0x10]; + u8 roce_address_table_size[0x10]; + + u8 reserved_6[0x700]; +}; + +enum { + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE = 0x0, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES = 0x2, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_4_BYTES = 0x4, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_8_BYTES = 0x8, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_16_BYTES = 0x10, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_32_BYTES = 0x20, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_64_BYTES = 0x40, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_128_BYTES = 0x80, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_256_BYTES = 0x100, +}; + +enum { + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_1_BYTE = 0x1, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_2_BYTES = 0x2, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_4_BYTES = 0x4, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_8_BYTES = 0x8, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_16_BYTES = 0x10, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_32_BYTES = 0x20, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_64_BYTES = 0x40, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_128_BYTES = 0x80, + MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_256_BYTES = 0x100, +}; + +struct mlx5_ifc_atomic_caps_bits { + u8 reserved_0[0x40]; + + u8 atomic_req_endianness[0x1]; + u8 reserved_1[0x1f]; + + u8 reserved_2[0x20]; + + u8 reserved_3[0x10]; + u8 atomic_operations[0x10]; + + u8 reserved_4[0x10]; + u8 atomic_size_qp[0x10]; + + u8 reserved_5[0x10]; + u8 atomic_size_dc[0x10]; + + u8 reserved_6[0x720]; +}; + +struct mlx5_ifc_odp_cap_bits { + u8 reserved_0[0x40]; + + u8 sig[0x1]; + u8 reserved_1[0x1f]; + + u8 reserved_2[0x20]; + + struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits uc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps; + + u8 reserved_3[0x720]; +}; + +enum { + MLX5_WQ_TYPE_LINKED_LIST = 0x0, + MLX5_WQ_TYPE_CYCLIC = 0x1, + MLX5_WQ_TYPE_STRQ = 0x2, +}; + +enum { + MLX5_WQ_END_PAD_MODE_NONE = 0x0, + MLX5_WQ_END_PAD_MODE_ALIGN = 0x1, +}; + +enum { + MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_8_GID_ENTRIES = 0x0, + MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_16_GID_ENTRIES = 0x1, + MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_32_GID_ENTRIES = 0x2, + MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_64_GID_ENTRIES = 0x3, + MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_128_GID_ENTRIES = 0x4, +}; + +enum { + MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_128_ENTRIES = 0x0, + MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_256_ENTRIES = 0x1, + MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_512_ENTRIES = 0x2, + MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_1K_ENTRIES = 0x3, + MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_2K_ENTRIES = 0x4, + MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_4K_ENTRIES = 0x5, +}; + +enum { + MLX5_CMD_HCA_CAP_PORT_TYPE_IB = 0x0, + MLX5_CMD_HCA_CAP_PORT_TYPE_ETHERNET = 0x1, +}; + +enum { + MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_DISABLED = 0x0, + MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_INITIAL_STATE = 0x1, + MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_ENABLED = 0x3, +}; + +enum { + MLX5_CAP_PORT_TYPE_IB = 0x0, + MLX5_CAP_PORT_TYPE_ETH = 0x1, }; struct mlx5_ifc_cmd_hca_cap_bits { @@ -148,9 +611,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_1[0xb]; u8 log_max_qp[0x5]; - u8 log_max_strq_sz[0x8]; - u8 reserved_2[0x3]; - u8 log_max_srqs[0x5]; + u8 reserved_2[0xb]; + u8 log_max_srq[0x5]; u8 reserved_3[0x10]; u8 reserved_4[0x8]; @@ -185,165 +647,6123 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 pad_cap[0x1]; u8 cc_query_allowed[0x1]; u8 cc_modify_allowed[0x1]; - u8 reserved_15[0x1d]; + u8 reserved_15[0xd]; + u8 gid_table_size[0x10]; - u8 reserved_16[0x6]; + u8 out_of_seq_cnt[0x1]; + u8 vport_counters[0x1]; + u8 reserved_16[0x4]; u8 max_qp_cnt[0xa]; u8 pkey_table_size[0x10]; - u8 eswitch_owner[0x1]; - u8 reserved_17[0xa]; + u8 vport_group_manager[0x1]; + u8 vhca_group_manager[0x1]; + u8 ib_virt[0x1]; + u8 eth_virt[0x1]; + u8 reserved_17[0x1]; + u8 ets[0x1]; + u8 nic_flow_table[0x1]; + u8 reserved_18[0x4]; u8 local_ca_ack_delay[0x5]; - u8 reserved_18[0x8]; + u8 reserved_19[0x6]; + u8 port_type[0x2]; u8 num_ports[0x8]; - u8 reserved_19[0x3]; + u8 reserved_20[0x3]; u8 log_max_msg[0x5]; - u8 reserved_20[0x18]; + u8 reserved_21[0x18]; u8 stat_rate_support[0x10]; - u8 reserved_21[0x10]; + u8 reserved_22[0xc]; + u8 cqe_version[0x4]; - u8 reserved_22[0x10]; + u8 compact_address_vector[0x1]; + u8 reserved_23[0xe]; + u8 drain_sigerr[0x1]; u8 cmdif_checksum[0x2]; u8 sigerr_cqe[0x1]; - u8 reserved_23[0x1]; + u8 reserved_24[0x1]; u8 wq_signature[0x1]; u8 sctr_data_cqe[0x1]; - u8 reserved_24[0x1]; + u8 reserved_25[0x1]; u8 sho[0x1]; u8 tph[0x1]; u8 rf[0x1]; - u8 dc[0x1]; - u8 reserved_25[0x2]; + u8 dct[0x1]; + u8 reserved_26[0x1]; + u8 eth_net_offloads[0x1]; u8 roce[0x1]; u8 atomic[0x1]; - u8 rsz_srq[0x1]; + u8 reserved_27[0x1]; u8 cq_oi[0x1]; u8 cq_resize[0x1]; u8 cq_moderation[0x1]; - u8 sniffer_rule_flow[0x1]; - u8 sniffer_rule_vport[0x1]; - u8 sniffer_rule_phy[0x1]; - u8 reserved_26[0x1]; + u8 reserved_28[0x3]; + u8 cq_eq_remap[0x1]; u8 pg[0x1]; u8 block_lb_mc[0x1]; - u8 reserved_27[0x3]; + u8 reserved_29[0x1]; + u8 scqe_break_moderation[0x1]; + u8 reserved_30[0x1]; u8 cd[0x1]; - u8 reserved_28[0x1]; + u8 reserved_31[0x1]; u8 apm[0x1]; - u8 reserved_29[0x7]; + u8 reserved_32[0x7]; u8 qkv[0x1]; u8 pkv[0x1]; - u8 reserved_30[0x4]; + u8 reserved_33[0x4]; u8 xrc[0x1]; u8 ud[0x1]; u8 uc[0x1]; u8 rc[0x1]; - u8 reserved_31[0xa]; + u8 reserved_34[0xa]; u8 uar_sz[0x6]; - u8 reserved_32[0x8]; + u8 reserved_35[0x8]; u8 log_pg_sz[0x8]; u8 bf[0x1]; - u8 reserved_33[0xa]; + u8 reserved_36[0x1]; + u8 pad_tx_eth_packet[0x1]; + u8 reserved_37[0x8]; u8 log_bf_reg_size[0x5]; - u8 reserved_34[0x10]; + u8 reserved_38[0x10]; - u8 reserved_35[0x10]; + u8 reserved_39[0x10]; u8 max_wqe_sz_sq[0x10]; - u8 reserved_36[0x10]; + u8 reserved_40[0x10]; u8 max_wqe_sz_rq[0x10]; - u8 reserved_37[0x10]; + u8 reserved_41[0x10]; u8 max_wqe_sz_sq_dc[0x10]; - u8 reserved_38[0x7]; + u8 reserved_42[0x7]; u8 max_qp_mcg[0x19]; - u8 reserved_39[0x18]; + u8 reserved_43[0x18]; u8 log_max_mcg[0x8]; - u8 reserved_40[0xb]; + u8 reserved_44[0x3]; + u8 log_max_transport_domain[0x5]; + u8 reserved_45[0x3]; u8 log_max_pd[0x5]; - u8 reserved_41[0xb]; + u8 reserved_46[0xb]; u8 log_max_xrcd[0x5]; - u8 reserved_42[0x20]; + u8 reserved_47[0x20]; - u8 reserved_43[0x3]; + u8 reserved_48[0x3]; u8 log_max_rq[0x5]; - u8 reserved_44[0x3]; + u8 reserved_49[0x3]; u8 log_max_sq[0x5]; - u8 reserved_45[0x3]; + u8 reserved_50[0x3]; u8 log_max_tir[0x5]; - u8 reserved_46[0x3]; + u8 reserved_51[0x3]; u8 log_max_tis[0x5]; - u8 reserved_47[0x13]; - u8 log_max_rq_per_tir[0x5]; - u8 reserved_48[0x3]; + u8 basic_cyclic_rcv_wqe[0x1]; + u8 reserved_52[0x2]; + u8 log_max_rmp[0x5]; + u8 reserved_53[0x3]; + u8 log_max_rqt[0x5]; + u8 reserved_54[0x3]; + u8 log_max_rqt_size[0x5]; + u8 reserved_55[0x3]; u8 log_max_tis_per_sq[0x5]; - u8 reserved_49[0xe0]; + u8 reserved_56[0x3]; + u8 log_max_stride_sz_rq[0x5]; + u8 reserved_57[0x3]; + u8 log_min_stride_sz_rq[0x5]; + u8 reserved_58[0x3]; + u8 log_max_stride_sz_sq[0x5]; + u8 reserved_59[0x3]; + u8 log_min_stride_sz_sq[0x5]; + + u8 reserved_60[0x1b]; + u8 log_max_wq_sz[0x5]; + + u8 reserved_61[0xa0]; - u8 reserved_50[0x10]; + u8 reserved_62[0x3]; + u8 log_max_l2_table[0x5]; + u8 reserved_63[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_51[0x100]; + u8 reserved_64[0x100]; - u8 reserved_52[0x1f]; + u8 reserved_65[0x1f]; u8 cqe_zip[0x1]; u8 cqe_zip_timeout[0x10]; u8 cqe_zip_max_num[0x10]; - u8 reserved_53[0x220]; + u8 reserved_66[0x220]; }; -struct mlx5_ifc_set_hca_cap_in_bits { - u8 opcode[0x10]; - u8 reserved_0[0x10]; +enum { + MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_FLOW_TABLE_ = 0x1, + MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_TIR = 0x2, +}; - u8 reserved_1[0x10]; - u8 op_mod[0x10]; +struct mlx5_ifc_dest_format_struct_bits { + u8 destination_type[0x8]; + u8 destination_id[0x18]; - u8 reserved_2[0x40]; + u8 reserved_0[0x20]; +}; + +struct mlx5_ifc_fte_match_param_bits { + struct mlx5_ifc_fte_match_set_lyr_2_4_bits outer_headers; + + struct mlx5_ifc_fte_match_set_misc_bits misc_parameters; + + struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers; - struct mlx5_ifc_cmd_hca_cap_bits hca_capability_struct; + u8 reserved_0[0xa00]; }; -struct mlx5_ifc_query_hca_cap_in_bits { - u8 opcode[0x10]; - u8 reserved_0[0x10]; +enum { + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP = 0x0, + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP = 0x1, + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT = 0x2, + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT = 0x3, + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_IPSEC_SPI = 0x4, +}; - u8 reserved_1[0x10]; - u8 op_mod[0x10]; +struct mlx5_ifc_rx_hash_field_select_bits { + u8 l3_prot_type[0x1]; + u8 l4_prot_type[0x1]; + u8 selected_fields[0x1e]; +}; - u8 reserved_2[0x40]; +enum { + MLX5_WQ_WQ_TYPE_WQ_LINKED_LIST = 0x0, + MLX5_WQ_WQ_TYPE_WQ_CYCLIC = 0x1, }; -struct mlx5_ifc_query_hca_cap_out_bits { - u8 status[0x8]; +enum { + MLX5_WQ_END_PADDING_MODE_END_PAD_NONE = 0x0, + MLX5_WQ_END_PADDING_MODE_END_PAD_ALIGN = 0x1, +}; + +struct mlx5_ifc_wq_bits { + u8 wq_type[0x4]; + u8 wq_signature[0x1]; + u8 end_padding_mode[0x2]; + u8 cd_slave[0x1]; u8 reserved_0[0x18]; - u8 syndrome[0x20]; + u8 hds_skip_first_sge[0x1]; + u8 log2_hds_buf_size[0x3]; + u8 reserved_1[0x7]; + u8 page_offset[0x5]; + u8 lwm[0x10]; - u8 reserved_1[0x40]; + u8 reserved_2[0x8]; + u8 pd[0x18]; + + u8 reserved_3[0x8]; + u8 uar_page[0x18]; + + u8 dbr_addr[0x40]; + + u8 hw_counter[0x20]; + + u8 sw_counter[0x20]; + + u8 reserved_4[0xc]; + u8 log_wq_stride[0x4]; + u8 reserved_5[0x3]; + u8 log_wq_pg_sz[0x5]; + u8 reserved_6[0x3]; + u8 log_wq_sz[0x5]; + + u8 reserved_7[0x4e0]; - u8 capability_struct[256][0x8]; + struct mlx5_ifc_cmd_pas_bits pas[0]; }; -struct mlx5_ifc_set_hca_cap_out_bits { - u8 status[0x8]; - u8 reserved_0[0x18]; +struct mlx5_ifc_rq_num_bits { + u8 reserved_0[0x8]; + u8 rq_num[0x18]; +}; - u8 syndrome[0x20]; +struct mlx5_ifc_mac_address_layout_bits { + u8 reserved_0[0x10]; + u8 mac_addr_47_32[0x10]; - u8 reserved_1[0x40]; + u8 mac_addr_31_0[0x20]; +}; + +struct mlx5_ifc_cong_control_r_roce_ecn_np_bits { + u8 reserved_0[0xa0]; + + u8 min_time_between_cnps[0x20]; + + u8 reserved_1[0x12]; + u8 cnp_dscp[0x6]; + u8 reserved_2[0x5]; + u8 cnp_802p_prio[0x3]; + + u8 reserved_3[0x720]; +}; + +struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits { + u8 reserved_0[0x60]; + + u8 reserved_1[0x4]; + u8 clamp_tgt_rate[0x1]; + u8 reserved_2[0x3]; + u8 clamp_tgt_rate_after_time_inc[0x1]; + u8 reserved_3[0x17]; + + u8 reserved_4[0x20]; + + u8 rpg_time_reset[0x20]; + + u8 rpg_byte_reset[0x20]; + + u8 rpg_threshold[0x20]; + + u8 rpg_max_rate[0x20]; + + u8 rpg_ai_rate[0x20]; + + u8 rpg_hai_rate[0x20]; + + u8 rpg_gd[0x20]; + + u8 rpg_min_dec_fac[0x20]; + + u8 rpg_min_rate[0x20]; + + u8 reserved_5[0xe0]; + + u8 rate_to_set_on_first_cnp[0x20]; + + u8 dce_tcp_g[0x20]; + + u8 dce_tcp_rtt[0x20]; + + u8 rate_reduce_monitor_period[0x20]; + + u8 reserved_6[0x20]; + + u8 initial_alpha_value[0x20]; + + u8 reserved_7[0x4a0]; +}; + +struct mlx5_ifc_cong_control_802_1qau_rp_bits { + u8 reserved_0[0x80]; + + u8 rppp_max_rps[0x20]; + + u8 rpg_time_reset[0x20]; + + u8 rpg_byte_reset[0x20]; + + u8 rpg_threshold[0x20]; + + u8 rpg_max_rate[0x20]; + + u8 rpg_ai_rate[0x20]; + + u8 rpg_hai_rate[0x20]; + + u8 rpg_gd[0x20]; + + u8 rpg_min_dec_fac[0x20]; + + u8 rpg_min_rate[0x20]; + + u8 reserved_1[0x640]; +}; + +enum { + MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_LOG_CQ_SIZE = 0x1, + MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_PAGE_OFFSET = 0x2, + MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_LOG_PAGE_SIZE = 0x4, +}; + +struct mlx5_ifc_resize_field_select_bits { + u8 resize_field_select[0x20]; +}; + +enum { + MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_PERIOD = 0x1, + MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_MAX_COUNT = 0x2, + MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_OI = 0x4, + MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_C_EQN = 0x8, +}; + +struct mlx5_ifc_modify_field_select_bits { + u8 modify_field_select[0x20]; +}; + +struct mlx5_ifc_field_select_r_roce_np_bits { + u8 field_select_r_roce_np[0x20]; +}; + +struct mlx5_ifc_field_select_r_roce_rp_bits { + u8 field_select_r_roce_rp[0x20]; +}; + +enum { + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPPP_MAX_RPS = 0x4, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_TIME_RESET = 0x8, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_BYTE_RESET = 0x10, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_THRESHOLD = 0x20, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MAX_RATE = 0x40, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_AI_RATE = 0x80, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_HAI_RATE = 0x100, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_GD = 0x200, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MIN_DEC_FAC = 0x400, + MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MIN_RATE = 0x800, +}; + +struct mlx5_ifc_field_select_802_1qau_rp_bits { + u8 field_select_8021qaurp[0x20]; +}; + +struct mlx5_ifc_phys_layer_cntrs_bits { + u8 time_since_last_clear_high[0x20]; + + u8 time_since_last_clear_low[0x20]; + + u8 symbol_errors_high[0x20]; + + u8 symbol_errors_low[0x20]; + + u8 sync_headers_errors_high[0x20]; + + u8 sync_headers_errors_low[0x20]; + + u8 edpl_bip_errors_lane0_high[0x20]; + + u8 edpl_bip_errors_lane0_low[0x20]; + + u8 edpl_bip_errors_lane1_high[0x20]; + + u8 edpl_bip_errors_lane1_low[0x20]; + + u8 edpl_bip_errors_lane2_high[0x20]; + + u8 edpl_bip_errors_lane2_low[0x20]; + + u8 edpl_bip_errors_lane3_high[0x20]; + + u8 edpl_bip_errors_lane3_low[0x20]; + + u8 fc_fec_corrected_blocks_lane0_high[0x20]; + + u8 fc_fec_corrected_blocks_lane0_low[0x20]; + + u8 fc_fec_corrected_blocks_lane1_high[0x20]; + + u8 fc_fec_corrected_blocks_lane1_low[0x20]; + + u8 fc_fec_corrected_blocks_lane2_high[0x20]; + + u8 fc_fec_corrected_blocks_lane2_low[0x20]; + + u8 fc_fec_corrected_blocks_lane3_high[0x20]; + + u8 fc_fec_corrected_blocks_lane3_low[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane0_high[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane0_low[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane1_high[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane1_low[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane2_high[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane2_low[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane3_high[0x20]; + + u8 fc_fec_uncorrectable_blocks_lane3_low[0x20]; + + u8 rs_fec_corrected_blocks_high[0x20]; + + u8 rs_fec_corrected_blocks_low[0x20]; + + u8 rs_fec_uncorrectable_blocks_high[0x20]; + + u8 rs_fec_uncorrectable_blocks_low[0x20]; + + u8 rs_fec_no_errors_blocks_high[0x20]; + + u8 rs_fec_no_errors_blocks_low[0x20]; + + u8 rs_fec_single_error_blocks_high[0x20]; + + u8 rs_fec_single_error_blocks_low[0x20]; + + u8 rs_fec_corrected_symbols_total_high[0x20]; + + u8 rs_fec_corrected_symbols_total_low[0x20]; + + u8 rs_fec_corrected_symbols_lane0_high[0x20]; + + u8 rs_fec_corrected_symbols_lane0_low[0x20]; + + u8 rs_fec_corrected_symbols_lane1_high[0x20]; + + u8 rs_fec_corrected_symbols_lane1_low[0x20]; + + u8 rs_fec_corrected_symbols_lane2_high[0x20]; + + u8 rs_fec_corrected_symbols_lane2_low[0x20]; + + u8 rs_fec_corrected_symbols_lane3_high[0x20]; + + u8 rs_fec_corrected_symbols_lane3_low[0x20]; + + u8 link_down_events[0x20]; + + u8 successful_recovery_events[0x20]; + + u8 reserved_0[0x180]; +}; + +struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits { + u8 transmit_queue_high[0x20]; + + u8 transmit_queue_low[0x20]; + + u8 reserved_0[0x780]; +}; + +struct mlx5_ifc_eth_per_prio_grp_data_layout_bits { + u8 rx_octets_high[0x20]; + + u8 rx_octets_low[0x20]; + + u8 reserved_0[0xc0]; + + u8 rx_frames_high[0x20]; + + u8 rx_frames_low[0x20]; + + u8 tx_octets_high[0x20]; + + u8 tx_octets_low[0x20]; + + u8 reserved_1[0xc0]; + + u8 tx_frames_high[0x20]; + + u8 tx_frames_low[0x20]; + + u8 rx_pause_high[0x20]; + + u8 rx_pause_low[0x20]; + + u8 rx_pause_duration_high[0x20]; + + u8 rx_pause_duration_low[0x20]; + + u8 tx_pause_high[0x20]; + + u8 tx_pause_low[0x20]; + + u8 tx_pause_duration_high[0x20]; + + u8 tx_pause_duration_low[0x20]; + + u8 rx_pause_transition_high[0x20]; + + u8 rx_pause_transition_low[0x20]; + + u8 reserved_2[0x400]; +}; + +struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits { + u8 port_transmit_wait_high[0x20]; + + u8 port_transmit_wait_low[0x20]; + + u8 reserved_0[0x780]; +}; + +struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits { + u8 dot3stats_alignment_errors_high[0x20]; + + u8 dot3stats_alignment_errors_low[0x20]; + + u8 dot3stats_fcs_errors_high[0x20]; + + u8 dot3stats_fcs_errors_low[0x20]; + + u8 dot3stats_single_collision_frames_high[0x20]; + + u8 dot3stats_single_collision_frames_low[0x20]; + + u8 dot3stats_multiple_collision_frames_high[0x20]; + + u8 dot3stats_multiple_collision_frames_low[0x20]; + + u8 dot3stats_sqe_test_errors_high[0x20]; + + u8 dot3stats_sqe_test_errors_low[0x20]; + + u8 dot3stats_deferred_transmissions_high[0x20]; + + u8 dot3stats_deferred_transmissions_low[0x20]; + + u8 dot3stats_late_collisions_high[0x20]; + + u8 dot3stats_late_collisions_low[0x20]; + + u8 dot3stats_excessive_collisions_high[0x20]; + + u8 dot3stats_excessive_collisions_low[0x20]; + + u8 dot3stats_internal_mac_transmit_errors_high[0x20]; + + u8 dot3stats_internal_mac_transmit_errors_low[0x20]; + + u8 dot3stats_carrier_sense_errors_high[0x20]; + + u8 dot3stats_carrier_sense_errors_low[0x20]; + + u8 dot3stats_frame_too_longs_high[0x20]; + + u8 dot3stats_frame_too_longs_low[0x20]; + + u8 dot3stats_internal_mac_receive_errors_high[0x20]; + + u8 dot3stats_internal_mac_receive_errors_low[0x20]; + + u8 dot3stats_symbol_errors_high[0x20]; + + u8 dot3stats_symbol_errors_low[0x20]; + + u8 dot3control_in_unknown_opcodes_high[0x20]; + + u8 dot3control_in_unknown_opcodes_low[0x20]; + + u8 dot3in_pause_frames_high[0x20]; + + u8 dot3in_pause_frames_low[0x20]; + + u8 dot3out_pause_frames_high[0x20]; + + u8 dot3out_pause_frames_low[0x20]; + + u8 reserved_0[0x3c0]; +}; + +struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits { + u8 ether_stats_drop_events_high[0x20]; + + u8 ether_stats_drop_events_low[0x20]; + + u8 ether_stats_octets_high[0x20]; + + u8 ether_stats_octets_low[0x20]; + + u8 ether_stats_pkts_high[0x20]; + + u8 ether_stats_pkts_low[0x20]; + + u8 ether_stats_broadcast_pkts_high[0x20]; + + u8 ether_stats_broadcast_pkts_low[0x20]; + + u8 ether_stats_multicast_pkts_high[0x20]; + + u8 ether_stats_multicast_pkts_low[0x20]; + + u8 ether_stats_crc_align_errors_high[0x20]; + + u8 ether_stats_crc_align_errors_low[0x20]; + + u8 ether_stats_undersize_pkts_high[0x20]; + + u8 ether_stats_undersize_pkts_low[0x20]; + + u8 ether_stats_oversize_pkts_high[0x20]; + + u8 ether_stats_oversize_pkts_low[0x20]; + + u8 ether_stats_fragments_high[0x20]; + + u8 ether_stats_fragments_low[0x20]; + + u8 ether_stats_jabbers_high[0x20]; + + u8 ether_stats_jabbers_low[0x20]; + + u8 ether_stats_collisions_high[0x20]; + + u8 ether_stats_collisions_low[0x20]; + + u8 ether_stats_pkts64octets_high[0x20]; + + u8 ether_stats_pkts64octets_low[0x20]; + + u8 ether_stats_pkts65to127octets_high[0x20]; + + u8 ether_stats_pkts65to127octets_low[0x20]; + + u8 ether_stats_pkts128to255octets_high[0x20]; + + u8 ether_stats_pkts128to255octets_low[0x20]; + + u8 ether_stats_pkts256to511octets_high[0x20]; + + u8 ether_stats_pkts256to511octets_low[0x20]; + + u8 ether_stats_pkts512to1023octets_high[0x20]; + + u8 ether_stats_pkts512to1023octets_low[0x20]; + + u8 ether_stats_pkts1024to1518octets_high[0x20]; + + u8 ether_stats_pkts1024to1518octets_low[0x20]; + + u8 ether_stats_pkts1519to2047octets_high[0x20]; + + u8 ether_stats_pkts1519to2047octets_low[0x20]; + + u8 ether_stats_pkts2048to4095octets_high[0x20]; + + u8 ether_stats_pkts2048to4095octets_low[0x20]; + + u8 ether_stats_pkts4096to8191octets_high[0x20]; + + u8 ether_stats_pkts4096to8191octets_low[0x20]; + + u8 ether_stats_pkts8192to10239octets_high[0x20]; + + u8 ether_stats_pkts8192to10239octets_low[0x20]; + + u8 reserved_0[0x280]; +}; + +struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits { + u8 if_in_octets_high[0x20]; + + u8 if_in_octets_low[0x20]; + + u8 if_in_ucast_pkts_high[0x20]; + + u8 if_in_ucast_pkts_low[0x20]; + + u8 if_in_discards_high[0x20]; + + u8 if_in_discards_low[0x20]; + + u8 if_in_errors_high[0x20]; + + u8 if_in_errors_low[0x20]; + + u8 if_in_unknown_protos_high[0x20]; + + u8 if_in_unknown_protos_low[0x20]; + + u8 if_out_octets_high[0x20]; + + u8 if_out_octets_low[0x20]; + + u8 if_out_ucast_pkts_high[0x20]; + + u8 if_out_ucast_pkts_low[0x20]; + + u8 if_out_discards_high[0x20]; + + u8 if_out_discards_low[0x20]; + + u8 if_out_errors_high[0x20]; + + u8 if_out_errors_low[0x20]; + + u8 if_in_multicast_pkts_high[0x20]; + + u8 if_in_multicast_pkts_low[0x20]; + + u8 if_in_broadcast_pkts_high[0x20]; + + u8 if_in_broadcast_pkts_low[0x20]; + + u8 if_out_multicast_pkts_high[0x20]; + + u8 if_out_multicast_pkts_low[0x20]; + + u8 if_out_broadcast_pkts_high[0x20]; + + u8 if_out_broadcast_pkts_low[0x20]; + + u8 reserved_0[0x480]; +}; + +struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits { + u8 a_frames_transmitted_ok_high[0x20]; + + u8 a_frames_transmitted_ok_low[0x20]; + + u8 a_frames_received_ok_high[0x20]; + + u8 a_frames_received_ok_low[0x20]; + + u8 a_frame_check_sequence_errors_high[0x20]; + + u8 a_frame_check_sequence_errors_low[0x20]; + + u8 a_alignment_errors_high[0x20]; + + u8 a_alignment_errors_low[0x20]; + + u8 a_octets_transmitted_ok_high[0x20]; + + u8 a_octets_transmitted_ok_low[0x20]; + + u8 a_octets_received_ok_high[0x20]; + + u8 a_octets_received_ok_low[0x20]; + + u8 a_multicast_frames_xmitted_ok_high[0x20]; + + u8 a_multicast_frames_xmitted_ok_low[0x20]; + + u8 a_broadcast_frames_xmitted_ok_high[0x20]; + + u8 a_broadcast_frames_xmitted_ok_low[0x20]; + + u8 a_multicast_frames_received_ok_high[0x20]; + + u8 a_multicast_frames_received_ok_low[0x20]; + + u8 a_broadcast_frames_received_ok_high[0x20]; + + u8 a_broadcast_frames_received_ok_low[0x20]; + + u8 a_in_range_length_errors_high[0x20]; + + u8 a_in_range_length_errors_low[0x20]; + + u8 a_out_of_range_length_field_high[0x20]; + + u8 a_out_of_range_length_field_low[0x20]; + + u8 a_frame_too_long_errors_high[0x20]; + + u8 a_frame_too_long_errors_low[0x20]; + + u8 a_symbol_error_during_carrier_high[0x20]; + + u8 a_symbol_error_during_carrier_low[0x20]; + + u8 a_mac_control_frames_transmitted_high[0x20]; + + u8 a_mac_control_frames_transmitted_low[0x20]; + + u8 a_mac_control_frames_received_high[0x20]; + + u8 a_mac_control_frames_received_low[0x20]; + + u8 a_unsupported_opcodes_received_high[0x20]; + + u8 a_unsupported_opcodes_received_low[0x20]; + + u8 a_pause_mac_ctrl_frames_received_high[0x20]; + + u8 a_pause_mac_ctrl_frames_received_low[0x20]; + + u8 a_pause_mac_ctrl_frames_transmitted_high[0x20]; + + u8 a_pause_mac_ctrl_frames_transmitted_low[0x20]; + + u8 reserved_0[0x300]; +}; + +struct mlx5_ifc_cmd_inter_comp_event_bits { + u8 command_completion_vector[0x20]; + + u8 reserved_0[0xc0]; +}; + +struct mlx5_ifc_stall_vl_event_bits { + u8 reserved_0[0x18]; + u8 port_num[0x1]; + u8 reserved_1[0x3]; + u8 vl[0x4]; + + u8 reserved_2[0xa0]; +}; + +struct mlx5_ifc_db_bf_congestion_event_bits { + u8 event_subtype[0x8]; + u8 reserved_0[0x8]; + u8 congestion_level[0x8]; + u8 reserved_1[0x8]; + + u8 reserved_2[0xa0]; +}; + +struct mlx5_ifc_gpio_event_bits { + u8 reserved_0[0x60]; + + u8 gpio_event_hi[0x20]; + + u8 gpio_event_lo[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_port_state_change_event_bits { + u8 reserved_0[0x40]; + + u8 port_num[0x4]; + u8 reserved_1[0x1c]; + + u8 reserved_2[0x80]; +}; + +struct mlx5_ifc_dropped_packet_logged_bits { + u8 reserved_0[0xe0]; +}; + +enum { + MLX5_CQ_ERROR_SYNDROME_CQ_OVERRUN = 0x1, + MLX5_CQ_ERROR_SYNDROME_CQ_ACCESS_VIOLATION_ERROR = 0x2, +}; + +struct mlx5_ifc_cq_error_bits { + u8 reserved_0[0x8]; + u8 cqn[0x18]; + + u8 reserved_1[0x20]; + + u8 reserved_2[0x18]; + u8 syndrome[0x8]; + + u8 reserved_3[0x80]; +}; + +struct mlx5_ifc_rdma_page_fault_event_bits { + u8 bytes_committed[0x20]; + + u8 r_key[0x20]; + + u8 reserved_0[0x10]; + u8 packet_len[0x10]; + + u8 rdma_op_len[0x20]; + + u8 rdma_va[0x40]; + + u8 reserved_1[0x5]; + u8 rdma[0x1]; + u8 write[0x1]; + u8 requestor[0x1]; + u8 qp_number[0x18]; +}; + +struct mlx5_ifc_wqe_associated_page_fault_event_bits { + u8 bytes_committed[0x20]; + + u8 reserved_0[0x10]; + u8 wqe_index[0x10]; + + u8 reserved_1[0x10]; + u8 len[0x10]; + + u8 reserved_2[0x60]; + + u8 reserved_3[0x5]; + u8 rdma[0x1]; + u8 write_read[0x1]; + u8 requestor[0x1]; + u8 qpn[0x18]; +}; + +struct mlx5_ifc_qp_events_bits { + u8 reserved_0[0xa0]; + + u8 type[0x8]; + u8 reserved_1[0x18]; + + u8 reserved_2[0x8]; + u8 qpn_rqn_sqn[0x18]; +}; + +struct mlx5_ifc_dct_events_bits { + u8 reserved_0[0xc0]; + + u8 reserved_1[0x8]; + u8 dct_number[0x18]; +}; + +struct mlx5_ifc_comp_event_bits { + u8 reserved_0[0xc0]; + + u8 reserved_1[0x8]; + u8 cq_number[0x18]; +}; + +enum { + MLX5_QPC_STATE_RST = 0x0, + MLX5_QPC_STATE_INIT = 0x1, + MLX5_QPC_STATE_RTR = 0x2, + MLX5_QPC_STATE_RTS = 0x3, + MLX5_QPC_STATE_SQER = 0x4, + MLX5_QPC_STATE_ERR = 0x6, + MLX5_QPC_STATE_SQD = 0x7, + MLX5_QPC_STATE_SUSPENDED = 0x9, +}; + +enum { + MLX5_QPC_ST_RC = 0x0, + MLX5_QPC_ST_UC = 0x1, + MLX5_QPC_ST_UD = 0x2, + MLX5_QPC_ST_XRC = 0x3, + MLX5_QPC_ST_DCI = 0x5, + MLX5_QPC_ST_QP0 = 0x7, + MLX5_QPC_ST_QP1 = 0x8, + MLX5_QPC_ST_RAW_DATAGRAM = 0x9, + MLX5_QPC_ST_REG_UMR = 0xc, +}; + +enum { + MLX5_QPC_PM_STATE_ARMED = 0x0, + MLX5_QPC_PM_STATE_REARM = 0x1, + MLX5_QPC_PM_STATE_RESERVED = 0x2, + MLX5_QPC_PM_STATE_MIGRATED = 0x3, +}; + +enum { + MLX5_QPC_END_PADDING_MODE_SCATTER_AS_IS = 0x0, + MLX5_QPC_END_PADDING_MODE_PAD_TO_CACHE_LINE_ALIGNMENT = 0x1, +}; + +enum { + MLX5_QPC_MTU_256_BYTES = 0x1, + MLX5_QPC_MTU_512_BYTES = 0x2, + MLX5_QPC_MTU_1K_BYTES = 0x3, + MLX5_QPC_MTU_2K_BYTES = 0x4, + MLX5_QPC_MTU_4K_BYTES = 0x5, + MLX5_QPC_MTU_RAW_ETHERNET_QP = 0x7, +}; + +enum { + MLX5_QPC_ATOMIC_MODE_IB_SPEC = 0x1, + MLX5_QPC_ATOMIC_MODE_ONLY_8B = 0x2, + MLX5_QPC_ATOMIC_MODE_UP_TO_8B = 0x3, + MLX5_QPC_ATOMIC_MODE_UP_TO_16B = 0x4, + MLX5_QPC_ATOMIC_MODE_UP_TO_32B = 0x5, + MLX5_QPC_ATOMIC_MODE_UP_TO_64B = 0x6, + MLX5_QPC_ATOMIC_MODE_UP_TO_128B = 0x7, + MLX5_QPC_ATOMIC_MODE_UP_TO_256B = 0x8, +}; + +enum { + MLX5_QPC_CS_REQ_DISABLE = 0x0, + MLX5_QPC_CS_REQ_UP_TO_32B = 0x11, + MLX5_QPC_CS_REQ_UP_TO_64B = 0x22, +}; + +enum { + MLX5_QPC_CS_RES_DISABLE = 0x0, + MLX5_QPC_CS_RES_UP_TO_32B = 0x1, + MLX5_QPC_CS_RES_UP_TO_64B = 0x2, +}; + +struct mlx5_ifc_qpc_bits { + u8 state[0x4]; + u8 reserved_0[0x4]; + u8 st[0x8]; + u8 reserved_1[0x3]; + u8 pm_state[0x2]; + u8 reserved_2[0x7]; + u8 end_padding_mode[0x2]; + u8 reserved_3[0x2]; + + u8 wq_signature[0x1]; + u8 block_lb_mc[0x1]; + u8 atomic_like_write_en[0x1]; + u8 latency_sensitive[0x1]; + u8 reserved_4[0x1]; + u8 drain_sigerr[0x1]; + u8 reserved_5[0x2]; + u8 pd[0x18]; + + u8 mtu[0x3]; + u8 log_msg_max[0x5]; + u8 reserved_6[0x1]; + u8 log_rq_size[0x4]; + u8 log_rq_stride[0x3]; + u8 no_sq[0x1]; + u8 log_sq_size[0x4]; + u8 reserved_7[0x6]; + u8 rlky[0x1]; + u8 reserved_8[0x4]; + + u8 counter_set_id[0x8]; + u8 uar_page[0x18]; + + u8 reserved_9[0x8]; + u8 user_index[0x18]; + + u8 reserved_10[0x3]; + u8 log_page_size[0x5]; + u8 remote_qpn[0x18]; + + struct mlx5_ifc_ads_bits primary_address_path; + + struct mlx5_ifc_ads_bits secondary_address_path; + + u8 log_ack_req_freq[0x4]; + u8 reserved_11[0x4]; + u8 log_sra_max[0x3]; + u8 reserved_12[0x2]; + u8 retry_count[0x3]; + u8 rnr_retry[0x3]; + u8 reserved_13[0x1]; + u8 fre[0x1]; + u8 cur_rnr_retry[0x3]; + u8 cur_retry_count[0x3]; + u8 reserved_14[0x5]; + + u8 reserved_15[0x20]; + + u8 reserved_16[0x8]; + u8 next_send_psn[0x18]; + + u8 reserved_17[0x8]; + u8 cqn_snd[0x18]; + + u8 reserved_18[0x40]; + + u8 reserved_19[0x8]; + u8 last_acked_psn[0x18]; + + u8 reserved_20[0x8]; + u8 ssn[0x18]; + + u8 reserved_21[0x8]; + u8 log_rra_max[0x3]; + u8 reserved_22[0x1]; + u8 atomic_mode[0x4]; + u8 rre[0x1]; + u8 rwe[0x1]; + u8 rae[0x1]; + u8 reserved_23[0x1]; + u8 page_offset[0x6]; + u8 reserved_24[0x3]; + u8 cd_slave_receive[0x1]; + u8 cd_slave_send[0x1]; + u8 cd_master[0x1]; + + u8 reserved_25[0x3]; + u8 min_rnr_nak[0x5]; + u8 next_rcv_psn[0x18]; + + u8 reserved_26[0x8]; + u8 xrcd[0x18]; + + u8 reserved_27[0x8]; + u8 cqn_rcv[0x18]; + + u8 dbr_addr[0x40]; + + u8 q_key[0x20]; + + u8 reserved_28[0x5]; + u8 rq_type[0x3]; + u8 srqn_rmpn[0x18]; + + u8 reserved_29[0x8]; + u8 rmsn[0x18]; + + u8 hw_sq_wqebb_counter[0x10]; + u8 sw_sq_wqebb_counter[0x10]; + + u8 hw_rq_counter[0x20]; + + u8 sw_rq_counter[0x20]; + + u8 reserved_30[0x20]; + + u8 reserved_31[0xf]; + u8 cgs[0x1]; + u8 cs_req[0x8]; + u8 cs_res[0x8]; + + u8 dc_access_key[0x40]; + + u8 reserved_32[0xc0]; +}; + +struct mlx5_ifc_roce_addr_layout_bits { + u8 source_l3_address[16][0x8]; + + u8 reserved_0[0x3]; + u8 vlan_valid[0x1]; + u8 vlan_id[0xc]; + u8 source_mac_47_32[0x10]; + + u8 source_mac_31_0[0x20]; + + u8 reserved_1[0x14]; + u8 roce_l3_type[0x4]; + u8 roce_version[0x8]; + + u8 reserved_2[0x20]; +}; + +union mlx5_ifc_hca_cap_union_bits { + struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap; + struct mlx5_ifc_odp_cap_bits odp_cap; + struct mlx5_ifc_atomic_caps_bits atomic_caps; + struct mlx5_ifc_roce_cap_bits roce_cap; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; + struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; + u8 reserved_0[0x8000]; +}; + +enum { + MLX5_FLOW_CONTEXT_ACTION_ALLOW = 0x1, + MLX5_FLOW_CONTEXT_ACTION_DROP = 0x2, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST = 0x4, +}; + +struct mlx5_ifc_flow_context_bits { + u8 reserved_0[0x20]; + + u8 group_id[0x20]; + + u8 reserved_1[0x8]; + u8 flow_tag[0x18]; + + u8 reserved_2[0x10]; + u8 action[0x10]; + + u8 reserved_3[0x8]; + u8 destination_list_size[0x18]; + + u8 reserved_4[0x160]; + + struct mlx5_ifc_fte_match_param_bits match_value; + + u8 reserved_5[0x600]; + + struct mlx5_ifc_dest_format_struct_bits destination[0]; +}; + +enum { + MLX5_XRC_SRQC_STATE_GOOD = 0x0, + MLX5_XRC_SRQC_STATE_ERROR = 0x1, +}; + +struct mlx5_ifc_xrc_srqc_bits { + u8 state[0x4]; + u8 log_xrc_srq_size[0x4]; + u8 reserved_0[0x18]; + + u8 wq_signature[0x1]; + u8 cont_srq[0x1]; + u8 reserved_1[0x1]; + u8 rlky[0x1]; + u8 basic_cyclic_rcv_wqe[0x1]; + u8 log_rq_stride[0x3]; + u8 xrcd[0x18]; + + u8 page_offset[0x6]; + u8 reserved_2[0x2]; + u8 cqn[0x18]; + + u8 reserved_3[0x20]; + + u8 user_index_equal_xrc_srqn[0x1]; + u8 reserved_4[0x1]; + u8 log_page_size[0x6]; + u8 user_index[0x18]; + + u8 reserved_5[0x20]; + + u8 reserved_6[0x8]; + u8 pd[0x18]; + + u8 lwm[0x10]; + u8 wqe_cnt[0x10]; + + u8 reserved_7[0x40]; + + u8 db_record_addr_h[0x20]; + + u8 db_record_addr_l[0x1e]; + u8 reserved_8[0x2]; + + u8 reserved_9[0x80]; +}; + +struct mlx5_ifc_traffic_counter_bits { + u8 packets[0x40]; + + u8 octets[0x40]; +}; + +struct mlx5_ifc_tisc_bits { + u8 reserved_0[0xc]; + u8 prio[0x4]; + u8 reserved_1[0x10]; + + u8 reserved_2[0x100]; + + u8 reserved_3[0x8]; + u8 transport_domain[0x18]; + + u8 reserved_4[0x3c0]; +}; + +enum { + MLX5_TIRC_DISP_TYPE_DIRECT = 0x0, + MLX5_TIRC_DISP_TYPE_INDIRECT = 0x1, +}; + +enum { + MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO = 0x1, + MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO = 0x2, +}; + +enum { + MLX5_TIRC_RX_HASH_FN_HASH_NONE = 0x0, + MLX5_TIRC_RX_HASH_FN_HASH_INVERTED_XOR8 = 0x1, + MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ = 0x2, +}; + +enum { + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_ = 0x1, + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST_ = 0x2, +}; + +struct mlx5_ifc_tirc_bits { + u8 reserved_0[0x20]; + + u8 disp_type[0x4]; + u8 reserved_1[0x1c]; + + u8 reserved_2[0x40]; + + u8 reserved_3[0x4]; + u8 lro_timeout_period_usecs[0x10]; + u8 lro_enable_mask[0x4]; + u8 lro_max_ip_payload_size[0x8]; + + u8 reserved_4[0x40]; + + u8 reserved_5[0x8]; + u8 inline_rqn[0x18]; + + u8 rx_hash_symmetric[0x1]; + u8 reserved_6[0x1]; + u8 tunneled_offload_en[0x1]; + u8 reserved_7[0x5]; + u8 indirect_table[0x18]; + + u8 rx_hash_fn[0x4]; + u8 reserved_8[0x2]; + u8 self_lb_block[0x2]; + u8 transport_domain[0x18]; + + u8 rx_hash_toeplitz_key[10][0x20]; + + struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_outer; + + struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_inner; + + u8 reserved_9[0x4c0]; +}; + +enum { + MLX5_SRQC_STATE_GOOD = 0x0, + MLX5_SRQC_STATE_ERROR = 0x1, +}; + +struct mlx5_ifc_srqc_bits { + u8 state[0x4]; + u8 log_srq_size[0x4]; + u8 reserved_0[0x18]; + + u8 wq_signature[0x1]; + u8 cont_srq[0x1]; + u8 reserved_1[0x1]; + u8 rlky[0x1]; + u8 reserved_2[0x1]; + u8 log_rq_stride[0x3]; + u8 xrcd[0x18]; + + u8 page_offset[0x6]; + u8 reserved_3[0x2]; + u8 cqn[0x18]; + + u8 reserved_4[0x20]; + + u8 reserved_5[0x2]; + u8 log_page_size[0x6]; + u8 reserved_6[0x18]; + + u8 reserved_7[0x20]; + + u8 reserved_8[0x8]; + u8 pd[0x18]; + + u8 lwm[0x10]; + u8 wqe_cnt[0x10]; + + u8 reserved_9[0x40]; + + u8 db_record_addr_h[0x20]; + + u8 db_record_addr_l[0x1e]; + u8 reserved_10[0x2]; + + u8 reserved_11[0x80]; +}; + +enum { + MLX5_SQC_STATE_RST = 0x0, + MLX5_SQC_STATE_RDY = 0x1, + MLX5_SQC_STATE_ERR = 0x3, +}; + +struct mlx5_ifc_sqc_bits { + u8 rlky[0x1]; + u8 cd_master[0x1]; + u8 fre[0x1]; + u8 flush_in_error_en[0x1]; + u8 reserved_0[0x4]; + u8 state[0x4]; + u8 reserved_1[0x14]; + + u8 reserved_2[0x8]; + u8 user_index[0x18]; + + u8 reserved_3[0x8]; + u8 cqn[0x18]; + + u8 reserved_4[0xa0]; + + u8 tis_lst_sz[0x10]; + u8 reserved_5[0x10]; + + u8 reserved_6[0x40]; + + u8 reserved_7[0x8]; + u8 tis_num_0[0x18]; + + struct mlx5_ifc_wq_bits wq; +}; + +struct mlx5_ifc_rqtc_bits { + u8 reserved_0[0xa0]; + + u8 reserved_1[0x10]; + u8 rqt_max_size[0x10]; + + u8 reserved_2[0x10]; + u8 rqt_actual_size[0x10]; + + u8 reserved_3[0x6a0]; + + struct mlx5_ifc_rq_num_bits rq_num[0]; +}; + +enum { + MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE = 0x0, + MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_RMP = 0x1, +}; + +enum { + MLX5_RQC_STATE_RST = 0x0, + MLX5_RQC_STATE_RDY = 0x1, + MLX5_RQC_STATE_ERR = 0x3, +}; + +struct mlx5_ifc_rqc_bits { + u8 rlky[0x1]; + u8 reserved_0[0x2]; + u8 vsd[0x1]; + u8 mem_rq_type[0x4]; + u8 state[0x4]; + u8 reserved_1[0x1]; + u8 flush_in_error_en[0x1]; + u8 reserved_2[0x12]; + + u8 reserved_3[0x8]; + u8 user_index[0x18]; + + u8 reserved_4[0x8]; + u8 cqn[0x18]; + + u8 counter_set_id[0x8]; + u8 reserved_5[0x18]; + + u8 reserved_6[0x8]; + u8 rmpn[0x18]; + + u8 reserved_7[0xe0]; + + struct mlx5_ifc_wq_bits wq; +}; + +enum { + MLX5_RMPC_STATE_RDY = 0x1, + MLX5_RMPC_STATE_ERR = 0x3, +}; + +struct mlx5_ifc_rmpc_bits { + u8 reserved_0[0x8]; + u8 state[0x4]; + u8 reserved_1[0x14]; + + u8 basic_cyclic_rcv_wqe[0x1]; + u8 reserved_2[0x1f]; + + u8 reserved_3[0x140]; + + struct mlx5_ifc_wq_bits wq; +}; + +enum { + MLX5_NIC_VPORT_CONTEXT_ALLOWED_LIST_TYPE_CURRENT_UC_MAC_ADDRESS = 0x0, +}; + +struct mlx5_ifc_nic_vport_context_bits { + u8 reserved_0[0x1f]; + u8 roce_en[0x1]; + + u8 reserved_1[0x760]; + + u8 reserved_2[0x5]; + u8 allowed_list_type[0x3]; + u8 reserved_3[0xc]; + u8 allowed_list_size[0xc]; + + struct mlx5_ifc_mac_address_layout_bits permanent_address; + + u8 reserved_4[0x20]; + + u8 current_uc_mac_address[0][0x40]; +}; + +enum { + MLX5_MKC_ACCESS_MODE_PA = 0x0, + MLX5_MKC_ACCESS_MODE_MTT = 0x1, + MLX5_MKC_ACCESS_MODE_KLMS = 0x2, +}; + +struct mlx5_ifc_mkc_bits { + u8 reserved_0[0x1]; + u8 free[0x1]; + u8 reserved_1[0xd]; + u8 small_fence_on_rdma_read_response[0x1]; + u8 umr_en[0x1]; + u8 a[0x1]; + u8 rw[0x1]; + u8 rr[0x1]; + u8 lw[0x1]; + u8 lr[0x1]; + u8 access_mode[0x2]; + u8 reserved_2[0x8]; + + u8 qpn[0x18]; + u8 mkey_7_0[0x8]; + + u8 reserved_3[0x20]; + + u8 length64[0x1]; + u8 bsf_en[0x1]; + u8 sync_umr[0x1]; + u8 reserved_4[0x2]; + u8 expected_sigerr_count[0x1]; + u8 reserved_5[0x1]; + u8 en_rinval[0x1]; + u8 pd[0x18]; + + u8 start_addr[0x40]; + + u8 len[0x40]; + + u8 bsf_octword_size[0x20]; + + u8 reserved_6[0x80]; + + u8 translations_octword_size[0x20]; + + u8 reserved_7[0x1b]; + u8 log_page_size[0x5]; + + u8 reserved_8[0x20]; +}; + +struct mlx5_ifc_pkey_bits { + u8 reserved_0[0x10]; + u8 pkey[0x10]; +}; + +struct mlx5_ifc_array128_auto_bits { + u8 array128_auto[16][0x8]; +}; + +struct mlx5_ifc_hca_vport_context_bits { + u8 field_select[0x20]; + + u8 reserved_0[0xe0]; + + u8 sm_virt_aware[0x1]; + u8 has_smi[0x1]; + u8 has_raw[0x1]; + u8 grh_required[0x1]; + u8 reserved_1[0x10]; + u8 port_state_policy[0x4]; + u8 phy_port_state[0x4]; + u8 vport_state[0x4]; + + u8 reserved_2[0x60]; + + u8 port_guid[0x40]; + + u8 node_guid[0x40]; + + u8 cap_mask1[0x20]; + + u8 cap_mask1_field_select[0x20]; + + u8 cap_mask2[0x20]; + + u8 cap_mask2_field_select[0x20]; + + u8 reserved_3[0x80]; + + u8 lid[0x10]; + u8 reserved_4[0x4]; + u8 init_type_reply[0x4]; + u8 lmc[0x3]; + u8 subnet_timeout[0x5]; + + u8 sm_lid[0x10]; + u8 sm_sl[0x4]; + u8 reserved_5[0xc]; + + u8 qkey_violation_counter[0x10]; + u8 pkey_violation_counter[0x10]; + + u8 reserved_6[0xca0]; +}; + +enum { + MLX5_EQC_STATUS_OK = 0x0, + MLX5_EQC_STATUS_EQ_WRITE_FAILURE = 0xa, +}; + +enum { + MLX5_EQC_ST_ARMED = 0x9, + MLX5_EQC_ST_FIRED = 0xa, +}; + +struct mlx5_ifc_eqc_bits { + u8 status[0x4]; + u8 reserved_0[0x9]; + u8 ec[0x1]; + u8 oi[0x1]; + u8 reserved_1[0x5]; + u8 st[0x4]; + u8 reserved_2[0x8]; + + u8 reserved_3[0x20]; + + u8 reserved_4[0x14]; + u8 page_offset[0x6]; + u8 reserved_5[0x6]; + + u8 reserved_6[0x3]; + u8 log_eq_size[0x5]; + u8 uar_page[0x18]; + + u8 reserved_7[0x20]; + + u8 reserved_8[0x18]; + u8 intr[0x8]; + + u8 reserved_9[0x3]; + u8 log_page_size[0x5]; + u8 reserved_10[0x18]; + + u8 reserved_11[0x60]; + + u8 reserved_12[0x8]; + u8 consumer_counter[0x18]; + + u8 reserved_13[0x8]; + u8 producer_counter[0x18]; + + u8 reserved_14[0x80]; +}; + +enum { + MLX5_DCTC_STATE_ACTIVE = 0x0, + MLX5_DCTC_STATE_DRAINING = 0x1, + MLX5_DCTC_STATE_DRAINED = 0x2, +}; + +enum { + MLX5_DCTC_CS_RES_DISABLE = 0x0, + MLX5_DCTC_CS_RES_NA = 0x1, + MLX5_DCTC_CS_RES_UP_TO_64B = 0x2, +}; + +enum { + MLX5_DCTC_MTU_256_BYTES = 0x1, + MLX5_DCTC_MTU_512_BYTES = 0x2, + MLX5_DCTC_MTU_1K_BYTES = 0x3, + MLX5_DCTC_MTU_2K_BYTES = 0x4, + MLX5_DCTC_MTU_4K_BYTES = 0x5, +}; + +struct mlx5_ifc_dctc_bits { + u8 reserved_0[0x4]; + u8 state[0x4]; + u8 reserved_1[0x18]; + + u8 reserved_2[0x8]; + u8 user_index[0x18]; + + u8 reserved_3[0x8]; + u8 cqn[0x18]; + + u8 counter_set_id[0x8]; + u8 atomic_mode[0x4]; + u8 rre[0x1]; + u8 rwe[0x1]; + u8 rae[0x1]; + u8 atomic_like_write_en[0x1]; + u8 latency_sensitive[0x1]; + u8 rlky[0x1]; + u8 free_ar[0x1]; + u8 reserved_4[0xd]; + + u8 reserved_5[0x8]; + u8 cs_res[0x8]; + u8 reserved_6[0x3]; + u8 min_rnr_nak[0x5]; + u8 reserved_7[0x8]; + + u8 reserved_8[0x8]; + u8 srqn[0x18]; + + u8 reserved_9[0x8]; + u8 pd[0x18]; + + u8 tclass[0x8]; + u8 reserved_10[0x4]; + u8 flow_label[0x14]; + + u8 dc_access_key[0x40]; + + u8 reserved_11[0x5]; + u8 mtu[0x3]; + u8 port[0x8]; + u8 pkey_index[0x10]; + + u8 reserved_12[0x8]; + u8 my_addr_index[0x8]; + u8 reserved_13[0x8]; + u8 hop_limit[0x8]; + + u8 dc_access_key_violation_count[0x20]; + + u8 reserved_14[0x14]; + u8 dei_cfi[0x1]; + u8 eth_prio[0x3]; + u8 ecn[0x2]; + u8 dscp[0x6]; + + u8 reserved_15[0x40]; +}; + +enum { + MLX5_CQC_STATUS_OK = 0x0, + MLX5_CQC_STATUS_CQ_OVERFLOW = 0x9, + MLX5_CQC_STATUS_CQ_WRITE_FAIL = 0xa, +}; + +enum { + MLX5_CQC_CQE_SZ_64_BYTES = 0x0, + MLX5_CQC_CQE_SZ_128_BYTES = 0x1, +}; + +enum { + MLX5_CQC_ST_SOLICITED_NOTIFICATION_REQUEST_ARMED = 0x6, + MLX5_CQC_ST_NOTIFICATION_REQUEST_ARMED = 0x9, + MLX5_CQC_ST_FIRED = 0xa, +}; + +struct mlx5_ifc_cqc_bits { + u8 status[0x4]; + u8 reserved_0[0x4]; + u8 cqe_sz[0x3]; + u8 cc[0x1]; + u8 reserved_1[0x1]; + u8 scqe_break_moderation_en[0x1]; + u8 oi[0x1]; + u8 reserved_2[0x2]; + u8 cqe_zip_en[0x1]; + u8 mini_cqe_res_format[0x2]; + u8 st[0x4]; + u8 reserved_3[0x8]; + + u8 reserved_4[0x20]; + + u8 reserved_5[0x14]; + u8 page_offset[0x6]; + u8 reserved_6[0x6]; + + u8 reserved_7[0x3]; + u8 log_cq_size[0x5]; + u8 uar_page[0x18]; + + u8 reserved_8[0x4]; + u8 cq_period[0xc]; + u8 cq_max_count[0x10]; + + u8 reserved_9[0x18]; + u8 c_eqn[0x8]; + + u8 reserved_10[0x3]; + u8 log_page_size[0x5]; + u8 reserved_11[0x18]; + + u8 reserved_12[0x20]; + + u8 reserved_13[0x8]; + u8 last_notified_index[0x18]; + + u8 reserved_14[0x8]; + u8 last_solicit_index[0x18]; + + u8 reserved_15[0x8]; + u8 consumer_counter[0x18]; + + u8 reserved_16[0x8]; + u8 producer_counter[0x18]; + + u8 reserved_17[0x40]; + + u8 dbr_addr[0x40]; +}; + +union mlx5_ifc_cong_control_roce_ecn_auto_bits { + struct mlx5_ifc_cong_control_802_1qau_rp_bits cong_control_802_1qau_rp; + struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits cong_control_r_roce_ecn_rp; + struct mlx5_ifc_cong_control_r_roce_ecn_np_bits cong_control_r_roce_ecn_np; + u8 reserved_0[0x800]; +}; + +struct mlx5_ifc_query_adapter_param_block_bits { + u8 reserved_0[0xe0]; + + u8 reserved_1[0x10]; + u8 vsd_vendor_id[0x10]; + + u8 vsd[208][0x8]; + + u8 vsd_contd_psid[16][0x8]; +}; + +union mlx5_ifc_modify_field_select_resize_field_select_auto_bits { + struct mlx5_ifc_modify_field_select_bits modify_field_select; + struct mlx5_ifc_resize_field_select_bits resize_field_select; + u8 reserved_0[0x20]; +}; + +union mlx5_ifc_field_select_802_1_r_roce_auto_bits { + struct mlx5_ifc_field_select_802_1qau_rp_bits field_select_802_1qau_rp; + struct mlx5_ifc_field_select_r_roce_rp_bits field_select_r_roce_rp; + struct mlx5_ifc_field_select_r_roce_np_bits field_select_r_roce_np; + u8 reserved_0[0x20]; +}; + +union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { + struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout; + struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout; + struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; + struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits eth_3635_cntrs_grp_data_layout; + struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout; + struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout; + struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout; + struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; + u8 reserved_0[0x7c0]; +}; + +union mlx5_ifc_event_auto_bits { + struct mlx5_ifc_comp_event_bits comp_event; + struct mlx5_ifc_dct_events_bits dct_events; + struct mlx5_ifc_qp_events_bits qp_events; + struct mlx5_ifc_wqe_associated_page_fault_event_bits wqe_associated_page_fault_event; + struct mlx5_ifc_rdma_page_fault_event_bits rdma_page_fault_event; + struct mlx5_ifc_cq_error_bits cq_error; + struct mlx5_ifc_dropped_packet_logged_bits dropped_packet_logged; + struct mlx5_ifc_port_state_change_event_bits port_state_change_event; + struct mlx5_ifc_gpio_event_bits gpio_event; + struct mlx5_ifc_db_bf_congestion_event_bits db_bf_congestion_event; + struct mlx5_ifc_stall_vl_event_bits stall_vl_event; + struct mlx5_ifc_cmd_inter_comp_event_bits cmd_inter_comp_event; + u8 reserved_0[0xe0]; +}; + +struct mlx5_ifc_health_buffer_bits { + u8 reserved_0[0x100]; + + u8 assert_existptr[0x20]; + + u8 assert_callra[0x20]; + + u8 reserved_1[0x40]; + + u8 fw_version[0x20]; + + u8 hw_id[0x20]; + + u8 reserved_2[0x20]; + + u8 irisc_index[0x8]; + u8 synd[0x8]; + u8 ext_synd[0x10]; +}; + +struct mlx5_ifc_register_loopback_control_bits { + u8 no_lb[0x1]; + u8 reserved_0[0x7]; + u8 port[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0x60]; +}; + +struct mlx5_ifc_teardown_hca_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +enum { + MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE = 0x0, + MLX5_TEARDOWN_HCA_IN_PROFILE_PANIC_CLOSE = 0x1, +}; + +struct mlx5_ifc_teardown_hca_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x10]; + u8 profile[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_sqerr2rts_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_sqerr2rts_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_4[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_sqd2rts_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_sqd2rts_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_4[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_set_roce_address_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_set_roce_address_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 roce_address_index[0x10]; + u8 reserved_2[0x10]; + + u8 reserved_3[0x20]; + + struct mlx5_ifc_roce_addr_layout_bits roce_address; +}; + +struct mlx5_ifc_set_mad_demux_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +enum { + MLX5_SET_MAD_DEMUX_IN_DEMUX_MODE_PASS_ALL = 0x0, + MLX5_SET_MAD_DEMUX_IN_DEMUX_MODE_SELECTIVE = 0x2, +}; + +struct mlx5_ifc_set_mad_demux_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x20]; + + u8 reserved_3[0x6]; + u8 demux_mode[0x2]; + u8 reserved_4[0x18]; +}; + +struct mlx5_ifc_set_l2_table_entry_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_set_l2_table_entry_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x60]; + + u8 reserved_3[0x8]; + u8 table_index[0x18]; + + u8 reserved_4[0x20]; + + u8 reserved_5[0x13]; + u8 vlan_valid[0x1]; + u8 vlan[0xc]; + + struct mlx5_ifc_mac_address_layout_bits mac_address; + + u8 reserved_6[0xc0]; +}; + +struct mlx5_ifc_set_issi_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_set_issi_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x10]; + u8 current_issi[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_set_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_set_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + union mlx5_ifc_hca_cap_union_bits capability; +}; + +struct mlx5_ifc_set_fte_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_set_fte_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x40]; + + u8 flow_index[0x20]; + + u8 reserved_6[0xe0]; + + struct mlx5_ifc_flow_context_bits flow_context; +}; + +struct mlx5_ifc_rts2rts_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_rts2rts_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_4[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_rtr2rts_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_rtr2rts_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_4[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_rst2init_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_rst2init_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_4[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_query_xrc_srq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry; + + u8 reserved_2[0x600]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_query_xrc_srq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 xrc_srqn[0x18]; + + u8 reserved_3[0x20]; +}; + +enum { + MLX5_QUERY_VPORT_STATE_OUT_STATE_DOWN = 0x0, + MLX5_QUERY_VPORT_STATE_OUT_STATE_UP = 0x1, +}; + +struct mlx5_ifc_query_vport_state_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x20]; + + u8 reserved_2[0x18]; + u8 admin_state[0x4]; + u8 state[0x4]; +}; + +enum { + MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT = 0x0, +}; + +struct mlx5_ifc_query_vport_state_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_vport_counter_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_traffic_counter_bits received_errors; + + struct mlx5_ifc_traffic_counter_bits transmit_errors; + + struct mlx5_ifc_traffic_counter_bits received_ib_unicast; + + struct mlx5_ifc_traffic_counter_bits transmitted_ib_unicast; + + struct mlx5_ifc_traffic_counter_bits received_ib_multicast; + + struct mlx5_ifc_traffic_counter_bits transmitted_ib_multicast; + + struct mlx5_ifc_traffic_counter_bits received_eth_broadcast; + + struct mlx5_ifc_traffic_counter_bits transmitted_eth_broadcast; + + struct mlx5_ifc_traffic_counter_bits received_eth_unicast; + + struct mlx5_ifc_traffic_counter_bits transmitted_eth_unicast; + + struct mlx5_ifc_traffic_counter_bits received_eth_multicast; + + struct mlx5_ifc_traffic_counter_bits transmitted_eth_multicast; + + u8 reserved_2[0xa00]; +}; + +enum { + MLX5_QUERY_VPORT_COUNTER_IN_OP_MOD_VPORT_COUNTERS = 0x0, +}; + +struct mlx5_ifc_query_vport_counter_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x60]; + + u8 clear[0x1]; + u8 reserved_4[0x1f]; + + u8 reserved_5[0x20]; +}; + +struct mlx5_ifc_query_tis_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_tisc_bits tis_context; +}; + +struct mlx5_ifc_query_tis_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 tisn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_tir_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0xc0]; + + struct mlx5_ifc_tirc_bits tir_context; +}; + +struct mlx5_ifc_query_tir_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 tirn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_srq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_srqc_bits srq_context_entry; + + u8 reserved_2[0x600]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_query_srq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 srqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_sq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0xc0]; + + struct mlx5_ifc_sqc_bits sq_context; +}; + +struct mlx5_ifc_query_sq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 sqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_special_contexts_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x20]; + + u8 resd_lkey[0x20]; +}; + +struct mlx5_ifc_query_special_contexts_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_query_rqt_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0xc0]; + + struct mlx5_ifc_rqtc_bits rqt_context; +}; + +struct mlx5_ifc_query_rqt_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 rqtn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_rq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0xc0]; + + struct mlx5_ifc_rqc_bits rq_context; +}; + +struct mlx5_ifc_query_rq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 rqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_roce_address_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_roce_addr_layout_bits roce_address; +}; + +struct mlx5_ifc_query_roce_address_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 roce_address_index[0x10]; + u8 reserved_2[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_rmp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0xc0]; + + struct mlx5_ifc_rmpc_bits rmp_context; +}; + +struct mlx5_ifc_query_rmp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 rmpn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 opt_param_mask[0x20]; + + u8 reserved_2[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_3[0x80]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_query_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_q_counter_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 rx_write_requests[0x20]; + + u8 reserved_2[0x20]; + + u8 rx_read_requests[0x20]; + + u8 reserved_3[0x20]; + + u8 rx_atomic_requests[0x20]; + + u8 reserved_4[0x20]; + + u8 rx_dct_connect[0x20]; + + u8 reserved_5[0x20]; + + u8 out_of_buffer[0x20]; + + u8 reserved_6[0x20]; + + u8 out_of_sequence[0x20]; + + u8 reserved_7[0x620]; +}; + +struct mlx5_ifc_query_q_counter_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x80]; + + u8 clear[0x1]; + u8 reserved_3[0x1f]; + + u8 reserved_4[0x18]; + u8 counter_set_id[0x8]; +}; + +struct mlx5_ifc_query_pages_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x10]; + u8 function_id[0x10]; + + u8 num_pages[0x20]; +}; + +enum { + MLX5_QUERY_PAGES_IN_OP_MOD_BOOT_PAGES = 0x1, + MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES = 0x2, + MLX5_QUERY_PAGES_IN_OP_MOD_REGULAR_PAGES = 0x3, +}; + +struct mlx5_ifc_query_pages_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x10]; + u8 function_id[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_nic_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_nic_vport_context_bits nic_vport_context; +}; + +struct mlx5_ifc_query_nic_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x5]; + u8 allowed_list_type[0x3]; + u8 reserved_4[0x18]; +}; + +struct mlx5_ifc_query_mkey_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_mkc_bits memory_key_mkey_entry; + + u8 reserved_2[0x600]; + + u8 bsf0_klm0_pas_mtt0_1[16][0x8]; + + u8 bsf1_klm1_pas_mtt2_3[16][0x8]; +}; + +struct mlx5_ifc_query_mkey_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 mkey_index[0x18]; + + u8 pg_access[0x1]; + u8 reserved_3[0x1f]; +}; + +struct mlx5_ifc_query_mad_demux_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 mad_dumux_parameters_block[0x20]; +}; + +struct mlx5_ifc_query_mad_demux_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_query_l2_table_entry_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0xa0]; + + u8 reserved_2[0x13]; + u8 vlan_valid[0x1]; + u8 vlan[0xc]; + + struct mlx5_ifc_mac_address_layout_bits mac_address; + + u8 reserved_3[0xc0]; +}; + +struct mlx5_ifc_query_l2_table_entry_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x60]; + + u8 reserved_3[0x8]; + u8 table_index[0x18]; + + u8 reserved_4[0x140]; +}; + +struct mlx5_ifc_query_issi_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x10]; + u8 current_issi[0x10]; + + u8 reserved_2[0xa0]; + + u8 supported_issi_reserved[76][0x8]; + u8 supported_issi_dw0[0x20]; +}; + +struct mlx5_ifc_query_issi_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_query_hca_vport_pkey_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_pkey_bits pkey[0]; +}; + +struct mlx5_ifc_query_hca_vport_pkey_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x10]; + u8 pkey_index[0x10]; +}; + +struct mlx5_ifc_query_hca_vport_gid_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x20]; + + u8 gids_num[0x10]; + u8 reserved_2[0x10]; + + struct mlx5_ifc_array128_auto_bits gid[0]; +}; + +struct mlx5_ifc_query_hca_vport_gid_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x10]; + u8 gid_index[0x10]; +}; + +struct mlx5_ifc_query_hca_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_hca_vport_context_bits hca_vport_context; +}; + +struct mlx5_ifc_query_hca_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + union mlx5_ifc_hca_cap_union_bits capability; +}; + +struct mlx5_ifc_query_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_query_flow_table_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x80]; + + u8 reserved_2[0x8]; + u8 level[0x8]; + u8 reserved_3[0x8]; + u8 log_size[0x8]; + + u8 reserved_4[0x120]; +}; + +struct mlx5_ifc_query_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x140]; +}; + +struct mlx5_ifc_query_fte_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x1c0]; + + struct mlx5_ifc_flow_context_bits flow_context; +}; + +struct mlx5_ifc_query_fte_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x40]; + + u8 flow_index[0x20]; + + u8 reserved_6[0xe0]; +}; + +enum { + MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, + MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, + MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, +}; + +struct mlx5_ifc_query_flow_group_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0xa0]; + + u8 start_flow_index[0x20]; + + u8 reserved_2[0x20]; + + u8 end_flow_index[0x20]; + + u8 reserved_3[0xa0]; + + u8 reserved_4[0x18]; + u8 match_criteria_enable[0x8]; + + struct mlx5_ifc_fte_match_param_bits match_criteria; + + u8 reserved_5[0xe00]; +}; + +struct mlx5_ifc_query_flow_group_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 group_id[0x20]; + + u8 reserved_5[0x120]; +}; + +struct mlx5_ifc_query_eq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_eqc_bits eq_context_entry; + + u8 reserved_2[0x40]; + + u8 event_bitmask[0x40]; + + u8 reserved_3[0x580]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_query_eq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x18]; + u8 eq_number[0x8]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_dct_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_dctc_bits dct_context_entry; + + u8 reserved_2[0x180]; +}; + +struct mlx5_ifc_query_dct_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 dctn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_cq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_cqc_bits cq_context; + + u8 reserved_2[0x600]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_query_cq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 cqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_cong_status_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x20]; + + u8 enable[0x1]; + u8 tag_enable[0x1]; + u8 reserved_2[0x1e]; +}; + +struct mlx5_ifc_query_cong_status_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x18]; + u8 priority[0x4]; + u8 cong_protocol[0x4]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_cong_statistics_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 cur_flows[0x20]; + + u8 sum_flows[0x20]; + + u8 cnp_ignored_high[0x20]; + + u8 cnp_ignored_low[0x20]; + + u8 cnp_handled_high[0x20]; + + u8 cnp_handled_low[0x20]; + + u8 reserved_2[0x100]; + + u8 time_stamp_high[0x20]; + + u8 time_stamp_low[0x20]; + + u8 accumulators_period[0x20]; + + u8 ecn_marked_roce_packets_high[0x20]; + + u8 ecn_marked_roce_packets_low[0x20]; + + u8 cnps_sent_high[0x20]; + + u8 cnps_sent_low[0x20]; + + u8 reserved_3[0x560]; +}; + +struct mlx5_ifc_query_cong_statistics_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 clear[0x1]; + u8 reserved_2[0x1f]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_cong_params_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters; +}; + +struct mlx5_ifc_query_cong_params_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x1c]; + u8 cong_protocol[0x4]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_query_adapter_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_query_adapter_param_block_bits query_adapter_struct; +}; + +struct mlx5_ifc_query_adapter_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_qp_2rst_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_qp_2rst_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_qp_2err_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_qp_2err_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_page_fault_resume_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_page_fault_resume_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 error[0x1]; + u8 reserved_2[0x4]; + u8 rdma[0x1]; + u8 read_write[0x1]; + u8 req_res[0x1]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_nop_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_nop_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_modify_vport_state_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_vport_state_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x18]; + u8 admin_state[0x4]; + u8 reserved_4[0x4]; +}; + +struct mlx5_ifc_modify_tis_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_tis_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 tisn[0x18]; + + u8 reserved_3[0x20]; + + u8 modify_bitmask[0x40]; + + u8 reserved_4[0x40]; + + struct mlx5_ifc_tisc_bits ctx; +}; + +struct mlx5_ifc_modify_tir_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_tir_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 tirn[0x18]; + + u8 reserved_3[0x20]; + + u8 modify_bitmask[0x40]; + + u8 reserved_4[0x40]; + + struct mlx5_ifc_tirc_bits ctx; +}; + +struct mlx5_ifc_modify_sq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_sq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 sq_state[0x4]; + u8 reserved_2[0x4]; + u8 sqn[0x18]; + + u8 reserved_3[0x20]; + + u8 modify_bitmask[0x40]; + + u8 reserved_4[0x40]; + + struct mlx5_ifc_sqc_bits ctx; +}; + +struct mlx5_ifc_modify_rqt_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_rqt_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 rqtn[0x18]; + + u8 reserved_3[0x20]; + + u8 modify_bitmask[0x40]; + + u8 reserved_4[0x40]; + + struct mlx5_ifc_rqtc_bits ctx; +}; + +struct mlx5_ifc_modify_rq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_rq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 rq_state[0x4]; + u8 reserved_2[0x4]; + u8 rqn[0x18]; + + u8 reserved_3[0x20]; + + u8 modify_bitmask[0x40]; + + u8 reserved_4[0x40]; + + struct mlx5_ifc_rqc_bits ctx; +}; + +struct mlx5_ifc_modify_rmp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_rmp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 rmp_state[0x4]; + u8 reserved_2[0x4]; + u8 rmpn[0x18]; + + u8 reserved_3[0x20]; + + u8 modify_bitmask[0x40]; + + u8 reserved_4[0x40]; + + struct mlx5_ifc_rmpc_bits ctx; +}; + +struct mlx5_ifc_modify_nic_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_nic_vport_field_select_bits { + u8 reserved_0[0x1c]; + u8 permanent_address[0x1]; + u8 addresses_list[0x1]; + u8 roce_en[0x1]; + u8 reserved_1[0x1]; +}; + +struct mlx5_ifc_modify_nic_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + struct mlx5_ifc_modify_nic_vport_field_select_bits field_select; + + u8 reserved_3[0x780]; + + struct mlx5_ifc_nic_vport_context_bits nic_vport_context; +}; + +struct mlx5_ifc_modify_hca_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_hca_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x20]; + + struct mlx5_ifc_hca_vport_context_bits hca_vport_context; +}; + +struct mlx5_ifc_modify_cq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +enum { + MLX5_MODIFY_CQ_IN_OP_MOD_MODIFY_CQ = 0x0, + MLX5_MODIFY_CQ_IN_OP_MOD_RESIZE_CQ = 0x1, +}; + +struct mlx5_ifc_modify_cq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 cqn[0x18]; + + union mlx5_ifc_modify_field_select_resize_field_select_auto_bits modify_field_select_resize_field_select; + + struct mlx5_ifc_cqc_bits cq_context; + + u8 reserved_3[0x600]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_modify_cong_status_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_cong_status_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x18]; + u8 priority[0x4]; + u8 cong_protocol[0x4]; + + u8 enable[0x1]; + u8 tag_enable[0x1]; + u8 reserved_3[0x1e]; +}; + +struct mlx5_ifc_modify_cong_params_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_cong_params_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x1c]; + u8 cong_protocol[0x4]; + + union mlx5_ifc_field_select_802_1_r_roce_auto_bits field_select; + + u8 reserved_3[0x80]; + + union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters; +}; + +struct mlx5_ifc_manage_pages_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 output_num_entries[0x20]; + + u8 reserved_1[0x20]; + + u8 pas[0][0x40]; +}; + +enum { + MLX5_MANAGE_PAGES_IN_OP_MOD_ALLOCATION_FAIL = 0x0, + MLX5_MANAGE_PAGES_IN_OP_MOD_ALLOCATION_SUCCESS = 0x1, + MLX5_MANAGE_PAGES_IN_OP_MOD_HCA_RETURN_PAGES = 0x2, +}; + +struct mlx5_ifc_manage_pages_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x10]; + u8 function_id[0x10]; + + u8 input_num_entries[0x20]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_mad_ifc_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 response_mad_packet[256][0x8]; +}; + +struct mlx5_ifc_mad_ifc_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 remote_lid[0x10]; + u8 reserved_2[0x8]; + u8 port[0x8]; + + u8 reserved_3[0x20]; + + u8 mad[256][0x8]; +}; + +struct mlx5_ifc_init_hca_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_init_hca_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_init2rtr_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_init2rtr_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_4[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_init2init_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_init2init_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_4[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_get_dropped_packet_log_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 packet_headers_log[128][0x8]; + + u8 packet_syndrome[64][0x8]; +}; + +struct mlx5_ifc_get_dropped_packet_log_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_gen_eqe_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x18]; + u8 eq_number[0x8]; + + u8 reserved_3[0x20]; + + u8 eqe[64][0x8]; +}; + +struct mlx5_ifc_gen_eq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_enable_hca_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x20]; +}; + +struct mlx5_ifc_enable_hca_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x10]; + u8 function_id[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_drain_dct_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_drain_dct_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 dctn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_disable_hca_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x20]; +}; + +struct mlx5_ifc_disable_hca_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x10]; + u8 function_id[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_detach_from_mcg_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_detach_from_mcg_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 multicast_gid[16][0x8]; +}; + +struct mlx5_ifc_destroy_xrc_srq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_xrc_srq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 xrc_srqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_tis_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_tis_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 tisn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_tir_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_tir_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 tirn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_srq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_srq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 srqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_sq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_sq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 sqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_rqt_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_rqt_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 rqtn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_rq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_rq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 rqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_rmp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_rmp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 rmpn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_psv_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_psv_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 psvn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_mkey_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_mkey_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 mkey_index[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_flow_table_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x140]; +}; + +struct mlx5_ifc_destroy_flow_group_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_flow_group_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 group_id[0x20]; + + u8 reserved_5[0x120]; +}; + +struct mlx5_ifc_destroy_eq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_eq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x18]; + u8 eq_number[0x8]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_dct_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_dct_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 dctn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_destroy_cq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_destroy_cq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 cqn[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_delete_vxlan_udp_dport_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_delete_vxlan_udp_dport_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x20]; + + u8 reserved_3[0x10]; + u8 vxlan_udp_port[0x10]; +}; + +struct mlx5_ifc_delete_l2_table_entry_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_delete_l2_table_entry_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x60]; + + u8 reserved_3[0x8]; + u8 table_index[0x18]; + + u8 reserved_4[0x140]; +}; + +struct mlx5_ifc_delete_fte_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_delete_fte_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x40]; + + u8 flow_index[0x20]; + + u8 reserved_6[0xe0]; +}; + +struct mlx5_ifc_dealloc_xrcd_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_dealloc_xrcd_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 xrcd[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_dealloc_uar_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_dealloc_uar_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 uar[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_dealloc_transport_domain_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_dealloc_transport_domain_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 transport_domain[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_dealloc_q_counter_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_dealloc_q_counter_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x18]; + u8 counter_set_id[0x8]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_dealloc_pd_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_dealloc_pd_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 pd[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_create_xrc_srq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 xrc_srqn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_xrc_srq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry; + + u8 reserved_3[0x600]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_create_tis_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 tisn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_tis_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0xc0]; + + struct mlx5_ifc_tisc_bits ctx; +}; + +struct mlx5_ifc_create_tir_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 tirn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_tir_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0xc0]; + + struct mlx5_ifc_tirc_bits ctx; +}; + +struct mlx5_ifc_create_srq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 srqn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_srq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_srqc_bits srq_context_entry; + + u8 reserved_3[0x600]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_create_sq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 sqn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_sq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0xc0]; + + struct mlx5_ifc_sqc_bits ctx; +}; + +struct mlx5_ifc_create_rqt_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 rqtn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_rqt_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0xc0]; + + struct mlx5_ifc_rqtc_bits rqt_context; +}; + +struct mlx5_ifc_create_rq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 rqn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_rq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0xc0]; + + struct mlx5_ifc_rqc_bits ctx; +}; + +struct mlx5_ifc_create_rmp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 rmpn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_rmp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0xc0]; + + struct mlx5_ifc_rmpc_bits ctx; +}; + +struct mlx5_ifc_create_qp_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 qpn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 opt_param_mask[0x20]; + + u8 reserved_3[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_4[0x80]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_create_psv_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 reserved_2[0x8]; + u8 psv0_index[0x18]; + + u8 reserved_3[0x8]; + u8 psv1_index[0x18]; + + u8 reserved_4[0x8]; + u8 psv2_index[0x18]; + + u8 reserved_5[0x8]; + u8 psv3_index[0x18]; +}; + +struct mlx5_ifc_create_psv_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 num_psv[0x4]; + u8 reserved_2[0x4]; + u8 pd[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_create_mkey_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 mkey_index[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_mkey_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x20]; + + u8 pg_access[0x1]; + u8 reserved_3[0x1f]; + + struct mlx5_ifc_mkc_bits memory_key_mkey_entry; + + u8 reserved_4[0x80]; + + u8 translations_octword_actual_size[0x20]; + + u8 reserved_5[0x560]; + + u8 klm_pas_mtt[0][0x20]; +}; + +struct mlx5_ifc_create_flow_table_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 table_id[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x20]; + + u8 reserved_5[0x8]; + u8 level[0x8]; + u8 reserved_6[0x8]; + u8 log_size[0x8]; + + u8 reserved_7[0x120]; +}; + +struct mlx5_ifc_create_flow_group_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 group_id[0x18]; + + u8 reserved_2[0x20]; +}; + +enum { + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, +}; + +struct mlx5_ifc_create_flow_group_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x20]; + + u8 start_flow_index[0x20]; + + u8 reserved_6[0x20]; + + u8 end_flow_index[0x20]; + + u8 reserved_7[0xa0]; + + u8 reserved_8[0x18]; + u8 match_criteria_enable[0x8]; + + struct mlx5_ifc_fte_match_param_bits match_criteria; + + u8 reserved_9[0xe00]; +}; + +struct mlx5_ifc_create_eq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x18]; + u8 eq_number[0x8]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_eq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_eqc_bits eq_context_entry; + + u8 reserved_3[0x40]; + + u8 event_bitmask[0x40]; + + u8 reserved_4[0x580]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_create_dct_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 dctn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_dct_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_dctc_bits dct_context_entry; + + u8 reserved_3[0x180]; +}; + +struct mlx5_ifc_create_cq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 cqn[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_create_cq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_cqc_bits cq_context; + + u8 reserved_3[0x600]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_config_int_moderation_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x4]; + u8 min_delay[0xc]; + u8 int_vector[0x10]; + + u8 reserved_2[0x20]; +}; + +enum { + MLX5_CONFIG_INT_MODERATION_IN_OP_MOD_WRITE = 0x0, + MLX5_CONFIG_INT_MODERATION_IN_OP_MOD_READ = 0x1, +}; + +struct mlx5_ifc_config_int_moderation_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x4]; + u8 min_delay[0xc]; + u8 int_vector[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_attach_to_mcg_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_attach_to_mcg_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 qpn[0x18]; + + u8 reserved_3[0x20]; + + u8 multicast_gid[16][0x8]; +}; + +struct mlx5_ifc_arm_xrc_srq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +enum { + MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ = 0x1, +}; + +struct mlx5_ifc_arm_xrc_srq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 xrc_srqn[0x18]; + + u8 reserved_3[0x10]; + u8 lwm[0x10]; +}; + +struct mlx5_ifc_arm_rq_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +enum { + MLX5_ARM_RQ_IN_OP_MOD_SRQ_ = 0x1, +}; + +struct mlx5_ifc_arm_rq_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 srq_number[0x18]; + + u8 reserved_3[0x10]; + u8 lwm[0x10]; +}; + +struct mlx5_ifc_arm_dct_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_arm_dct_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x8]; + u8 dct_number[0x18]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_alloc_xrcd_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 xrcd[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_alloc_xrcd_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_alloc_uar_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 uar[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_alloc_uar_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_alloc_transport_domain_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 transport_domain[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_alloc_transport_domain_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_alloc_q_counter_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x18]; + u8 counter_set_id[0x8]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_alloc_q_counter_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_alloc_pd_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x8]; + u8 pd[0x18]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_alloc_pd_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_add_vxlan_udp_dport_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_add_vxlan_udp_dport_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x20]; + + u8 reserved_3[0x10]; + u8 vxlan_udp_port[0x10]; +}; + +struct mlx5_ifc_access_register_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 register_data[0][0x20]; +}; + +enum { + MLX5_ACCESS_REGISTER_IN_OP_MOD_WRITE = 0x0, + MLX5_ACCESS_REGISTER_IN_OP_MOD_READ = 0x1, +}; + +struct mlx5_ifc_access_register_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x10]; + u8 register_id[0x10]; + + u8 argument[0x20]; + + u8 register_data[0][0x20]; +}; + +struct mlx5_ifc_sltp_reg_bits { + u8 status[0x4]; + u8 version[0x4]; + u8 local_port[0x8]; + u8 pnat[0x2]; + u8 reserved_0[0x2]; + u8 lane[0x4]; + u8 reserved_1[0x8]; + + u8 reserved_2[0x20]; + + u8 reserved_3[0x7]; + u8 polarity[0x1]; + u8 ob_tap0[0x8]; + u8 ob_tap1[0x8]; + u8 ob_tap2[0x8]; + + u8 reserved_4[0xc]; + u8 ob_preemp_mode[0x4]; + u8 ob_reg[0x8]; + u8 ob_bias[0x8]; + + u8 reserved_5[0x20]; +}; + +struct mlx5_ifc_slrg_reg_bits { + u8 status[0x4]; + u8 version[0x4]; + u8 local_port[0x8]; + u8 pnat[0x2]; + u8 reserved_0[0x2]; + u8 lane[0x4]; + u8 reserved_1[0x8]; + + u8 time_to_link_up[0x10]; + u8 reserved_2[0xc]; + u8 grade_lane_speed[0x4]; + + u8 grade_version[0x8]; + u8 grade[0x18]; + + u8 reserved_3[0x4]; + u8 height_grade_type[0x4]; + u8 height_grade[0x18]; + + u8 height_dz[0x10]; + u8 height_dv[0x10]; + + u8 reserved_4[0x10]; + u8 height_sigma[0x10]; + + u8 reserved_5[0x20]; + + u8 reserved_6[0x4]; + u8 phase_grade_type[0x4]; + u8 phase_grade[0x18]; + + u8 reserved_7[0x8]; + u8 phase_eo_pos[0x8]; + u8 reserved_8[0x8]; + u8 phase_eo_neg[0x8]; + + u8 ffe_set_tested[0x10]; + u8 test_errors_per_lane[0x10]; +}; + +struct mlx5_ifc_pvlc_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0x1c]; + u8 vl_hw_cap[0x4]; + + u8 reserved_3[0x1c]; + u8 vl_admin[0x4]; + + u8 reserved_4[0x1c]; + u8 vl_operational[0x4]; +}; + +struct mlx5_ifc_pude_reg_bits { + u8 swid[0x8]; + u8 local_port[0x8]; + u8 reserved_0[0x4]; + u8 admin_status[0x4]; + u8 reserved_1[0x4]; + u8 oper_status[0x4]; + + u8 reserved_2[0x60]; +}; + +struct mlx5_ifc_ptys_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0xd]; + u8 proto_mask[0x3]; + + u8 reserved_2[0x40]; + + u8 eth_proto_capability[0x20]; + + u8 ib_link_width_capability[0x10]; + u8 ib_proto_capability[0x10]; + + u8 reserved_3[0x20]; + + u8 eth_proto_admin[0x20]; + + u8 ib_link_width_admin[0x10]; + u8 ib_proto_admin[0x10]; + + u8 reserved_4[0x20]; + + u8 eth_proto_oper[0x20]; + + u8 ib_link_width_oper[0x10]; + u8 ib_proto_oper[0x10]; + + u8 reserved_5[0x20]; + + u8 eth_proto_lp_advertise[0x20]; + + u8 reserved_6[0x60]; +}; + +struct mlx5_ifc_ptas_reg_bits { + u8 reserved_0[0x20]; + + u8 algorithm_options[0x10]; + u8 reserved_1[0x4]; + u8 repetitions_mode[0x4]; + u8 num_of_repetitions[0x8]; + + u8 grade_version[0x8]; + u8 height_grade_type[0x4]; + u8 phase_grade_type[0x4]; + u8 height_grade_weight[0x8]; + u8 phase_grade_weight[0x8]; + + u8 gisim_measure_bits[0x10]; + u8 adaptive_tap_measure_bits[0x10]; + + u8 ber_bath_high_error_threshold[0x10]; + u8 ber_bath_mid_error_threshold[0x10]; + + u8 ber_bath_low_error_threshold[0x10]; + u8 one_ratio_high_threshold[0x10]; + + u8 one_ratio_high_mid_threshold[0x10]; + u8 one_ratio_low_mid_threshold[0x10]; + + u8 one_ratio_low_threshold[0x10]; + u8 ndeo_error_threshold[0x10]; + + u8 mixer_offset_step_size[0x10]; + u8 reserved_2[0x8]; + u8 mix90_phase_for_voltage_bath[0x8]; + + u8 mixer_offset_start[0x10]; + u8 mixer_offset_end[0x10]; + + u8 reserved_3[0x15]; + u8 ber_test_time[0xb]; +}; + +struct mlx5_ifc_pspa_reg_bits { + u8 swid[0x8]; + u8 local_port[0x8]; + u8 sub_port[0x8]; + u8 reserved_0[0x8]; + + u8 reserved_1[0x20]; +}; + +struct mlx5_ifc_pqdr_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x5]; + u8 prio[0x3]; + u8 reserved_2[0x6]; + u8 mode[0x2]; + + u8 reserved_3[0x20]; + + u8 reserved_4[0x10]; + u8 min_threshold[0x10]; + + u8 reserved_5[0x10]; + u8 max_threshold[0x10]; + + u8 reserved_6[0x10]; + u8 mark_probability_denominator[0x10]; + + u8 reserved_7[0x60]; +}; + +struct mlx5_ifc_ppsc_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0x60]; + + u8 reserved_3[0x1c]; + u8 wrps_admin[0x4]; + + u8 reserved_4[0x1c]; + u8 wrps_status[0x4]; + + u8 reserved_5[0x8]; + u8 up_threshold[0x8]; + u8 reserved_6[0x8]; + u8 down_threshold[0x8]; + + u8 reserved_7[0x20]; + + u8 reserved_8[0x1c]; + u8 srps_admin[0x4]; + + u8 reserved_9[0x1c]; + u8 srps_status[0x4]; + + u8 reserved_10[0x40]; +}; + +struct mlx5_ifc_pplr_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0x8]; + u8 lb_cap[0x8]; + u8 reserved_3[0x8]; + u8 lb_en[0x8]; +}; + +struct mlx5_ifc_pplm_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0x20]; + + u8 port_profile_mode[0x8]; + u8 static_port_profile[0x8]; + u8 active_port_profile[0x8]; + u8 reserved_3[0x8]; + + u8 retransmission_active[0x8]; + u8 fec_mode_active[0x18]; + + u8 reserved_4[0x20]; +}; + +struct mlx5_ifc_ppcnt_reg_bits { + u8 swid[0x8]; + u8 local_port[0x8]; + u8 pnat[0x2]; + u8 reserved_0[0x8]; + u8 grp[0x6]; + + u8 clr[0x1]; + u8 reserved_1[0x1c]; + u8 prio_tc[0x3]; + + union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set; +}; + +struct mlx5_ifc_ppad_reg_bits { + u8 reserved_0[0x3]; + u8 single_mac[0x1]; + u8 reserved_1[0x4]; + u8 local_port[0x8]; + u8 mac_47_32[0x10]; + + u8 mac_31_0[0x20]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_pmtu_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 max_mtu[0x10]; + u8 reserved_2[0x10]; + + u8 admin_mtu[0x10]; + u8 reserved_3[0x10]; + + u8 oper_mtu[0x10]; + u8 reserved_4[0x10]; +}; + +struct mlx5_ifc_pmpr_reg_bits { + u8 reserved_0[0x8]; + u8 module[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0x18]; + u8 attenuation_5g[0x8]; + + u8 reserved_3[0x18]; + u8 attenuation_7g[0x8]; + + u8 reserved_4[0x18]; + u8 attenuation_12g[0x8]; +}; + +struct mlx5_ifc_pmpe_reg_bits { + u8 reserved_0[0x8]; + u8 module[0x8]; + u8 reserved_1[0xc]; + u8 module_status[0x4]; + + u8 reserved_2[0x60]; +}; + +struct mlx5_ifc_pmpc_reg_bits { + u8 module_state_updated[32][0x8]; +}; + +struct mlx5_ifc_pmlpn_reg_bits { + u8 reserved_0[0x4]; + u8 mlpn_status[0x4]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 e[0x1]; + u8 reserved_2[0x1f]; +}; + +struct mlx5_ifc_pmlp_reg_bits { + u8 rxtx[0x1]; + u8 reserved_0[0x7]; + u8 local_port[0x8]; + u8 reserved_1[0x8]; + u8 width[0x8]; + + u8 lane0_module_mapping[0x20]; + + u8 lane1_module_mapping[0x20]; + + u8 lane2_module_mapping[0x20]; + + u8 lane3_module_mapping[0x20]; + + u8 reserved_2[0x160]; +}; + +struct mlx5_ifc_pmaos_reg_bits { + u8 reserved_0[0x8]; + u8 module[0x8]; + u8 reserved_1[0x4]; + u8 admin_status[0x4]; + u8 reserved_2[0x4]; + u8 oper_status[0x4]; + + u8 ase[0x1]; + u8 ee[0x1]; + u8 reserved_3[0x1c]; + u8 e[0x2]; + + u8 reserved_4[0x40]; +}; + +struct mlx5_ifc_plpc_reg_bits { + u8 reserved_0[0x4]; + u8 profile_id[0xc]; + u8 reserved_1[0x4]; + u8 proto_mask[0x4]; + u8 reserved_2[0x8]; + + u8 reserved_3[0x10]; + u8 lane_speed[0x10]; + + u8 reserved_4[0x17]; + u8 lpbf[0x1]; + u8 fec_mode_policy[0x8]; + + u8 retransmission_capability[0x8]; + u8 fec_mode_capability[0x18]; + + u8 retransmission_support_admin[0x8]; + u8 fec_mode_support_admin[0x18]; + + u8 retransmission_request_admin[0x8]; + u8 fec_mode_request_admin[0x18]; + + u8 reserved_5[0x80]; +}; + +struct mlx5_ifc_plib_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x8]; + u8 ib_port[0x8]; + + u8 reserved_2[0x60]; +}; + +struct mlx5_ifc_plbf_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0xd]; + u8 lbf_mode[0x3]; + + u8 reserved_2[0x20]; +}; + +struct mlx5_ifc_pipg_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 dic[0x1]; + u8 reserved_2[0x19]; + u8 ipg[0x4]; + u8 reserved_3[0x2]; +}; + +struct mlx5_ifc_pifr_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0xe0]; + + u8 port_filter[8][0x20]; + + u8 port_filter_update_en[8][0x20]; +}; + +struct mlx5_ifc_pfcc_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 ppan[0x4]; + u8 reserved_2[0x4]; + u8 prio_mask_tx[0x8]; + u8 reserved_3[0x8]; + u8 prio_mask_rx[0x8]; + + u8 pptx[0x1]; + u8 aptx[0x1]; + u8 reserved_4[0x6]; + u8 pfctx[0x8]; + u8 reserved_5[0x10]; + + u8 pprx[0x1]; + u8 aprx[0x1]; + u8 reserved_6[0x6]; + u8 pfcrx[0x8]; + u8 reserved_7[0x10]; + + u8 reserved_8[0x80]; +}; + +struct mlx5_ifc_pelc_reg_bits { + u8 op[0x4]; + u8 reserved_0[0x4]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 op_admin[0x8]; + u8 op_capability[0x8]; + u8 op_request[0x8]; + u8 op_active[0x8]; + + u8 admin[0x40]; + + u8 capability[0x40]; + + u8 request[0x40]; + + u8 active[0x40]; + + u8 reserved_2[0x80]; +}; + +struct mlx5_ifc_peir_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 reserved_2[0xc]; + u8 error_count[0x4]; + u8 reserved_3[0x10]; + + u8 reserved_4[0xc]; + u8 lane[0x4]; + u8 reserved_5[0x8]; + u8 error_type[0x8]; +}; + +struct mlx5_ifc_pcap_reg_bits { + u8 reserved_0[0x8]; + u8 local_port[0x8]; + u8 reserved_1[0x10]; + + u8 port_capability_mask[4][0x20]; +}; + +struct mlx5_ifc_paos_reg_bits { + u8 swid[0x8]; + u8 local_port[0x8]; + u8 reserved_0[0x4]; + u8 admin_status[0x4]; + u8 reserved_1[0x4]; + u8 oper_status[0x4]; + + u8 ase[0x1]; + u8 ee[0x1]; + u8 reserved_2[0x1c]; + u8 e[0x2]; + + u8 reserved_3[0x40]; +}; + +struct mlx5_ifc_pamp_reg_bits { + u8 reserved_0[0x8]; + u8 opamp_group[0x8]; + u8 reserved_1[0xc]; + u8 opamp_group_type[0x4]; + + u8 start_index[0x10]; + u8 reserved_2[0x4]; + u8 num_of_indices[0xc]; + + u8 index_data[18][0x10]; +}; + +struct mlx5_ifc_lane_2_module_mapping_bits { + u8 reserved_0[0x6]; + u8 rx_lane[0x2]; + u8 reserved_1[0x6]; + u8 tx_lane[0x2]; + u8 reserved_2[0x8]; + u8 module[0x8]; +}; + +struct mlx5_ifc_bufferx_reg_bits { + u8 reserved_0[0x6]; + u8 lossy[0x1]; + u8 epsb[0x1]; + u8 reserved_1[0xc]; + u8 size[0xc]; + + u8 xoff_threshold[0x10]; + u8 xon_threshold[0x10]; +}; + +struct mlx5_ifc_set_node_in_bits { + u8 node_description[64][0x8]; +}; + +struct mlx5_ifc_register_power_settings_bits { + u8 reserved_0[0x18]; + u8 power_settings_level[0x8]; + + u8 reserved_1[0x60]; +}; + +struct mlx5_ifc_register_host_endianness_bits { + u8 he[0x1]; + u8 reserved_0[0x1f]; + + u8 reserved_1[0x60]; +}; + +struct mlx5_ifc_umr_pointer_desc_argument_bits { + u8 reserved_0[0x20]; + + u8 mkey[0x20]; + + u8 addressh_63_32[0x20]; + + u8 addressl_31_0[0x20]; +}; + +struct mlx5_ifc_ud_adrs_vector_bits { + u8 dc_key[0x40]; + + u8 ext[0x1]; + u8 reserved_0[0x7]; + u8 destination_qp_dct[0x18]; + + u8 static_rate[0x4]; + u8 sl_eth_prio[0x4]; + u8 fl[0x1]; + u8 mlid[0x7]; + u8 rlid_udp_sport[0x10]; + + u8 reserved_1[0x20]; + + u8 rmac_47_16[0x20]; + + u8 rmac_15_0[0x10]; + u8 tclass[0x8]; + u8 hop_limit[0x8]; + + u8 reserved_2[0x1]; + u8 grh[0x1]; + u8 reserved_3[0x2]; + u8 src_addr_index[0x8]; + u8 flow_label[0x14]; + + u8 rgid_rip[16][0x8]; +}; + +struct mlx5_ifc_pages_req_event_bits { + u8 reserved_0[0x10]; + u8 function_id[0x10]; + + u8 num_pages[0x20]; + + u8 reserved_1[0xa0]; +}; + +struct mlx5_ifc_eqe_bits { + u8 reserved_0[0x8]; + u8 event_type[0x8]; + u8 reserved_1[0x8]; + u8 event_sub_type[0x8]; + + u8 reserved_2[0xe0]; + + union mlx5_ifc_event_auto_bits event_data; + + u8 reserved_3[0x10]; + u8 signature[0x8]; + u8 reserved_4[0x7]; + u8 owner[0x1]; +}; + +enum { + MLX5_CMD_QUEUE_ENTRY_TYPE_PCIE_CMD_IF_TRANSPORT = 0x7, +}; + +struct mlx5_ifc_cmd_queue_entry_bits { + u8 type[0x8]; + u8 reserved_0[0x18]; + + u8 input_length[0x20]; + + u8 input_mailbox_pointer_63_32[0x20]; + + u8 input_mailbox_pointer_31_9[0x17]; + u8 reserved_1[0x9]; + + u8 command_input_inline_data[16][0x8]; + + u8 command_output_inline_data[16][0x8]; + + u8 output_mailbox_pointer_63_32[0x20]; + + u8 output_mailbox_pointer_31_9[0x17]; + u8 reserved_2[0x9]; + + u8 output_length[0x20]; + + u8 token[0x8]; + u8 signature[0x8]; + u8 reserved_3[0x8]; + u8 status[0x7]; + u8 ownership[0x1]; +}; + +struct mlx5_ifc_cmd_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 command_output[0x20]; +}; + +struct mlx5_ifc_cmd_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 command[0][0x20]; +}; + +struct mlx5_ifc_cmd_if_box_bits { + u8 mailbox_data[512][0x8]; + + u8 reserved_0[0x180]; + + u8 next_pointer_63_32[0x20]; + + u8 next_pointer_31_10[0x16]; + u8 reserved_1[0xa]; + + u8 block_number[0x20]; + + u8 reserved_2[0x8]; + u8 token[0x8]; + u8 ctrl_signature[0x8]; + u8 signature[0x8]; +}; + +struct mlx5_ifc_mtt_bits { + u8 ptag_63_32[0x20]; + + u8 ptag_31_8[0x18]; + u8 reserved_0[0x6]; + u8 wr_en[0x1]; + u8 rd_en[0x1]; +}; + +enum { + MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER = 0x0, + MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED = 0x1, + MLX5_INITIAL_SEG_NIC_INTERFACE_NO_DRAM_NIC = 0x2, +}; + +enum { + MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_FULL_DRIVER = 0x0, + MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_DISABLED = 0x1, + MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_NO_DRAM_NIC = 0x2, +}; + +enum { + MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_INTERNAL_ERR = 0x1, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_DEAD_IRISC = 0x7, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_HW_FATAL_ERR = 0x8, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_CRC_ERR = 0x9, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_FETCH_PCI_ERR = 0xa, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PAGE_ERR = 0xb, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_ASYNCHRONOUS_EQ_BUF_OVERRUN = 0xc, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_IN_ERR = 0xd, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV = 0xe, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR = 0xf, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR = 0x10, +}; + +struct mlx5_ifc_initial_seg_bits { + u8 fw_rev_minor[0x10]; + u8 fw_rev_major[0x10]; + + u8 cmd_interface_rev[0x10]; + u8 fw_rev_subminor[0x10]; + + u8 reserved_0[0x40]; + + u8 cmdq_phy_addr_63_32[0x20]; + + u8 cmdq_phy_addr_31_12[0x14]; + u8 reserved_1[0x2]; + u8 nic_interface[0x2]; + u8 log_cmdq_size[0x4]; + u8 log_cmdq_stride[0x4]; + + u8 command_doorbell_vector[0x20]; + + u8 reserved_2[0xf00]; + + u8 initializing[0x1]; + u8 reserved_3[0x4]; + u8 nic_interface_supported[0x3]; + u8 reserved_4[0x18]; + + struct mlx5_ifc_health_buffer_bits health_buffer; + + u8 no_dram_nic_offset[0x20]; + + u8 reserved_5[0x6e40]; + + u8 reserved_6[0x1f]; + u8 clear_int[0x1]; + + u8 health_syndrome[0x8]; + u8 health_counter[0x18]; + + u8 reserved_7[0x17fc0]; +}; + +union mlx5_ifc_ports_control_registers_document_bits { + struct mlx5_ifc_bufferx_reg_bits bufferx_reg; + struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; + struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout; + struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits eth_3635_cntrs_grp_data_layout; + struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout; + struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout; + struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout; + struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout; + struct mlx5_ifc_lane_2_module_mapping_bits lane_2_module_mapping; + struct mlx5_ifc_pamp_reg_bits pamp_reg; + struct mlx5_ifc_paos_reg_bits paos_reg; + struct mlx5_ifc_pcap_reg_bits pcap_reg; + struct mlx5_ifc_peir_reg_bits peir_reg; + struct mlx5_ifc_pelc_reg_bits pelc_reg; + struct mlx5_ifc_pfcc_reg_bits pfcc_reg; + struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; + struct mlx5_ifc_pifr_reg_bits pifr_reg; + struct mlx5_ifc_pipg_reg_bits pipg_reg; + struct mlx5_ifc_plbf_reg_bits plbf_reg; + struct mlx5_ifc_plib_reg_bits plib_reg; + struct mlx5_ifc_plpc_reg_bits plpc_reg; + struct mlx5_ifc_pmaos_reg_bits pmaos_reg; + struct mlx5_ifc_pmlp_reg_bits pmlp_reg; + struct mlx5_ifc_pmlpn_reg_bits pmlpn_reg; + struct mlx5_ifc_pmpc_reg_bits pmpc_reg; + struct mlx5_ifc_pmpe_reg_bits pmpe_reg; + struct mlx5_ifc_pmpr_reg_bits pmpr_reg; + struct mlx5_ifc_pmtu_reg_bits pmtu_reg; + struct mlx5_ifc_ppad_reg_bits ppad_reg; + struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg; + struct mlx5_ifc_pplm_reg_bits pplm_reg; + struct mlx5_ifc_pplr_reg_bits pplr_reg; + struct mlx5_ifc_ppsc_reg_bits ppsc_reg; + struct mlx5_ifc_pqdr_reg_bits pqdr_reg; + struct mlx5_ifc_pspa_reg_bits pspa_reg; + struct mlx5_ifc_ptas_reg_bits ptas_reg; + struct mlx5_ifc_ptys_reg_bits ptys_reg; + struct mlx5_ifc_pude_reg_bits pude_reg; + struct mlx5_ifc_pvlc_reg_bits pvlc_reg; + struct mlx5_ifc_slrg_reg_bits slrg_reg; + struct mlx5_ifc_sltp_reg_bits sltp_reg; + u8 reserved_0[0x60e0]; +}; + +union mlx5_ifc_debug_enhancements_document_bits { + struct mlx5_ifc_health_buffer_bits health_buffer; + u8 reserved_0[0x200]; +}; + +union mlx5_ifc_uplink_pci_interface_document_bits { + struct mlx5_ifc_initial_seg_bits initial_seg; + u8 reserved_0[0x20060]; }; #endif /* MLX5_IFC_H */ diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 310b5f7fd6ae..f079fb1a31f7 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -134,13 +134,21 @@ enum { enum { MLX5_WQE_CTRL_CQ_UPDATE = 2 << 2, + MLX5_WQE_CTRL_CQ_UPDATE_AND_EQE = 3 << 2, MLX5_WQE_CTRL_SOLICITED = 1 << 1, }; enum { + MLX5_SEND_WQE_DS = 16, MLX5_SEND_WQE_BB = 64, }; +#define MLX5_SEND_WQEBB_NUM_DS (MLX5_SEND_WQE_BB / MLX5_SEND_WQE_DS) + +enum { + MLX5_SEND_WQE_MAX_WQEBBS = 16, +}; + enum { MLX5_WQE_FMR_PERM_LOCAL_READ = 1 << 27, MLX5_WQE_FMR_PERM_LOCAL_WRITE = 1 << 28, @@ -200,6 +208,23 @@ struct mlx5_wqe_ctrl_seg { #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 +enum { + MLX5_ETH_WQE_L3_INNER_CSUM = 1 << 4, + MLX5_ETH_WQE_L4_INNER_CSUM = 1 << 5, + MLX5_ETH_WQE_L3_CSUM = 1 << 6, + MLX5_ETH_WQE_L4_CSUM = 1 << 7, +}; + +struct mlx5_wqe_eth_seg { + u8 rsvd0[4]; + u8 cs_flags; + u8 rsvd1; + __be16 mss; + __be32 rsvd2; + __be16 inline_hdr_sz; + u8 inline_hdr_start[2]; +}; + struct mlx5_wqe_xrc_seg { __be32 xrc_srqn; u8 rsvd[12]; -- cgit v1.2.3 From 938fe83c8dcbbf294d167e6163200a8540ae43c4 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2015 22:28:41 +0300 Subject: net/mlx5_core: New device capabilities handling - Query all supported types of dev caps on driver load. - Store the Cap data outbox per cap type into driver private data. - Introduce new Macros to access/dump stored caps (using the auto generated data types). - Obsolete SW representation of dev caps (no need for SW copy for each cap). - Modify IB driver to use new macros for checking caps. Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 66 ++++++++++++++++++++++++++++++++++++++++----- include/linux/mlx5/driver.h | 58 +++++---------------------------------- 2 files changed, 65 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index feebed7b392b..4ee52bf1f959 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -59,6 +59,8 @@ #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8) #define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8) #define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32) +#define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8) +#define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32) #define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8) #define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld)) @@ -322,13 +324,6 @@ enum { MLX5_CAP_OFF_CMDIF_CSUM = 46, }; -enum { - HCA_CAP_OPMOD_GET_MAX = 0, - HCA_CAP_OPMOD_GET_CUR = 1, - HCA_CAP_OPMOD_GET_ODP_MAX = 4, - HCA_CAP_OPMOD_GET_ODP_CUR = 5 -}; - struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; @@ -1101,4 +1096,61 @@ enum { MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM = 0x1, }; +/* MLX5 DEV CAPs */ + +/* TODO: EAT.ME */ +enum mlx5_cap_mode { + HCA_CAP_OPMOD_GET_MAX = 0, + HCA_CAP_OPMOD_GET_CUR = 1, +}; + +enum mlx5_cap_type { + MLX5_CAP_GENERAL = 0, + MLX5_CAP_ETHERNET_OFFLOADS, + MLX5_CAP_ODP, + MLX5_CAP_ATOMIC, + MLX5_CAP_ROCE, + MLX5_CAP_IPOIB_OFFLOADS, + MLX5_CAP_EOIB_OFFLOADS, + MLX5_CAP_FLOW_TABLE, + /* NUM OF CAP Types */ + MLX5_CAP_NUM +}; + +/* GET Dev Caps macros */ +#define MLX5_CAP_GEN(mdev, cap) \ + MLX5_GET(cmd_hca_cap, mdev->hca_caps_cur[MLX5_CAP_GENERAL], cap) + +#define MLX5_CAP_GEN_MAX(mdev, cap) \ + MLX5_GET(cmd_hca_cap, mdev->hca_caps_max[MLX5_CAP_GENERAL], cap) + +#define MLX5_CAP_ETH(mdev, cap) \ + MLX5_GET(per_protocol_networking_offload_caps,\ + mdev->hca_caps_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap) + +#define MLX5_CAP_ETH_MAX(mdev, cap) \ + MLX5_GET(per_protocol_networking_offload_caps,\ + mdev->hca_caps_max[MLX5_CAP_ETHERNET_OFFLOADS], cap) + +#define MLX5_CAP_ROCE(mdev, cap) \ + MLX5_GET(roce_cap, mdev->hca_caps_cur[MLX5_CAP_ROCE], cap) + +#define MLX5_CAP_ROCE_MAX(mdev, cap) \ + MLX5_GET(roce_cap, mdev->hca_caps_max[MLX5_CAP_ROCE], cap) + +#define MLX5_CAP_ATOMIC(mdev, cap) \ + MLX5_GET(atomic_caps, mdev->hca_caps_cur[MLX5_CAP_ATOMIC], cap) + +#define MLX5_CAP_ATOMIC_MAX(mdev, cap) \ + MLX5_GET(atomic_caps, mdev->hca_caps_max[MLX5_CAP_ATOMIC], cap) + +#define MLX5_CAP_FLOWTABLE(mdev, cap) \ + MLX5_GET(flow_table_nic_cap, mdev->hca_caps_cur[MLX5_CAP_FLOW_TABLE], cap) + +#define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \ + MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap) + +#define MLX5_CAP_ODP(mdev, cap)\ + MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) + #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3fd4fdc1ba16..6b9199163633 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -268,55 +268,7 @@ struct mlx5_cmd { struct mlx5_port_caps { int gid_table_len; int pkey_table_len; -}; - -struct mlx5_general_caps { - u8 log_max_eq; - u8 log_max_cq; - u8 log_max_qp; - u8 log_max_mkey; - u8 log_max_pd; - u8 log_max_srq; - u8 log_max_mrw_sz; - u8 log_max_bsf_list_size; - u8 log_max_klm_list_size; - u32 max_cqes; - int max_wqes; - u32 max_eqes; - u32 max_indirection; - int max_sq_desc_sz; - int max_rq_desc_sz; - int max_dc_sq_desc_sz; - u64 flags; - u16 stat_rate_support; - int log_max_msg; - int num_ports; - u8 log_max_ra_res_qp; - u8 log_max_ra_req_qp; - int max_srq_wqes; - int bf_reg_size; - int bf_regs_per_page; - struct mlx5_port_caps port[MLX5_MAX_PORTS]; - u8 ext_port_cap[MLX5_MAX_PORTS]; - int max_vf; - u32 reserved_lkey; - u8 local_ca_ack_delay; - u8 log_max_mcg; - u32 max_qp_mcg; - int min_page_sz; - int pd_cap; - u32 max_qp_counters; - u32 pkey_table_size; - u8 log_max_ra_req_dc; - u8 log_max_ra_res_dc; - u32 uar_sz; - u8 min_log_pg_sz; - u8 log_max_xrcd; - u16 log_uar_page_sz; -}; - -struct mlx5_caps { - struct mlx5_general_caps gen; + u8 ext_port_cap; }; struct mlx5_cmd_mailbox { @@ -521,7 +473,9 @@ struct mlx5_core_dev { u8 rev_id; char board_id[MLX5_BOARD_ID_LEN]; struct mlx5_cmd cmd; - struct mlx5_caps caps; + struct mlx5_port_caps port_caps[MLX5_MAX_PORTS]; + u32 hca_caps_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; + u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; void (*event) (struct mlx5_core_dev *dev, @@ -651,8 +605,8 @@ void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); int mlx5_cmd_status_to_err_v2(void *ptr); -int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, - u16 opmod); +int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type, + enum mlx5_cap_mode cap_mode); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, -- cgit v1.2.3 From adb0c9545bce6f1b1d563e988e6ee5531861d449 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2015 22:28:42 +0300 Subject: net/mlx5_core: Implement access functions of ptys register fields Those registers will be used by the ethtool to set/get settings. Signed-off-by: Rana Shahout Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6b9199163633..266d5498a270 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -504,6 +504,11 @@ enum { MLX5_COMP_EQ_SIZE = 1024, }; +enum { + MLX5_PTYS_IB = 1 << 0, + MLX5_PTYS_EN = 1 << 2, +}; + struct mlx5_db_pgdir { struct list_head list; DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE); @@ -686,7 +691,16 @@ void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, u16 reg_num, int arg, int write); + int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); +int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, + int ptys_size, int proto_mask); +int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev, + u32 *proto_cap, int proto_mask); +int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, + u32 *proto_admin, int proto_mask); +int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin, + int proto_mask); int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); -- cgit v1.2.3 From 4c916a798058c1acf5a980438416020932c24aca Mon Sep 17 00:00:00 2001 From: Rana Shahout Date: Thu, 28 May 2015 22:28:43 +0300 Subject: net/mlx5_core: Implement get/set port status Implemet get/set port status low level functions to be exposed by the netdev. Signed-off-by: Rana Shahout Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 266d5498a270..6438444ab361 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -149,6 +149,11 @@ enum mlx5_dev_event { MLX5_DEV_EVENT_CLIENT_REREG, }; +enum mlx5_port_status { + MLX5_PORT_UP = 1 << 1, + MLX5_PORT_DOWN = 1 << 2, +}; + struct mlx5_uuar_info { struct mlx5_uar *uars; int num_uars; @@ -701,6 +706,9 @@ int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, u32 *proto_admin, int proto_mask); int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin, int proto_mask); +int mlx5_set_port_status(struct mlx5_core_dev *dev, + enum mlx5_port_status status); +int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status); int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); -- cgit v1.2.3 From 90b3e38d048f09b22fb50bcd460cea65fd00b2d7 Mon Sep 17 00:00:00 2001 From: Rana Shahout Date: Thu, 28 May 2015 22:28:44 +0300 Subject: net/mlx5_core: Modify CQ moderation parameters Introduce mlx5_core_modify_cq_moderation() to be used by the netdev, to set hardware coalescing. Signed-off-by: Rana Shahout Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/cq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 2695ced222df..abc4767695e4 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -169,6 +169,9 @@ int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, struct mlx5_query_cq_mbox_out *out); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, struct mlx5_modify_cq_mbox_in *in, int in_sz); +int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, + struct mlx5_core_cq *cq, u16 cq_period, + u16 cq_max_count); int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); -- cgit v1.2.3 From e725440e75da8c4d617a31c4e38216acc55c24e3 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2015 22:28:45 +0300 Subject: net/mlx5_core: Set/Query port MTU commands Introduce set/Query low level functions to access MTU in hardware. To be used by the netdev. Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6438444ab361..51738472657e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -710,6 +710,10 @@ int mlx5_set_port_status(struct mlx5_core_dev *dev, enum mlx5_port_status status); int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status); +int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu); +int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu); +int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu); + int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, -- cgit v1.2.3 From afb736e9330ad6b2b6935d2f53ded784eb73f12d Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Thu, 28 May 2015 22:28:47 +0300 Subject: net/mlx5: Ethernet resource handling files This patch contains the resource handling files: - flow_table.c: This file contains the code to handle the low level API to configure hardware flow table. It is separated from the flow_table_en.c, because it will be used in the future by Raw Ethernet QP in mlx5_ib too. - en_flow_table.[ch]: Ethernet flow steering handling. The flow table object contain a mapping between flow specs and TIRs. This mechanism will be used also to configure e-switch in the future, when SR-IOV support will be added. - transobj.[ch] - Low level functions to create/modify/destroy the transport objects: RQ/SQ/TIR/TIS - vport.[ch] - Handle attributes of a virtual port (vPort) in the embedded switch. Currently this switch is a passthrough, until SR-IOV support will be added. Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/flow_table.h | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 include/linux/mlx5/flow_table.h (limited to 'include/linux') diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h new file mode 100644 index 000000000000..5f922c6d4fc2 --- /dev/null +++ b/include/linux/mlx5/flow_table.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_FLOW_TABLE_H +#define MLX5_FLOW_TABLE_H + +#include + +struct mlx5_flow_table_group { + u8 log_sz; + u8 match_criteria_enable; + u32 match_criteria[MLX5_ST_SZ_DW(fte_match_param)]; +}; + +void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type, + u16 num_groups, + struct mlx5_flow_table_group *group); +void mlx5_destroy_flow_table(void *flow_table); +int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable, + void *match_criteria, void *flow_context, + u32 *flow_index); +void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index); +u32 mlx5_get_flow_table_id(void *flow_table); + +#endif /* MLX5_FLOW_TABLE_H */ -- cgit v1.2.3 From f62b8bb8f2d30582f30f51e85a8c0e1260125d7e Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Thu, 28 May 2015 22:28:48 +0300 Subject: net/mlx5: Extend mlx5_core to support ConnectX-4 Ethernet functionality This is the Ethernet part of the driver for the Mellanox ConnectX(R)-4 Single/Dual-Port Adapter supporting 100Gb/s with VPI. The driver extends the existing mlx5 driver with Ethernet functionality. This patch contains the driver entry points but does not include transmit and receive (see the previous patch in the series) routines. It also adds the option MLX5_CORE_EN to Kconfig to enable/disable the Ethernet functionality. Currently, Kconfig is programmed to make Ethernet and Infiniband functionality mutally exclusive. Also changed MLX5_INFINIBAND to be depandant on MLX5_CORE instead of selecting it, since MLX5_CORE could be selected without MLX5_INFINIBAND being selected. Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 19 +++++++++++++++++++ include/linux/mlx5/driver.h | 1 + 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 4ee52bf1f959..b288c538347a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1153,4 +1153,23 @@ enum mlx5_cap_type { #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) +enum { + MLX5_CMD_STAT_OK = 0x0, + MLX5_CMD_STAT_INT_ERR = 0x1, + MLX5_CMD_STAT_BAD_OP_ERR = 0x2, + MLX5_CMD_STAT_BAD_PARAM_ERR = 0x3, + MLX5_CMD_STAT_BAD_SYS_STATE_ERR = 0x4, + MLX5_CMD_STAT_BAD_RES_ERR = 0x5, + MLX5_CMD_STAT_RES_BUSY = 0x6, + MLX5_CMD_STAT_LIM_ERR = 0x8, + MLX5_CMD_STAT_BAD_RES_STATE_ERR = 0x9, + MLX5_CMD_STAT_IX_ERR = 0xa, + MLX5_CMD_STAT_NO_RES_ERR = 0xf, + MLX5_CMD_STAT_BAD_INP_LEN_ERR = 0x50, + MLX5_CMD_STAT_BAD_OUTP_LEN_ERR = 0x51, + MLX5_CMD_STAT_BAD_QP_STATE_ERR = 0x10, + MLX5_CMD_STAT_BAD_PKT_ERR = 0x30, + MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR = 0x40, +}; + #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 51738472657e..7fa26f03acc1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -489,6 +489,7 @@ struct mlx5_core_dev { struct mlx5_priv priv; struct mlx5_profile *profile; atomic_t num_qps; + u32 issi; }; struct mlx5_db { -- cgit v1.2.3 From 07d783fd830a49008f3b2764ae7b6033ee1bf329 Mon Sep 17 00:00:00 2001 From: Peter Senna Tschudin Date: Tue, 19 May 2015 11:44:46 +0200 Subject: staging: goldfish: Fix pointer cast for 32 bits As the first argument of gf_write64() was of type unsigned long, and as some calls to gf_write64() were casting the first argument from void * to u64 the compiler and/or sparse were printing warnings for casts of wrong sizes when compiling for i386. This patch changes the type of the first argument of gf_write64() to const void *, and update calls to the function. This change fixed the warnings and allowed to remove casts from 3 calls to gf_write64(). In addition gf_write64() was renamed to gf_write_ptr() as the name was misleading because it only writes 32 bits on 32 bit systems. gf_write_dma_addr() was added to handle dma_addr_t values which is used at drivers/staging/goldfish/goldfish_audio.c. Signed-off-by: Dan Carpenter Signed-off-by: Peter Senna Tschudin Signed-off-by: Greg Kroah-Hartman --- include/linux/goldfish.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/goldfish.h b/include/linux/goldfish.h index 569236e6b2bc..93e080b39cf6 100644 --- a/include/linux/goldfish.h +++ b/include/linux/goldfish.h @@ -3,13 +3,24 @@ /* Helpers for Goldfish virtual platform */ -static inline void gf_write64(unsigned long data, - void __iomem *portl, void __iomem *porth) +static inline void gf_write_ptr(const void *ptr, void __iomem *portl, + void __iomem *porth) { - writel((u32)data, portl); + writel((u32)(unsigned long)ptr, portl); #ifdef CONFIG_64BIT - writel(data>>32, porth); + writel((unsigned long)ptr >> 32, porth); #endif } +static inline void gf_write_dma_addr(const dma_addr_t addr, + void __iomem *portl, + void __iomem *porth) +{ + writel((u32)addr, portl); +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT + writel(addr >> 32, porth); +#endif +} + + #endif /* __LINUX_GOLDFISH_H */ -- cgit v1.2.3 From b144ce2d37619e05afdb0a15676500d76a64b1be Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 May 2015 17:17:27 -0700 Subject: mei: fix up uuid matching A previous commit, c93b76b34b4d ("mei: bus: report also uuid in module alias") caused a build error as I missed applying a needed patch to add some macros to uapi/linux/uuid.h. Instead of those additional macros, change the mei code to use the existing uuid structure directly. Fixes: c93b76b34b4d Cc: Tomas Winkler Cc: Samuel Ortiz Reported-by: Stephen Rothwell Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 2d2b2b571d61..048c270822f9 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -614,7 +614,7 @@ struct ipack_device_id { */ struct mei_cl_device_id { char name[MEI_CL_NAME_SIZE]; - __u8 uuid[16]; + uuid_le uuid; kernel_ulong_t driver_info; }; -- cgit v1.2.3 From c66fa19c405a36673d4aab13658c8246413d5c0f Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Sun, 31 May 2015 09:30:16 +0300 Subject: net/mlx4: Add EQ pool Previously, mlx4_en allocated EQs and used them exclusively. This affected RoCE performance, as applications which are events sensitive were limited to use only the legacy EQs. Change that by introducing an EQ pool. This pool is managed by mlx4_core. EQs are assigned to ports (when there are limited number of EQs, multiple ports could be assigned to the same EQs). An exception to this rule is the ASYNC EQ which handles various events. Legacy EQs are completely removed as all EQs could be shared. When a consumer (mlx4_ib/mlx4_en) requests an EQ, it asks for EQ serving on a specific port. The core driver calculates which EQ should be assigned to that request. Because IRQs are shared between IB and Ethernet modules, their names only include the PCI device BDF address. Signed-off-by: Matan Barak Signed-off-by: Ido Shamay Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 83e80ab94500..ad31e476873f 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -46,8 +46,9 @@ #define MAX_MSIX_P_PORT 17 #define MAX_MSIX 64 -#define MSIX_LEGACY_SZ 4 #define MIN_MSIX_P_PORT 5 +#define MLX4_IS_LEGACY_EQ_MODE(dev_cap) ((dev_cap).num_comp_vectors < \ + (dev_cap).num_ports * MIN_MSIX_P_PORT) #define MLX4_MAX_100M_UNITS_VAL 255 /* * work around: can't set values @@ -528,7 +529,6 @@ struct mlx4_caps { int num_eqs; int reserved_eqs; int num_comp_vectors; - int comp_pool; int num_mpts; int max_fmr_maps; int num_mtts; @@ -1332,10 +1332,13 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); int mlx4_test_interrupts(struct mlx4_dev *dev); -int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap, - int *vector); +u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port); +bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector); +struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port); +int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector); void mlx4_release_eq(struct mlx4_dev *dev, int vec); +int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector); int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec); int mlx4_get_phys_port_id(struct mlx4_dev *dev); -- cgit v1.2.3 From f0e6a326deec8b51ee12f82a34057efd3d0979b8 Mon Sep 17 00:00:00 2001 From: Abhishek Bist Date: Wed, 27 May 2015 23:54:19 +0530 Subject: USB: hcd.h : Removed an unnecessary function prototype usb_find_interface_driver() This function is used to call in early version of linux kernel in order to find out the interface used by a usb device. But now it's use is completely abolished. So,it would be relevant to remove this obselete function from kernel mainline. Signed-off-by: Abhishek Bist Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 68b1e836dff1..c9aa7792de10 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -622,8 +622,6 @@ extern struct list_head usb_bus_list; extern struct mutex usb_bus_list_lock; extern wait_queue_head_t usb_kill_urb_queue; -extern int usb_find_interface_driver(struct usb_device *dev, - struct usb_interface *interface); #define usb_endpoint_out(ep_dir) (!((ep_dir) & USB_DIR_IN)) -- cgit v1.2.3 From 138c3f03b017e261316a4f1ec793e1ff74516def Mon Sep 17 00:00:00 2001 From: Nikhil Badola Date: Tue, 26 May 2015 17:15:29 +0530 Subject: drivers:usb:fsl: Add support for USB controller version-2.5 Add support for USB controller version-2.5 used in T4240 rev2.0, T1024, T1040, T2080, LS1021A Signed-off-by: Nikhil Badola Signed-off-by: Greg Kroah-Hartman --- include/linux/fsl_devices.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h index a82296af413f..2a2f56b292c1 100644 --- a/include/linux/fsl_devices.h +++ b/include/linux/fsl_devices.h @@ -24,6 +24,7 @@ #define FSL_USB_VER_1_6 1 #define FSL_USB_VER_2_2 2 #define FSL_USB_VER_2_4 3 +#define FSL_USB_VER_2_5 4 #include -- cgit v1.2.3 From 34d2e4584ae594eff29d1595d47d7d044e57f834 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 May 2015 13:28:48 +0900 Subject: serial: 8250: include from serial_8250.h The header file, include/linux/serial_8250.h, contains references to UART_LSR_BRK_ERROR_BITS and UART_MSR_ANY_DELTA that are defined in . Signed-off-by: Masahiro Yamada Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index f0c68d88b6f4..ba82c07feb95 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -12,6 +12,7 @@ #define _LINUX_SERIAL_8250_H #include +#include #include /* -- cgit v1.2.3 From abf2e7d6e2e315b32ee00067a69aaad2cf4e1b3f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 28 May 2015 19:26:02 -0700 Subject: bpf: add missing rcu protection when releasing programs from prog_array Normally the program attachment place (like sockets, qdiscs) takes care of rcu protection and calls bpf_prog_put() after a grace period. The programs stored inside prog_array may not be attached anywhere, so prog_array needs to take care of preserving rcu protection. Otherwise bpf_tail_call() will race with bpf_prog_put(). To solve that introduce bpf_prog_put_rcu() helper function and use it in 3 places where unattached program can decrement refcnt: closing program fd, deleting/replacing program in prog_array. Fixes: 04fd61ab36ec ("bpf: allow bpf programs to tail-call other bpf programs") Reported-by: Martin Schwidefsky Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8821b9a8689e..5f520f5f087e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -123,7 +123,10 @@ struct bpf_prog_aux { const struct bpf_verifier_ops *ops; struct bpf_map **used_maps; struct bpf_prog *prog; - struct work_struct work; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; struct bpf_array { @@ -153,6 +156,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); void bpf_prog_put(struct bpf_prog *prog); +void bpf_prog_put_rcu(struct bpf_prog *prog); struct bpf_map *bpf_map_get(struct fd f); void bpf_map_put(struct bpf_map *map); -- cgit v1.2.3 From 3b369bd212d5cabb46cff0e863298971b382bbd6 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Thu, 28 May 2015 15:38:32 +0300 Subject: ieee802154: Fix generation of random EUI-64 addresses. Currently, ieee802154_random_extended_addr() has a 50% chance of generating a group (multicast) address, while this function is used for generating station addresses (which can't be group addresses) for interfaces that don't have a hardware-provided address. Also, in case get_random_bytes() generates the EUI-64 address 00:00:00:00:00:00:00:00 (extremely unlikely), which is an invalid address, ieee802154_random_extended_addr() reacts by changing it to 01:00:00:00:00:00:00:00, which is an invalid station address as well, as it is a group address. This patch changes the address generation procedure to grab eight random bytes, treat that as an EUI-64, and then clear the Group address bit and set the Locally Administered bit, which is in line with how eth_random_addr() generates random EUI-48s. Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/linux/ieee802154.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index 8872ca103d06..552210d0a46f 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -244,9 +244,9 @@ static inline void ieee802154_random_extended_addr(__le64 *addr) { get_random_bytes(addr, IEEE802154_EXTENDED_ADDR_LEN); - /* toggle some bit if we hit an invalid extended addr */ - if (!ieee802154_is_valid_extended_addr(*addr)) - ((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] ^= 0x01; + /* clear the group bit, and set the locally administered bit */ + ((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] &= ~0x01; + ((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] |= 0x02; } #endif /* LINUX_IEEE802154_H */ -- cgit v1.2.3 From daf4e2c89254ed6eb8cf7ef60f614edebfdb9f3a Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Thu, 28 May 2015 15:38:43 +0300 Subject: ieee802154: Fix EUI-64 station address validation. Refuse to allow setting an EUI-64 group address as an interface address, as those are not valid station addresses. Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/linux/ieee802154.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index 552210d0a46f..1dc1f4ed4001 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -225,15 +225,13 @@ static inline bool ieee802154_is_valid_psdu_len(const u8 len) * ieee802154_is_valid_psdu_len - check if extended addr is valid * @addr: extended addr to check */ -static inline bool ieee802154_is_valid_extended_addr(const __le64 addr) +static inline bool ieee802154_is_valid_extended_unicast_addr(const __le64 addr) { - /* These EUI-64 addresses are reserved by IEEE. 0xffffffffffffffff - * is used internally as extended to short address broadcast mapping. - * This is currently a workaround because neighbor discovery can't - * deal with short addresses types right now. + /* Bail out if the address is all zero, or if the group + * address bit is set. */ return ((addr != cpu_to_le64(0x0000000000000000ULL)) && - (addr != cpu_to_le64(0xffffffffffffffffULL))); + !(addr & cpu_to_le64(0x0100000000000000ULL))); } /** -- cgit v1.2.3 From 1a1bc59c5f7657387d1a4b45d63248fed55ab88c Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Fri, 29 May 2015 10:56:55 +0530 Subject: cc2520: fix CC2591 handling This patch changes tha way of handling of cc2591-cc2520 combination by moving amplified variable from platform data to private data. This will be useful in other sections like tx power support. Signed-off-by: Varka Bhadram Cc: Brad Campbell Signed-off-by: Marcel Holtmann --- include/linux/spi/cc2520.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/cc2520.h b/include/linux/spi/cc2520.h index e741e8baad92..85b8ee67e937 100644 --- a/include/linux/spi/cc2520.h +++ b/include/linux/spi/cc2520.h @@ -21,7 +21,6 @@ struct cc2520_platform_data { int sfd; int reset; int vreg; - bool amplified; }; #endif -- cgit v1.2.3 From 6c4e5f9c9ff41ea997fd0f345b3b2b88c113eb68 Mon Sep 17 00:00:00 2001 From: Keith Mange Date: Tue, 26 May 2015 14:23:01 -0700 Subject: Drivers: hv: vmbus:Update preferred vmbus protocol version to windows 10. Add support for Windows 10. Signed-off-by: Keith Mange Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 3932a993ff5a..4317cd1b69ed 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -160,16 +160,18 @@ hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi, * 1 . 1 (Windows 7) * 2 . 4 (Windows 8) * 3 . 0 (Windows 8 R2) + * 4 . 0 (Windows 10) */ #define VERSION_WS2008 ((0 << 16) | (13)) #define VERSION_WIN7 ((1 << 16) | (1)) #define VERSION_WIN8 ((2 << 16) | (4)) #define VERSION_WIN8_1 ((3 << 16) | (0)) +#define VERSION_WIN10 ((4 << 16) | (0)) #define VERSION_INVAL -1 -#define VERSION_CURRENT VERSION_WIN8_1 +#define VERSION_CURRENT VERSION_WIN10 /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) -- cgit v1.2.3 From 6fa45a22689722dac9f0e90c0931d4b34b334ede Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Wed, 20 May 2015 20:56:57 +0530 Subject: parport: add device-model to parport subsystem parport subsystem starts using the device-model. Drivers using the device-model has to define devmodel as true and should register the device with parport using parport_register_dev_model(). Tested-by: Jean Delvare Tested-by: Alan Cox Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- include/linux/parport.h | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/parport.h b/include/linux/parport.h index c22f12547324..58e3c64c6b49 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -145,6 +146,8 @@ struct pardevice { unsigned int flags; struct pardevice *next; struct pardevice *prev; + struct device dev; + bool devmodel; struct parport_state *state; /* saved status over preemption */ wait_queue_head_t wait_q; unsigned long int time; @@ -156,6 +159,8 @@ struct pardevice { void * sysctl_table; }; +#define to_pardevice(n) container_of(n, struct pardevice, dev) + /* IEEE1284 information */ /* IEEE1284 phases. These are exposed to userland through ppdev IOCTL @@ -195,7 +200,7 @@ struct parport { * This may unfortulately be null if the * port has a legacy driver. */ - + struct device bus_dev; /* to link with the bus */ struct parport *physport; /* If this is a non-default mux parport, i.e. we're a clone of a real @@ -245,15 +250,26 @@ struct parport { struct parport *slaves[3]; }; +#define to_parport_dev(n) container_of(n, struct parport, bus_dev) + #define DEFAULT_SPIN_TIME 500 /* us */ struct parport_driver { const char *name; void (*attach) (struct parport *); void (*detach) (struct parport *); + void (*match_port)(struct parport *); + int (*probe)(struct pardevice *); + struct device_driver driver; + bool devmodel; struct list_head list; }; +#define to_parport_driver(n) container_of(n, struct parport_driver, driver) + +int parport_bus_init(void); +void parport_bus_exit(void); + /* parport_register_port registers a new parallel port at the given address (if one does not already exist) and returns a pointer to it. This entails claiming the I/O region, IRQ and DMA. NULL is returned @@ -272,10 +288,20 @@ void parport_announce_port (struct parport *port); extern void parport_remove_port(struct parport *port); /* Register a new high-level driver. */ -extern int parport_register_driver (struct parport_driver *); + +int __must_check __parport_register_driver(struct parport_driver *, + struct module *, + const char *mod_name); +/* + * parport_register_driver must be a macro so that KBUILD_MODNAME can + * be expanded + */ +#define parport_register_driver(driver) \ + __parport_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) /* Unregister a high-level driver. */ extern void parport_unregister_driver (struct parport_driver *); +void parport_unregister_driver(struct parport_driver *); /* If parport_register_driver doesn't fit your needs, perhaps * parport_find_xxx does. */ @@ -288,6 +314,15 @@ extern irqreturn_t parport_irq_handler(int irq, void *dev_id); /* Reference counting for ports. */ extern struct parport *parport_get_port (struct parport *); extern void parport_put_port (struct parport *); +void parport_del_port(struct parport *); + +struct pardev_cb { + int (*preempt)(void *); + void (*wakeup)(void *); + void *private; + void (*irq_func)(void *); + unsigned int flags; +}; /* parport_register_device declares that a device is connected to a port, and tells the kernel all it needs to know. @@ -301,6 +336,10 @@ struct pardevice *parport_register_device(struct parport *port, void (*irq_func)(void *), int flags, void *handle); +struct pardevice * +parport_register_dev_model(struct parport *port, const char *name, + const struct pardev_cb *par_dev_cb, int cnt); + /* parport_unregister unlinks a device from the chain. */ extern void parport_unregister_device(struct pardevice *dev); -- cgit v1.2.3 From b84b1d522f979fb53ad347605e24b2940fa2ad99 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 29 Apr 2015 08:57:34 +0200 Subject: scsi: Do not set cmd_per_lun to 1 in the host template '0' is now used as the default cmd_per_lun value, so there's no need to explicitly set it to '1' in the host template. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: James Bottomley --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 8dad4a307bb8..1402291aab5e 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -134,7 +134,6 @@ enum { ATA_ALL_DEVICES = (1 << ATA_MAX_DEVICES) - 1, ATA_SHT_EMULATED = 1, - ATA_SHT_CMD_PER_LUN = 1, ATA_SHT_THIS_ID = -1, ATA_SHT_USE_CLUSTERING = 1, @@ -1354,7 +1353,6 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .can_queue = ATA_DEF_QUEUE, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ - .cmd_per_lun = ATA_SHT_CMD_PER_LUN, \ .emulated = ATA_SHT_EMULATED, \ .use_clustering = ATA_SHT_USE_CLUSTERING, \ .proc_name = drv_name, \ -- cgit v1.2.3 From 1c4b1d73bacc546ba4e42f7eb4cb88c54139820b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 27 May 2015 13:57:46 +0200 Subject: tty: move linux/gsmmux.h to uapi linux/gsmmux.h defines a user interface and therefore should be installed with other headers. Make the file include: * linux/if.h for IFNAMSIZ * linux/ioctl.h for _IO* macros Signed-off-by: Jiri Slaby Cc: Alan Cox Signed-off-by: Greg Kroah-Hartman --- include/linux/gsmmux.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 include/linux/gsmmux.h (limited to 'include/linux') diff --git a/include/linux/gsmmux.h b/include/linux/gsmmux.h deleted file mode 100644 index c25e9477f7c3..000000000000 --- a/include/linux/gsmmux.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _LINUX_GSMMUX_H -#define _LINUX_GSMMUX_H - -struct gsm_config -{ - unsigned int adaption; - unsigned int encapsulation; - unsigned int initiator; - unsigned int t1; - unsigned int t2; - unsigned int t3; - unsigned int n2; - unsigned int mru; - unsigned int mtu; - unsigned int k; - unsigned int i; - unsigned int unused[8]; /* Padding for expansion without - breaking stuff */ -}; - -#define GSMIOC_GETCONF _IOR('G', 0, struct gsm_config) -#define GSMIOC_SETCONF _IOW('G', 1, struct gsm_config) - -struct gsm_netconfig { - unsigned int adaption; /* Adaption to use in network mode */ - unsigned short protocol;/* Protocol to use - only ETH_P_IP supported */ - unsigned short unused2; - char if_name[IFNAMSIZ]; /* interface name format string */ - __u8 unused[28]; /* For future use */ -}; - -#define GSMIOC_ENABLE_NET _IOW('G', 2, struct gsm_netconfig) -#define GSMIOC_DISABLE_NET _IO('G', 3) - - -#endif -- cgit v1.2.3 From 1f656ff3fdddc2f59649cc84b633b799908f1f7b Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Sat, 30 May 2015 23:37:48 -0700 Subject: Drivers: hv: vmbus: Implement NUMA aware CPU affinity for channels Channels/sub-channels can be affinitized to VCPUs in the guest. Implement this affinity in a way that is NUMA aware. The current protocol distributed the primary channels uniformly across all available CPUs. The new protocol is NUMA aware: primary channels are distributed across the available NUMA nodes while the sub-channels within a primary channel are distributed amongst CPUs within the NUMA node assigned to the primary channel. Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 4317cd1b69ed..30d3a1f79450 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -696,6 +696,11 @@ struct vmbus_channel { u32 target_vp; /* The corresponding CPUID in the guest */ u32 target_cpu; + /* + * State to manage the CPU affiliation of channels. + */ + struct cpumask alloced_cpus_in_node; + int numa_node; /* * Support for sub-channels. For high performance devices, * it will be useful to have multiple sub-channels to support -- cgit v1.2.3 From 17ca8cbf49be3aa94bb1c2b7ee6545fd70094eb4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 May 2015 23:23:06 +0200 Subject: ebpf: allow bpf_ktime_get_ns_proto also for networking As this is already exported from tracing side via commit d9847d310ab4 ("tracing: Allow BPF programs to call bpf_ktime_get_ns()"), we might as well want to move it to the core, so also networking users can make use of it, e.g. to measure diffs for certain flows from ingress/egress. Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Ingo Molnar Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5f520f5f087e..ca854e5bb2f7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -186,5 +186,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; +extern const struct bpf_func_proto bpf_ktime_get_ns_proto; #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 887ee43477e4e327dbcd2aabc2d78a5116ed8a33 Mon Sep 17 00:00:00 2001 From: Beomho Seo Date: Thu, 30 Apr 2015 13:07:43 +0900 Subject: hwmon: (ntc_thermistor) Add support for ncpXXwf104 This patch adds support for the ntc thermistor NCPXXWF104 series. Cc: Jean Delvare Cc: Guenter Roeck Signed-off-by: Beomho Seo Signed-off-by: Guenter Roeck --- include/linux/platform_data/ntc_thermistor.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/ntc_thermistor.h b/include/linux/platform_data/ntc_thermistor.h index 0a6de4ca4930..aed170588b74 100644 --- a/include/linux/platform_data/ntc_thermistor.h +++ b/include/linux/platform_data/ntc_thermistor.h @@ -27,6 +27,7 @@ enum ntc_thermistor_type { TYPE_NCPXXWB473, TYPE_NCPXXWL333, TYPE_B57330V2103, + TYPE_NCPXXWF104, }; struct ntc_thermistor_platform_data { -- cgit v1.2.3 From dfa13ebbe3340e538b988f5608efd9ff2ca7fc35 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 7 May 2015 13:10:12 +0300 Subject: mmc: host: Add facility to support re-tuning Currently, there is core support for tuning during initialization. There can also be a need to re-tune periodically (e.g. sdhci) or to re-tune after the host controller is powered off (e.g. after PM runtime suspend / resume) or to re-tune in response to CRC errors. The main requirements for re-tuning are: - ability to enable / disable re-tuning - ability to flag that re-tuning is needed - ability to re-tune before any request - ability to hold off re-tuning if the card is busy - ability to hold off re-tuning if re-tuning is in progress - ability to run a re-tuning timer To support those requirements 7 members are added to struct mmc_host: unsigned int can_retune:1; /* re-tuning can be used */ unsigned int doing_retune:1; /* re-tuning in progress */ unsigned int retune_now:1; /* do re-tuning at next req */ int need_retune; /* re-tuning is needed */ int hold_retune; /* hold off re-tuning */ unsigned int retune_period; /* re-tuning period in secs */ struct timer_list retune_timer; /* for periodic re-tuning */ need_retune is an integer so it can be set without needing synchronization. hold_retune is a integer to allow nesting. Various simple functions are provided to set / clear those variables. Subsequent patches take those functions into use. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index b5bedaec6223..f471193ef6d6 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -321,10 +322,18 @@ struct mmc_host { #ifdef CONFIG_MMC_DEBUG unsigned int removed:1; /* host is being removed */ #endif + unsigned int can_retune:1; /* re-tuning can be used */ + unsigned int doing_retune:1; /* re-tuning in progress */ + unsigned int retune_now:1; /* do re-tuning at next req */ int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ + int need_retune; /* re-tuning is needed */ + int hold_retune; /* hold off re-tuning */ + unsigned int retune_period; /* re-tuning period in secs */ + struct timer_list retune_timer; /* for periodic re-tuning */ + bool trigger_card_event; /* card_event necessary */ struct mmc_card *card; /* device attached to this host */ @@ -513,4 +522,18 @@ static inline bool mmc_card_hs400(struct mmc_card *card) return card->host->ios.timing == MMC_TIMING_MMC_HS400; } +void mmc_retune_timer_stop(struct mmc_host *host); + +static inline void mmc_retune_needed(struct mmc_host *host) +{ + if (host->can_retune) + host->need_retune = 1; +} + +static inline void mmc_retune_recheck(struct mmc_host *host) +{ + if (host->hold_retune <= 1) + host->retune_now = 1; +} + #endif /* LINUX_MMC_HOST_H */ -- cgit v1.2.3 From 9f6e0bff2afb52a4c29f5ca8a4db01810357974e Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Wed, 6 May 2015 20:31:19 +0200 Subject: mmc: Add support for disabling write-protect detection It is not uncommon to see systems where there is no physical write-protect signal (e.g. when using eMMC or microSD card slots). For some controllers, which have a dedicated write-protection detection logic (like SDHCI controllers), the get_ro() callback can return bogus data in such a case. Instead of handling this on a per controller basis this patch adds a new capability flag to the MMC core that can be set to specify that the result of get_ro() is invalid. When the flag is set the core will not call get_ro() and assume that the card is always read-write. Signed-off-by: Lars-Peter Clausen Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index f471193ef6d6..433eccb50838 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -286,6 +286,7 @@ struct mmc_host { MMC_CAP2_HS400_1_2V) #define MMC_CAP2_HSX00_1_2V (MMC_CAP2_HS200_1_2V_SDR | MMC_CAP2_HS400_1_2V) #define MMC_CAP2_SDIO_IRQ_NOTHREAD (1 << 17) +#define MMC_CAP2_NO_WRITE_PROTECT (1 << 18) /* No physical write protect pin, assume that card is always read-write */ mmc_pm_flag_t pm_caps; /* supported pm features */ -- cgit v1.2.3 From eff8f2f5df1c509c873cdc70c84eb2ee75b41e65 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Wed, 6 May 2015 20:31:22 +0200 Subject: mmc: dw_mmc: Use core to handle absent write protect line Use the new MMC_CAP2_NO_WRITE_PROTECT to let the core handle the case where no write protect line is present instead of having custom driver code to handle it. dw_mci_of_get_slot_quirks() is slightly refactored to directly modify the mmc_host capabilities instead of returning a quirk mask. Signed-off-by: Lars-Peter Clausen Signed-off-by: Jaehoon Chung Signed-off-by: Ulf Hansson --- include/linux/mmc/dw_mmc.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h index 12111993a317..5be97676f1fa 100644 --- a/include/linux/mmc/dw_mmc.h +++ b/include/linux/mmc/dw_mmc.h @@ -226,12 +226,6 @@ struct dw_mci_dma_ops { #define DW_MCI_QUIRK_HIGHSPEED BIT(2) /* Unreliable card detection */ #define DW_MCI_QUIRK_BROKEN_CARD_DETECTION BIT(3) -/* No write protect */ -#define DW_MCI_QUIRK_NO_WRITE_PROTECT BIT(4) - -/* Slot level quirks */ -/* This slot has no write protect */ -#define DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT BIT(0) struct dma_pdata; -- cgit v1.2.3 From b4f30a174e1fda8118eda038b5d8d5260db36ad5 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 6 Feb 2015 14:12:52 +0200 Subject: mmc: core: Allow card drive strength to be different to host Initialization of UHS-I modes for SD and SDIO cards employs a callback to allow the host driver to choose a drive strength value. Currently that assumes the card drive strength and host driver type must be the same value. Change to let the callback make that decision and return both the card drive strength and host driver type. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 433eccb50838..da33d18c66c8 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -132,7 +132,8 @@ struct mmc_host_ops { /* Prepare HS400 target operating frequency depending host driver */ int (*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios); - int (*select_drive_strength)(unsigned int max_dtr, int host_drv, int card_drv); + int (*select_drive_strength)(unsigned int max_dtr, int host_drv, + int card_drv, int *drv_type); void (*hw_reset)(struct mmc_host *host); void (*card_event)(struct mmc_host *host); -- cgit v1.2.3 From f168359efbb99d6f8591bb666d6510bb78df2d07 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 6 Feb 2015 14:12:54 +0200 Subject: mmc: core: Add 'card' to drive strength selection callback In preparation for supporting also eMMC drive strength, add the 'card' as a parameter so that the callback can distinguish different types of cards if necessary. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index da33d18c66c8..1369e54faeb7 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -132,7 +132,8 @@ struct mmc_host_ops { /* Prepare HS400 target operating frequency depending host driver */ int (*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios); - int (*select_drive_strength)(unsigned int max_dtr, int host_drv, + int (*select_drive_strength)(struct mmc_card *card, + unsigned int max_dtr, int host_drv, int card_drv, int *drv_type); void (*hw_reset)(struct mmc_host *host); void (*card_event)(struct mmc_host *host); -- cgit v1.2.3 From 3853a042325e8f497c199020979c4fc824528c6e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 6 Feb 2015 14:12:56 +0200 Subject: mmc: core: Record card drive strength In preparation for adding drive strength support for eMMC, add drive_strength to struct mmc_card to record the card drive strength for UHS-I modes and HS200 / HS400. For eMMC this will be needed when switching between HS200 and HS400. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 19f0175c0afa..2f073d555793 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -305,6 +305,7 @@ struct mmc_card { unsigned int sd_bus_speed; /* Bus Speed Mode set for the card */ unsigned int mmc_avail_type; /* supported device type by both host and card */ + unsigned int drive_strength; /* for UHS-I, HS200 or HS400 */ struct dentry *debugfs_root; struct mmc_part part[MMC_NUM_PHY_PARTITION]; /* physical partitions */ -- cgit v1.2.3 From b097e07f57930eda774c83aa46e8e401686d01dc Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 6 Feb 2015 14:12:57 +0200 Subject: mmc: mmc: Read card's valid driver strength mask In preparation for supporing drive strength selection for eMMC, read the card's valid driver strengths. Note that though the SD spec uses the term "drive strength", the JEDEC eMMC spec uses the term "driver strength". Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 + include/linux/mmc/mmc.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 2f073d555793..4d3776d25925 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -97,6 +97,7 @@ struct mmc_ext_csd { u8 raw_erased_mem_count; /* 181 */ u8 raw_ext_csd_structure; /* 194 */ u8 raw_card_type; /* 196 */ + u8 raw_driver_strength; /* 197 */ u8 out_of_int_time; /* 198 */ u8 raw_pwr_cl_52_195; /* 200 */ u8 raw_pwr_cl_26_195; /* 201 */ diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 124f562118b8..4819cfbc3795 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -302,6 +302,7 @@ struct _mmc_csd { #define EXT_CSD_REV 192 /* RO */ #define EXT_CSD_STRUCTURE 194 /* RO */ #define EXT_CSD_CARD_TYPE 196 /* RO */ +#define EXT_CSD_DRIVER_STRENGTH 197 /* RO */ #define EXT_CSD_OUT_OF_INTERRUPT_TIME 198 /* RO */ #define EXT_CSD_PART_SWITCH_TIME 199 /* RO */ #define EXT_CSD_PWR_CL_52_195 200 /* RO */ -- cgit v1.2.3 From cc4f414c885cd04f7227ad9bcd6b18fd78d718d9 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 6 Feb 2015 14:12:58 +0200 Subject: mmc: mmc: Add driver strength selection Add the ability to set eMMC driver strength for HS200 and HS400. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/mmc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 4819cfbc3795..15f2c4a0a62c 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -391,6 +391,7 @@ struct _mmc_csd { #define EXT_CSD_TIMING_HS 1 /* High speed */ #define EXT_CSD_TIMING_HS200 2 /* HS200 */ #define EXT_CSD_TIMING_HS400 3 /* HS400 */ +#define EXT_CSD_DRV_STR_SHIFT 4 /* Driver Strength shift */ #define EXT_CSD_SEC_ER_EN BIT(0) #define EXT_CSD_SEC_BD_BLK_EN BIT(2) @@ -442,4 +443,6 @@ struct _mmc_csd { #define MMC_SWITCH_MODE_CLEAR_BITS 0x02 /* Clear bits which are 1 in value */ #define MMC_SWITCH_MODE_WRITE_BYTE 0x03 /* Set target to value */ +#define mmc_driver_type_mask(n) (1 << (n)) + #endif /* LINUX_MMC_MMC_H */ -- cgit v1.2.3 From e1bfad6d936d7149a83423e2a7244dd5771f27e7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 6 Feb 2015 14:13:00 +0200 Subject: mmc: sdhci-pci: Add support for drive strength selection for SPT Implement the select_drive_strength callback to provide drive strength selection for Intel SPT. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/sdhci-pci-data.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdhci-pci-data.h b/include/linux/mmc/sdhci-pci-data.h index 8959604a13d3..fda15b6d4135 100644 --- a/include/linux/mmc/sdhci-pci-data.h +++ b/include/linux/mmc/sdhci-pci-data.h @@ -15,4 +15,6 @@ struct sdhci_pci_data { extern struct sdhci_pci_data *(*sdhci_pci_get_data)(struct pci_dev *pdev, int slotno); +extern int sdhci_pci_spt_drive_strength; + #endif -- cgit v1.2.3 From 225d59adf1c899176cce0fc80e42b1d1c12f109f Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Fri, 29 May 2015 18:14:21 +0200 Subject: iio: Specify supported modes for buffers For each buffer type specify the supported device modes for this buffer. This allows us for devices which support multiple different operating modes to pick the correct operating mode based on the modes supported by the attached buffers. It also prevents that buffers with conflicting modes are attached to a device at the same time or that a buffer with a non-supported mode is attached to a device (e.g. in-kernel callback buffer to a device only supporting hardware mode). Signed-off-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index eb8622b78ec9..1600c55828e0 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -29,6 +29,7 @@ struct iio_buffer; * @set_length: set number of datums in buffer * @release: called when the last reference to the buffer is dropped, * should free all resources allocated by the buffer. + * @modes: Supported operating modes by this buffer type * * The purpose of this structure is to make the buffer element * modular as event for a given driver, different usecases may require @@ -51,6 +52,8 @@ struct iio_buffer_access_funcs { int (*set_length)(struct iio_buffer *buffer, int length); void (*release)(struct iio_buffer *buffer); + + unsigned int modes; }; /** -- cgit v1.2.3 From 04fba7864ffcceae8a5f78d88ae1fd8d682a5123 Mon Sep 17 00:00:00 2001 From: Goffredo Baroncelli Date: Sat, 30 May 2015 11:00:26 +0200 Subject: HID: Export hid_field_extract() Rename the function extract() to hid_field_extract(), make it external linkage to allow the use from other modules. Suggested-by: Jiri Kosina Signed-off-by: Goffredo Baroncelli Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 176b43670e5d..f17980de2662 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -815,6 +815,8 @@ void hid_disconnect(struct hid_device *hid); const struct hid_device_id *hid_match_id(struct hid_device *hdev, const struct hid_device_id *id); s32 hid_snto32(__u32 value, unsigned n); +__u32 hid_field_extract(const struct hid_device *hid, __u8 *report, + unsigned offset, unsigned n); /** * hid_device_io_start - enable HID input during probe, remove -- cgit v1.2.3 From 3fff99bc4e926d9602a7d6e8c008a0175a099ce4 Mon Sep 17 00:00:00 2001 From: Rojhalat Ibrahim Date: Wed, 13 May 2015 11:04:56 +0200 Subject: gpiolib: rename gpiod_set_array to gpiod_set_array_value There have been concerns that the function names gpiod_set_array() and gpiod_get_array() might be confusing to users. One might expect gpiod_get_array() to return array values, while it is actually the array counterpart of gpiod_get(). To be consistent with the single descriptor API we could rename gpiod_set_array() to gpiod_set_array_value(). This makes some function names a bit lengthy: gpiod_set_raw_array_value_cansleep(). Signed-off-by: Rojhalat Ibrahim Acked-by: Alexandre Courbot Signed-off-by: Linus Walleij --- include/linux/gpio/consumer.h | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 09a7fb0062a6..fd098169fe87 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -100,24 +100,25 @@ int gpiod_direction_output_raw(struct gpio_desc *desc, int value); /* Value get/set from non-sleeping context */ int gpiod_get_value(const struct gpio_desc *desc); void gpiod_set_value(struct gpio_desc *desc, int value); -void gpiod_set_array(unsigned int array_size, - struct gpio_desc **desc_array, int *value_array); +void gpiod_set_array_value(unsigned int array_size, + struct gpio_desc **desc_array, int *value_array); int gpiod_get_raw_value(const struct gpio_desc *desc); void gpiod_set_raw_value(struct gpio_desc *desc, int value); -void gpiod_set_raw_array(unsigned int array_size, - struct gpio_desc **desc_array, int *value_array); +void gpiod_set_raw_array_value(unsigned int array_size, + struct gpio_desc **desc_array, + int *value_array); /* Value get/set from sleeping context */ int gpiod_get_value_cansleep(const struct gpio_desc *desc); void gpiod_set_value_cansleep(struct gpio_desc *desc, int value); -void gpiod_set_array_cansleep(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array); +void gpiod_set_array_value_cansleep(unsigned int array_size, + struct gpio_desc **desc_array, + int *value_array); int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc); void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value); -void gpiod_set_raw_array_cansleep(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array); +void gpiod_set_raw_array_value_cansleep(unsigned int array_size, + struct gpio_desc **desc_array, + int *value_array); int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce); @@ -304,9 +305,9 @@ static inline void gpiod_set_value(struct gpio_desc *desc, int value) /* GPIO can never have been requested */ WARN_ON(1); } -static inline void gpiod_set_array(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array) +static inline void gpiod_set_array_value(unsigned int array_size, + struct gpio_desc **desc_array, + int *value_array) { /* GPIO can never have been requested */ WARN_ON(1); @@ -322,9 +323,9 @@ static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value) /* GPIO can never have been requested */ WARN_ON(1); } -static inline void gpiod_set_raw_array(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array) +static inline void gpiod_set_raw_array_value(unsigned int array_size, + struct gpio_desc **desc_array, + int *value_array) { /* GPIO can never have been requested */ WARN_ON(1); @@ -341,7 +342,7 @@ static inline void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) /* GPIO can never have been requested */ WARN_ON(1); } -static inline void gpiod_set_array_cansleep(unsigned int array_size, +static inline void gpiod_set_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, int *value_array) { @@ -360,7 +361,7 @@ static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, /* GPIO can never have been requested */ WARN_ON(1); } -static inline void gpiod_set_raw_array_cansleep(unsigned int array_size, +static inline void gpiod_set_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, int *value_array) { -- cgit v1.2.3 From 5fbf65d5c9c0fd2e5c6c48d69ce34b1c5415f2fd Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2015 15:19:37 +0900 Subject: pinctrl: remove useless const qualifier This "const" claims the get_function_groups callback never changes the given num_groups pointer. It is always true in C language, so not worth mentioning. Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinmux.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index d3740fa7073f..ace60d775b20 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -69,7 +69,7 @@ struct pinmux_ops { int (*get_function_groups) (struct pinctrl_dev *pctldev, unsigned selector, const char * const **groups, - unsigned * const num_groups); + unsigned *num_groups); int (*set_mux) (struct pinctrl_dev *pctldev, unsigned func_selector, unsigned group_selector); int (*gpio_request_enable) (struct pinctrl_dev *pctldev, -- cgit v1.2.3 From b3da97ee581387cd42dafd76eb2ac23f2335cd92 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2015 15:25:50 +0900 Subject: pinctrl: use "const struct ..." rather than "struct ... const" Only this member, pins, is defined as "struct ... const *", but the others in this struct, pinlops, pmxops, confops, etc. are defined as "const struct ... *". Swap the "struct pinctrl_pin_desc" and "const" for consistency. Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 66e4697516de..9ba59fcba549 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -127,7 +127,7 @@ struct pinctrl_ops { */ struct pinctrl_desc { const char *name; - struct pinctrl_pin_desc const *pins; + const struct pinctrl_pin_desc *pins; unsigned int npins; const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; -- cgit v1.2.3 From 4af34b572a85c44c55491a10693535a79627c478 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 1 Jun 2015 11:04:26 +0200 Subject: drivers: soc: sunxi: Introduce SoC driver to map SRAMs The Allwinner SoCs have a handful of SRAM that can be either mapped to be accessible by devices or the CPU. That mapping is controlled by an SRAM controller, and that mapping might not be set by the bootloader, for example if the device wasn't used at all, or if we're using solutions like the U-Boot's Falcon Boot. We could also imagine changing this at runtime for example to change the mapping of these SRAMs to use them for suspend/resume or runtime memory rate change, if that ever happens. These use cases require some API in the kernel to control that mapping, exported through a drivers/soc driver. This driver also implement a debugfs file that shows the SRAM found in the system, the current mapping and the SRAM that have been claimed by some drivers in the kernel. Signed-off-by: Maxime Ripard Acked-by: Arnd Bergmann Acked-by: Hans de Goede Tested-by: Hans de Goede Signed-off-by: Arnd Bergmann --- include/linux/soc/sunxi/sunxi_sram.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/soc/sunxi/sunxi_sram.h (limited to 'include/linux') diff --git a/include/linux/soc/sunxi/sunxi_sram.h b/include/linux/soc/sunxi/sunxi_sram.h new file mode 100644 index 000000000000..c5f663bba9c2 --- /dev/null +++ b/include/linux/soc/sunxi/sunxi_sram.h @@ -0,0 +1,19 @@ +/* + * Allwinner SoCs SRAM Controller Driver + * + * Copyright (C) 2015 Maxime Ripard + * + * Author: Maxime Ripard + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#ifndef _SUNXI_SRAM_H_ +#define _SUNXI_SRAM_H_ + +int sunxi_sram_claim(struct device *dev); +int sunxi_sram_release(struct device *dev); + +#endif /* _SUNXI_SRAM_H_ */ -- cgit v1.2.3 From f26cdc8536ad50fb802a0445f836b4f94ca09ae7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 1 Jun 2015 09:29:53 -0600 Subject: blk-mq: Shared tag enhancements Storage controllers may expose multiple block devices that share hardware resources managed by blk-mq. This patch enhances the shared tags so a low-level driver can access the shared resources not tied to the unshared h/w contexts. This way the LLD can dynamically add and delete disks and request queues without having to track all the request_queue hctx's to iterate outstanding tags. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2056a99b92f8..37d1602c4f7a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -96,6 +96,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int, typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); +typedef void (busy_tag_iter_fn)(struct request *, void *, bool); struct blk_mq_ops { /* @@ -182,6 +183,7 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); enum { BLK_MQ_UNIQUE_TAG_BITS = 16, @@ -224,6 +226,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv); +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); -- cgit v1.2.3 From bdef7de4b8d9be4cf7bf5aea977f827310ab3ff0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 1 Jun 2015 14:56:09 -0700 Subject: net: Add priority to packet_offload objects. When we scan a packet for GRO processing, we want to see the most common packet types in the front of the offload_base list. So add a priority field so we can handle this properly. IPv4/IPv6 get the highest priority with the implicit zero priority field. Next comes ethernet with a priority of 10, and then we have the MPLS types with a priority of 15. Suggested-by: Eric Dumazet Suggested-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 51f8d2f5dc3f..6f5f71ff5169 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1997,6 +1997,7 @@ struct offload_callbacks { struct packet_offload { __be16 type; /* This is really htons(ether_type). */ + u16 priority; struct offload_callbacks callbacks; struct list_head list; }; -- cgit v1.2.3 From 66e5133f19e901a044fa5eaeeb6ecff4545839e5 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Mon, 1 Jun 2015 21:55:06 +0900 Subject: vlan: Add GRO support for non hardware accelerated vlan Currently packets with non-hardware-accelerated vlan cannot be handled by GRO. This causes low performance for 802.1ad and stacked vlan, as their vlan tags are currently not stripped by hardware. This patch adds GRO support for non-hardware-accelerated vlan and improves receive performance of them. Test Environment: vlan device (.1Q) on vlan device (.1ad) on ixgbe (82599) Result: - Before $ netperf -t TCP_STREAM -H 192.168.20.2 -l 60 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 60.00 5233.17 Rx side CPU usage: %usr %sys %irq %soft %idle 0.27 58.03 0.00 41.70 0.00 - After $ netperf -t TCP_STREAM -H 192.168.20.2 -l 60 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 60.00 7586.85 Rx side CPU usage: %usr %sys %irq %soft %idle 0.50 25.83 0.00 59.53 14.14 [ Register VLAN offloads with priority 10 -DaveM ] Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index a40d29846ac2..67ce5bd3b56a 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -628,4 +628,24 @@ static inline netdev_features_t vlan_features_check(const struct sk_buff *skb, return features; } +/** + * compare_vlan_header - Compare two vlan headers + * @h1: Pointer to vlan header + * @h2: Pointer to vlan header + * + * Compare two vlan headers, returns 0 if equal. + * + * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. + */ +static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, + const struct vlan_hdr *h2) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + return *(u32 *)h1 ^ *(u32 *)h2; +#else + return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) | + ((__force u32)h1->h_vlan_encapsulated_proto ^ + (__force u32)h2->h_vlan_encapsulated_proto); +#endif +} #endif /* !(_LINUX_IF_VLAN_H_) */ -- cgit v1.2.3 From 3434d23b694e5cb6e44e966914563406c31c4053 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 May 2015 13:33:45 +0530 Subject: clockevents: Add helpers to check the state of a clockevent device Some clockevent drivers, once migrated to use per-state callbacks, need to check the state of the clockevent device in their callbacks or interrupt handler. Add accessor functions clockevent_state_*() to get this information. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/04a717d490335c688dd7af899fbcede97e1bb8ee.1432192527.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 271fa4c8eb29..64214ad85af9 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -149,6 +149,32 @@ struct clock_event_device { struct module *owner; } ____cacheline_aligned; +/* Helpers to verify state of a clockevent device */ +static inline bool clockevent_state_detached(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_DETACHED; +} + +static inline bool clockevent_state_shutdown(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_SHUTDOWN; +} + +static inline bool clockevent_state_periodic(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_PERIODIC; +} + +static inline bool clockevent_state_oneshot(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_ONESHOT; +} + +static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED; +} + /* * Calculate a multiplication factor for scaled math, which is used to convert * nanoseconds based values to clock ticks: -- cgit v1.2.3 From a97e2d86a9b88ea9e9a280b594b80f0eec2c955b Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sun, 31 May 2015 17:15:30 -0400 Subject: IB/core cleanup: Add const on args - device->process_mad The process_mad device function declares some parameters as "in". Make those parameters const and adjust the call tree under process_mad in the various drivers accordingly. Signed-off-by: Ira Weiny Reviewed-by: Hal Rosenstock Reviewed-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9a90e7523dc2..9ec7c93d6fa3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -696,7 +696,7 @@ int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); -int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb, +int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); void mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 11f81becca04bb7d2826a9b65bb8d27b0a1bb543 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:15 -0400 Subject: page_writeback: revive cancel_dirty_page() in a restricted form cancel_dirty_page() had some issues and b9ea25152e56 ("page_writeback: clean up mess around cancel_dirty_page()") replaced it with account_page_cleaned() which makes the caller responsible for clearing the dirty bit; unfortunately, the planned changes for cgroup writeback support requires synchronization between dirty bit manipulation and stat updates. While we can open-code such synchronization in each account_page_cleaned() callsite, that's gonna be unnecessarily awkward and verbose. This patch revives cancel_dirty_page() but in a more restricted form. All it does is TestClearPageDirty() followed by account_page_cleaned() invocation if the page was dirty. This helper covers all account_page_cleaned() usages except for __delete_from_page_cache() which is a special case anyway and left alone. As this leaves no module user for account_page_cleaned(), EXPORT_SYMBOL() is dropped from it. This patch just revives cancel_dirty_page() as a trivial wrapper to replace equivalent usages and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..a83cf3a6f78e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1215,6 +1215,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping); void account_page_cleaned(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); +void cancel_dirty_page(struct page *page); int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -- cgit v1.2.3 From c4843a7593a9df3ff5b1806084cefdfa81dd7c79 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 22 May 2015 17:13:16 -0400 Subject: memcg: add per cgroup dirty page accounting When modifying PG_Dirty on cached file pages, update the new MEM_CGROUP_STAT_DIRTY counter. This is done in the same places where global NR_FILE_DIRTY is managed. The new memcg stat is visible in the per memcg memory.stat cgroupfs file. The most recent past attempt at this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632 The new accounting supports future efforts to add per cgroup dirty page throttling and writeback. It also helps an administrator break down a container's memory usage and provides evidence to understand memcg oom kills (the new dirty count is included in memcg oom kill messages). The ability to move page accounting between memcg (memory.move_charge_at_immigrate) makes this accounting more complicated than the global counter. The existing mem_cgroup_{begin,end}_page_stat() lock is used to serialize move accounting with stat updates. Typical update operation: memcg = mem_cgroup_begin_page_stat(page) if (TestSetPageDirty()) { [...] mem_cgroup_update_page_stat(memcg) } mem_cgroup_end_page_stat(memcg) Summary of mem_cgroup_end_page_stat() overhead: - Without CONFIG_MEMCG it's a no-op - With CONFIG_MEMCG and no inter memcg task movement, it's just rcu_read_lock() - With CONFIG_MEMCG and inter memcg task movement, it's rcu_read_lock() + spin_lock_irqsave() A memcg parameter is added to several routines because their callers now grab mem_cgroup_begin_page_stat() which returns the memcg later needed by for mem_cgroup_update_page_stat(). Because mem_cgroup_begin_page_stat() may disable interrupts, some adjustments are needed: - move __mark_inode_dirty() from __set_page_dirty() to its caller. __mark_inode_dirty() locking does not want interrupts disabled. - use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in __delete_from_page_cache(), replace_page_cache_page(), invalidate_complete_page2(), and __remove_mapping(). text data bss dec hex filename 8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before 8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after +192 text bytes 8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before 8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after +773 text bytes Performance tests run on v4.0-rc1-36-g4f671fe2f952. Lower is better for all metrics, they're all wall clock or cycle counts. The read and write fault benchmarks just measure fault time, they do not include I/O time. * CONFIG_MEMCG not set: baseline patched kbuild 1m25.030000(+-0.088% 3 samples) 1m25.426667(+-0.120% 3 samples) dd write 100 MiB 0.859211561 +-15.10% 0.874162885 +-15.03% dd write 200 MiB 1.670653105 +-17.87% 1.669384764 +-11.99% dd write 1000 MiB 8.434691190 +-14.15% 8.474733215 +-14.77% read fault cycles 254.0(+-0.000% 10 samples) 253.0(+-0.000% 10 samples) write fault cycles 2021.2(+-3.070% 10 samples) 1984.5(+-1.036% 10 samples) * CONFIG_MEMCG=y root_memcg: baseline patched kbuild 1m25.716667(+-0.105% 3 samples) 1m25.686667(+-0.153% 3 samples) dd write 100 MiB 0.855650830 +-14.90% 0.887557919 +-14.90% dd write 200 MiB 1.688322953 +-12.72% 1.667682724 +-13.33% dd write 1000 MiB 8.418601605 +-14.30% 8.673532299 +-15.00% read fault cycles 266.0(+-0.000% 10 samples) 266.0(+-0.000% 10 samples) write fault cycles 2051.7(+-1.349% 10 samples) 2049.6(+-1.686% 10 samples) * CONFIG_MEMCG=y non-root_memcg: baseline patched kbuild 1m26.120000(+-0.273% 3 samples) 1m25.763333(+-0.127% 3 samples) dd write 100 MiB 0.861723964 +-15.25% 0.818129350 +-14.82% dd write 200 MiB 1.669887569 +-13.30% 1.698645885 +-13.27% dd write 1000 MiB 8.383191730 +-14.65% 8.351742280 +-14.52% read fault cycles 265.7(+-0.172% 10 samples) 267.0(+-0.000% 10 samples) write fault cycles 2070.6(+-1.512% 10 samples) 2084.4(+-2.148% 10 samples) As expected anon page faults are not affected by this patch. tj: Updated to apply on top of the recent cancel_dirty_page() changes. Signed-off-by: Sha Zhengju Signed-off-by: Greg Thelen Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 1 + include/linux/mm.h | 6 ++++-- include/linux/pagemap.h | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 72dff5fb0d0c..5fe6411b5e54 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -41,6 +41,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, diff --git a/include/linux/mm.h b/include/linux/mm.h index a83cf3a6f78e..f48d979ced4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1211,8 +1211,10 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_cleaned(struct page *page, struct address_space *mapping); +void account_page_dirtied(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg); +void account_page_cleaned(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); void cancel_dirty_page(struct page *page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f7065c..fb0814ca65c7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); -extern void __delete_from_page_cache(struct page *page, void *shadow); +extern void __delete_from_page_cache(struct page *page, void *shadow, + struct mem_cgroup *memcg); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* -- cgit v1.2.3 From eea8f41cc58849e354ecf8b95bd7f806e1d1f703 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:17 -0400 Subject: blkcg: move block/blk-cgroup.h to include/linux/blk-cgroup.h cgroup aware writeback support will require exposing some of blkcg details. In preprataion, move block/blk-cgroup.h to include/linux/blk-cgroup.h. This patch is pure file move. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 603 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 603 insertions(+) create mode 100644 include/linux/blk-cgroup.h (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h new file mode 100644 index 000000000000..c567865b5f1d --- /dev/null +++ b/include/linux/blk-cgroup.h @@ -0,0 +1,603 @@ +#ifndef _BLK_CGROUP_H +#define _BLK_CGROUP_H +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include +#include +#include +#include +#include +#include + +/* Max limits for throttle policy */ +#define THROTL_IOPS_MAX UINT_MAX + +/* CFQ specific, out here for blkcg->cfq_weight */ +#define CFQ_WEIGHT_MIN 10 +#define CFQ_WEIGHT_MAX 1000 +#define CFQ_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_BLK_CGROUP + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +struct blkcg_gq; + +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + + struct radix_tree_root blkg_tree; + struct blkcg_gq *blkg_hint; + struct hlist_head blkg_list; + + /* TODO: per-policy storage in blkcg */ + unsigned int cfq_weight; /* belongs to cfq */ + unsigned int cfq_leaf_weight; +}; + +struct blkg_stat { + struct u64_stats_sync syncp; + uint64_t cnt; +}; + +struct blkg_rwstat { + struct u64_stats_sync syncp; + uint64_t cnt[BLKG_RWSTAT_NR]; +}; + +/* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each has its private + * data on each blkg, the size of which is determined by + * blkcg_policy->pd_size. blkcg core allocates and frees such areas + * together with blkg and invokes pd_init/exit_fn() methods. + * + * Such private data must embed struct blkg_policy_data (pd) at the + * beginning and pd_size can't be smaller than pd. + */ +struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; + + /* used during policy activation */ + struct list_head alloc_node; +}; + +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* request allocation list for this blkcg-q pair */ + struct request_list rl; + + /* reference count */ + atomic_t refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + struct rcu_head rcu_head; +}; + +typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); + +struct blkcg_policy { + int plid; + /* policy specific private data size */ + size_t pd_size; + /* cgroup files for the policy */ + struct cftype *cftypes; + + /* operations */ + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; + blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; +}; + +extern struct blkcg blkcg_root; + +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); +int blkcg_init_queue(struct request_queue *q); +void blkcg_drain_queue(struct request_queue *q); +void blkcg_exit_queue(struct request_queue *q); + +/* Blkio controller policy registration */ +int blkcg_policy_register(struct blkcg_policy *pol); +void blkcg_policy_unregister(struct blkcg_policy *pol); +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat); +u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); + +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + +struct blkg_conf_ctx { + struct gendisk *disk; + struct blkcg_gq *blkg; + u64 v; +}; + +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx); +void blkg_conf_finish(struct blkg_conf_ctx *ctx); + + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + +static inline struct blkcg *task_blkcg(struct task_struct *tsk) +{ + return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); +} + +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + return task_blkcg(current); +} + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) +{ + return blkg ? blkg->pd[pol->plid] : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) +{ + return pd ? pd->blkg : NULL; +} + +/** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ +static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) +{ + char *p; + + p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + if (!p) { + strncpy(buf, "", buflen); + return -ENAMETOOLONG; + } + + memmove(buf, p, buf + buflen - p); + return 0; +} + +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding an existing reference. + */ +static inline void blkg_get(struct blkcg_gq *blkg) +{ + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); +} + +void __blkg_release_rcu(struct rcu_head *rcu); + +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + */ +static inline void blkg_put(struct blkcg_gq *blkg) +{ + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); +} + +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, + bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Similar to blkg_for_each_descendant_pre() but performs post-order + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. + */ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; +root_rl: + rcu_read_unlock(); + return &q->root_rl; +} + +/** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ +static inline void blk_put_rl(struct request_list *rl) +{ + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); +} + +/** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) +{ + rq->rl = rl; +} + +/** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ +static inline struct request_list *blk_rq_rl(struct request *rq) +{ + return rq->rl; +} + +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); +/** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + +static inline void blkg_stat_init(struct blkg_stat *stat) +{ + u64_stats_init(&stat->syncp); +} + +/** + * blkg_stat_add - add a value to a blkg_stat + * @stat: target blkg_stat + * @val: value to add + * + * Add @val to @stat. The caller is responsible for synchronizing calls to + * this function. + */ +static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) +{ + u64_stats_update_begin(&stat->syncp); + stat->cnt += val; + u64_stats_update_end(&stat->syncp); +} + +/** + * blkg_stat_read - read the current value of a blkg_stat + * @stat: blkg_stat to read + * + * Read the current value of @stat. This function can be called without + * synchroniztion and takes care of u64 atomicity. + */ +static inline uint64_t blkg_stat_read(struct blkg_stat *stat) +{ + unsigned int start; + uint64_t v; + + do { + start = u64_stats_fetch_begin_irq(&stat->syncp); + v = stat->cnt; + } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); + + return v; +} + +/** + * blkg_stat_reset - reset a blkg_stat + * @stat: blkg_stat to reset + */ +static inline void blkg_stat_reset(struct blkg_stat *stat) +{ + stat->cnt = 0; +} + +/** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + +static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +{ + u64_stats_init(&rwstat->syncp); +} + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @rw: mask of REQ_{WRITE|SYNC} + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + int rw, uint64_t val) +{ + u64_stats_update_begin(&rwstat->syncp); + + if (rw & REQ_WRITE) + rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + else + rwstat->cnt[BLKG_RWSTAT_READ] += val; + if (rw & REQ_SYNC) + rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + else + rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + + u64_stats_update_end(&rwstat->syncp); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it as the return value. + * This function can be called without synchronization and takes care of + * u64 atomicity. + */ +static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) +{ + unsigned int start; + struct blkg_rwstat tmp; + + do { + start = u64_stats_fetch_begin_irq(&rwstat->syncp); + tmp = *rwstat; + } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + + return tmp; +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); + + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); +} + +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + +#else /* CONFIG_BLK_CGROUP */ + +struct cgroup; +struct blkcg; + +struct blkg_policy_data { +}; + +struct blkcg_gq { +}; + +struct blkcg_policy { +}; + +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline int blkcg_init_queue(struct request_queue *q) { return 0; } +static inline void blkcg_drain_queue(struct request_queue *q) { } +static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } +static inline int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { return 0; } +static inline void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { } + +static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } +static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } +static inline void blkg_get(struct blkcg_gq *blkg) { } +static inline void blkg_put(struct blkcg_gq *blkg) { } + +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } +static inline void blk_put_rl(struct request_list *rl) { } +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } +static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + +#endif /* CONFIG_BLK_CGROUP */ +#endif /* _BLK_CGROUP_H */ -- cgit v1.2.3 From efa7d1c733d1d2c1a468b85126d70bad9fdf6ba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:18 -0400 Subject: update !CONFIG_BLK_CGROUP dummies in include/linux/blk-cgroup.h The header file will be used more widely with the pending cgroup writeback support and the current set of dummy declarations aren't enough to handle different config combinations. Update as follows. * Drop the struct cgroup declaration. None of the dummy defs need it. * Define blkcg as an empty struct instead of just declaring it. * Wrap dummy function defs in CONFIG_BLOCK. Some functions use block data types and none of them are to be used w/o block enabled. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index c567865b5f1d..51f95b34d3f0 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -558,8 +558,8 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to, #else /* CONFIG_BLK_CGROUP */ -struct cgroup; -struct blkcg; +struct blkcg { +}; struct blkg_policy_data { }; @@ -570,6 +570,8 @@ struct blkcg_gq { struct blkcg_policy { }; +#ifdef CONFIG_BLOCK + static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } @@ -599,5 +601,6 @@ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) +#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_H */ -- cgit v1.2.3 From 56161634e4824380a67243a4cf3fa52eb1e5d836 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:20 -0400 Subject: memcg: add mem_cgroup_root_css Add global mem_cgroup_root_css which points to the root memcg css. This will be used by cgroup writeback support. If memcg is disabled, it's defined as ERR_PTR(-EINVAL). Signed-off-by: Tejun Heo Cc: Johannes Weiner aCc: Michal Hocko Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5fe6411b5e54..294498f4f6fc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -68,6 +68,8 @@ enum mem_cgroup_events_index { }; #ifdef CONFIG_MEMCG +extern struct cgroup_subsys_state *mem_cgroup_root_css; + void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr); @@ -196,6 +198,8 @@ void mem_cgroup_split_huge_fixup(struct page *head); #else /* CONFIG_MEMCG */ struct mem_cgroup; +#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + static inline void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr) -- cgit v1.2.3 From 496d5e7560dbb84399dbd92316fc33857aa83900 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:21 -0400 Subject: blkcg: add blkcg_root_css Add global constant blkcg_root_css which points to &blkcg_root.css. This will be used by cgroup writeback support. If blkcg is disabled, it's defined as ERR_PTR(-EINVAL). v2: The declarations moved to include/linux/blk-cgroup.h as suggested by Vivek. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 51f95b34d3f0..65f0c178fd04 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -134,6 +134,7 @@ struct blkcg_policy { }; extern struct blkcg blkcg_root; +extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, @@ -570,6 +571,8 @@ struct blkcg_gq { struct blkcg_policy { }; +#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit v1.2.3 From ec438699a9ae0856c2ce20a50dd39cdc7e92a732 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:22 -0400 Subject: cgroup, block: implement task_get_css() and use it in bio_associate_current() bio_associate_current() currently open codes task_css() and css_tryget_online() to find and pin $current's blkcg css. Abstract it into task_get_css() which is implemented from cgroup side. As a task is always associated with an online css for every subsystem except while the css_set update is propagating, task_get_css() retries till css_tryget_online() succeeds. This is a cleanup and shouldn't lead to noticeable behavior changes. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Jens Axboe Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b9cb94c3102a..e7da0aa65b2d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -773,6 +773,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, return task_css_check(task, subsys_id, false); } +/** + * task_get_css - find and get the css for (task, subsys) + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Find the css for the (@task, @subsys_id) combination, increment a + * reference on and return it. This function is guaranteed to return a + * valid css. + */ +static inline struct cgroup_subsys_state * +task_get_css(struct task_struct *task, int subsys_id) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + while (true) { + css = task_css(task, subsys_id); + if (likely(css_tryget_online(css))) + break; + cpu_relax(); + } + rcu_read_unlock(); + return css; +} + /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task -- cgit v1.2.3 From fd383c2d3cae146337cea809de0d622b8b887e6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:23 -0400 Subject: blkcg: implement task_get_blkcg_css() Implement a wrapper around task_get_css() to acquire the blkcg css for a given task. The wrapper is necessary for cgroup writeback support as there will be places outside blkcg proper trying to acquire blkcg_css and blkio_cgrp_id will be undefined when !CONFIG_BLK_CGROUP. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 65f0c178fd04..4dc643f2046e 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -195,6 +195,12 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return task_blkcg(current); } +static inline struct cgroup_subsys_state * +task_get_blkcg_css(struct task_struct *task) +{ + return task_get_css(task, blkio_cgrp_id); +} + /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -573,6 +579,12 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) +static inline struct cgroup_subsys_state * +task_get_blkcg_css(struct task_struct *task) +{ + return NULL; +} + #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit v1.2.3 From 1d933cf096e3aea15f1aec8297657b7a846fab63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:24 -0400 Subject: blkcg: implement bio_associate_blkcg() Currently, a bio can only be associated with the io_context and blkcg of %current using bio_associate_current(). This is too restrictive for cgroup writeback support. Implement bio_associate_blkcg() which associates a bio with the specified blkcg. bio_associate_blkcg() leaves the io_context unassociated. bio_associate_current() is updated so that it considers a bio as already associated if it has a blkcg_css, instead of an io_context, associated with it. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index f0291cf64cc5..5e963a6d7c14 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -482,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); #ifdef CONFIG_BLK_CGROUP +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); #else /* CONFIG_BLK_CGROUP */ +static inline int bio_associate_blkcg(struct bio *bio, + struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } #endif /* CONFIG_BLK_CGROUP */ -- cgit v1.2.3 From ad7fa852d3d2816d68a138ebc5bc8967aeb7fd86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 May 2015 20:00:02 -0400 Subject: memcg: implement mem_cgroup_css_from_page() Implement mem_cgroup_css_from_page() which returns the cgroup_subsys_state of the memcg associated with a given page on the default hierarchy. This will be used by cgroup writeback support. This function assumes that page->mem_cgroup association doesn't change until the page is released, which is true on the default hierarchy as long as replace_page_cache_page() is not used. As the only user of replace_page_cache_page() is FUSE which won't support cgroup writeback for the time being, this works for now, and replace_page_cache_page() will soon be updated so that the invariant actually holds. Note that the RCU protected page->mem_cgroup access is consistent with other usages across memcg but ultimately incorrect. These unlocked accesses are missing required barriers. page->mem_cgroup should be made an RCU pointer and updated and accessed using RCU operations. v4: Instead of triggering WARN, return the root css on the traditional hierarchies. This makes the function a lot easier to deal with especially as there's no light way to synchronize against hierarchy rebinding. v3: s/mem_cgroup_migrate()/mem_cgroup_css_from_page()/ v2: Trigger WARN if the function is used on the traditional hierarchies and add comment about the assumed invariant. Signed-off-by: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 294498f4f6fc..637ef626008e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -115,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, } extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); +extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, -- cgit v1.2.3 From 4452226ea276e74fc3e252c88d9bb7e8f8e44bf0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:26 -0400 Subject: writeback: move backing_dev_info->state into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->state into wb. * enum bdi_state is renamed to wb_state and the prefix of all enums is changed from BDI_ to WB_. * Explicit zeroing of bdi->state is removed without adding zeoring of wb->state as the whole data structure is zeroed on init anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: drbd-dev@lists.linbit.com Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index aff923ae8c4b..eb14f988a63e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,13 +25,13 @@ struct device; struct dentry; /* - * Bits in backing_dev_info.state + * Bits in bdi_writeback.state */ -enum bdi_state { - BDI_async_congested, /* The async (write) queue is getting full */ - BDI_sync_congested, /* The sync queue is getting full */ - BDI_registered, /* bdi_register() was done */ - BDI_writeback_running, /* Writeback is in progress */ +enum wb_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ }; typedef int (congested_fn)(void *, int); @@ -49,6 +49,7 @@ enum bdi_stat_item { struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ + unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct delayed_work dwork; /* work item used for writeback */ @@ -62,7 +63,6 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ @@ -250,23 +250,23 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->state & bdi_bits); + return (bdi->wb.state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_sync_congested); + return bdi_congested(bdi, 1 << WB_sync_congested); } static inline int bdi_write_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_async_congested); + return bdi_congested(bdi, 1 << WB_async_congested); } static inline int bdi_rw_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, (1 << BDI_sync_congested) | - (1 << BDI_async_congested)); + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); } enum { -- cgit v1.2.3 From 93f78d882865cb90020d0f80a9523c99cf46924c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:27 -0400 Subject: writeback: move backing_dev_info->bdi_stat[] into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->bdi_stat[] into wb. * enum bdi_stat_item is renamed to wb_stat_item and the prefix of all enums is changed from BDI_ to WB_. * BDI_STAT_BATCH() -> WB_STAT_BATCH() * [__]{add|inc|dec|sum}_wb_stat(bdi, ...) -> [__]{add|inc}_wb_stat(wb, ...) * bdi_stat[_error]() -> wb_stat[_error]() * bdi_writeout_inc() -> wb_writeout_inc() * stat init is moved to bdi_wb_init() and bdi_wb_exit() is added and frees stat. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 68 +++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index eb14f988a63e..fe7a907a4e16 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,15 +36,15 @@ enum wb_state { typedef int (congested_fn)(void *, int); -enum bdi_stat_item { - BDI_RECLAIMABLE, - BDI_WRITEBACK, - BDI_DIRTIED, - BDI_WRITTEN, - NR_BDI_STAT_ITEMS +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS }; -#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -58,6 +58,8 @@ struct bdi_writeback { struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; }; struct backing_dev_info { @@ -69,8 +71,6 @@ struct backing_dev_info { char *name; - struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; - unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ @@ -137,78 +137,74 @@ static inline int wb_has_dirty_io(struct bdi_writeback *wb) !list_empty(&wb->b_more_io); } -static inline void __add_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item, s64 amount) +static inline void __add_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item, s64 amount) { - __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); + __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void __inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __inc_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, 1); + __add_wb_stat(wb, item, 1); } -static inline void inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __inc_bdi_stat(bdi, item); + __inc_wb_stat(wb, item); local_irq_restore(flags); } -static inline void __dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __dec_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, -1); + __add_wb_stat(wb, item, -1); } -static inline void dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __dec_bdi_stat(bdi, item); + __dec_wb_stat(wb, item); local_irq_restore(flags); } -static inline s64 bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - return percpu_counter_read_positive(&bdi->bdi_stat[item]); + return percpu_counter_read_positive(&wb->stat[item]); } -static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 __wb_stat_sum(struct bdi_writeback *wb, + enum wb_stat_item item) { - return percpu_counter_sum_positive(&bdi->bdi_stat[item]); + return percpu_counter_sum_positive(&wb->stat[item]); } -static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { s64 sum; unsigned long flags; local_irq_save(flags); - sum = __bdi_stat_sum(bdi, item); + sum = __wb_stat_sum(wb, item); local_irq_restore(flags); return sum; } -extern void bdi_writeout_inc(struct backing_dev_info *bdi); +extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +static inline unsigned long wb_stat_error(struct bdi_writeback *wb) { #ifdef CONFIG_SMP - return nr_cpu_ids * BDI_STAT_BATCH; + return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif -- cgit v1.2.3 From a88a341a73be4ef035ca26170c849f002797da27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:28 -0400 Subject: writeback: move bandwidth related fields from backing_dev_info into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bandwidth related fields from backing_dev_info into bdi_writeback. * The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp, write_bandwidth, avg_write_bandwidth, dirty_ratelimit, balanced_dirty_ratelimit, completions and dirty_exceeded. * writeback_chunk_size() and over_bground_thresh() now take @wb instead of @bdi. * bdi_writeout_fraction(bdi, ...) -> wb_writeout_fraction(wb, ...) bdi_dirty_limit(bdi, ...) -> wb_dirty_limit(wb, ...) bdi_position_ration(bdi, ...) -> wb_position_ratio(wb, ...) bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...) [__]bdi_update_bandwidth(bdi, ...) -> [__]wb_update_bandwidth(wb, ...) bdi_{max|min}_pause(bdi, ...) -> wb_{max|min}_pause(wb, ...) bdi_dirty_limits(bdi, ...) -> wb_dirty_limits(wb, ...) * Init/exits of the relocated fields are moved to bdi_wb_init/exit() respectively. Note that explicit zeroing is dropped in the process as wb's are cleared in entirety anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. v2: Typo in description fixed as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Jaegeuk Kim Cc: Steven Whitehouse Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 20 ++++++++++---------- include/linux/writeback.h | 19 +++++++++---------- 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fe7a907a4e16..2ab06049d812 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -60,16 +60,6 @@ struct bdi_writeback { spinlock_t list_lock; /* protects the b_* lists */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; @@ -88,6 +78,16 @@ struct backing_dev_info { struct fprop_local_percpu completions; int dirty_exceeded; +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b2dd371ec0ca..a6b9db7fcee8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,16 +155,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty); - -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); +unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); + +void __wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); -- cgit v1.2.3 From f0054bb1e1f3be03cc33369df640db97f10f6172 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:30 -0400 Subject: writeback: move backing_dev_info->wb_lock and ->worklist into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->wb_lock and ->worklist into wb. * The lock protects bdi->worklist and bdi->wb.dwork scheduling. While moving, rename it to wb->work_lock as wb->wb_lock is confusing. Also, move wb->dwork downwards so that it's colocated with the new ->work_lock and ->work_list fields. * bdi_writeback_workfn() -> wb_workfn() bdi_wakeup_thread_delayed(bdi) -> wb_wakeup_delayed(wb) bdi_wakeup_thread(bdi) -> wb_wakeup(wb) bdi_queue_work(bdi, ...) -> wb_queue_work(wb, ...) __bdi_start_writeback(bdi, ...) -> __wb_start_writeback(wb, ...) get_next_work_item(bdi) -> get_next_work_item(wb) * bdi_wb_shutdown() is renamed to wb_shutdown() and now takes @wb. The function contained parts which belong to the containing bdi rather than the wb itself - testing cap_writeback_dirty and bdi_remove_from_list() invocation. Those are moved to bdi_unregister(). * bdi_wb_{init|exit}() are renamed to wb_{init|exit}(). Initializations of the moved bdi->wb_lock and ->work_list are relocated from bdi_init() to wb_init(). * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2ab06049d812..d796f49ce87a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -52,7 +52,6 @@ struct bdi_writeback { unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ - struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -78,6 +77,10 @@ struct bdi_writeback { struct fprop_local_percpu completions; int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ }; struct backing_dev_info { @@ -93,9 +96,6 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */ - - struct list_head work_list; struct device *dev; @@ -121,9 +121,9 @@ int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -void bdi_writeback_workfn(struct work_struct *work); +void wb_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -- cgit v1.2.3 From 66114cad64bf76a155fec1f0fff0de771cf909d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:32 -0400 Subject: writeback: separate out include/linux/backing-dev-defs.h With the planned cgroup writeback support, backing-dev related declarations will be more widely used across block and cgroup; unfortunately, including backing-dev.h from include/linux/blkdev.h makes cyclic include dependency quite likely. This patch separates out backing-dev-defs.h which only has the essential definitions and updates blkdev.h to include it. c files which need access to more backing-dev details now include backing-dev.h directly. This takes backing-dev.h off the common include dependency chain making it a lot easier to use it across block and cgroup. v2: fs/fat build failure fixed. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 106 +++++++++++++++++++++++++++++++++++++++ include/linux/backing-dev.h | 102 +------------------------------------ include/linux/blkdev.h | 2 +- 3 files changed, 108 insertions(+), 102 deletions(-) create mode 100644 include/linux/backing-dev-defs.h (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h new file mode 100644 index 000000000000..aa18c4bd43c1 --- /dev/null +++ b/include/linux/backing-dev-defs.h @@ -0,0 +1,106 @@ +#ifndef __LINUX_BACKING_DEV_DEFS_H +#define __LINUX_BACKING_DEV_DEFS_H + +#include +#include +#include +#include +#include +#include + +struct page; +struct device; +struct dentry; + +/* + * Bits in bdi_writeback.state + */ +enum wb_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ +}; + +typedef int (congested_fn)(void *, int); + +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS +}; + +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) + +struct bdi_writeback { + struct backing_dev_info *bdi; /* our parent bdi */ + + unsigned long state; /* Always use atomic bitops on this */ + unsigned long last_old_flush; /* last old data flush */ + + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_dirty_time; /* time stamps are dirty */ + spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; + + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long dirtied_stamp; + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw */ + + /* + * The base dirty throttle rate, re-calculated on every 200ms. + * All the bdi tasks' dirty rate will be curbed under it. + * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit + * in small steps and is much more smooth/stable than the latter. + */ + unsigned long dirty_ratelimit; + unsigned long balanced_dirty_ratelimit; + + struct fprop_local_percpu completions; + int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; + + unsigned int min_ratio; + unsigned int max_ratio, max_prop_frac; + + struct bdi_writeback wb; /* default writeback info for this bdi */ + + struct device *dev; + + struct timer_list laptop_mode_wb_timer; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debug_dir; + struct dentry *debug_stats; +#endif +}; + +enum { + BLK_RW_ASYNC = 0, + BLK_RW_SYNC = 1, +}; + +void clear_bdi_congested(struct backing_dev_info *bdi, int sync); +void set_bdi_congested(struct backing_dev_info *bdi, int sync); + +#endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d796f49ce87a..5e39f7a8efed 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -8,104 +8,11 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H -#include -#include -#include #include #include #include -#include #include -#include -#include -#include - -struct page; -struct device; -struct dentry; - -/* - * Bits in bdi_writeback.state - */ -enum wb_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ - WB_registered, /* bdi_register() was done */ - WB_writeback_running, /* Writeback is in progress */ -}; - -typedef int (congested_fn)(void *, int); - -enum wb_stat_item { - WB_RECLAIMABLE, - WB_WRITEBACK, - WB_DIRTIED, - WB_WRITTEN, - NR_WB_STAT_ITEMS -}; - -#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) - -struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ - - unsigned long state; /* Always use atomic bitops on this */ - unsigned long last_old_flush; /* last old data flush */ - - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ - struct list_head b_dirty_time; /* time stamps are dirty */ - spinlock_t list_lock; /* protects the b_* lists */ - - struct percpu_counter stat[NR_WB_STAT_ITEMS]; - - unsigned long bw_time_stamp; /* last time write bw is updated */ - unsigned long dirtied_stamp; - unsigned long written_stamp; /* pages written at bw_time_stamp */ - unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ - - /* - * The base dirty throttle rate, re-calculated on every 200ms. - * All the bdi tasks' dirty rate will be curbed under it. - * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit - * in small steps and is much more smooth/stable than the latter. - */ - unsigned long dirty_ratelimit; - unsigned long balanced_dirty_ratelimit; - - struct fprop_local_percpu completions; - int dirty_exceeded; - - spinlock_t work_lock; /* protects work_list & dwork scheduling */ - struct list_head work_list; - struct delayed_work dwork; /* work item used for writeback */ -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; - - unsigned int min_ratio; - unsigned int max_ratio, max_prop_frac; - - struct bdi_writeback wb; /* default writeback info for this bdi */ - - struct device *dev; - - struct timer_list laptop_mode_wb_timer; - -#ifdef CONFIG_DEBUG_FS - struct dentry *debug_dir; - struct dentry *debug_stats; -#endif -}; +#include struct backing_dev_info *inode_to_bdi(struct inode *inode); @@ -265,13 +172,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); } -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); int pdflush_proc_obsolete(struct ctl_table *table, int write, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ccaa9aecd593..60d2726a6b62 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From a212b105b07d75b48b1a166378282e8a77fbf53d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:33 -0400 Subject: bdi: make inode_to_bdi() inline Now that bdi definitions are moved to backing-dev-defs.h, backing-dev.h can include blkdev.h and inline inode_to_bdi() without worrying about introducing circular include dependency. The function gets called from hot paths and fairly trivial. This patch makes inode_to_bdi() and sb_is_blkdev_sb() that the function calls inline. blockdev_superblock and noop_backing_dev_info are EXPORT_GPL'd to allow the inline functions to be used from modules. While at it, make sb_is_blkdev_sb() return bool instead of int. v2: Fixed typo in description as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 18 ++++++++++++++++-- include/linux/fs.h | 8 +++++++- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5e39f7a8efed..785782034e86 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -11,11 +11,10 @@ #include #include #include +#include #include #include -struct backing_dev_info *inode_to_bdi(struct inode *inode); - int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -149,6 +148,21 @@ extern struct backing_dev_info noop_backing_dev_info; int writeback_in_progress(struct backing_dev_info *bdi); +static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif + return sb->s_bdi; +} + static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ef63900243c..ce100b87fba3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2240,7 +2240,13 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); -extern int sb_is_blkdev_sb(struct super_block *sb); + +extern struct super_block *blockdev_superblock; + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } -- cgit v1.2.3 From 4aa9c692e052cf6db99db62a8fe0543e5c455da7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:35 -0400 Subject: bdi: separate out congested state into a separate struct Currently, a wb's (bdi_writeback) congestion state is carried in its ->state field; however, cgroup writeback support will require multiple wb's sharing the same congestion state. This patch separates out congestion state into its own struct - struct bdi_writeback_congested. A new field wb field, wb_congested, points to its associated congested struct. The default wb, bdi->wb, always points to bdi->wb_congested. While this patch adds a layer of indirection, it doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 14 ++++++++++++-- include/linux/backing-dev.h | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index aa18c4bd43c1..9e9eafa5f5aa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -16,12 +16,15 @@ struct dentry; * Bits in bdi_writeback.state */ enum wb_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ }; +enum wb_congested_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ +}; + typedef int (congested_fn)(void *, int); enum wb_stat_item { @@ -34,6 +37,10 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +struct bdi_writeback_congested { + unsigned long state; /* WB_[a]sync_congested flags */ +}; + struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -48,6 +55,8 @@ struct bdi_writeback { struct percpu_counter stat[NR_WB_STAT_ITEMS]; + struct bdi_writeback_congested *congested; + unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ @@ -84,6 +93,7 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; struct device *dev; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 785782034e86..bfdaa18ba0a1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,7 +167,7 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.state & bdi_bits); + return (bdi->wb.congested->state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) -- cgit v1.2.3 From 89e9b9e07a390c50980d10aa37a04631db5a23ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:36 -0400 Subject: writeback: add {CONFIG|BDI_CAP|FS}_CGROUP_WRITEBACK cgroup writeback requires support from both bdi and filesystem sides. Add BDI_CAP_CGROUP_WRITEBACK and FS_CGROUP_WRITEBACK to indicate support and enable BDI_CAP_CGROUP_WRITEBACK on block based bdi's by default. Also, define CONFIG_CGROUP_WRITEBACK which is enabled if both MEMCG and BLK_CGROUP are enabled. inode_cgwb_enabled() which determines whether a given inode's both bdi and fs support cgroup writeback is added. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 32 +++++++++++++++++++++++++++++++- include/linux/fs.h | 1 + 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index bfdaa18ba0a1..6bb31234e6a9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -134,12 +134,15 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_WRITEBACK: Don't write pages back * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. + * + * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 #define BDI_CAP_NO_ACCT_WB 0x00000004 #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 +#define BDI_CAP_CGROUP_WRITEBACK 0x00000020 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -229,4 +232,31 @@ static inline int bdi_sched_wait(void *word) return 0; } -#endif /* _LINUX_BACKING_DEV_H */ +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode + * @inode: inode of interest + * + * cgroup writeback requires support from both the bdi and filesystem. + * Test whether @inode has both. + */ +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + return bdi_cap_account_dirty(bdi) && + (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && + (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + return false; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +#endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index ce100b87fba3..74e0ae0626a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1897,6 +1897,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ +#define FS_CGROUP_WRITEBACK 32 /* Supports cgroup-aware writeback */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- cgit v1.2.3 From 52ebea749aaed195245701a8f90a23d672c7a933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:37 -0400 Subject: writeback: make backing_dev_info host cgroup-specific bdi_writebacks For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo Cc: kbuild test robot Cc: Dan Carpenter Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 59 +++++++++++- include/linux/backing-dev.h | 195 +++++++++++++++++++++++++++++++++++++++ include/linux/blk-cgroup.h | 4 + include/linux/fs.h | 4 + include/linux/memcontrol.h | 4 + 5 files changed, 263 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 9e9eafa5f5aa..a1e9c407a59a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -2,8 +2,11 @@ #define __LINUX_BACKING_DEV_DEFS_H #include +#include +#include #include #include +#include #include #include #include @@ -37,10 +40,43 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +/* + * For cgroup writeback, multiple wb's may map to the same blkcg. Those + * wb's can operate mostly independently but should share the congested + * state. To facilitate such sharing, the congested state is tracked using + * the following struct which is created on demand, indexed by blkcg ID on + * its bdi, and refcounted. + */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct backing_dev_info *bdi; /* the associated bdi */ + atomic_t refcnt; /* nr of attached wb's and blkg */ + int blkcg_id; /* ID of the associated blkcg */ + struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ +#endif }; +/* + * Each wb (bdi_writeback) can perform writeback operations, is measured + * and throttled, independently. Without cgroup writeback, each bdi + * (bdi_writeback) is served by its embedded bdi->wb. + * + * On the default hierarchy, blkcg implicitly enables memcg. This allows + * using memcg's page ownership for attributing writeback IOs, and every + * memcg - blkcg combination can be served by its own wb by assigning a + * dedicated wb to each memcg, which enables isolation across different + * cgroups and propagation of IO back pressure down from the IO layer upto + * the tasks which are generating the dirty pages to be written back. + * + * A cgroup wb is indexed on its bdi by the ID of the associated memcg, + * refcounted with the number of inodes attached to it, and pins the memcg + * and the corresponding blkcg. As the corresponding blkcg for a memcg may + * change as blkcg is disabled and enabled higher up in the hierarchy, a wb + * is tested for blkcg after lookup and removed from index on mismatch so + * that a new wb for the combination can be created. + */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -78,6 +114,19 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct percpu_ref refcnt; /* used only for !root wb's */ + struct cgroup_subsys_state *memcg_css; /* the associated memcg */ + struct cgroup_subsys_state *blkcg_css; /* and blkcg */ + struct list_head memcg_node; /* anchored at memcg->cgwb_list */ + struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + + union { + struct work_struct release_work; + struct rcu_head rcu; + }; +#endif }; struct backing_dev_info { @@ -92,9 +141,13 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - struct bdi_writeback wb; /* default writeback info for this bdi */ - struct bdi_writeback_congested wb_congested; - + struct bdi_writeback wb; /* the root writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; /* its congested state */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ + struct rb_root cgwb_congested_tree; /* their congested states */ + atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ +#endif struct device *dev; struct timer_list laptop_mode_wb_timer; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6bb31234e6a9..8ae59df2e3d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include int __must_check bdi_init(struct backing_dev_info *bdi); @@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word) #ifdef CONFIG_CGROUP_WRITEBACK +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); +void wb_congested_put(struct bdi_writeback_congested *congested); +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp); +void __inode_attach_wb(struct inode *inode, struct page *page); +void wb_memcg_offline(struct mem_cgroup *memcg); +void wb_blkcg_offline(struct blkcg *blkcg); + /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest @@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +/** + * wb_find_current - find wb for %current on a bdi + * @bdi: bdi of interest + * + * Find the wb of @bdi which matches both the memcg and blkcg of %current. + * Must be called under rcu_read_lock() which protects the returend wb. + * NULL if not found. + */ +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + + memcg_css = task_css(current, memory_cgrp_id); + if (!memcg_css->parent) + return &bdi->wb; + + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + + /* + * %current's blkcg equals the effective blkcg of its memcg. No + * need to use the relatively expensive cgroup_get_e_css(). + */ + if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + return wb; + return NULL; +} + +/** + * wb_get_create_current - get or create wb for %current on a bdi + * @bdi: bdi of interest + * @gfp: allocation mask + * + * Equivalent to wb_get_create() on %current's memcg. This function is + * called from a relatively hot path and optimizes the common cases using + * wb_find_current(). + */ +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + wb = wb_find_current(bdi); + if (wb && unlikely(!wb_tryget(wb))) + wb = NULL; + rcu_read_unlock(); + + if (unlikely(!wb)) { + struct cgroup_subsys_state *memcg_css; + + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, gfp); + css_put(memcg_css); + } + return wb; +} + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +/** + * inode_to_wb - determine the wb of an inode + * @inode: inode of interest + * + * Returns the wb @inode is currently associated with. + */ +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return inode->i_wb; +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return false; } +static inline struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + return bdi->wb.congested; +} + +static inline void wb_congested_put(struct bdi_writeback_congested *congested) +{ +} + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + return &bdi->wb; +} + +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + return &bdi->wb; +} + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + +static inline void wb_memcg_offline(struct mem_cgroup *memcg) +{ +} + +static inline void wb_blkcg_offline(struct blkcg *blkcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4dc643f2046e..3033eb173eb4 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -53,6 +53,10 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ unsigned int cfq_leaf_weight; + +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif }; struct blkg_stat { diff --git a/include/linux/fs.h b/include/linux/fs.h index 74e0ae0626a8..67a42ec95065 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -35,6 +35,7 @@ #include struct backing_dev_info; +struct bdi_writeback; struct export_operations; struct hd_geometry; struct iovec; @@ -635,6 +636,9 @@ struct inode { struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *i_wb; /* the associated cgroup wb */ +#endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; union { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 637ef626008e..662a953ea8ad 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -388,6 +388,10 @@ enum { OVER_LIMIT, }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); +#endif + struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) void sock_update_memcg(struct sock *sk); -- cgit v1.2.3 From ce7acfeaf0363c8b75810908448f61af04d38f91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:38 -0400 Subject: writeback, blkcg: associate each blkcg_gq with the corresponding bdi_writeback_congested A blkg (blkcg_gq) can be congested and decongested independently from other blkgs on the same request_queue. Accordingly, for cgroup writeback support, the congestion status at bdi (backing_dev_info) should be split and updated separately from matching blkg's. This patch prepares by adding blkg->wb_congested and associating a blkg with its matching per-blkcg bdi_writeback_congested on creation. v2: Updated to associate bdi_writeback_congested instead of bdi_writeback. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3033eb173eb4..07a32b813ed8 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -99,6 +99,12 @@ struct blkcg_gq { struct hlist_node blkcg_node; struct blkcg *blkcg; + /* + * Each blkg gets congested separately and the congestion state is + * propagated to the matching bdi_writeback_congested. + */ + struct bdi_writeback_congested *wb_congested; + /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; -- cgit v1.2.3 From ec8a6f2643923ee5b74d24fa8d134240379f436b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:41 -0400 Subject: writeback: make congestion functions per bdi_writeback Currently, all congestion functions take bdi (backing_dev_info) and always operate on the root wb (bdi->wb) and the congestion state from the block layer is propagated only for the root blkcg. This patch introduces {set|clear}_wb_congested() and wb_congested() which take a bdi_writeback_congested and bdi_writeback respectively. The bdi counteparts are now wrappers invoking the wb based functions on @bdi->wb. While converting clear_bdi_congested() to clear_wb_congested(), the local variable declaration order between @wqh and @bit is swapped for cosmetic reason. This patch just adds the new wb based functions. The following patches will apply them. v2: Updated for bdi_writeback_congested. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 14 +++++++++++-- include/linux/backing-dev.h | 45 +++++++++++++++++++++++----------------- 2 files changed, 38 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a1e9c407a59a..eb386766b5f3 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -163,7 +163,17 @@ enum { BLK_RW_SYNC = 1, }; -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync); +void set_wb_congested(struct bdi_writeback_congested *congested, int sync); + +static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + clear_wb_congested(bdi->wb.congested, sync); +} + +static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + set_wb_congested(bdi->wb.congested, sync); +} #endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8ae59df2e3d1..2c498a2a8268 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,27 +167,13 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } -static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) +static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) { - if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.congested->state & bdi_bits); -} - -static inline int bdi_read_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_sync_congested); -} - -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_async_congested); -} + struct backing_dev_info *bdi = wb->bdi; -static inline int bdi_rw_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, (1 << WB_sync_congested) | - (1 << WB_async_congested)); + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, cong_bits); + return wb->congested->state & cong_bits; } long congestion_wait(int sync, long timeout); @@ -454,4 +440,25 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) #endif /* CONFIG_CGROUP_WRITEBACK */ +static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) +{ + return wb_congested(&bdi->wb, cong_bits); +} + +static inline int bdi_read_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_sync_congested); +} + +static inline int bdi_write_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_async_congested); +} + +static inline int bdi_rw_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + #endif /* _LINUX_BACKING_DEV_H */ -- cgit v1.2.3 From d40f75a06dd675808eed385d490ba9468200b23f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:42 -0400 Subject: writeback, blkcg: restructure blk_{set|clear}_queue_congested() blk_{set|clear}_queue_congested() take @q and set or clear, respectively, the congestion state of its bdi's root wb. Because bdi used to be able to handle congestion state only on the root wb, the callers of those functions tested whether the congestion is on the root blkcg and skipped if not. This is cumbersome and makes implementation of per cgroup bdi_writeback congestion state propagation difficult. This patch renames blk_{set|clear}_queue_congested() to blk_{set|clear}_congested(), and makes them take request_list instead of request_queue and test whether the specified request_list is the root one before updating bdi_writeback congestion state. This makes the tests in the callers unnecessary and simplifies them. As there are no external users of these functions, the definitions are moved from include/linux/blkdev.h to block/blk-core.c. This patch doesn't introduce any noticeable behavior difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60d2726a6b62..ab4a27852f1b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -790,25 +790,6 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern void blk_queue_bio(struct request_queue *q, struct bio *bio); -/* - * A queue has just exitted congestion. Note this in the global counter of - * congested queues, and wake up anyone who was waiting for requests to be - * put back. - */ -static inline void blk_clear_queue_congested(struct request_queue *q, int sync) -{ - clear_bdi_congested(&q->backing_dev_info, sync); -} - -/* - * A queue has just entered congestion. Flag that in the queue's VM-visible - * state flags and increment the global gounter of congested queues. - */ -static inline void blk_set_queue_congested(struct request_queue *q, int sync) -{ - set_bdi_congested(&q->backing_dev_info, sync); -} - extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -- cgit v1.2.3 From 703c270887bb5106c4c46a00cc7477d30d5e04f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:44 -0400 Subject: writeback: implement and use inode_congested() In several places, bdi_congested() and its wrappers are used to determine whether more IOs should be issued. With cgroup writeback support, this question can't be answered solely based on the bdi (backing_dev_info). It's dependent on whether the filesystem and bdi support cgroup writeback and the blkcg the inode is associated with. This patch implements inode_congested() and its wrappers which take @inode and determines the congestion state considering cgroup writeback. The new functions replace bdi_*congested() calls in places where the query is about specific inode and task. There are several filesystem users which also fit this criteria but they should be updated when each filesystem implements cgroup writeback support. v2: Now that a given inode is associated with only one wb, congestion state can be determined independent from the asking task. Drop @task. Spotted by Vivek. Also, converted to take @inode instead of @mapping and renamed to inode_congested(). Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2c498a2a8268..6f0882105f95 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -230,6 +230,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); +int inode_congested(struct inode *inode, int cong_bits); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -438,8 +439,29 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +static inline int inode_congested(struct inode *inode, int cong_bits) +{ + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ +static inline int inode_read_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_sync_congested); +} + +static inline int inode_write_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_async_congested); +} + +static inline int inode_rw_congested(struct inode *inode) +{ + return inode_congested(inode, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) { return wb_congested(&bdi->wb, cong_bits); -- cgit v1.2.3 From d6c10f1fc8626dc55946f4768ae322b4c57b07dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:45 -0400 Subject: writeback: implement WB_has_dirty_io wb_state flag Currently, wb_has_dirty_io() determines whether a wb (bdi_writeback) has any dirty inode by testing all three IO lists on each invocation without actively keeping track. For cgroup writeback support, a single bdi will host multiple wb's each of which will host dirty inodes separately and we'll need to make bdi_has_dirty_io(), which currently only represents the root wb, aggregate has_dirty_io from all member wb's, which requires tracking transitions in has_dirty_io state on each wb. This patch introduces inode_wb_list_{move|del}_locked() to consolidate IO list operations leaving queue_io() the only other function which directly manipulates IO lists (via move_expired_inodes()). All three functions are updated to call wb_io_lists_[de]populated() which keep track of whether the wb has dirty inodes or not and record it using the new WB_has_dirty_io flag. inode_wb_list_moved_locked()'s return value indicates whether the wb had no dirty inodes before. mark_inode_dirty() is restructured so that the return value of inode_wb_list_move_locked() can be used for deciding whether to wake up the wb. While at it, change {bdi|wb}_has_dirty_io()'s return values to bool. These functions were returning 0 and 1 before. Also, add a comment explaining the synchronization of wb_state flags. v2: Updated to accommodate b_dirty_time. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + include/linux/backing-dev.h | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index eb386766b5f3..7a94b7850b7c 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; enum wb_state { WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ + WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; enum wb_congested_state { diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6f0882105f95..3c8403c012ce 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -int bdi_has_dirty_io(struct backing_dev_info *bdi); +bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -37,11 +37,9 @@ extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -static inline int wb_has_dirty_io(struct bdi_writeback *wb) +static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); + return test_bit(WB_has_dirty_io, &wb->state); } static inline void __add_wb_stat(struct bdi_writeback *wb, -- cgit v1.2.3 From 766a9d6e60578f1ef6de71f89f022084f8bffc82 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:46 -0400 Subject: writeback: implement backing_dev_info->tot_write_bandwidth cgroup writeback support needs to keep track of the sum of avg_write_bandwidth of all wb's (bdi_writeback's) with dirty inodes to distribute write workload. This patch adds bdi->tot_write_bandwidth and updates inode_wb_list_move_locked(), inode_wb_list_del_locked() and wb_update_write_bandwidth() to adjust it as wb's gain and lose dirty inodes and its avg_write_bandwidth gets updated. As the update events are not synchronized with each other, bdi->tot_write_bandwidth is an atomic_long_t. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 7a94b7850b7c..d631a61f4023 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -142,6 +142,8 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; + atomic_long_t tot_write_bandwidth; /* sum of active avg_write_bw */ + struct bdi_writeback wb; /* the root writeback info for this bdi */ struct bdi_writeback_congested wb_congested; /* its congested state */ #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From 95a46c65e3c09edb9f17dabf2dc16670cd328739 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:47 -0400 Subject: writeback: make bdi_has_dirty_io() take multiple bdi_writeback's into account bdi_has_dirty_io() used to only reflect whether the root wb (bdi_writeback) has dirty inodes. For cgroup writeback support, it needs to take all active wb's into account. If any wb on the bdi has dirty inodes, bdi_has_dirty_io() should return true. To achieve that, as inode_wb_list_{move|del}_locked() now keep track of the dirty state transition of each wb, the number of dirty wbs can be counted in the bdi; however, bdi is already aggregating wb->avg_write_bandwidth which can easily be guaranteed to be > 0 when there are any dirty inodes by ensuring wb->avg_write_bandwidth can't dip below 1. bdi_has_dirty_io() can simply test whether bdi->tot_write_bandwidth is zero or not. While this bumps the value of wb->avg_write_bandwidth to one when it used to be zero, this shouldn't cause any meaningful behavior difference. bdi_has_dirty_io() is made an inline function which tests whether ->tot_write_bandwidth is non-zero. Also, WARN_ON_ONCE()'s on its value are added to inode_wb_list_{move|del}_locked(). Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 8 ++++++-- include/linux/backing-dev.h | 10 +++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index d631a61f4023..8c857d723023 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -98,7 +98,7 @@ struct bdi_writeback { unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ + unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. @@ -142,7 +142,11 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - atomic_long_t tot_write_bandwidth; /* sum of active avg_write_bw */ + /* + * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are + * any dirty wbs, which is depended upon by bdi_has_dirty(). + */ + atomic_long_t tot_write_bandwidth; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct bdi_writeback_congested wb_congested; /* its congested state */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3c8403c012ce..0839e44105bd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,6 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -42,6 +41,15 @@ static inline bool wb_has_dirty_io(struct bdi_writeback *wb) return test_bit(WB_has_dirty_io, &wb->state); } +static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + /* + * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are + * any dirty wbs. See wb_update_write_bandwidth(). + */ + return atomic_long_read(&bdi->tot_write_bandwidth); +} + static inline void __add_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { -- cgit v1.2.3 From ebe41ab0c79d5633123f6faa3265a1a63c5f22d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:50 -0400 Subject: writeback: implement bdi_for_each_wb() This will be used to implement bdi-wide operations which should be distributed across all its cgroup bdi_writebacks. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0839e44105bd..c7979806baee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -383,6 +383,61 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +struct wb_iter { + int start_blkcg_id; + struct radix_tree_iter tree_iter; + void **slot; +}; + +static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, + struct backing_dev_info *bdi) +{ + struct radix_tree_iter *titer = &iter->tree_iter; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (iter->start_blkcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); + iter->start_blkcg_id = -1; + } else { + iter->slot = radix_tree_next_slot(iter->slot, titer, 0); + } + + if (!iter->slot) + iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0); + if (iter->slot) + return *iter->slot; + return NULL; +} + +static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, + struct backing_dev_info *bdi, + int start_blkcg_id) +{ + iter->start_blkcg_id = start_blkcg_id; + + if (start_blkcg_id) + return __wb_iter_next(iter, bdi); + else + return &bdi->wb; +} + +/** + * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * @wb_cur: cursor struct bdi_writeback pointer + * @bdi: bdi to walk wb's of + * @iter: pointer to struct wb_iter to be used as iteration buffer + * @start_blkcg_id: blkcg ID to start iteration from + * + * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending + * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * to be used as temp storage during iteration. rcu_read_lock() must be + * held throughout iteration. + */ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ + (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -445,6 +500,14 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +struct wb_iter { + int next_id; +}; + +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((iter)->next_id = (start_blkcg_id); \ + ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); ) + static inline int inode_congested(struct inode *inode, int cong_bits) { return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -- cgit v1.2.3 From c00ddad39f512b1a81e25b7892217ce10efab0f1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:51 -0400 Subject: writeback: remove bdi_start_writeback() bdi_start_writeback() is a thin wrapper on top of __wb_start_writeback() which is used only by laptop_mode_timer_fn(). This patches removes bdi_start_writeback(), renames __wb_start_writeback() to wb_start_writeback() and makes laptop_mode_timer_fn() use it instead. This doesn't cause any functional difference and will ease making laptop_mode_timer_fn() cgroup writeback aware. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c7979806baee..0ff40c228bee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,8 +25,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason); +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From bc05873dccd27d75d6acdf812c3edfb181f1ba17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:53 -0400 Subject: writeback: make writeback_in_progress() take bdi_writeback instead of backing_dev_info writeback_in_progress() currently takes @bdi and returns whether writeback is in progress on its root wb (bdi_writeback). In preparation for cgroup writeback support, make it take wb instead. While at it, make it an inline function. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0ff40c228bee..f04956c900ec 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -156,7 +156,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); extern struct backing_dev_info noop_backing_dev_info; -int writeback_in_progress(struct backing_dev_info *bdi); +/** + * writeback_in_progress - determine whether there is writeback in progress + * @wb: bdi_writeback of interest + * + * Determine whether there is writeback waiting to be handled against a + * bdi_writeback. + */ +static inline bool writeback_in_progress(struct bdi_writeback *wb) +{ + return test_bit(WB_writeback_running, &wb->state); +} static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { -- cgit v1.2.3 From 9ecf4866c018aeb304a7b49216c4d183665becb7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:54 -0400 Subject: writeback: make bdi_start_background_writeback() take bdi_writeback instead of backing_dev_info bdi_start_background_writeback() currently takes @bdi and kicks the root wb (bdi_writeback). In preparation for cgroup writeback support, make it take wb instead. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f04956c900ec..9cc11e5b97ca 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -27,7 +27,7 @@ void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); -void bdi_start_background_writeback(struct backing_dev_info *bdi); +void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From cc395d7f1f7b9c740ab6d367ef1f6eb248595dff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:58 -0400 Subject: writeback: implement bdi_wait_for_completion() If the completion of a wb_writeback_work can be waited upon by setting its ->done to a struct completion and waiting on it; however, for cgroup writeback support, it's necessary to issue multiple work items to multiple bdi_writebacks and wait for the completion of all. This patch implements wb_completion which can wait for multiple work items and replaces the struct completion with it. It can be defined using DEFINE_WB_COMPLETION_ONSTACK(), used for multiple work items and waited for by wb_wait_for_completion(). Nobody currently issues multiple work items and this patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8c857d723023..97a92fa0cdb5 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -155,6 +155,8 @@ struct backing_dev_info { struct rb_root cgwb_congested_tree; /* their congested states */ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #endif + wait_queue_head_t wb_waitq; + struct device *dev; struct timer_list laptop_mode_wb_timer; -- cgit v1.2.3 From f30a7d0cc8d9096d6728fadd0ab024e648010ec0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:14:00 -0400 Subject: writeback: restructure try_writeback_inodes_sb[_nr]() try_writeback_inodes_sb_nr() wraps writeback_inodes_sb_nr() so that it handles s_umount locking and skips if writeback is already in progress. The in progress test is performed on the root wb (bdi_writeback) which isn't sufficient for cgroup writeback support. The test must be done per-wb. To prepare for the change, this patch factors out __writeback_inodes_sb_nr() from writeback_inodes_sb_nr() and adds @skip_if_busy and moves the in progress test right before queueing the wb_writeback_work. try_writeback_inodes_sb_nr() now just grabs s_umount and invokes __writeback_inodes_sb_nr() with asserted @skip_if_busy. This way, later addition of multiple wb handling can skip only the wb's which already have writeback in progress. This swaps the order between in progress test and s_umount test which can flip the return value when writeback is in progress and s_umount is being held by someone else but this shouldn't cause any meaningful difference. It's a fringe condition and the return value is an unsynchronized hint anyway. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/writeback.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a6b9db7fcee8..23af355d5471 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -93,9 +93,9 @@ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); -int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, - enum wb_reason reason); +bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); +bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, + enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); -- cgit v1.2.3 From bafc0dba1e20d84578d7098d32caf63441e5743d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:37:23 -0600 Subject: buffer, writeback: make __block_write_full_page() honor cgroup writeback [__]block_write_full_page() is used to implement ->writepage in various filesystems. All writeback logic is now updated to handle cgroup writeback and the block cgroup to issue IOs for is encoded in writeback_control and can be retrieved from the inode; however, [__]block_write_full_page() currently ignores the blkcg indicated by inode and issues all bio's without explicit blkcg association. This patch adds submit_bh_blkcg() which associates the bio with the specified blkio cgroup before issuing and uses it in __block_write_full_page() so that the issued bio's are associated with inode_to_wb_blkcg_css(inode). v2: Updated for per-inode wb association. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9cc11e5b97ca..e9d7373f5f93 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -393,6 +393,12 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +static inline struct cgroup_subsys_state * +inode_to_wb_blkcg_css(struct inode *inode) +{ + return inode_to_wb(inode)->blkcg_css; +} + struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -510,6 +516,12 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +static inline struct cgroup_subsys_state * +inode_to_wb_blkcg_css(struct inode *inode) +{ + return blkcg_root_css; +} + struct wb_iter { int next_id; }; -- cgit v1.2.3 From 0d960a383ae7aa791b2833e122ba7519d264cf92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:19 -0400 Subject: writeback: clean up wb_dirty_limit() The function name wb_dirty_limit(), its argument @dirty and the local variable @wb_dirty are mortally confusing given that the function calculates per-wb threshold value not dirty pages, especially given that @dirty and @wb_dirty are used elsewhere for dirty pages. Let's rename the function to wb_calc_thresh() and wb_dirty to wb_thresh. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 23af355d5471..0435c85d4cfa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,7 +155,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void __wb_update_bandwidth(struct bdi_writeback *wb, unsigned long thresh, -- cgit v1.2.3 From 8a73179956e649df0d4b3250db17734f272d8266 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:20 -0400 Subject: writeback: reorganize [__]wb_update_bandwidth() __wb_update_bandwidth() is called from two places - fs/fs-writeback.c::balance_dirty_pages() and mm/page-writeback.c::wb_writeback(). The latter updates only the write bandwidth while the former also deals with the dirty ratelimit. The two callsites are distinguished by whether @thresh parameter is zero or not, which is cryptic. In addition, the two files define their own different versions of wb_update_bandwidth() on top of __wb_update_bandwidth(), which is confusing to say the least. This patch cleans up [__]wb_update_bandwidth() in the following ways. * __wb_update_bandwidth() now takes explicit @update_ratelimit parameter to gate dirty ratelimit handling. * mm/page-writeback.c::wb_update_bandwidth() is flattened into its caller - balance_dirty_pages(). * fs/fs-writeback.c::wb_update_bandwidth() is moved to mm/page-writeback.c and __wb_update_bandwidth() is made static. * While at it, add a lockdep assertion to __wb_update_bandwidth(). Except for the lockdep addition, this is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0435c85d4cfa..80adf3d88d9d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -157,14 +157,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void __wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); - +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); -- cgit v1.2.3 From 380c27ca33ebecc9da35aa90c8b3a9154f90aac2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:21 -0400 Subject: writeback: implement wb_domain Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. Currently, what constitutes the global writeback domain are scattered across a number of global states. This patch starts collecting them into struct wb_domain. * fprop_global which serves as the basis for proportional bandwidth measurement and its period timer are moved into struct wb_domain. * global_wb_domain hosts the states for the global domain. * While at it, flatten wb_writeout_fraction() into its callers. This thin wrapper doesn't provide any actual benefits while getting in the way. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 80adf3d88d9d..3148db1296a2 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -86,6 +87,36 @@ struct writeback_control { unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ }; +/* + * A wb_domain represents a domain that wb's (bdi_writeback's) belong to + * and are measured against each other in. There always is one global + * domain, global_wb_domain, that every wb in the system is a member of. + * This allows measuring the relative bandwidth of each wb to distribute + * dirtyable memory accordingly. + */ +struct wb_domain { + /* + * Scale the writeback cache size proportional to the relative + * writeout speed. + * + * We do this by keeping a floating proportion between BDIs, based + * on page writeback completions [end_page_writeback()]. Those + * devices that write out pages fastest will get the larger share, + * while the slower will get a smaller share. + * + * We use page writeout completions because we are interested in + * getting rid of dirty pages. Having them written out is the + * primary goal. + * + * We introduce a concept of time, a period over which we measure + * these events, because demand can/will vary over time. The length + * of this period itself is measured in page writeback completions. + */ + struct fprop_global completions; + struct timer_list period_timer; /* timer for aging of completions */ + unsigned long period_time; +}; + /* * fs/fs-writeback.c */ @@ -120,6 +151,7 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); +int wb_domain_init(struct wb_domain *dom, gfp_t gfp); extern unsigned long global_dirty_limit; -- cgit v1.2.3 From dcc25ae76eb7b8ff883eaaab57e30e8f2f085be3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:22 -0400 Subject: writeback: move global_dirty_limit into wb_domain This patch is a part of the series to define wb_domain which represents a domain that wb's (bdi_writeback's) belong to and are measured against each other in. This will enable IO backpressure propagation for cgroup writeback. global_dirty_limit exists to regulate the global dirty threshold which is a property of the wb_domain. This patch moves hard_dirty_limit, dirty_lock, and update_time into wb_domain. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3148db1296a2..5fdd4e1805e6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -95,6 +95,8 @@ struct writeback_control { * dirtyable memory accordingly. */ struct wb_domain { + spinlock_t lock; + /* * Scale the writeback cache size proportional to the relative * writeout speed. @@ -115,6 +117,19 @@ struct wb_domain { struct fprop_global completions; struct timer_list period_timer; /* timer for aging of completions */ unsigned long period_time; + + /* + * The dirtyable memory and dirty threshold could be suddenly + * knocked down by a large amount (eg. on the startup of KVM in a + * swapless system). This may throw the system into deep dirty + * exceeded state and throttle heavy/light dirtiers alike. To + * retain good responsiveness, maintain global_dirty_limit for + * tracking slowly down to the knocked down dirty threshold. + * + * Both fields are protected by ->lock. + */ + unsigned long dirty_limit_tstamp; + unsigned long dirty_limit; }; /* @@ -153,7 +168,7 @@ void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); -extern unsigned long global_dirty_limit; +extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern int dirty_background_ratio; -- cgit v1.2.3 From aa661bbe1e61ce80ca4ae98804f673ede94b0827 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:31 -0400 Subject: writeback: move over_bground_thresh() to mm/page-writeback.c and rename it to wb_over_bg_thresh(). The function is closely tied to the dirty throttling mechanism implemented in page-writeback.c. This relocation will allow future updates necessary for cgroup writeback support. While at it, add function comment. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5fdd4e1805e6..b57c2786b5aa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -207,6 +207,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); +bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); -- cgit v1.2.3 From 841710aa6e4acd066ab9fe8c8cb6f4e4e6709d83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:33 -0400 Subject: writeback: implement memcg wb_domain Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. The previous patches laid the groundwork to support the two wb_domains and this patch implements memcg wb_domain. memcg->cgwb_domain is initialized on css online and destroyed on css release, wb->memcg_completions is added, and __wb_writeout_inc() is updated to increment completions against both global and memcg wb_domains. The following patches will update balance_dirty_pages() and its subroutines to actually consider memcg wb_domain for throttling. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + include/linux/memcontrol.h | 12 +++++++++++- include/linux/writeback.h | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 97a92fa0cdb5..8d470b73824f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -118,6 +118,7 @@ struct bdi_writeback { #ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ + struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 662a953ea8ad..e3177bed23ea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -389,8 +389,18 @@ enum { }; #ifdef CONFIG_CGROUP_WRITEBACK + struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); -#endif +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b57c2786b5aa..04a3786c456f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -167,6 +167,9 @@ static inline void laptop_sync_completion(void) { } void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom); +#endif extern struct wb_domain global_wb_domain; -- cgit v1.2.3 From 2529bb3aadc40a93e642f5f3650f63379a964467 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:34 -0400 Subject: writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes The amount of available memory to a memcg wb_domain can change as memcg configuration changes. A domain's ->dirty_limit exists to smooth out sudden drops in dirty threshold; however, when a domain's size actually drops significantly, it hinders the dirty throttling from adjusting to the new configuration leading to unexpected behaviors including unnecessary OOM kills. This patch resolves the issue by adding wb_domain_size_changed() which resets ->dirty_limit[_tstmp] and making memcg call it on configuration changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 04a3786c456f..3b73e97ecfc7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -132,6 +132,26 @@ struct wb_domain { unsigned long dirty_limit; }; +/** + * wb_domain_size_changed - memory available to a wb_domain has changed + * @dom: wb_domain of interest + * + * This function should be called when the amount of memory available to + * @dom has changed. It resets @dom's dirty limit parameters to prevent + * the past values which don't match the current configuration from skewing + * dirty throttling. Without this, when memory size of a wb_domain is + * greatly reduced, the dirty throttling logic may allow too many pages to + * be dirtied leading to consecutive unnecessary OOMs and may get stuck in + * that situation. + */ +static inline void wb_domain_size_changed(struct wb_domain *dom) +{ + spin_lock(&dom->lock); + dom->dirty_limit_tstamp = jiffies; + dom->dirty_limit = 0; + spin_unlock(&dom->lock); +} + /* * fs/fs-writeback.c */ -- cgit v1.2.3 From c2aa723a6093633ae4ec15b08a4db276643cab3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:35 -0400 Subject: writeback: implement memcg writeback domain based throttling While cgroup writeback support now connects memcg and blkcg so that writeback IOs are properly attributed and controlled, the IO back pressure propagation mechanism implemented in balance_dirty_pages() and its subroutines wasn't aware of cgroup writeback. Processes belonging to a memcg may have access to only subset of total memory available in the system and not factoring this into dirty throttling rendered it completely ineffective for processes under memcg limits and memcg ended up building a separate ad-hoc degenerate mechanism directly into vmscan code to limit page dirtying. The previous patches updated balance_dirty_pages() and its subroutines so that they can deal with multiple wb_domain's (writeback domains) and defined per-memcg wb_domain. Processes belonging to a non-root memcg are bound to two wb_domains, global wb_domain and memcg wb_domain, and should be throttled according to IO pressures from both domains. This patch updates dirty throttling code so that it repeats similar calculations for the two domains - the differences between the two are few and minor - and applies the lower of the two sets of resulting constraints. wb_over_bg_thresh(), which controls when background writeback terminates, is also updated to consider both global and memcg wb_domains. It returns true if dirty is over bg_thresh for either domain. This makes the dirty throttling mechanism operational for memcg domains including writeback-bandwidth-proportional dirty page distribution inside them but the ad-hoc memcg throttling mechanism in vmscan is still in place. The next patch will rip it out. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e3177bed23ea..c3eb19e2bc1c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -392,6 +392,8 @@ enum { struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, + unsigned long *pdirty, unsigned long *pwriteback); #else /* CONFIG_CGROUP_WRITEBACK */ @@ -400,6 +402,13 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return NULL; } +static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, + unsigned long *pavail, + unsigned long *pdirty, + unsigned long *pwriteback) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -- cgit v1.2.3 From 21c6321fbb3a3787af07f1bc031d713a707fb69c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:49 -0400 Subject: writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Currently, majority of cgroup writeback support including all the above functions are implemented in include/linux/backing-dev.h and mm/backing-dev.c; however, the portion closely related to writeback logic implemented in include/linux/writeback.h and mm/page-writeback.c will expand to support foreign writeback detection and correction. This patch moves wb[_try]_get() and wb_put() to include/linux/backing-dev-defs.h so that they can be used from writeback.h and inode_{attach|detach}_wb() to writeback.h and page-writeback.c. This is pure reorganization and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 50 ++++++++++++++++++++++++ include/linux/backing-dev.h | 82 ---------------------------------------- include/linux/writeback.h | 46 ++++++++++++++++++++++ 3 files changed, 96 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8d470b73824f..e047b496a0b9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -186,4 +186,54 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + #endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e9d7373f5f93..5c978a924157 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); -void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); int inode_congested(struct inode *inode, int cong_bits); @@ -264,37 +263,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } -/** - * wb_tryget - try to increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - return percpu_ref_tryget(&wb->refcnt); - return true; -} - -/** - * wb_get - increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline void wb_get(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_get(&wb->refcnt); -} - -/** - * wb_put - decrement a wb's refcount - * @wb: bdi_writeback to put - */ -static inline void wb_put(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_put(&wb->refcnt); -} - /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest @@ -353,35 +321,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_attach_wb - associate an inode with its wb - * @inode: inode of interest - * @page: page being dirtied (may be NULL) - * - * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o - * @inode->i_lock. - */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ - if (!inode->i_wb) - __inode_attach_wb(inode, page); -} - -/** - * inode_detach_wb - disassociate an inode from its wb - * @inode: inode of interest - * - * @inode is being freed. Detach from its wb. - */ -static inline void inode_detach_wb(struct inode *inode) -{ - if (inode->i_wb) { - wb_put(inode->i_wb); - inode->i_wb = NULL; - } -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested) { } -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - return true; -} - -static inline void wb_get(struct bdi_writeback *wb) -{ -} - -static inline void wb_put(struct bdi_writeback *wb) -{ -} - static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; @@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ -} - -static inline void inode_detach_wb(struct inode *inode) -{ -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3b73e97ecfc7..6726b7e56beb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -8,6 +8,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -173,6 +174,51 @@ static inline void wait_on_inode(struct inode *inode) wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } +#ifdef CONFIG_CGROUP_WRITEBACK + +void __inode_attach_wb(struct inode *inode, struct page *page); + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * mm/page-writeback.c */ -- cgit v1.2.3 From b16b1deb553adcd7b3b7ce3e6d6fd1b923f314da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:39:48 -0600 Subject: writeback: make writeback_control track the inode being written back Currently, for cgroup writeback, the IO submission paths directly associate the bio's with the blkcg from inode_to_wb_blkcg_css(); however, it'd be necessary to keep more writeback context to implement foreign inode writeback detection. wbc (writeback_control) is the natural fit for the extra context - it persists throughout the writeback of each inode and is passed all the way down to IO submission paths. This patch adds wbc_attach_and_unlock_inode(), wbc_detach_inode(), and wbc_attach_fdatawrite_inode() which are used to associate wbc with the inode being written back. IO submission paths now use wbc_init_bio() instead of directly associating bio's with blkcg themselves. This leaves inode_to_wb_blkcg_css() w/o any user. The function is removed. wbc currently only tracks the associated wb (bdi_writeback). Future patches will add more for foreign inode detection. The association is established under i_lock which will be depended upon when migrating foreign inodes to other wb's. As currently, once established, inode to wb association never changes, going through wbc when initializing bio's doesn't cause any behavior changes. v2: submit_blk_blkcg() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 -------- include/linux/writeback.h | 68 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5c978a924157..b1d2489a6536 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,12 +332,6 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return inode_to_wb(inode)->blkcg_css; -} - struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -434,12 +428,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return blkcg_root_css; -} - struct wb_iter { int next_id; }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6726b7e56beb..8f964e558af5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -86,6 +86,9 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *wb; /* wb this writeback is issued under */ +#endif }; /* @@ -176,7 +179,14 @@ static inline void wait_on_inode(struct inode *inode) #ifdef CONFIG_CGROUP_WRITEBACK +#include +#include + void __inode_attach_wb(struct inode *inode, struct page *page); +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock); +void wbc_detach_inode(struct writeback_control *wbc); /** * inode_attach_wb - associate an inode with its wb @@ -207,6 +217,44 @@ static inline void inode_detach_wb(struct inode *inode) } } +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} + +/** + * wbc_init_bio - writeback specific initializtion of bio + * @wbc: writeback_control for the writeback in progress + * @bio: bio to be initialized + * + * @bio is a part of the writeback in progress controlled by @wbc. Perform + * writeback specific initialization. This is used to apply the cgroup + * writeback context. + */ +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (wbc->wb) + bio_associate_blkcg(bio, wbc->wb->blkcg_css); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct page *page) @@ -217,6 +265,26 @@ static inline void inode_detach_wb(struct inode *inode) { } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ +} + +static inline void wbc_detach_inode(struct writeback_control *wbc) +{ +} + +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* -- cgit v1.2.3 From 2a81490811d0296d390c571bb64eaa93e5ed7def Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:51 -0400 Subject: writeback: implement foreign cgroup inode detection As concurrent write sharing of an inode is expected to be very rare and memcg only tracks page ownership on first-use basis severely confining the usefulness of such sharing, cgroup writeback tracks ownership per-inode. While the support for concurrent write sharing of an inode is deemed unnecessary, an inode being written to by different cgroups at different points in time is a lot more common, and, more importantly, charging only by first-use can too readily lead to grossly incorrect behaviors (single foreign page can lead to gigabytes of writeback to be incorrectly attributed). To resolve this issue, cgroup writeback detects the majority dirtier of an inode and will transfer the ownership to it. To avoid unnnecessary oscillation, the detection mechanism keeps track of history and gives out the switch verdict only if the foreign usage pattern is stable over a certain amount of time and/or writeback attempts. The detection mechanism has fairly low space and computation overhead. It adds 8 bytes to struct inode (one int and two u16's) and minimal amount of calculation per IO. The detection mechanism converges to the correct answer usually in several seconds of IO time when there's a clear majority dirtier. Even when there isn't, it can reach an acceptable answer fairly quickly under most circumstances. Please see wb_detach_inode() for more details. This patch only implements detection. Following patches will implement actual switching. v2: wbc_account_io() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/fs.h | 5 +++++ include/linux/writeback.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 67a42ec95065..740126d7c44e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -638,6 +638,11 @@ struct inode { struct list_head i_wb_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ + + /* foreign inode detection, see wbc_detach_inode() */ + int i_wb_frn_winner; + u16 i_wb_frn_avg_time; + u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8f964e558af5..b333c945e571 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -88,6 +88,15 @@ struct writeback_control { unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ + struct inode *inode; /* inode being written out */ + + /* foreign inode detection, see wbc_detach_inode() */ + int wb_id; /* current wb id */ + int wb_lcand_id; /* last foreign candidate wb id */ + int wb_tcand_id; /* this foreign candidate wb id */ + size_t wb_bytes; /* bytes written by current wb */ + size_t wb_lcand_bytes; /* bytes written by last candidate */ + size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; @@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes); /** * inode_attach_wb - associate an inode with its wb @@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } +static inline void wbc_account_io(struct writeback_control *wbc, + struct page *page, size_t bytes) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* -- cgit v1.2.3 From 682aa8e1a6a1504a4caaa62e6c2c9daae3757210 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:53 -0400 Subject: writeback: implement unlocked_inode_to_wb transaction and use it for stat updates The mechanism for detecting whether an inode should switch its wb (bdi_writeback) association is now in place. This patch build the framework for the actual switching. This patch adds a new inode flag I_WB_SWITCHING, which has two functions. First, the easy one, it ensures that there's only one switching in progress for a give inode. Second, it's used as a mechanism to synchronize wb stat updates. The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters but track the current number of dirty pages and pages under writeback respectively. As such, when an inode is moved from one wb to another, the inode's portion of those stats have to be transferred together; unfortunately, this is a bit tricky as those stat updates are percpu operations which are performed without holding any lock in some places. This patch solves the problem in a similar way as memcg. Each such lockless stat updates are wrapped in transaction surrounded by unlocked_inode_to_wb_begin/end(). During normal operation, they map to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted, mapping->tree_lock is grabbed across the transaction. In turn, the switching path sets I_WB_SWITCHING and waits for a RCU grace period to pass before actually starting to switch, which guarantees that all stat update paths are synchronizing against mapping->tree_lock. This patch still doesn't implement the actual switching. v3: Updated on top of the recent cancel_dirty_page() updates. unlocked_inode_to_wb_begin() now nests inside mem_cgroup_begin_page_stat() to match the locking order. v2: The i_wb access transaction will be used for !stat accesses too. Function names and comments updated accordingly. s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/ s/switch_wb/switch_wbs/ Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 54 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 6 +++++ include/linux/mm.h | 3 ++- 3 files changed, 62 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b1d2489a6536..73ffa32e58ee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,6 +332,50 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +/** + * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction + * @inode: target inode + * @lockedp: temp bool output param, to be passed to the end function + * + * The caller wants to access the wb associated with @inode but isn't + * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This + * function determines the wb associated with @inode and ensures that the + * association doesn't change until the transaction is finished with + * unlocked_inode_to_wb_end(). + * + * The caller must call unlocked_inode_to_wb_end() with *@lockdep + * afterwards and can't sleep during transaction. IRQ may or may not be + * disabled on return. + */ +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + rcu_read_lock(); + + /* + * Paired with store_release in inode_switch_wb_work_fn() and + * ensures that we see the new wb if we see cleared I_WB_SWITCH. + */ + *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + + if (unlikely(*lockedp)) + spin_lock_irq(&inode->i_mapping->tree_lock); + return inode_to_wb(inode); +} + +/** + * unlocked_inode_to_wb_end - end inode wb access transaction + * @inode: target inode + * @locked: *@lockedp from unlocked_inode_to_wb_begin() + */ +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ + if (unlikely(locked)) + spin_unlock_irq(&inode->i_mapping->tree_lock); + + rcu_read_unlock(); +} + struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -420,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + return inode_to_wb(inode); +} + +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ +} + static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } diff --git a/include/linux/fs.h b/include/linux/fs.h index 740126d7c44e..b5e1dcfbc5e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1815,6 +1815,11 @@ struct super_operations { * * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab mapping->tree_lock. See + * inode_switch_wb_work_fn() for details. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1834,6 +1839,7 @@ struct super_operations { #define I_DIRTY_TIME (1 << 11) #define __I_DIRTY_TIME_EXPIRED 12 #define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) +#define I_WB_SWITCH (1 << 13) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) diff --git a/include/linux/mm.h b/include/linux/mm.h index f48d979ced4b..4024543b4203 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,6 +27,7 @@ struct anon_vma_chain; struct file_ra_state; struct user_struct; struct writeback_control; +struct bdi_writeback; #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1214,7 +1215,7 @@ int redirty_page_for_writepage(struct writeback_control *wbc, void account_page_dirtied(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg); void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg); + struct mem_cgroup *memcg, struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); void cancel_dirty_page(struct page *page); -- cgit v1.2.3 From aaa2cacf8184e2a92accb8e443b1608d65f9a13f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:55 -0400 Subject: writeback: add lockdep annotation to inode_to_wb() With the previous three patches, all operations which acquire wb from inode are either under one of inode->i_lock, mapping->tree_lock or wb->list_lock or protected by unlocked_inode_to_wb transaction. This will be depended upon by foreign inode wb switching. This patch adds lockdep assertion to inode_to_wb() so that usages outside the above list locks can be caught easily. There are three exceptions. * locked_inode_to_wb_and_lock_list() is holding wb->list_lock but the wb may not be the inode's. Ensuring that is the function's role after all. Updated to deref inode->i_wb directly. * inode_wb_stat_unlocked_begin() is usually protected by combination of !I_WB_SWITCH and rcu_read_lock(). Updated to deref inode->i_wb directly. * inode_congested() wants to test whether inode->i_wb is set before starting the transaction. Added inode_to_wb_is_valid() which tests inode->i_wb directly. v5: might_lock() removed. It annotates that the lock is grabbed w/ irq enabled which isn't the case and triggering lockdep warning spuriously. v4: might_lock() added to unlocked_inode_to_wb_begin(). v3: inode_congested() conversion added. v2: locked_inode_to_wb_and_lock_list() was missing in the first version. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 73ffa32e58ee..dfce80869145 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -321,14 +321,34 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } +/** + * inode_to_wb_is_valid - test whether an inode has a wb associated + * @inode: inode of interest + * + * Returns %true if @inode has a wb associated. May be called without any + * locking. + */ +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return inode->i_wb; +} + /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * - * Returns the wb @inode is currently associated with. + * Returns the wb @inode is currently associated with. The caller must be + * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the + * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && + (!lockdep_is_held(&inode->i_lock) && + !lockdep_is_held(&inode->i_mapping->tree_lock) && + !lockdep_is_held(&inode->i_wb->list_lock))); +#endif return inode->i_wb; } @@ -360,7 +380,12 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) if (unlikely(*lockedp)) spin_lock_irq(&inode->i_mapping->tree_lock); - return inode_to_wb(inode); + + /* + * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. + * inode_to_wb() will bark. Deref directly. + */ + return inode->i_wb; } /** @@ -459,6 +484,11 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return true; +} + static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; -- cgit v1.2.3 From e8a7abf5a5bd302a1e06a3c21a629eaa4cba57d6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:57 -0400 Subject: writeback: disassociate inodes from dying bdi_writebacks For the purpose of foreign inode detection, wb's (bdi_writeback's) are identified by the associated memcg ID. As we create a separate wb for each memcg, this is enough to identify the active wb's; however, when blkcg is enabled or disabled higher up in the hierarchy, the mapping between memcg and blkcg changes which in turn creates a new wb to service the new mapping. The old wb is unlinked from index and released after all references are drained. The foreign inode detection logic can't detect this condition because both the old and new wb's point to the same memcg and thus never decides to move inodes attached to the old wb to the new one. This patch adds logic to initiate switching immediately in wbc_attach_and_unlock_inode() if the associated wb is dying. We can make the usual foreign detection logic to distinguish the different wb's mapped to the memcg but the dying wb is never gonna be in active service again and there's no point in tracking the usage history and reaching the switch verdict after enough data points are collected. It's already known that the wb has to be switched. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e047b496a0b9..a48d90e3bcbb 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -219,6 +219,17 @@ static inline void wb_put(struct bdi_writeback *wb) percpu_ref_put(&wb->refcnt); } +/** + * wb_dying - is a wb dying? + * @wb: bdi_writeback of interest + * + * Returns whether @wb is unlinked and being drained. + */ +static inline bool wb_dying(struct bdi_writeback *wb) +{ + return percpu_ref_is_dying(&wb->refcnt); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool wb_tryget(struct bdi_writeback *wb) @@ -234,6 +245,11 @@ static inline void wb_put(struct bdi_writeback *wb) { } +static inline bool wb_dying(struct bdi_writeback *wb) +{ + return false; +} + #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* __LINUX_BACKING_DEV_DEFS_H */ -- cgit v1.2.3 From be3ef76e9d9b97962c70bd6351787d29071ae481 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Jun 2015 14:30:11 +0200 Subject: clockevents: Rename state to state_use_accessors The only sensible way to make abuse of core internal fields obvious and easy to grep for. Signed-off-by: Thomas Gleixner Cc: Viresh Kumar Cc: Peter Zijlstra --- include/linux/clockchips.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 64214ad85af9..597a1e836f22 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -87,7 +87,7 @@ enum clock_event_state { * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) * @mode: operating mode, relevant only to ->set_mode(), OBSOLETE - * @state: current state of the device, assigned by the core code + * @state_use_accessors:current state of the device, assigned by the core code * @features: features * @retries: number of forced programming retries * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. @@ -117,7 +117,7 @@ struct clock_event_device { u32 mult; u32 shift; enum clock_event_mode mode; - enum clock_event_state state; + enum clock_event_state state_use_accessors; unsigned int features; unsigned long retries; @@ -152,27 +152,27 @@ struct clock_event_device { /* Helpers to verify state of a clockevent device */ static inline bool clockevent_state_detached(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_DETACHED; + return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED; } static inline bool clockevent_state_shutdown(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_SHUTDOWN; + return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN; } static inline bool clockevent_state_periodic(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_PERIODIC; + return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC; } static inline bool clockevent_state_oneshot(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_ONESHOT; + return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT; } static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED; + return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED; } /* -- cgit v1.2.3 From 24bbd929e6b9e62afd263c42b4318d3b603c956c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 1 Jun 2015 13:40:31 +0200 Subject: of/fdt: split off FDT self reservation from memreserve processing This splits off the reservation of the memory occupied by the FDT binary itself from the processing of the memory reservations it contains. This is necessary because the physical address of the FDT, which is needed to perform the reservation, may not be known to the FDT driver core, i.e., it may be mapped outside the linear direct mapping, in which case __pa() returns a bogus value. Cc: Russell King Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Rob Herring Acked-by: Mark Rutland Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- include/linux/of_fdt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 587ee507965d..fd627a58068f 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -64,6 +64,7 @@ extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, extern int early_init_dt_scan_memory(unsigned long node, const char *uname, int depth, void *data); extern void early_init_fdt_scan_reserved_mem(void); +extern void early_init_fdt_reserve_self(void); extern void early_init_dt_add_memory_arch(u64 base, u64 size); extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool no_map); @@ -91,6 +92,7 @@ extern u64 fdt_translate_address(const void *blob, int node_offset); extern void of_fdt_limit_memory(int limit); #else /* CONFIG_OF_FLATTREE */ static inline void early_init_fdt_scan_reserved_mem(void) {} +static inline void early_init_fdt_reserve_self(void) {} static inline const char *of_flat_dt_get_machine_name(void) { return NULL; } static inline void unflatten_device_tree(void) {} static inline void unflatten_and_copy_device_tree(void) {} -- cgit v1.2.3 From cfaed10d1f27d036b72bbdc6b1e59ea28c38ec7f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 1 Jun 2015 11:15:25 -0500 Subject: scatterlist: introduce sg_nents_for_len When performing a dma_map_sg() call, the number of sg entries to map is required. Using sg_nents to retrieve the number of sg entries will return the total number of entries in the sg list up to the entry marked as the end. If there happen to be unused entries in the list, these will still be counted. Some dma_map_sg() implementations will not handle the unused entries correctly (lib/swiotlb.c) and execute a BUG_ON. The sg_nents_for_len() function will traverse the sg list and return the number of entries required to satisfy the supplied length argument. This can then be supplied to the dma_map_sg() call to successfully map the sg. Signed-off-by: Tom Lendacky Signed-off-by: Herbert Xu --- include/linux/scatterlist.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index ed8f9e70df9b..a0edb992c9c3 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -221,6 +221,7 @@ static inline void *sg_virt(struct scatterlist *sg) } int sg_nents(struct scatterlist *sg); +int sg_nents_for_len(struct scatterlist *sg, u64 len); struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); -- cgit v1.2.3 From d6472302f242559d45dcf4ebace62508dc4d8aeb Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 2 Jun 2015 19:01:38 +1000 Subject: x86/mm: Decouple from Nothing in uses anything from , so remove it from there and fix up the resulting build problems triggered on x86 {64|32}-bit {def|allmod|allno}configs. The breakages were triggering in places where x86 builds relied on vmalloc() facilities but did not include explicitly and relied on the implicit inclusion via . Also add: - to - to ... which were two other implicit header file dependencies. Suggested-by: David Miller Signed-off-by: Stephen Rothwell [ Tidied up the changelog. ] Acked-by: David Miller Acked-by: Takashi Iwai Acked-by: Viresh Kumar Acked-by: Vinod Koul Cc: Andrew Morton Cc: Anton Vorontsov Cc: Boris Ostrovsky Cc: Colin Cross Cc: David Vrabel Cc: H. Peter Anvin Cc: Haiyang Zhang Cc: James E.J. Bottomley Cc: Jaroslav Kysela Cc: K. Y. Srinivasan Cc: Kees Cook Cc: Konrad Rzeszutek Wilk Cc: Kristen Carlson Accardi Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Suma Ramars Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Ingo Molnar --- include/linux/io.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index 04cce4da3685..fb5a99800e77 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -19,6 +19,7 @@ #define _LINUX_IO_H #include +#include #include #include -- cgit v1.2.3 From da7049f834c3582c1ed1a04889bda5b4121973c0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 26 May 2015 13:49:07 -0400 Subject: svcrdma: Remove svc_rdma_xdr_decode_deferred_req() svc_rdma_xdr_decode_deferred_req() indexes an array with an un-byte-swapped value off the wire. Fortunately this function isn't used anywhere, so simply remove it. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index df8edf8ec914..8ad9b6d9d4e0 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -182,7 +182,6 @@ struct svcxprt_rdma { /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); -extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode, u32 *); -- cgit v1.2.3 From e842f2903908934187af7232fb5b21da527d1757 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: dax: don't abuse get_block mapping for endio callbacks dax_fault() currently relies on the get_block callback to attach an io completion callback to the mapping buffer head so that it can run unwritten extent conversion after zeroing allocated blocks. Instead of this hack, pass the conversion callback directly into dax_fault() similar to the get_block callback. When the filesystem allocates unwritten extents, it will set the buffer_unwritten() flag, and hence the dax_fault code can call the completion function in the contexts where it is necessary without overloading the mapping buffer head. Note: The changes to ext4 to use this interface are suspect at best. In fact, the way ext4 did this end_io assignment in the first place looks suspect because it only set a completion callback when there wasn't already some other write() call taking place on the same inode. The ext4 end_io code looks rather intricate and fragile with all it's reference counting and passing to different contexts for modification via inode private pointers that aren't protected by locks... Signed-off-by: Dave Chinner Acked-by: Jan Kara Signed-off-by: Dave Chinner --- include/linux/fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..c9b4cca9e08d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); +typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 @@ -2627,9 +2628,10 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, int dax_clear_blocks(struct inode *, sector_t block, long size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, -- cgit v1.2.3 From ce5c5d554dc47a4fb4360c84b72231fea081e7a0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: dax: expose __dax_fault for filesystems with locking constraints Some filesystems cannot call dax_fault() directly because they have different locking and/or allocation constraints in the page fault IO path. To handle this, we need to follow the same model as the generic block_page_mkwrite code, where the internals are exposed via __block_page_mkwrite() so that filesystems can wrap the correct locking and operations around the outside. This is loosely based on a patch originally from Matthew Willcox. Unlike the original patch, it does not change ext4 code, error returns or unwritten extent conversion handling. It also adds a __dax_mkwrite() wrapper for .page_mkwrite implementations to do the right thing, too. Signed-off-by: Dave Chinner Reviewed-by: Jan Kara Signed-off-by: Dave Chinner --- include/linux/fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c9b4cca9e08d..5784377e7c56 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2630,8 +2630,11 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, -- cgit v1.2.3 From 12f7c14aa602f15ad60e5a9da459271f63b92917 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Thu, 4 Jun 2015 00:01:21 +0900 Subject: crypto: doc - Fix typo in crypto-API.xml This patch fix some typos found in crypto-API.xml. It is because the file is generated from comments in sources, so I had to fix typo in sources. Signed-off-by: Masanari Iida Acked-by: Stephan Mueller Signed-off-by: Herbert Xu --- include/linux/crypto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 7d290a91c6f9..25a4b71d6d1f 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -451,7 +451,7 @@ struct compress_alg { * transformation algorithm. * @cra_type: Type of the cryptographic transformation. This is a pointer to * struct crypto_type, which implements callbacks common for all - * trasnformation types. There are multiple options: + * transformation types. There are multiple options: * &crypto_blkcipher_type, &crypto_ablkcipher_type, * &crypto_ahash_type, &crypto_aead_type, &crypto_rng_type. * This field might be empty. In that case, there are no common -- cgit v1.2.3 From 64d6067057d9658acb8675afcfba549abdb7fc16 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 7 May 2015 11:36:11 +0200 Subject: KVM: x86: stubs for SMM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds the interface between x86.c and the emulator: the SMBASE register, a new emulator flag, the RSM instruction. It also adds a new request bit that will be used by the KVM_SMI ioctl. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a8bcbc9c6078..b019fee6d941 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -134,6 +134,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_ENABLE_IBS 23 #define KVM_REQ_DISABLE_IBS 24 #define KVM_REQ_APIC_PAGE_RELOAD 25 +#define KVM_REQ_SMI 26 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 -- cgit v1.2.3 From 9e7c8f8c62c1e1cda203b5bfaba4575b141e42e7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Jun 2015 16:22:16 -0400 Subject: signals: don't abuse __flush_signals() in selinux_bprm_committed_creds() selinux_bprm_committed_creds()->__flush_signals() is not right, we shouldn't clear TIF_SIGPENDING unconditionally. There can be other reasons for signal_pending(): freezing(), JOBCTL_PENDING_MASK, and potentially more. Also change this code to check fatal_signal_pending() rather than SIGNAL_GROUP_EXIT, it looks a bit better. Now we can kill __flush_signals() before it finds another buggy user. Note: this code looks racy, we can flush a signal which was sent after the task SID has been updated. Signed-off-by: Oleg Nesterov Signed-off-by: Paul Moore --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8222ae40ecb0..4f84aade8b4d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2373,7 +2373,6 @@ extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); -extern void __flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); -- cgit v1.2.3 From 30b7e246a6222f1fbad39b1451273375306fe1e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Jun 2015 11:21:10 -0400 Subject: svcrdma: Keep rpcrdma_msg fields in network byte-order Fields in struct rpcrdma_msg are __be32. Don't byte-swap these fields when decoding RPC calls and then swap them back for the reply. For the most part, they can be left alone. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 8ad9b6d9d4e0..c03ca0a1b743 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -184,7 +184,7 @@ struct svcxprt_rdma { extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, struct rpcrdma_msg *, - enum rpcrdma_errcode, u32 *); + enum rpcrdma_errcode, __be32 *); extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, -- cgit v1.2.3 From b7e0b9a965a116341b4ef86ab98ea2843b218271 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Jun 2015 11:21:20 -0400 Subject: svcrdma: Replace GFP_KERNEL in a loop with GFP_NOFAIL At the 2015 LSF/MM, it was requested that memory allocation call sites that request GFP_KERNEL allocations in a loop should be annotated with __GFP_NOFAIL. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c03ca0a1b743..d26384b22126 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -211,7 +211,6 @@ extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode); -struct page *svc_rdma_get_page(void); extern int svc_rdma_post_recv(struct svcxprt_rdma *); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); -- cgit v1.2.3 From 0380a3f37540ad0582b3c749a74fc127af914689 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Jun 2015 11:21:32 -0400 Subject: svcrdma: Add a separate "max data segs macro for svcrdma The server and client maximum are architecturally independent. Allow changing one without affecting the other. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d26384b22126..cb94ee4181d4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -172,6 +172,13 @@ struct svcxprt_rdma { #define RDMAXPRT_SQ_PENDING 2 #define RDMAXPRT_CONN_PENDING 3 +#define RPCRDMA_MAX_SVC_SEGS (64) /* server max scatter/gather */ +#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT) +#define RPCRDMA_MAXPAYLOAD RPCSVC_MAXPAYLOAD +#else +#define RPCRDMA_MAXPAYLOAD (RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT) +#endif + #define RPCRDMA_LISTEN_BACKLOG 10 /* The default ORD value is based on two outstanding full-size writes with a * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ -- cgit v1.2.3 From 42aecaa9bb2bd57eb8d61b4565cee5d3640863fb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:39 -0700 Subject: net: Get skb hash over flow_keys structure This patch changes flow hashing to use jhash2 over the flow_keys structure instead just doing jhash_3words over src, dst, and ports. This method will allow us take more input into the hashing function so that we can include full IPv6 addresses, VLAN, flow labels etc. without needing to resort to xor'ing which makes for a poor hash. Acked-by: Jiri Pirko Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6b41c15efa27..cc612fc0a894 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1943,7 +1943,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; else if (skb_flow_dissect_flow_keys(skb, &keys)) - skb_set_transport_header(skb, keys.basic.thoff); + skb_set_transport_header(skb, keys.control.thoff); else skb_set_transport_header(skb, offset_hint); } -- cgit v1.2.3 From 01949d0109ee5fae33752f0db99a36f1619e1873 Mon Sep 17 00:00:00 2001 From: Haggai Abramonvsky Date: Thu, 4 Jun 2015 19:30:38 +0300 Subject: net/mlx5_core: Enable XRCs and SRQs when using ISSI > 0 When working in ISSI > 0 mode, the model exposed by the device for XRCs and SRQs is different. XRCs use XRC SRQs and plain SRQs are based on RPM (Receive Memory Pool). Add helper functions to create, modify, query, and arm XRC SRQs and RMPs. Signed-off-by: Haggai Abramovsky Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 6 +++++- include/linux/mlx5/mlx5_ifc.h | 16 ++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7fa26f03acc1..ba9f212c94bb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -339,6 +339,8 @@ struct mlx5_core_mr { enum mlx5_res_type { MLX5_RES_QP, + MLX5_RES_SRQ, + MLX5_RES_XSRQ, }; struct mlx5_core_rsc_common { @@ -348,6 +350,7 @@ struct mlx5_core_rsc_common { }; struct mlx5_core_srq { + struct mlx5_core_rsc_common common; /* must be first */ u32 srqn; int max; int max_gs; @@ -640,7 +643,8 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev, struct mlx5_cmd_mailbox *head); int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_create_srq_mbox_in *in, int inlen); + struct mlx5_create_srq_mbox_in *in, int inlen, + int is_xrc); int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq); int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, struct mlx5_query_srq_mbox_out *out); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b27e9f6e090a..dbe2b32c0539 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2022,12 +2022,9 @@ struct mlx5_ifc_srqc_bits { u8 reserved_9[0x40]; - u8 db_record_addr_h[0x20]; - - u8 db_record_addr_l[0x1e]; - u8 reserved_10[0x2]; + u8 dbr_addr[0x40]; - u8 reserved_11[0x80]; + u8 reserved_10[0x80]; }; enum { @@ -4167,6 +4164,13 @@ struct mlx5_ifc_modify_rmp_out_bits { u8 reserved_1[0x40]; }; +struct mlx5_ifc_rmp_bitmask_bits { + u8 reserved[0x20]; + + u8 reserved1[0x1f]; + u8 lwm[0x1]; +}; + struct mlx5_ifc_modify_rmp_in_bits { u8 opcode[0x10]; u8 reserved_0[0x10]; @@ -4180,7 +4184,7 @@ struct mlx5_ifc_modify_rmp_in_bits { u8 reserved_3[0x20]; - u8 modify_bitmask[0x40]; + struct mlx5_ifc_rmp_bitmask_bits bitmask; u8 reserved_4[0x40]; -- cgit v1.2.3 From d18a9470f89727f870db944a36223bf1bb15bdc1 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 4 Jun 2015 19:30:40 +0300 Subject: net/mlx5_core: Make the vport helpers available for the IB driver too Move the vport header file to be under include/linux/mlx5, such that the mlx5 IB can use it as well. Also add nic_ prefix to the vport NIC commands to differeniate between HCA vport commands and NIC vport commands. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/vport.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 include/linux/mlx5/vport.h (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h new file mode 100644 index 000000000000..99d0e9f85432 --- /dev/null +++ b/include/linux/mlx5/vport.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_VPORT_H__ +#define __MLX5_VPORT_H__ + +#include + +u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); +void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr); + +#endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 707c4602cda6624940761b66a4119f1909492385 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 4 Jun 2015 19:30:41 +0300 Subject: net/mlx5_core: Add new query HCA vport commands Added the implementation for the following commands: 1. QUERY_HCA_VPORT_GID 2. QUERY_HCA_VPORT_PKEY 3. QUERY_HCA_VPORT_CONTEXT They will be needed when we move to work with ISSI > 0 in the IB driver too. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 13 +++++++++++++ include/linux/mlx5/driver.h | 45 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 23 ++++++++++++++-------- include/linux/mlx5/vport.h | 14 ++++++++++++++ 4 files changed, 87 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b288c538347a..b2c43508a737 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -99,6 +99,12 @@ __mlx5_mask(typ, fld)) #define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld))) +#define MLX5_GET64_PR(typ, p, fld) ({ \ + u64 ___t = MLX5_GET64(typ, p, fld); \ + pr_debug(#fld " = 0x%llx\n", ___t); \ + ___t; \ +}) + enum { MLX5_MAX_COMMANDS = 32, MLX5_CMD_DATA_BLOCK_SIZE = 512, @@ -1172,4 +1178,11 @@ enum { MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR = 0x40, }; +static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) +{ + if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE) + return 0; + return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; +} + #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ba9f212c94bb..8ab8b8af5c32 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -553,6 +553,41 @@ struct mlx5_pas { u8 log_sz; }; +enum port_state_policy { + MLX5_AAA_000 +}; + +enum phy_port_state { + MLX5_AAA_111 +}; + +struct mlx5_hca_vport_context { + u32 field_select; + bool sm_virt_aware; + bool has_smi; + bool has_raw; + enum port_state_policy policy; + enum phy_port_state phys_state; + enum ib_port_state vport_state; + u8 port_physical_state; + u64 sys_image_guid; + u64 port_guid; + u64 node_guid; + u32 cap_mask1; + u32 cap_mask1_perm; + u32 cap_mask2; + u32 cap_mask2_perm; + u16 lid; + u8 init_type_reply; /* bitmask: see ib spec 14.2.5.6 InitTypeReply */ + u8 lmc; + u8 subnet_timeout; + u16 sm_lid; + u8 sm_sl; + u16 qkey_violation_counter; + u16 pkey_violation_counter; + bool grh_required; +}; + static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset) { return buf->direct.buf + offset; @@ -792,4 +827,14 @@ struct mlx5_profile { } mr_cache[MAX_MR_CACHE_ENTRIES]; }; +static inline int mlx5_get_gid_table_len(u16 param) +{ + if (param > 4) { + pr_warn("gid table length is zero\n"); + return 0; + } + + return 8 * (1 << param); +} + #endif /* MLX5_DRIVER_H */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index dbe2b32c0539..f06d054ad021 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2221,12 +2221,15 @@ struct mlx5_ifc_hca_vport_context_bits { u8 has_smi[0x1]; u8 has_raw[0x1]; u8 grh_required[0x1]; - u8 reserved_1[0x10]; - u8 port_state_policy[0x4]; - u8 phy_port_state[0x4]; + u8 reserved_1[0xc]; + u8 port_physical_state[0x4]; + u8 vport_state_policy[0x4]; + u8 port_state[0x4]; u8 vport_state[0x4]; - u8 reserved_2[0x60]; + u8 reserved_2[0x20]; + + u8 system_image_guid[0x40]; u8 port_guid[0x40]; @@ -3490,7 +3493,8 @@ struct mlx5_ifc_query_hca_vport_pkey_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_2[0xf]; + u8 reserved_2[0xb]; + u8 port_num[0x4]; u8 vport_number[0x10]; u8 reserved_3[0x10]; @@ -3519,7 +3523,8 @@ struct mlx5_ifc_query_hca_vport_gid_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_2[0xf]; + u8 reserved_2[0xb]; + u8 port_num[0x4]; u8 vport_number[0x10]; u8 reserved_3[0x10]; @@ -3545,7 +3550,8 @@ struct mlx5_ifc_query_hca_vport_context_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_2[0xf]; + u8 reserved_2[0xb]; + u8 port_num[0x4]; u8 vport_number[0x10]; u8 reserved_3[0x20]; @@ -4243,7 +4249,8 @@ struct mlx5_ifc_modify_hca_vport_context_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_2[0xf]; + u8 reserved_2[0xb]; + u8 port_num[0x4]; u8 vport_number[0x10]; u8 reserved_3[0x20]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 99d0e9f85432..67882a834efb 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -37,5 +37,19 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr); +int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, u16 vf_num, u16 gid_index, + union ib_gid *gid); +int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, u16 vf_num, u16 pkey_index, + u16 *pkey); +int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, + u8 other_vport, u8 port_num, + u16 vf_num, + struct mlx5_hca_vport_context *rep); +int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev, + __be64 *sys_image_guid); +int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev, + u64 *node_guid); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 211e6c80e5a68ef39a81484583e8efbf9774627d Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 4 Jun 2015 19:30:42 +0300 Subject: net/mlx5_core: Get vendor-id using the query adapter command Add two wrapper functions to the query adapter command: 1. mlx5_query_board_id -- replaces the old mlx5_cmd_query_adapter. 2. mlx5_core_query_vendor_id -- retrieves the vendor_id from the query_adapter command. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8ab8b8af5c32..b90fb9336d21 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -817,6 +817,7 @@ struct mlx5_interface { void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol); int mlx5_register_interface(struct mlx5_interface *intf); void mlx5_unregister_interface(struct mlx5_interface *intf); +int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); struct mlx5_profile { u64 mask; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f06d054ad021..6d2f6fee041c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2470,9 +2470,12 @@ union mlx5_ifc_cong_control_roce_ecn_auto_bits { }; struct mlx5_ifc_query_adapter_param_block_bits { - u8 reserved_0[0xe0]; + u8 reserved_0[0xc0]; - u8 reserved_1[0x10]; + u8 reserved_1[0x8]; + u8 ieee_vendor_id[0x18]; + + u8 reserved_2[0x10]; u8 vsd_vendor_id[0x10]; u8 vsd[208][0x8]; -- cgit v1.2.3 From e760152d08da78aa160e68ac90bf8f3f10aff462 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 4 Jun 2015 19:30:43 +0300 Subject: net/mlx5_core: Use port number in the query port mtu helpers Extend the function prototypes for max and operational mtu to take the local port number. In the Ethernet driver is this hard coded to one, since ConnectX4 Ethernet devices are always function-per-port. The IB driver also serves older devices (ConnectIB) which isn't such, and hence the part can vary. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b90fb9336d21..cd09784b6999 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -751,8 +751,10 @@ int mlx5_set_port_status(struct mlx5_core_dev *dev, int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status); int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu); -int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu); -int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu); +int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, + u8 local_port); +int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, + u8 local_port); int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); -- cgit v1.2.3 From a05bdefa4081d43f9c86c3bb693d0492a21590da Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 4 Jun 2015 19:30:44 +0300 Subject: net/mlx5_core: Use port number when querying port ptys Until now, mlx5_query_port_ptys always queried port number one. Added new argument in the function's prototype so we can also query the second port. This will be needed when thr helper will be invoked from the IB driver on non FPP (Function-Per-Port) devices. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index cd09784b6999..e4b814f64014 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -739,7 +739,7 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, - int ptys_size, int proto_mask); + int ptys_size, int proto_mask, u8 local_port); int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev, u32 *proto_cap, int proto_mask); int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, -- cgit v1.2.3 From a124d13ef59e09941fc0924fd7c29ae6d7cd77a3 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 4 Jun 2015 19:30:45 +0300 Subject: net/mlx5_core: Add more query port helpers Add the following helpers: 1. mlx5_query_port_proto_oper -- queries the port speed port mask 2. mlx5_query_port_link_width_oper - queries the port link with bitmask 3. mlx5_query_port_vl_hw_cap - queries the Virtual Lanes supported on this port These helpers will be used from the IB driver when working in ISSI > 0 mode. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e4b814f64014..6093bde16b94 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -107,6 +107,7 @@ enum { MLX5_REG_PUDE = 0x5009, MLX5_REG_PMPE = 0x5010, MLX5_REG_PELC = 0x500e, + MLX5_REG_PVLC = 0x500f, MLX5_REG_PMLP = 0, /* TBD */ MLX5_REG_NODE_DESC = 0x6001, MLX5_REG_HOST_ENDIANNESS = 0x7004, @@ -744,6 +745,11 @@ int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev, u32 *proto_cap, int proto_mask); int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, u32 *proto_admin, int proto_mask); +int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev, + u8 *link_width_oper, u8 local_port); +int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev, + u8 *proto_oper, int proto_mask, + u8 local_port); int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin, int proto_mask); int mlx5_set_port_status(struct mlx5_core_dev *dev, @@ -755,6 +761,8 @@ int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 local_port); int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, u8 local_port); +int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, + u8 *vl_hw_cap, u8 local_port); int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); -- cgit v1.2.3 From 03fbf488cece461468d3abb795f5e5f055e00040 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Thu, 4 Jun 2015 16:55:10 +0300 Subject: spi: pxa2xx: Differentiate Intel LPSS types Intel LPSS SPI properties differ between between platforms. Now private registers offset 0x400 or 0x800 is autodetected but there is need to support also other offset and handle a few other differences. Prepare for that by splitting the LPSS_SSP type into compatible hardware types and set it now based on PCI or ACPI ID. That type will be used to set properties that differ between current and upcoming platforms. Signed-off-by: Jarkko Nikula Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index dab545bb66b3..95a4b3bd7a5c 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -194,8 +194,9 @@ enum pxa_ssp_type { PXA168_SSP, PXA910_SSP, CE4100_SSP, - LPSS_SSP, QUARK_X1000_SSP, + LPSS_LPT_SSP, + LPSS_BYT_SSP, }; struct ssp_device { -- cgit v1.2.3 From dccf7369652f3934456345aab6a92fa905177886 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Thu, 4 Jun 2015 16:55:11 +0300 Subject: spi: pxa2xx: Prepare for new Intel LPSS SPI type Some of the Intel LPSS SPI properties will be different in upcoming platforms compared to existing Lynxpoint and BayTrail/Braswell. LPSS SPI private registers will be at different offset and there will be changes in individual registers and default FIFO thresholds too. Add configuration for these differences and use them in runtime based on LPSS SSP type. With this change private registers offset autodetection becomes needless. Signed-off-by: Jarkko Nikula Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index 95a4b3bd7a5c..0485bab061fd 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -195,7 +195,7 @@ enum pxa_ssp_type { PXA910_SSP, CE4100_SSP, QUARK_X1000_SSP, - LPSS_LPT_SSP, + LPSS_LPT_SSP, /* Keep LPSS types sorted with lpss_platforms[] */ LPSS_BYT_SSP, }; -- cgit v1.2.3 From 8e73485c7959fd25650761eab04db1e72ea14c23 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 17 May 2015 13:58:53 +0200 Subject: KVM: add vcpu-specific functions to read/write/translate GFNs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to hide SMRAM from guests not running in SMM. Therefore, all uses of kvm_read_guest* and kvm_write_guest* must be changed to use different address spaces, depending on whether the VCPU is in system management mode. We need to introduce a new family of functions for this purpose. For now, the VCPU-based functions have the same behavior as the existing per-VM ones, they just accept a different type for the first argument. Later however they will be changed to use one of many "struct kvm_memslots" stored in struct kvm, through an architecture hook. VM-based functions will unconditionally use the first memslots pointer. Whenever possible, this patch introduces slot-based functions with an __ prefix, with two wrappers for generic and vcpu-based actions. The exceptions are kvm_read_guest and kvm_write_guest, which are copied into the new functions kvm_vcpu_read_guest and kvm_vcpu_write_guest. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b019fee6d941..ba1ea43998e4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -471,6 +471,11 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) || lockdep_is_held(&kvm->slots_lock)); } +static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) +{ + return kvm_memslots(vcpu->kvm); +} + static inline struct kvm_memory_slot * id_to_memslot(struct kvm_memslots *slots, int id) { @@ -576,6 +581,25 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); +struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); +struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); +pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); +pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); +unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); +unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); +int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, + int len); +int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, + unsigned long len); +int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, + unsigned long len); +int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, + int offset, int len); +int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, + unsigned long len); +void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); + void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); -- cgit v1.2.3 From f481b069e674378758c73761827e83ab05c46b52 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 17 May 2015 17:30:37 +0200 Subject: KVM: implement multiple address spaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only two ioctls have to be modified; the address space id is placed in the higher 16 bits of their slot id argument. As of this patch, no architecture defines more than one address space; x86 will be the first. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ba1ea43998e4..9564fd78c547 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -44,6 +44,10 @@ /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 +#ifndef KVM_ADDRESS_SPACE_NUM +#define KVM_ADDRESS_SPACE_NUM 1 +#endif + /* * For the normal pfn, the highest 12 bits should be zero, * so we can mask bit 62 ~ bit 52 to indicate the error pfn, @@ -331,6 +335,13 @@ struct kvm_kernel_irq_routing_entry { #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) #endif +#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) +{ + return 0; +} +#endif + /* * Note: * memslots are not sorted by id anymore, please use id_to_memslot() @@ -349,7 +360,7 @@ struct kvm { spinlock_t mmu_lock; struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ - struct kvm_memslots *memslots; + struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; struct srcu_struct srcu; struct srcu_struct irq_srcu; #ifdef CONFIG_KVM_APIC_ARCHITECTURE @@ -464,16 +475,23 @@ void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); -static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) +static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { - return rcu_dereference_check(kvm->memslots, + return rcu_dereference_check(kvm->memslots[as_id], srcu_read_lock_held(&kvm->srcu) || lockdep_is_held(&kvm->slots_lock)); } +static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) +{ + return __kvm_memslots(kvm, 0); +} + static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) { - return kvm_memslots(vcpu->kvm); + int as_id = kvm_arch_vcpu_memslots_id(vcpu); + + return __kvm_memslots(vcpu->kvm, as_id); } static inline struct kvm_memory_slot * -- cgit v1.2.3 From 3f21c265cd5f7ae867cc0e86a1f6d5093f1963cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Jun 2015 10:57:37 -0600 Subject: block: add blk_set_queue_dying() to blkdev.h We export this function and NVMe wants to use it, but for some reason it was never added to the block header. Do that. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ccaa9aecd593..a31380c35918 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1024,6 +1024,7 @@ bool __must_check blk_get_queue(struct request_queue *); struct request_queue *blk_alloc_queue(gfp_t); struct request_queue *blk_alloc_queue_node(gfp_t, int); extern void blk_put_queue(struct request_queue *); +extern void blk_set_queue_dying(struct request_queue *); /* * block layer runtime pm functions -- cgit v1.2.3 From a5768aa887fb636f0cc4c83a2f1242506aaf50f6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 1 Jun 2015 14:28:14 -0600 Subject: NVMe: Automatic namespace rescan Namespaces may be dynamically allocated and deleted or attached and detached. This has the driver rescan the device for namespace changes after each device reset or namespace change asynchronous event. There could potentially be many detached namespaces that we don't want polluting /dev/ with unusable block handles, so this will delete disks if the namespace is not active as indicated by the response from identify namespace. This also skips adding the disk if no capacity is provisioned to the namespace in the first place. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 986bf8ad8e93..c0d94ed8ce9a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -92,6 +92,7 @@ struct nvme_dev { work_func_t reset_workfn; struct work_struct reset_work; struct work_struct probe_work; + struct work_struct scan_work; char name[12]; char serial[20]; char model[40]; -- cgit v1.2.3 From d691f9e8d4405c334aa10d556e73c8bf44cb0e01 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Jun 2015 10:11:54 -0700 Subject: bpf: allow programs to write to certain skb fields allow programs read/write skb->mark, tc_index fields and ((struct qdisc_skb_cb *)cb)->data. mark and tc_index are generically useful in TC. cb[0]-cb[4] are primarily used to pass arguments from one program to another called via bpf_tail_call() which can be seen in sockex3_kern.c example. All fields of 'struct __sk_buff' are readable to socket and tc_cls_act progs. mark, tc_index are writeable from tc_cls_act only. cb[0]-cb[4] are writeable by both sockets and tc_cls_act. Add verifier tests and improve sample code. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ca854e5bb2f7..2235aee8096a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -105,7 +105,8 @@ struct bpf_verifier_ops { */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type); - u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off, + u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, struct bpf_insn *insn); }; -- cgit v1.2.3 From 4eaca0a887eaee04fc7a3866d0f5b51b34030dfa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jun 2015 17:39:08 +0200 Subject: preempt: Use preempt_schedule_context() as the official tracing preemption point preempt_schedule_context() is a tracing safe preemption point but it's only used when CONFIG_CONTEXT_TRACKING=y. Other configs have tracing recursion issues since commit: b30f0e3ffedf ("sched/preempt: Optimize preemption operations on __schedule() callers") introduced function based preemp_count_*() ops. Lets make it available on all configs and give it a more appropriate name for its new position. Reported-by: Fengguang Wu Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433432349-1021-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index a1a00e14c14f..7686dd63bc35 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -204,15 +204,11 @@ do { \ #ifdef CONFIG_PREEMPT -#ifndef CONFIG_CONTEXT_TRACKING -#define __preempt_schedule_context() __preempt_schedule() -#endif - #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ - __preempt_schedule_context(); \ + __preempt_schedule_notrace(); \ } while (0) #else #define preempt_enable_notrace() \ -- cgit v1.2.3 From 9a92e3dc6ad02208a014d0d8404ebbd697e3d5ef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jun 2015 17:39:09 +0200 Subject: preempt: Reorganize the notrace definitions a bit preempt.h has two seperate "#ifdef CONFIG_PREEMPT" sections: one to define preempt_enable() and another to define preempt_enable_notrace(). Lets gather both. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Fengguang Wu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433432349-1021-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 7686dd63bc35..0f1534acaf60 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -175,48 +175,46 @@ do { \ __preempt_schedule(); \ } while (0) +#define preempt_enable_notrace() \ +do { \ + barrier(); \ + if (unlikely(__preempt_count_dec_and_test())) \ + __preempt_schedule_notrace(); \ +} while (0) + #define preempt_check_resched() \ do { \ if (should_resched()) \ __preempt_schedule(); \ } while (0) -#else +#else /* !CONFIG_PREEMPT */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) -#define preempt_check_resched() do { } while (0) -#endif - -#define preempt_disable_notrace() \ -do { \ - __preempt_count_inc(); \ - barrier(); \ -} while (0) -#define preempt_enable_no_resched_notrace() \ +#define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) -#ifdef CONFIG_PREEMPT +#define preempt_check_resched() do { } while (0) +#endif /* CONFIG_PREEMPT */ -#define preempt_enable_notrace() \ +#define preempt_disable_notrace() \ do { \ + __preempt_count_inc(); \ barrier(); \ - if (unlikely(__preempt_count_dec_and_test())) \ - __preempt_schedule_notrace(); \ } while (0) -#else -#define preempt_enable_notrace() \ + +#define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) -#endif #else /* !CONFIG_PREEMPT_COUNT */ -- cgit v1.2.3 From 21509084f999d7accd32e45961ef76853112e978 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 6 May 2015 15:33:49 -0400 Subject: perf/x86/intel: Handle multiple records in the PEBS buffer When the PEBS interrupt threshold is larger than one record and the machine supports multiple PEBS events, the records of these events are mixed up and we need to demultiplex them. Demuxing the records is hard because the hardware is deficient. The hardware has two issues that, when combined, create impossible scenarios to demux. The first issue is that the 'status' field of the PEBS record is a copy of the GLOBAL_STATUS MSR at PEBS assist time. To see why this is a problem let us first describe the regular PEBS cycle: A) the CTRn value reaches 0: - the corresponding bit in GLOBAL_STATUS gets set - we start arming the hardware assist < some unspecified amount of time later -- this could cover multiple events of interest > B) the hardware assist is armed, any next event will trigger it C) a matching event happens: - the hardware assist triggers and generates a PEBS record this includes a copy of GLOBAL_STATUS at this moment - if we auto-reload we (re)set CTRn - we clear the relevant bit in GLOBAL_STATUS Now consider the following chain of events: A0, B0, A1, C0 The event generated for counter 0 will include a status with counter 1 set, even though its not at all related to the record. A similar thing can happen with a !PEBS event if it just happens to overflow at the right moment. The second issue is that the hardware will only emit one record for two or more counters if the event that triggers the assist is 'close'. The 'close' can be several cycles. In some cases even the complete assist, if the event is something that doesn't need retirement. For instance, consider this chain of events: A0, B0, A1, B1, C01 Where C01 is an event that triggers both hardware assists, we will generate but a single record, but again with both counters listed in the status field. This time the record pertains to both events. Note that these two cases are different but undistinguishable with the data as generated. Therefore demuxing records with multiple PEBS bits (we can safely ignore status bits for !PEBS counters) is impossible. Furthermore we cannot emit the record to both events because that might cause a data leak -- the events might not have the same privileges -- so what this patch does is discard such events. The assumption/hope is that such discards will be rare. Here lists some possible ways you may get high discard rate. - when you count the same thing multiple times. But it is not a useful configuration. - you can be unfortunate if you measure with a userspace only PEBS event along with either a kernel or unrestricted PEBS event. Imagine the event triggering and setting the overflow flag right before entering the kernel. Then all kernel side events will end up with multiple bits set. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang [ Changelog improvements. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-4-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 06580028cee6..5f192e1bc98e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -730,6 +730,19 @@ extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); +extern void perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); + +extern void +perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event); +extern void +perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample); + static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; -- cgit v1.2.3 From f38b0dbb491a6987e198aa6b428db8692a6480f8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Sun, 10 May 2015 15:13:14 -0400 Subject: perf/x86/intel: Introduce PERF_RECORD_LOST_SAMPLES After enlarging the PEBS interrupt threshold, there may be some mixed up PEBS samples which are discarded by the kernel. This patch makes the kernel emit a PERF_RECORD_LOST_SAMPLES record with the number of possible discarded records when it is impossible to demux the samples. It makes sure the user is not left in the dark about such discards. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285195-14269-8-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5f192e1bc98e..a204d5266f5f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -743,6 +743,9 @@ perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); +extern void +perf_log_lost_samples(struct perf_event *event, u64 lost); + static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; -- cgit v1.2.3 From 7cf7fa529d0b6b514949cc67b39e3ce406c37006 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Sun, 7 Jun 2015 15:44:23 +0300 Subject: net/mlx5_core: Fix static checker warnings around system guid query flow Fix static checker warnings in the flow of system guid query. Fixes: 707c4602cda6 ('net/mlx5_core: Add new query HCA vport commands') Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/vport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 67882a834efb..967e0fd06e89 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -48,7 +48,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, u16 vf_num, struct mlx5_hca_vport_context *rep); int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev, - __be64 *sys_image_guid); + u64 *sys_image_guid); int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev, u64 *node_guid); -- cgit v1.2.3 From cb4a316752709be4a644f070440a8be470d92b7d Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 6 Jun 2015 10:02:14 +1000 Subject: cgroup: use bitmask to filter for_each_subsys Add a new macro for_each_subsys_which that allows all enabled cgroup subsystems to be filtered by a bitmask, such that mask & (1 << ssid) determines if the subsystem is to be processed in the loop body (where ssid is the unique id of the subsystem). Also replace the need_forkexit_callback with two separate bitmasks for each callback to make (ss->{fork,exit}) checks unnecessary. tj: add a short comment for "if (!CGROUP_SUBSYS_COUNT)". Signed-off-by: Aleksa Sarai --- include/linux/cgroup-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 26d1cea7929f..c5588c438448 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -490,6 +490,8 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) #else /* CONFIG_CGROUPS */ +#define CGROUP_SUBSYS_COUNT 0 + static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} -- cgit v1.2.3 From 3846c15820a1841225d0245afda4875af23dfbbe Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Fri, 5 Jun 2015 15:14:54 -0400 Subject: efi: Work around ia64 build problem with ESRT driver So, I'm told this problem exists in the world: > Subject: Build error in -next due to 'efi: Add esrt support' > > Building ia64:defconfig ... failed > -------------- > Error log: > > drivers/firmware/efi/esrt.c:28:31: fatal error: asm/early_ioremap.h: No such file or directory > I'm not really sure how it's okay that we have things in asm-generic on some platforms but not others - is having it the same everywhere not the whole point of asm-generic? That said, ia64 doesn't have early_ioremap.h . So instead, since it's difficult to imagine new IA64 machines with UEFI 2.5, just don't build this code there. To me this looks like a workaround - doing something like: generic-y += early_ioremap.h in arch/ia64/include/asm/Kbuild would appear to be more correct, but ia64 has its own early_memremap() decl in arch/ia64/include/asm/io.h , and it's a macro. So adding the above /and/ requiring that asm/io.h be included /after/ asm/early_ioremap.h in all cases would fix it, but that's pretty ugly as well. Since I'm not going to spend the rest of my life rectifying ia64 headers vs "generic" headers that aren't generic, it's much simpler to just not build there. Note that I've only actually tried to build this patch on x86_64, but esrt.o still gets built there, and that would seem to demonstrate that the conditional building is working correctly at all the places the code built before. I no longer have any ia64 machines handy to test that the exclusion actually works there. Signed-off-by: Peter Jones Acked-by: Tony Luck Reviewed-by: Guenter Roeck (Compile-)Tested-by: Guenter Roeck Signed-off-by: Matt Fleming --- include/linux/efi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 024c27e7c0fa..2092965afca3 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -879,7 +879,11 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon #endif extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); extern int efi_config_init(efi_config_table_type_t *arch_tables); +#ifdef CONFIG_EFI_ESRT extern void __init efi_esrt_init(void); +#else +static inline void efi_esrt_init(void) { } +#endif extern int efi_config_parse_tables(void *config_tables, int count, int sz, efi_config_table_type_t *arch_tables); extern u64 efi_get_iobase (void); -- cgit v1.2.3 From a8077d6573530a91d5674a28cdedbed39c391ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sun, 7 Jun 2015 13:15:30 +0200 Subject: bcma: make calls to PCI hostmode functions config-safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma_driver_pci.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 5ba6918ca20b..9657f11d48a7 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -246,7 +246,18 @@ static inline void bcma_core_pci_power_save(struct bcma_bus *bus, bool up) } #endif +#ifdef CONFIG_BCMA_DRIVER_PCI_HOSTMODE extern int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev); extern int bcma_core_pci_plat_dev_init(struct pci_dev *dev); +#else +static inline int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev) +{ + return -ENOTSUPP; +} +static inline int bcma_core_pci_plat_dev_init(struct pci_dev *dev) +{ + return -ENOTSUPP; +} +#endif #endif /* LINUX_BCMA_DRIVER_PCI_H_ */ -- cgit v1.2.3 From 01d72a95188880b22190e937ed8718ed4b45bdce Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 4 Jun 2015 16:38:17 -0500 Subject: PCI: Remove unused pci_dma_burst_advice() pci_dma_burst_advice() was added by e24c2d963a60 ("[PATCH] PCI: DMA bursting advice") but apparently never used. Remove it. Signed-off-by: Bjorn Helgaas Acked-by: Michal Simek # microblaze CC: David S. Miller --- include/linux/pci.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..08fb4e307d68 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1197,15 +1197,6 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle) #define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr) -enum pci_dma_burst_strategy { - PCI_DMA_BURST_INFINITY, /* make bursts as large as possible, - strategy_parameter is N/A */ - PCI_DMA_BURST_BOUNDARY, /* disconnect at every strategy_parameter - byte boundaries */ - PCI_DMA_BURST_MULTIPLE, /* disconnect at some multiple of - strategy_parameter byte boundaries */ -}; - struct msix_entry { u32 vector; /* kernel uses to write allocated vector */ u16 entry; /* driver uses to specify entry, OS writes */ @@ -1430,8 +1421,6 @@ static inline int pci_request_regions(struct pci_dev *dev, const char *res_name) { return -EIO; } static inline void pci_release_regions(struct pci_dev *dev) { } -#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0) - static inline void pci_block_cfg_access(struct pci_dev *dev) { } static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev) { return 0; } -- cgit v1.2.3 From d711b8b30c803b1b2aedf6a3474758798078f9e1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 6 Jun 2015 11:30:00 +0200 Subject: hrtimers: Make sure hrtimer_resolution is unsigned int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... in the !CONFIG_HIGH_RES_TIMERS case too. And thus fix warnings like this one: net/sched/sch_api.c: In function ‘psched_show’: net/sched/sch_api.c:1891:6: warning: format ‘%x’ expects argument of type ‘unsigned int’, but argument 6 has type ‘long int’ [-Wformat=] (u32)NSEC_PER_SEC / hrtimer_resolution); Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1433583000-32090-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner Cc: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 470d876c2eda..3f82a7edc03d 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -309,7 +309,7 @@ extern unsigned int hrtimer_resolution; # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES -#define hrtimer_resolution LOW_RES_NSEC +#define hrtimer_resolution (unsigned int)LOW_RES_NSEC static inline void hrtimer_peek_ahead_timers(void) { } -- cgit v1.2.3 From 2c1296d92ac0367364bcb73a43c12a0bdfbfee75 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 28 May 2015 18:41:32 +0200 Subject: iommu: Add iommu_get_domain_for_dev function This function can be used to request the current domain a device is attached to. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0546b8710ce3..683a1c4b15e7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -193,6 +193,7 @@ extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); +extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, @@ -332,6 +333,11 @@ static inline void iommu_detach_device(struct iommu_domain *domain, { } +static inline struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) +{ + return NULL; +} + static inline int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, int gfp_order, int prot) { -- cgit v1.2.3 From a1015c2b99b94cf521603b41debf167114031456 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 28 May 2015 18:41:33 +0200 Subject: iommu: Introduce direct mapped region handling Add two new functions to the IOMMU-API to allow the IOMMU drivers to export the requirements for direct mapped regions per device. This is useful for exporting the information in Intel VT-d's RMRR entries or AMD-Vi's unity mappings. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 683a1c4b15e7..689499904166 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -114,6 +114,20 @@ enum iommu_attr { DOMAIN_ATTR_MAX, }; +/** + * struct iommu_dm_region - descriptor for a direct mapped memory region + * @list: Linked list pointers + * @start: System physical start address of the region + * @length: Length of the region in bytes + * @prot: IOMMU Protection flags (READ/WRITE/...) + */ +struct iommu_dm_region { + struct list_head list; + phys_addr_t start; + size_t length; + int prot; +}; + #ifdef CONFIG_IOMMU_API /** @@ -159,6 +173,10 @@ struct iommu_ops { int (*domain_set_attr)(struct iommu_domain *domain, enum iommu_attr attr, void *data); + /* Request/Free a list of direct mapping requirements for a device */ + void (*get_dm_regions)(struct device *dev, struct list_head *list); + void (*put_dm_regions)(struct device *dev, struct list_head *list); + /* Window handling functions */ int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr, phys_addr_t paddr, u64 size, int prot); @@ -205,6 +223,9 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t io extern void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token); +extern void iommu_get_dm_regions(struct device *dev, struct list_head *list); +extern void iommu_put_dm_regions(struct device *dev, struct list_head *list); + extern int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); extern void iommu_detach_group(struct iommu_domain *domain, @@ -379,6 +400,16 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain, { } +static inline void iommu_get_dm_regions(struct device *dev, + struct list_head *list) +{ +} + +static inline void iommu_put_dm_regions(struct device *dev, + struct list_head *list) +{ +} + static inline int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { -- cgit v1.2.3 From 6827ca83695d5e41ad31b0719788ee65f00ca4b3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 28 May 2015 18:41:35 +0200 Subject: iommu: Add function to query the default domain of a group This will be used to handle unity mappings in the iommu drivers. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 689499904166..b944b2be4fa2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -249,6 +249,7 @@ extern int iommu_group_unregister_notifier(struct iommu_group *group, struct notifier_block *nb); extern int iommu_group_id(struct iommu_group *group); extern struct iommu_group *iommu_group_get_for_dev(struct device *dev); +extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, void *data); -- cgit v1.2.3 From 5179f0ce2f96f155e3bda93b3b82f912dbaddad2 Mon Sep 17 00:00:00 2001 From: Steve Twiss Date: Mon, 8 Jun 2015 16:26:20 -0700 Subject: Input: add OnKey driver for DA9063 MFD part This adds OnKey driver support for DA9063. Signed-off-by: Steve Twiss Signed-off-by: Dmitry Torokhov --- include/linux/mfd/da9063/pdata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h index 95c8742215a7..612383bd80ae 100644 --- a/include/linux/mfd/da9063/pdata.h +++ b/include/linux/mfd/da9063/pdata.h @@ -103,6 +103,7 @@ struct da9063; struct da9063_pdata { int (*init)(struct da9063 *da9063); int irq_base; + bool key_power; unsigned flags; struct da9063_regulators_pdata *regulators_pdata; struct led_platform_data *leds_pdata; -- cgit v1.2.3 From ae60d6a0e3a9197d37f8c8c4584a8ecd18518cd6 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 28 May 2015 19:09:55 +0200 Subject: time: Refactor usecs_to_jiffies Refactor the usecs_to_jiffies conditional code part in time.c and jiffies.h putting it into conditional functions rather than #ifdefs to improve readability. This is analogous to the msecs_to_jiffies() cleanup in commit ca42aaf0c861 ("time: Refactor msecs_to_jiffies") Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1432832996-12129-1-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 3bde5eb8568b..a316ebea0a89 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -362,7 +362,32 @@ static inline unsigned long msecs_to_jiffies(const unsigned int m) } } -extern unsigned long usecs_to_jiffies(const unsigned int u); +extern unsigned long __usecs_to_jiffies(const unsigned int u); +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +} +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return u * (HZ / USEC_PER_SEC); +} +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ +#else +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +} +#endif + +static inline unsigned long usecs_to_jiffies(const unsigned int u) +{ + return __usecs_to_jiffies(u); +} + extern unsigned long timespec_to_jiffies(const struct timespec *value); extern void jiffies_to_timespec(const unsigned long jiffies, struct timespec *value); -- cgit v1.2.3 From c569a23d65ac2900d9998d3fe04044fe95be6b2f Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 28 May 2015 19:09:56 +0200 Subject: time: Allow gcc to fold usecs_to_jiffies(constant) To allow constant folding in usecs_to_jiffies() conditionally calls the HZ dependent _usecs_to_jiffies() helpers or, when gcc can not figure out constant folding, __usecs_to_jiffies, which is the renamed original usecs_to_jiffies() function. Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1432832996-12129-2-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index a316ebea0a89..535fd3bb1ba8 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -383,9 +383,37 @@ static inline unsigned long _usecs_to_jiffies(const unsigned int u) } #endif +/** + * usecs_to_jiffies: - convert microseconds to jiffies + * @u: time in microseconds + * + * conversion is done as follows: + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows as for msecs_to_jiffies. + * + * usecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __usecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the HZ range specific helpers _usecs_to_jiffies() are called both + * directly here and from __msecs_to_jiffies() in the case where + * constant folding is not possible. + */ static inline unsigned long usecs_to_jiffies(const unsigned int u) { - return __usecs_to_jiffies(u); + if (__builtin_constant_p(u)) { + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return _usecs_to_jiffies(u); + } else { + return __usecs_to_jiffies(u); + } } extern unsigned long timespec_to_jiffies(const struct timespec *value); -- cgit v1.2.3 From ed06aeefdac348cfb91a3db5fe1067e3202afd70 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Tue, 9 Jun 2015 22:26:05 +0200 Subject: nfc: st-nci: Rename st21nfcb to st-nci STMicroelectronics NFC NCI chips family is extending with the new ST21NFCC using the AMS AS39230 RF booster. The st21nfcb driver is relevant for this solution and might be with future products. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/st-nci.h | 29 +++++++++++++++++++++++++++++ include/linux/platform_data/st21nfcb.h | 29 ----------------------------- include/linux/platform_data/st_nci.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 29 deletions(-) create mode 100644 include/linux/platform_data/st-nci.h delete mode 100644 include/linux/platform_data/st21nfcb.h create mode 100644 include/linux/platform_data/st_nci.h (limited to 'include/linux') diff --git a/include/linux/platform_data/st-nci.h b/include/linux/platform_data/st-nci.h new file mode 100644 index 000000000000..d9d400a297bd --- /dev/null +++ b/include/linux/platform_data/st-nci.h @@ -0,0 +1,29 @@ +/* + * Driver include for ST NCI NFC chip family. + * + * Copyright (C) 2014-2015 STMicroelectronics SAS. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _ST_NCI_H_ +#define _ST_NCI_H_ + +#define ST_NCI_DRIVER_NAME "st_nci" + +struct st_nci_nfc_platform_data { + unsigned int gpio_reset; + unsigned int irq_polarity; +}; + +#endif /* _ST_NCI_H_ */ diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h deleted file mode 100644 index b023373d9874..000000000000 --- a/include/linux/platform_data/st21nfcb.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Driver include for the ST21NFCB NFC chip. - * - * Copyright (C) 2014 STMicroelectronics SAS. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#ifndef _ST21NFCB_NCI_H_ -#define _ST21NFCB_NCI_H_ - -#define ST21NFCB_NCI_DRIVER_NAME "st21nfcb_nci" - -struct st21nfcb_nfc_platform_data { - unsigned int gpio_reset; - unsigned int irq_polarity; -}; - -#endif /* _ST21NFCB_NCI_H_ */ diff --git a/include/linux/platform_data/st_nci.h b/include/linux/platform_data/st_nci.h new file mode 100644 index 000000000000..d9d400a297bd --- /dev/null +++ b/include/linux/platform_data/st_nci.h @@ -0,0 +1,29 @@ +/* + * Driver include for ST NCI NFC chip family. + * + * Copyright (C) 2014-2015 STMicroelectronics SAS. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _ST_NCI_H_ +#define _ST_NCI_H_ + +#define ST_NCI_DRIVER_NAME "st_nci" + +struct st_nci_nfc_platform_data { + unsigned int gpio_reset; + unsigned int irq_polarity; +}; + +#endif /* _ST_NCI_H_ */ -- cgit v1.2.3 From 205a525c334295e3cd4cc7755fd2c0398e3a787f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 9 Jun 2015 18:19:39 +0800 Subject: random: Add callback API for random pool readiness The get_blocking_random_bytes API is broken because the wait can be arbitrarily long (potentially forever) so there is no safe way of calling it from within the kernel. This patch replaces it with a callback API instead. The callback is invoked potentially from interrupt context so the user needs to schedule their own work thread if necessary. In addition to adding callbacks, they can also be removed as otherwise this opens up a way for user-space to allocate kernel memory with no bound (by opening algif_rng descriptors and then closing them). Signed-off-by: Herbert Xu --- include/linux/random.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 796267d56901..30e2aca0b16a 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -6,8 +6,15 @@ #ifndef _LINUX_RANDOM_H #define _LINUX_RANDOM_H +#include #include +struct random_ready_callback { + struct list_head list; + void (*func)(struct random_ready_callback *rdy); + struct module *owner; +}; + extern void add_device_randomness(const void *, unsigned int); extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); @@ -15,6 +22,8 @@ extern void add_interrupt_randomness(int irq, int irq_flags); extern void get_random_bytes(void *buf, int nbytes); extern void get_blocking_random_bytes(void *buf, int nbytes); +extern int add_random_ready_callback(struct random_ready_callback *rdy); +extern void del_random_ready_callback(struct random_ready_callback *rdy); extern void get_random_bytes_arch(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); extern int random_int_secret_init(void); -- cgit v1.2.3 From c2719503f5e1e6213d716bb078bdad01e28ebcbf Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 9 Jun 2015 18:19:42 +0800 Subject: random: Remove kernel blocking API This patch removes the kernel blocking API as it has been completely replaced by the callback API. Signed-off-by: Herbert Xu --- include/linux/random.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 30e2aca0b16a..e651874df2c9 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -21,7 +21,6 @@ extern void add_input_randomness(unsigned int type, unsigned int code, extern void add_interrupt_randomness(int irq, int irq_flags); extern void get_random_bytes(void *buf, int nbytes); -extern void get_blocking_random_bytes(void *buf, int nbytes); extern int add_random_ready_callback(struct random_ready_callback *rdy); extern void del_random_ready_callback(struct random_ready_callback *rdy); extern void get_random_bytes_arch(void *buf, int nbytes); -- cgit v1.2.3 From 0bb979472a7401022109e81dd89d777adea58710 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jun 2015 08:01:20 -0600 Subject: cfq-iosched: fix the setting of IOPS mode on SSDs A previous commit wanted to make CFQ default to IOPS mode on non-rotational storage, however it did so when the queue was initialized and the non-rotational flag is only set later on in the probe. Add an elevator hook that gets called off the add_disk() path, at that point we know that feature probing has finished, and we can reliably check for the various flags that drivers can set. Fixes: 41c0126b ("block: Make CFQ default to IOPS mode on SSDs") Tested-by: Romain Francoise Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 45a91474487d..638b324f0291 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -39,6 +39,7 @@ typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct reques typedef int (elevator_init_fn) (struct request_queue *, struct elevator_type *e); typedef void (elevator_exit_fn) (struct elevator_queue *); +typedef void (elevator_registered_fn) (struct request_queue *); struct elevator_ops { @@ -68,6 +69,7 @@ struct elevator_ops elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; + elevator_registered_fn *elevator_registered_fn; }; #define ELV_NAME_MAX (16) -- cgit v1.2.3 From 5c6e3a97e969e978368df83239583771c936efea Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 8 Jun 2015 10:09:48 +0900 Subject: power_supply: sysfs: Bring back write to writeable properties The fix for NULL pointer exception related to calling uevent for not finished probe caused to set all writeable properties as non-writeable. This was caused by checking if property is writeable before the initial increase of power supply usage counter and in the same time using wrapper over property_is_writeable(). The wrapper returns ENODEV if the usage counter is still 0. The call trace looked like: device probe: power_supply_register() use_cnt = 0; device_add() create sysfs entries power_supply_attr_is_visible() power_supply_property_is_writeable() if (use_cnt == 0) return -ENODEV; use_cnt++; Replace the usage of wrapper with direct call to property_is_writeable() from driver. This should be safe call during device probe because implementations of this callback just return 0/1 for different properties and they do not access any of the driver's internal data. Fixes: 8e59c7f23410 ("power_supply: Fix NULL pointer dereference during bq27x00_battery probe") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index a80f1fd01ddb..0395bcb18ddb 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -206,6 +206,11 @@ struct power_supply_desc { int (*set_property)(struct power_supply *psy, enum power_supply_property psp, const union power_supply_propval *val); + /* + * property_is_writeable() will be called during registration + * of power supply. If this happens during device probe then it must + * not access internal data of device (because probe did not end). + */ int (*property_is_writeable)(struct power_supply *psy, enum power_supply_property psp); void (*external_power_changed)(struct power_supply *psy); -- cgit v1.2.3 From fe27e1dfe9962b07215ee01445926306ddbb7c25 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 9 Jun 2015 23:37:56 +0200 Subject: power: Add devm_power_supply_get_by_phandle() helper function This commit adds a resource-managed version of the power_supply_get_by_phandle() function. Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 0395bcb18ddb..ef9f1592185d 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -292,10 +292,15 @@ extern void power_supply_put(struct power_supply *psy); #ifdef CONFIG_OF extern struct power_supply *power_supply_get_by_phandle(struct device_node *np, const char *property); +extern struct power_supply *devm_power_supply_get_by_phandle( + struct device *dev, const char *property); #else /* !CONFIG_OF */ static inline struct power_supply * power_supply_get_by_phandle(struct device_node *np, const char *property) { return NULL; } +static inline struct power_supply * +devm_power_supply_get_by_phandle(struct device *dev, const char *property) +{ return NULL; } #endif /* CONFIG_OF */ extern void power_supply_changed(struct power_supply *psy); extern int power_supply_am_i_supplied(struct power_supply *psy); -- cgit v1.2.3 From b064a8fa77dfead647564c46ac8fc5b13bd1ab73 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 10 Jun 2015 01:33:36 +0200 Subject: ACPI / init: Switch over platform to the ACPI mode later Commit 73f7d1ca3263 "ACPI / init: Run acpi_early_init() before timekeeping_init()" moved the ACPI subsystem initialization, including the ACPI mode enabling, to an earlier point in the initialization sequence, to allow the timekeeping subsystem use ACPI early. Unfortunately, that resulted in boot regressions on some systems and the early ACPI initialization was moved toward its original position in the kernel initialization code by commit c4e1acbb35e4 "ACPI / init: Invoke early ACPI initialization later". However, that turns out to be insufficient, as boot is still broken on the Tyan S8812 mainboard. To fix that issue, split the ACPI early initialization code into two pieces so the majority of it still located in acpi_early_init() and the part switching over the platform into the ACPI mode goes into a new function, acpi_subsystem_init(), executed at the original early ACPI initialization spot. That fixes the Tyan S8812 boot problem, but still allows ACPI tables to be loaded earlier which is useful to the EFI code in efi_enter_virtual_mode(). Link: https://bugzilla.kernel.org/show_bug.cgi?id=97141 Fixes: 73f7d1ca3263 "ACPI / init: Run acpi_early_init() before timekeeping_init()" Reported-and-tested-by: Marius Tolzmann Signed-off-by: Rafael J. Wysocki Acked-by: Toshi Kani Reviewed-by: Hanjun Guo Reviewed-by: Lee, Chun-Yi --- include/linux/acpi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..4550be3bb63b 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -440,6 +440,7 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, #define ACPI_OST_SC_INSERT_NOT_SUPPORTED 0x82 extern void acpi_early_init(void); +extern void acpi_subsystem_init(void); extern int acpi_nvs_register(__u64 start, __u64 size); @@ -494,6 +495,7 @@ static inline const char *acpi_dev_name(struct acpi_device *adev) } static inline void acpi_early_init(void) { } +static inline void acpi_subsystem_init(void) { } static inline int early_acpi_boot_init(void) { -- cgit v1.2.3 From 4f822c625f9c68267ffee7519519e304858a46c3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 10 Jun 2015 18:07:57 -0700 Subject: net: phy: broadcom: include phy.h for brcmphy.h We utilize inline functions from the PHY library, make sure that we do include phy.h in brcmphy.h in order for the code including brcmphy.h not to have to resolve this inclusion dependency. Fixes: 705314797b8b ("net: phy: broadcom: move shadow 0x1C register accessors to brcmphy.h") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 656da2a12ffe..abb6106f839d 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -1,6 +1,8 @@ #ifndef _LINUX_BRCMPHY_H #define _LINUX_BRCMPHY_H +#include + #define PHY_ID_BCM50610 0x0143bd60 #define PHY_ID_BCM50610M 0x0143bd70 #define PHY_ID_BCM5241 0x0143bc30 -- cgit v1.2.3 From 8bc84b79265f66d5af736473db582e74b28e099d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 10 Jun 2015 18:07:58 -0700 Subject: net: phy: broadcom: define Broadcom pseudo-PHY address in brcmphy.h Define the pseudo-PHY address (30) which is used by all Broadcom Ethernet switches in a shared header file. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index abb6106f839d..697ca7795bd9 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -3,6 +3,11 @@ #include +/* All Broadcom Ethernet switches have a pseudo-PHY at address 30 which is used + * to configure the switch internal registers via MDIO accesses. + */ +#define BRCM_PSEUDO_PHY_ADDR 30 + #define PHY_ID_BCM50610 0x0143bd60 #define PHY_ID_BCM50610M 0x0143bd70 #define PHY_ID_BCM5241 0x0143bc30 -- cgit v1.2.3 From d290f1e70d85a9a4d124594c6a3d769329960bdc Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 28 May 2015 18:41:36 +0200 Subject: iommu: Introduce iommu_request_dm_for_dev() This function can be called by an IOMMU driver to request that a device's default domain is direct mapped. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b944b2be4fa2..dc767f7c3704 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -225,6 +225,7 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain, extern void iommu_get_dm_regions(struct device *dev, struct list_head *list); extern void iommu_put_dm_regions(struct device *dev, struct list_head *list); +extern int iommu_request_dm_for_dev(struct device *dev); extern int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); @@ -411,6 +412,11 @@ static inline void iommu_put_dm_regions(struct device *dev, { } +static inline int iommu_request_dm_for_dev(struct device *dev) +{ + return -ENODEV; +} + static inline int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { -- cgit v1.2.3 From dfabde206aa10ae71a89ba75e68b1f58a6336a05 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 11 May 2015 17:08:50 +0100 Subject: mailbox: Add ability for clients to request channels by name This patch supplies a new framework API; mbox_request_channel_byname(). It works by supplying the usual client pointer as the first argument and a string as the second. The API will search the client's node for a 'mbox-names' property then request a channel in the normal way using the requested string's index as the expected second 'index' argument. Signed-off-by: Lee Jones Signed-off-by: Jassi Brar --- include/linux/mailbox_client.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h index 1726ccbd8009..44348710953f 100644 --- a/include/linux/mailbox_client.h +++ b/include/linux/mailbox_client.h @@ -40,6 +40,8 @@ struct mbox_client { void (*tx_done)(struct mbox_client *cl, void *mssg, int r); }; +struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl, + const char *name); struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index); int mbox_send_message(struct mbox_chan *chan, void *mssg); void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ -- cgit v1.2.3 From dc14bdef8762a8098b1da881b611d722e24fe787 Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Thu, 11 Jun 2015 14:00:19 +0200 Subject: NFC: nfcmrvl: add platform_data and DT configuration Declare nfcmrvl platform_data structure and few DT parameters for nfcmrvl driver. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/nfcmrvl.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/linux/platform_data/nfcmrvl.h (limited to 'include/linux') diff --git a/include/linux/platform_data/nfcmrvl.h b/include/linux/platform_data/nfcmrvl.h new file mode 100644 index 000000000000..106cfe5ed589 --- /dev/null +++ b/include/linux/platform_data/nfcmrvl.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2015, Marvell International Ltd. + * + * This software file (the "File") is distributed by Marvell International + * Ltd. under the terms of the GNU General Public License Version 2, June 1991 + * (the "License"). You may use, redistribute and/or modify this File in + * accordance with the terms and conditions of the License, a copy of which + * is available on the worldwide web at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt. + * + * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE + * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY DISCLAIMED. The License provides additional details about + * this warranty disclaimer. + */ + +#ifndef _NFCMRVL_PTF_H_ +#define _NFCMRVL_PTF_H_ + +struct nfcmrvl_platform_data { + /* + * Generic + */ + + /* GPIO that is wired to RESET_N signal */ + unsigned int reset_n_io; + /* Tell if transport is muxed in HCI one */ + unsigned int hci_muxed; +}; + +#endif /* _NFCMRVL_PTF_H_ */ -- cgit v1.2.3 From e097dc624f784debbde49701a493bf920bc422c7 Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Thu, 11 Jun 2015 14:00:20 +0200 Subject: NFC: nfcmrvl: add UART driver Add support of Marvell NFC chip controlled over UART Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/nfcmrvl.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/nfcmrvl.h b/include/linux/platform_data/nfcmrvl.h index 106cfe5ed589..ac91707dabcb 100644 --- a/include/linux/platform_data/nfcmrvl.h +++ b/include/linux/platform_data/nfcmrvl.h @@ -26,6 +26,15 @@ struct nfcmrvl_platform_data { unsigned int reset_n_io; /* Tell if transport is muxed in HCI one */ unsigned int hci_muxed; + + /* + * UART specific + */ + + /* Tell if UART needs flow control at init */ + unsigned int flow_control; + /* Tell if firmware supports break control for power management */ + unsigned int break_control; }; #endif /* _NFCMRVL_PTF_H_ */ -- cgit v1.2.3 From facc9699f0fe7d65a92cc09e175662659306066d Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 11 Jun 2015 14:47:27 +0300 Subject: net/mlx5e: Fix HW MTU settings Previously we configured HW MTU to be netdev->mtu, actually we need to configure netdev->mtu + (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN). Also, query MTU can not fail, hence make the relevant helper a void functionm, add mlx5e_set_dev_port_mtu, helper function to handle MTU setting. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6093bde16b94..c0930f8d7021 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -756,11 +756,11 @@ int mlx5_set_port_status(struct mlx5_core_dev *dev, enum mlx5_port_status status); int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status); -int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu); -int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, - u8 local_port); -int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, - u8 local_port); +int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port); +void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port); +void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, + u8 port); + int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, u8 *vl_hw_cap, u8 local_port); -- cgit v1.2.3 From fc11fbf9a785b25c5d07f05a30d4169ec39818da Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 11 Jun 2015 14:47:28 +0300 Subject: net/mlx5e: Add HW cacheline start padding Enable HW cacheline start padding and align RX WQE size to cacheline while considering HW start padding. Also, fix dma_unmap call to use the correct SKB data buffer size. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b2c43508a737..b943cd9e2097 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -131,6 +131,10 @@ enum { MLX5_INLINE_SEG = 0x80000000, }; +enum { + MLX5_HW_START_PADDING = MLX5_INLINE_SEG, +}; + enum { MLX5_MIN_PKEY_TABLE_SIZE = 128, MLX5_MAX_LOG_PKEY_TABLE = 5, -- cgit v1.2.3 From 833f32d763028c1bb371c64f457788b933773b3e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 11 Jun 2015 15:54:55 -0700 Subject: time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge Currently, leapsecond adjustments are done at tick time. As a result, the leapsecond was applied at the first timer tick *after* the leapsecond (~1-10ms late depending on HZ), rather then exactly on the second edge. This was in part historical from back when we were always tick based, but correcting this since has been avoided since it adds extra conditional checks in the gettime fastpath, which has performance overhead. However, it was recently pointed out that ABS_TIME CLOCK_REALTIME timers set for right after the leapsecond could fire a second early, since some timers may be expired before we trigger the timekeeping timer, which then applies the leapsecond. This isn't quite as bad as it sounds, since behaviorally it is similar to what is possible w/ ntpd made leapsecond adjustments done w/o using the kernel discipline. Where due to latencies, timers may fire just prior to the settimeofday call. (Also, one should note that all applications using CLOCK_REALTIME timers should always be careful, since they are prone to quirks from settimeofday() disturbances.) However, the purpose of having the kernel do the leap adjustment is to avoid such latencies, so I think this is worth fixing. So in order to properly keep those timers from firing a second early, this patch modifies the ntp and timekeeping logic so that we keep enough state so that the update_base_offsets_now accessor, which provides the hrtimer core the current time, can check and apply the leapsecond adjustment on the second edge. This prevents the hrtimer core from expiring timers too early. This patch does not modify any other time read path, so no additional overhead is incurred. However, this also means that the leap-second continues to be applied at tick time for all other read-paths. Apologies to Richard Cochran, who pushed for similar changes years ago, which I resisted due to the concerns about the performance overhead. While I suspect this isn't extremely critical, folks who care about strict leap-second correctness will likely want to watch this. Potentially a -stable candidate eventually. Originally-suggested-by: Richard Cochran Reported-by: Daniel Bristot de Oliveira Reported-by: Prarit Bhargava Signed-off-by: John Stultz Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Shuah Khan Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/time64.h | 1 + include/linux/timekeeper_internal.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index 12d4e82b0276..77b5df2acd2a 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -29,6 +29,7 @@ struct timespec64 { #define FSEC_PER_SEC 1000000000000000LL /* Located here for timespec[64]_valid_strict */ +#define TIME64_MAX ((s64)~((u64)1 << 63)) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index e1f5a1136554..25247220b4b7 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,6 +50,7 @@ struct tk_read_base { * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events + * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -90,6 +91,7 @@ struct timekeeper { ktime_t offs_tai; s32 tai_offset; unsigned int clock_was_set_seq; + ktime_t next_leap_ktime; struct timespec64 raw_time; /* The following members are for timekeeping internal use */ -- cgit v1.2.3 From 3bf17472226b0041b0c61363bd57a5cadbe620c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Jun 2015 13:20:29 +0800 Subject: iommu: dmar: Extend struct irte for VT-d Posted-Interrupts The IRTE (Interrupt Remapping Table Entry) is either an entry for remapped or for posted interrupts. The hardware distiguishes between remapped and posted entries by bit 15 in the low 64 bit of the IRTE. If cleared the entry is remapped, if set it's posted. The entries have common fields and dependent on the posted bit fields with different meanings. Extend struct irte to handle the differences between remap and posted mode by having three structs in the unions: - Shared - Remapped - Posted Signed-off-by: Thomas Gleixner Signed-off-by: Feng Wu Acked-by: Joerg Roedel Cc: jiang.liu@linux.intel.com Cc: iommu@lists.linux-foundation.org Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1433827237-3382-3-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- include/linux/dmar.h | 70 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 84737565c1fd..0dbcabcb5f0d 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -185,33 +185,73 @@ static inline int dmar_device_remove(void *handle) struct irte { union { + /* Shared between remapped and posted mode*/ struct { - __u64 present : 1, - fpd : 1, - dst_mode : 1, - redir_hint : 1, - trigger_mode : 1, - dlvry_mode : 3, - avail : 4, - __reserved_1 : 4, - vector : 8, - __reserved_2 : 8, - dest_id : 32; + __u64 present : 1, /* 0 */ + fpd : 1, /* 1 */ + __res0 : 6, /* 2 - 6 */ + avail : 4, /* 8 - 11 */ + __res1 : 3, /* 12 - 14 */ + pst : 1, /* 15 */ + vector : 8, /* 16 - 23 */ + __res2 : 40; /* 24 - 63 */ + }; + + /* Remapped mode */ + struct { + __u64 r_present : 1, /* 0 */ + r_fpd : 1, /* 1 */ + dst_mode : 1, /* 2 */ + redir_hint : 1, /* 3 */ + trigger_mode : 1, /* 4 */ + dlvry_mode : 3, /* 5 - 7 */ + r_avail : 4, /* 8 - 11 */ + r_res0 : 4, /* 12 - 15 */ + r_vector : 8, /* 16 - 23 */ + r_res1 : 8, /* 24 - 31 */ + dest_id : 32; /* 32 - 63 */ + }; + + /* Posted mode */ + struct { + __u64 p_present : 1, /* 0 */ + p_fpd : 1, /* 1 */ + p_res0 : 6, /* 2 - 7 */ + p_avail : 4, /* 8 - 11 */ + p_res1 : 2, /* 12 - 13 */ + p_urgent : 1, /* 14 */ + p_pst : 1, /* 15 */ + p_vector : 8, /* 16 - 23 */ + p_res2 : 14, /* 24 - 37 */ + pda_l : 26; /* 38 - 63 */ }; __u64 low; }; union { + /* Shared between remapped and posted mode*/ struct { - __u64 sid : 16, - sq : 2, - svt : 2, - __reserved_3 : 44; + __u64 sid : 16, /* 64 - 79 */ + sq : 2, /* 80 - 81 */ + svt : 2, /* 82 - 83 */ + __res3 : 44; /* 84 - 127 */ + }; + + /* Posted mode*/ + struct { + __u64 p_sid : 16, /* 64 - 79 */ + p_sq : 2, /* 80 - 81 */ + p_svt : 2, /* 82 - 83 */ + p_res3 : 12, /* 84 - 95 */ + pda_h : 32; /* 96 - 127 */ }; __u64 high; }; }; +#define PDA_LOW_BIT 26 +#define PDA_HIGH_BIT 32 + enum { IRQ_REMAP_XAPIC_MODE, IRQ_REMAP_X2APIC_MODE, -- cgit v1.2.3 From bf56027ff4d9e75bf668ae990fe6204d00a23002 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Jun 2015 13:20:30 +0800 Subject: iommu: dmar: Provide helper to copy shared irte fields Instead of open coding, provide a helper function to copy the shared irte fields. Signed-off-by: Thomas Gleixner Cc: jiang.liu@linux.intel.com Cc: iommu@lists.linux-foundation.org Cc: joro@8bytes.org Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1433827237-3382-4-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- include/linux/dmar.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 0dbcabcb5f0d..e9bc9292bd3a 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -249,6 +249,18 @@ struct irte { }; }; +static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src) +{ + dst->present = src->present; + dst->fpd = src->fpd; + dst->avail = src->avail; + dst->pst = src->pst; + dst->vector = src->vector; + dst->sid = src->sid; + dst->sq = src->sq; + dst->svt = src->svt; +} + #define PDA_LOW_BIT 26 #define PDA_HIGH_BIT 32 -- cgit v1.2.3 From 07c09787b26db724c94a912a572a9a4fa66008f3 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 9 Jun 2015 13:20:34 +0800 Subject: iommu, x86: Add cap_pi_support() to detect VT-d PI capability Add helper function to detect VT-d Posted-Interrupts capability. Signed-off-by: Feng Wu Reviewed-by: Jiang Liu Reviewed-by: Thomas Gleixner Acked-by: David Woodhouse Acked-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1433827237-3382-8-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- include/linux/intel-iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 0af9b03e2b1c..0c251be39836 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -87,6 +87,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) /* * Decoding Capability Register */ +#define cap_pi_support(c) (((c) >> 59) & 1) #define cap_read_drain(c) (((c) >> 55) & 1) #define cap_write_drain(c) (((c) >> 54) & 1) #define cap_max_amask_val(c) (((c) >> 48) & 0x3f) -- cgit v1.2.3 From b6a00fae9760a49114016e4764d09e522a5ba5b6 Mon Sep 17 00:00:00 2001 From: Tim Kryger Date: Tue, 26 May 2015 13:08:16 -0700 Subject: pwm: Add pwmchip_add_with_polarity() API Add a new function to register a PWM chip with channels that have their initial polarity as specified by an additional parameter. This benefits drivers of controllers that by default operate with inversed polarity by removing the need to modify the polarity during initialization. Signed-off-by: Tim Kryger Signed-off-by: Jonathan Richardson [thierry.reding@gmail.com: export pwmchip_add_with_polarity()] Signed-off-by: Thierry Reding --- include/linux/pwm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index cfe2d8df5be0..36262d08a9da 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -182,6 +182,8 @@ struct pwm_chip { int pwm_set_chip_data(struct pwm_device *pwm, void *data); void *pwm_get_chip_data(struct pwm_device *pwm); +int pwmchip_add_with_polarity(struct pwm_chip *chip, + enum pwm_polarity polarity); int pwmchip_add(struct pwm_chip *chip); int pwmchip_remove(struct pwm_chip *chip); struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, @@ -217,6 +219,11 @@ static inline int pwmchip_add(struct pwm_chip *chip) return -EINVAL; } +static inline int pwmchip_add_inversed(struct pwm_chip *chip) +{ + return -EINVAL; +} + static inline int pwmchip_remove(struct pwm_chip *chip) { return -EINVAL; -- cgit v1.2.3 From 22a10bca280073f81e9e2d9fed6f90a3bcf00236 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 11 Jun 2015 17:37:03 -0700 Subject: regulator: Add system_load constraint Some regulators have a fixed load that isn't captured by consumers that the kernel knows about. Add a constraint to support this. Signed-off-by: Stephen Boyd Signed-off-by: Mark Brown --- include/linux/regulator/machine.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index b07562e082c4..01526559c8c3 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -75,6 +75,7 @@ struct regulator_state { * * @min_uA: Smallest current consumers may set. * @max_uA: Largest current consumers may set. + * @system_load: Load that isn't captured by any consumer requests. * * @valid_modes_mask: Mask of modes which may be configured by consumers. * @valid_ops_mask: Operations which may be performed by consumers. @@ -112,6 +113,8 @@ struct regulation_constraints { int min_uA; int max_uA; + int system_load; + /* valid regulator operating modes for this machine */ unsigned int valid_modes_mask; -- cgit v1.2.3 From 72b31f7271df34c6aab36c01305287924826678f Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:27:40 +0200 Subject: netfilter: bridge: detect NAT66 correctly and change MAC address IPv4 iptables allows to REDIRECT/DNAT/SNAT any traffic over a bridge. e.g. REDIRECT $ sysctl -w net.bridge.bridge-nf-call-iptables=1 $ iptables -t nat -A PREROUTING -p tcp -m tcp --dport 8080 \ -j REDIRECT --to-ports 81 This does not work with ip6tables on a bridge in NAT66 scenario because the REDIRECT/DNAT/SNAT is not correctly detected. The bridge pre-routing (finish) netfilter hook has to check for a possible redirect and then fix the destination mac address. This allows to use the ip6tables rules for local REDIRECT/DNAT/SNAT REDIRECT similar to the IPv4 iptables version. e.g. REDIRECT $ sysctl -w net.bridge.bridge-nf-call-ip6tables=1 $ ip6tables -t nat -A PREROUTING -p tcp -m tcp --dport 8080 \ -j REDIRECT --to-ports 81 This patch makes it possible to use IPv6 NAT66 on a bridge. It was tested on a bridge with two interfaces using SNAT/DNAT NAT66 rules. Reported-by: Artie Hamilton Signed-off-by: Sven Eckelmann [bernhard.thaler@wvnet.at: rebased, add indirect call to ip6_route_input()] [bernhard.thaler@wvnet.at: rebased, split into separate patches] Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 1 + include/linux/skbuff.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 64dad1cc1a4b..e2d19694ee8f 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -25,6 +25,7 @@ void ipv6_netfilter_fini(void); struct nf_ipv6_ops { int (*chk_addr)(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); + void (*route_input)(struct sk_buff *skb); }; extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cc612fc0a894..f70fc0e6bf7b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -36,6 +36,7 @@ #include #include #include +#include /* A. Checksumming of received packets by device. * @@ -179,7 +180,10 @@ struct nf_bridge_info { struct net_device *physoutdev; char neigh_header[8]; }; - __be32 ipv4_daddr; + union { + __be32 ipv4_daddr; + struct in6_addr ipv6_daddr; + }; }; #endif -- cgit v1.2.3 From 411ffb4fde80705a9a8db4c2d38dbeef6f5bd689 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:28:28 +0200 Subject: netfilter: bridge: refactor frag_max_size Currently frag_max_size is member of br_input_skb_cb and copied back and forth using IPCB(skb) and BR_INPUT_SKB_CB(skb) each time it is changed or used. Attach frag_max_size to nf_bridge_info and set value in pre_routing and forward functions. Use its value in forward and xmit functions. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f70fc0e6bf7b..32b105e682b3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -174,6 +174,7 @@ struct nf_bridge_info { BRNF_PROTO_PPPOE } orig_proto:8; bool pkt_otherhost; + __u16 frag_max_size; unsigned int mask; struct net_device *physindev; union { -- cgit v1.2.3 From 23c779b9f9161d6568d3b2fca06e70ad182c480c Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 11 Jun 2015 17:37:04 -0700 Subject: regulator: Add pull down support Some regulators need to be configured to pull down a resistor when the regulator is disabled. Add an op (set_pull_down) and a DT property + constraint to support this. Signed-off-by: Stephen Boyd Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 5 +++++ include/linux/regulator/machine.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index fffa688ac3a7..76144a337ff7 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -121,6 +121,9 @@ struct regulator_linear_range { * @set_suspend_mode: Set the operating mode for the regulator when the * system is suspended. * + * @set_pull_down: Configure the regulator to pull down when the regulator + * is disabled. + * * This struct describes regulator operations which can be implemented by * regulator chip drivers. */ @@ -187,6 +190,8 @@ struct regulator_ops { /* set regulator suspend operating mode (defined in consumer.h) */ int (*set_suspend_mode) (struct regulator_dev *, unsigned int mode); + + int (*set_pull_down) (struct regulator_dev *); }; /* diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 01526559c8c3..8ffb0619a03c 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -87,6 +87,7 @@ struct regulator_state { * applied. * @apply_uV: Apply the voltage constraint when initialising. * @ramp_disable: Disable ramp delay when initialising or when setting voltage. + * @pull_down: Enable pull down when regulator is disabled. * * @input_uV: Input voltage for regulator when supplied by another regulator. * @@ -141,6 +142,7 @@ struct regulation_constraints { unsigned boot_on:1; /* bootloader/firmware enabled regulator */ unsigned apply_uV:1; /* apply uV constraint if min == max */ unsigned ramp_disable:1; /* disable ramp delay */ + unsigned pull_down:1; /* pull down resistor when regulator off */ }; /** -- cgit v1.2.3 From efb6de9b4ba0092b2c55f6a52d16294a8a698edd Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:30:16 +0200 Subject: netfilter: bridge: forward IPv6 fragmented packets IPv6 fragmented packets are not forwarded on an ethernet bridge with netfilter ip6_tables loaded. e.g. steps to reproduce 1) create a simple bridge like this modprobe br_netfilter brctl addbr br0 brctl addif br0 eth0 brctl addif br0 eth2 ifconfig eth0 up ifconfig eth2 up ifconfig br0 up 2) place a host with an IPv6 address on each side of the bridge set IPv6 address on host A: ip -6 addr add fd01:2345:6789:1::1/64 dev eth0 set IPv6 address on host B: ip -6 addr add fd01:2345:6789:1::2/64 dev eth0 3) run a simple ping command on host A with packets > MTU ping6 -s 4000 fd01:2345:6789:1::2 4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge IPv6 fragmented packets traverse the bridge cleanly until somebody runs. "ip6tables -t nat -nvL". As soon as it is run (and netfilter modules are loaded) IPv6 fragmented packets do not traverse the bridge any more (you see no more responses in ping's output). After applying this patch IPv6 fragmented packets traverse the bridge cleanly in above scenario. Signed-off-by: Bernhard Thaler [pablo@netfilter.org: small changes to br_nf_dev_queue_xmit] Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index e2d19694ee8f..8b7d28f3aada 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -26,6 +26,8 @@ struct nf_ipv6_ops { int (*chk_addr)(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); void (*route_input)(struct sk_buff *skb); + int (*fragment)(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)); }; extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; -- cgit v1.2.3 From 33b1f31392861947fa2a2a57c3a39ab63b8c9f9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 5 Jun 2015 13:28:38 +0200 Subject: net: ip_fragment: remove BRIDGE_NETFILTER mtu special handling since commit d6b915e29f4adea9 ("ip_fragment: don't forward defragmented DF packet") the largest fragment size is available in the IPCB. Therefore we no longer need to care about 'encapsulation' overhead of stripped PPPOE/VLAN headers since ip_do_fragment doesn't use device mtu in such cases. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index f2fdb5a52070..6d80fc686323 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -20,13 +20,6 @@ enum nf_br_hook_priorities { #define BRNF_BRIDGED_DNAT 0x02 #define BRNF_NF_BRIDGE_PREROUTING 0x08 -static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) -{ - if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) - return PPPOE_SES_HLEN; - return 0; -} - int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); static inline void br_drop_fake_rtable(struct sk_buff *skb) -- cgit v1.2.3 From 57f66b78860968fc7eddc9ce25f8e57f7e5000bd Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 11 Jun 2015 17:37:05 -0700 Subject: regulator: Add soft start support Some regulators support a "soft start" feature where the voltage ramps up slowly when the regulator is enabled. Add an op (set_soft_start) and a DT property + constraint to support this. Signed-off-by: Stephen Boyd Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 2 ++ include/linux/regulator/machine.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 76144a337ff7..e0635d0894aa 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -161,6 +161,8 @@ struct regulator_ops { unsigned int old_selector, unsigned int new_selector); + int (*set_soft_start) (struct regulator_dev *); + /* report regulator status ... most other accessors report * control inputs, this reports results of combining inputs * from Linux (and other sources) with the actual load. diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 8ffb0619a03c..7f7d0a3fe1e1 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -142,6 +142,7 @@ struct regulation_constraints { unsigned boot_on:1; /* bootloader/firmware enabled regulator */ unsigned apply_uV:1; /* apply uV constraint if min == max */ unsigned ramp_disable:1; /* disable ramp delay */ + unsigned soft_start:1; /* ramp voltage slowly */ unsigned pull_down:1; /* pull down resistor when regulator off */ }; -- cgit v1.2.3 From 36e4f839de59b6216a16cdf5c1d3263f4dbd9421 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 11 Jun 2015 17:37:06 -0700 Subject: regulator: Add input current limit support Some regulators can limit their input current (typically annotated as ilim). Add an op (set_input_current_limit) and a DT property + constraint to support this. Signed-off-by: Stephen Boyd Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 3 +++ include/linux/regulator/machine.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index e0635d0894aa..125264f8be93 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -91,6 +91,7 @@ struct regulator_linear_range { * @set_current_limit: Configure a limit for a current-limited regulator. * The driver should select the current closest to max_uA. * @get_current_limit: Get the configured limit for a current-limited regulator. + * @set_input_current_limit: Configure an input limit. * * @set_mode: Set the configured operating mode for the regulator. * @get_mode: Get the configured operating mode for the regulator. @@ -145,6 +146,8 @@ struct regulator_ops { int min_uA, int max_uA); int (*get_current_limit) (struct regulator_dev *); + int (*set_input_current_limit) (struct regulator_dev *, int lim_uA); + /* enable/disable regulator */ int (*enable) (struct regulator_dev *); int (*disable) (struct regulator_dev *); diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 7f7d0a3fe1e1..85a3b457de51 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -75,6 +75,7 @@ struct regulator_state { * * @min_uA: Smallest current consumers may set. * @max_uA: Largest current consumers may set. + * @ilim_uA: Maximum input current. * @system_load: Load that isn't captured by any consumer requests. * * @valid_modes_mask: Mask of modes which may be configured by consumers. @@ -113,6 +114,7 @@ struct regulation_constraints { /* current output range (inclusive) - for current control */ int min_uA; int max_uA; + int ilim_uA; int system_load; -- cgit v1.2.3 From 71ae0dff02d756e4d2ca710b79f2ff5390029a5f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Jun 2015 01:34:54 +0200 Subject: netfilter: xtables: use percpu rule counters The binary arp/ip/ip6tables ruleset is stored per cpu. The only reason left as to why we need percpu duplication are the rule counters embedded into ipt_entry et al -- since each cpu has its own copy of the rules, all counters can be lockless. The downside is that the more cpus are supported, the more memory is required. Rules are not just duplicated per online cpu but for each possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times, not for the e.g. 64 cores present. To save some memory and also improve utilization of shared caches it would be preferable to only store the rule blob once. So we first need to separate counters and the rule blob. Instead of using entry->counters, allocate this percpu and store the percpu address in entry->counters.pcnt on CONFIG_SMP. This change makes no sense as-is; it is merely an intermediate step to remove the percpu duplication of the rule set in a followup patch. Suggested-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Reported-by: Marcelo Ricardo Leitner Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 09f38206c18f..b77ab9f17641 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -353,6 +353,55 @@ static inline unsigned long ifname_compare_aligned(const char *_a, return ret; } + +/* On SMP, ip(6)t_entry->counters.pcnt holds address of the + * real (percpu) counter. On !SMP, its just the packet count, + * so nothing needs to be done there. + * + * xt_percpu_counter_alloc returns the address of the percpu + * counter, or 0 on !SMP. + * + * Hence caller must use IS_ERR_VALUE to check for error, this + * allows us to return 0 for single core systems without forcing + * callers to deal with SMP vs. NONSMP issues. + */ +static inline u64 xt_percpu_counter_alloc(void) +{ + if (nr_cpu_ids > 1) { + void __percpu *res = alloc_percpu(struct xt_counters); + + if (res == NULL) + return (u64) -ENOMEM; + + return (__force u64) res; + } + + return 0; +} +static inline void xt_percpu_counter_free(u64 pcnt) +{ + if (nr_cpu_ids > 1) + free_percpu((void __percpu *) pcnt); +} + +static inline struct xt_counters * +xt_get_this_cpu_counter(struct xt_counters *cnt) +{ + if (nr_cpu_ids > 1) + return this_cpu_ptr((void __percpu *) cnt->pcnt); + + return cnt; +} + +static inline struct xt_counters * +xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) +{ + if (nr_cpu_ids > 1) + return per_cpu_ptr((void __percpu *) cnt->pcnt, cpu); + + return cnt; +} + struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *); void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *); -- cgit v1.2.3 From 482cfc318559e2527dfd8513582d2fdb276e47c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Jun 2015 01:34:55 +0200 Subject: netfilter: xtables: avoid percpu ruleset duplication We store the rule blob per (possible) cpu. Unfortunately this means we can waste lot of memory on big smp machines. ipt_entry structure ('rule head') is 112 byte, so e.g. with maxcpu=64 one single rule eats close to 8k RAM. Since previous patch made counters percpu it appears there is nothing left in the rule blob that needs to be percpu. On my test system (144 possible cpus, 400k dummy rules) this change saves close to 9 Gigabyte of RAM. Reported-by: Marcelo Ricardo Leitner Acked-by: Jesper Dangaard Brouer Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index b77ab9f17641..9969d79dcde1 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -224,9 +224,9 @@ struct xt_table_info { unsigned int stacksize; unsigned int __percpu *stackptr; void ***jumpstack; - /* ipt_entry tables: one per CPU */ + /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ - void *entries[1]; + void *entries; }; #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \ -- cgit v1.2.3 From 87d001ef5366c4a24f7a1340246c4ce68190581c Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 27 May 2015 16:01:52 +0200 Subject: dmaengine: Move icg helpers to global header Now that we can have ICGs set for both the source and destination (using the icg field of struct data_chunk) or for only the source or the destination (using the dst_icg or src_icg respectively), and that these fields can be ignored depending on other parameters (src_inc, src_sgl, etc.), the logic to get the actual ICG value can be quite tricky. The XDMAC driver was already implementing it, but since we will need it in other drivers, we can move it to the main header file. Signed-off-by: Maxime Ripard Acked-by: Ludovic Desroches Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..499c530bcbaa 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -874,6 +874,33 @@ static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags) BUG(); } +static inline size_t dmaengine_get_icg(bool inc, bool sgl, size_t icg, + size_t dir_icg) +{ + if (inc) { + if (dir_icg) + return dir_icg; + else if (sgl) + return icg; + } + + return 0; +} + +static inline size_t dmaengine_get_dst_icg(struct dma_interleaved_template *xt, + struct data_chunk *chunk) +{ + return dmaengine_get_icg(xt->dst_inc, xt->dst_sgl, + chunk->icg, chunk->dst_icg); +} + +static inline size_t dmaengine_get_src_icg(struct dma_interleaved_template *xt, + struct data_chunk *chunk) +{ + return dmaengine_get_icg(xt->src_inc, xt->src_sgl, + chunk->icg, chunk->src_icg); +} + /* --- public DMA engine API --- */ #ifdef CONFIG_DMA_ENGINE -- cgit v1.2.3 From 4983a501afede12f95d26e1e213f8f2e9eda1871 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 18 May 2015 13:46:15 +0200 Subject: dmaengine: Revert "drivers/dma: remove unused support for MEMSET operations" This reverts commit 48a9db462d99494583dad829969616ac90a8df4e. Some platforms actually need support for the memset operations. Bring it back. Signed-off-by: Maxime Ripard Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..19face3168b4 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -65,6 +65,7 @@ enum dma_transaction_type { DMA_PQ, DMA_XOR_VAL, DMA_PQ_VAL, + DMA_MEMSET, DMA_INTERRUPT, DMA_SG, DMA_PRIVATE, @@ -570,6 +571,7 @@ struct dma_tx_state { * @copy_align: alignment shift for memcpy operations * @xor_align: alignment shift for xor operations * @pq_align: alignment shift for pq operations + * @fill_align: alignment shift for memset operations * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @src_addr_widths: bit mask of src addr widths the device supports @@ -588,6 +590,7 @@ struct dma_tx_state { * @device_prep_dma_xor_val: prepares a xor validation operation * @device_prep_dma_pq: prepares a pq operation * @device_prep_dma_pq_val: prepares a pqzero_sum operation + * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. @@ -620,6 +623,7 @@ struct dma_device { u8 copy_align; u8 xor_align; u8 pq_align; + u8 fill_align; #define DMA_HAS_PQ_CONTINUE (1 << 15) int dev_id; @@ -650,6 +654,9 @@ struct dma_device { struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, unsigned int src_cnt, const unsigned char *scf, size_t len, enum sum_check_flags *pqres, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_memset)( + struct dma_chan *chan, dma_addr_t dest, int value, size_t len, + unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)( struct dma_chan *chan, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_sg)( @@ -745,6 +752,17 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma( return chan->device->device_prep_interleaved_dma(chan, xt, flags); } +static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset( + struct dma_chan *chan, dma_addr_t dest, int value, size_t len, + unsigned long flags) +{ + if (!chan || !chan->device) + return NULL; + + return chan->device->device_prep_dma_memset(chan, dest, value, + len, flags); +} + static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg( struct dma_chan *chan, struct scatterlist *dst_sg, unsigned int dst_nents, @@ -820,6 +838,12 @@ static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1, return dmaengine_check_align(dev->pq_align, off1, off2, len); } +static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1, + size_t off2, size_t len) +{ + return dmaengine_check_align(dev->fill_align, off1, off2, len); +} + static inline void dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue) { -- cgit v1.2.3 From 7bbf1dd24b17b9ec4f47c43ce4e05bf190745553 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:10 +0800 Subject: genirq: Enhance irq_data_to_desc() to support hierarchy irqdomain For irq associated with hierarchy irqdomains, there will be multiple irq_datas for one irq_desc. So enhance irq_data_to_desc() to support hierarchy irqdomain. Also export irq_data_to_desc() as an inline function for later reuse. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433145945-789-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index dd1109fb241e..a113a8dc7438 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -93,6 +93,15 @@ struct irq_desc { extern struct irq_desc irq_desc[NR_IRQS]; #endif +static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irq_to_desc(data->irq); +#else + return container_of(data, struct irq_desc, irq_data); +#endif +} + static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) { return &desc->irq_data; -- cgit v1.2.3 From 0d0b4c866bcce647f40d73efe5e90aeeb079050a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:12 +0800 Subject: genirq: Introduce struct irq_common_data to host shared irq data With the introduction of hierarchy irqdomain, struct irq_data becomes per-chip instead of per-irq and there may be multiple irq_datas associated with the same irq. Some per-irq data stored in struct irq_data now may get duplicated into multiple irq_datas, and causes inconsistent view. So introduce struct irq_common_data to host per-irq common data and to achieve consistent view among irq_chips. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433145945-789-4-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 54 +++++++++++++++++++++++++++++-------------------- include/linux/irqdesc.h | 3 ++- 2 files changed, 34 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 48cb7d1aa58f..3c7fbe44edae 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -126,13 +126,21 @@ struct msi_desc; struct irq_domain; /** - * struct irq_data - per irq and irq chip data passed down to chip functions + * struct irq_common_data - per irq data shared by all irqchips + * @state_use_accessors: status information for irq chip functions. + * Use accessor functions to deal with it + */ +struct irq_common_data { + unsigned int state_use_accessors; +}; + +/** + * struct irq_data - per irq chip data passed down to chip functions * @mask: precomputed bitmask for accessing the chip registers * @irq: interrupt number * @hwirq: hardware interrupt number, local to the interrupt domain * @node: node index useful for balancing - * @state_use_accessors: status information for irq chip functions. - * Use accessor functions to deal with it + * @common: point to data shared by all irqchips * @chip: low level interrupt hardware access * @domain: Interrupt translation domain; responsible for mapping * between hwirq number and linux irq number. @@ -153,7 +161,7 @@ struct irq_data { unsigned int irq; unsigned long hwirq; unsigned int node; - unsigned int state_use_accessors; + struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -166,7 +174,7 @@ struct irq_data { }; /* - * Bit masks for irq_data.state + * Bit masks for irq_common_data.state_use_accessors * * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending @@ -198,34 +206,36 @@ enum { IRQD_WAKEUP_ARMED = (1 << 19), }; +#define __irqd_to_state(d) ((d)->common->state_use_accessors) + static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { - return d->state_use_accessors & IRQD_SETAFFINITY_PENDING; + return __irqd_to_state(d) & IRQD_SETAFFINITY_PENDING; } static inline bool irqd_is_per_cpu(struct irq_data *d) { - return d->state_use_accessors & IRQD_PER_CPU; + return __irqd_to_state(d) & IRQD_PER_CPU; } static inline bool irqd_can_balance(struct irq_data *d) { - return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING)); + return !(__irqd_to_state(d) & (IRQD_PER_CPU | IRQD_NO_BALANCING)); } static inline bool irqd_affinity_was_set(struct irq_data *d) { - return d->state_use_accessors & IRQD_AFFINITY_SET; + return __irqd_to_state(d) & IRQD_AFFINITY_SET; } static inline void irqd_mark_affinity_was_set(struct irq_data *d) { - d->state_use_accessors |= IRQD_AFFINITY_SET; + __irqd_to_state(d) |= IRQD_AFFINITY_SET; } static inline u32 irqd_get_trigger_type(struct irq_data *d) { - return d->state_use_accessors & IRQD_TRIGGER_MASK; + return __irqd_to_state(d) & IRQD_TRIGGER_MASK; } /* @@ -233,43 +243,43 @@ static inline u32 irqd_get_trigger_type(struct irq_data *d) */ static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) { - d->state_use_accessors &= ~IRQD_TRIGGER_MASK; - d->state_use_accessors |= type & IRQD_TRIGGER_MASK; + __irqd_to_state(d) &= ~IRQD_TRIGGER_MASK; + __irqd_to_state(d) |= type & IRQD_TRIGGER_MASK; } static inline bool irqd_is_level_type(struct irq_data *d) { - return d->state_use_accessors & IRQD_LEVEL; + return __irqd_to_state(d) & IRQD_LEVEL; } static inline bool irqd_is_wakeup_set(struct irq_data *d) { - return d->state_use_accessors & IRQD_WAKEUP_STATE; + return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } static inline bool irqd_can_move_in_process_context(struct irq_data *d) { - return d->state_use_accessors & IRQD_MOVE_PCNTXT; + return __irqd_to_state(d) & IRQD_MOVE_PCNTXT; } static inline bool irqd_irq_disabled(struct irq_data *d) { - return d->state_use_accessors & IRQD_IRQ_DISABLED; + return __irqd_to_state(d) & IRQD_IRQ_DISABLED; } static inline bool irqd_irq_masked(struct irq_data *d) { - return d->state_use_accessors & IRQD_IRQ_MASKED; + return __irqd_to_state(d) & IRQD_IRQ_MASKED; } static inline bool irqd_irq_inprogress(struct irq_data *d) { - return d->state_use_accessors & IRQD_IRQ_INPROGRESS; + return __irqd_to_state(d) & IRQD_IRQ_INPROGRESS; } static inline bool irqd_is_wakeup_armed(struct irq_data *d) { - return d->state_use_accessors & IRQD_WAKEUP_ARMED; + return __irqd_to_state(d) & IRQD_WAKEUP_ARMED; } @@ -280,12 +290,12 @@ static inline bool irqd_is_wakeup_armed(struct irq_data *d) */ static inline void irqd_set_chained_irq_inprogress(struct irq_data *d) { - d->state_use_accessors |= IRQD_IRQ_INPROGRESS; + __irqd_to_state(d) |= IRQD_IRQ_INPROGRESS; } static inline void irqd_clr_chained_irq_inprogress(struct irq_data *d) { - d->state_use_accessors &= ~IRQD_IRQ_INPROGRESS; + __irqd_to_state(d) &= ~IRQD_IRQ_INPROGRESS; } static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index a113a8dc7438..c52d1480f272 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -17,7 +17,7 @@ struct pt_regs; /** * struct irq_desc - interrupt descriptor - * @irq_data: per irq and chip data passed down to chip functions + * @irq_common_data: per irq and chip data passed down to chip functions * @kstat_irqs: irq stats per cpu * @handle_irq: highlevel irq-events handler * @preflow_handler: handler called before the flow handler (currently used by sparc) @@ -47,6 +47,7 @@ struct pt_regs; * @name: flow handler name for /proc/interrupts output */ struct irq_desc { + struct irq_common_data irq_common_data; struct irq_data irq_data; unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; -- cgit v1.2.3 From 6783011b48096b9a0c239d0f7645f93070b6eefd Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:13 +0800 Subject: genirq: Introduce helper function irq_data_get_node() Introduce helper function irq_data_get_node() and variants thereof to hide struct irq_data implementation details. Convert the core code to use them. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1433145945-789-5-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 3c7fbe44edae..b3b82a5344c8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -640,6 +640,11 @@ static inline u32 irq_get_trigger_type(unsigned int irq) return d ? irqd_get_trigger_type(d) : 0; } +static inline int irq_data_get_node(struct irq_data *d) +{ + return d->node; +} + unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, -- cgit v1.2.3 From c64301a230a64dfc2fcf4581cd98a2d703f3c057 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:23 +0800 Subject: genirq: Introduce helper function irq_data_get_affinity_mask() Introduce helper function irq_data_get_affinity_mask() and irq_get_affinity_mask() to hide implementation details, so we could move field 'affinity' from struct irq_data into struct irq_common_data later. Signed-off-by: Jiang Liu Acked-by: Russell King Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1433145945-789-15-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index b3b82a5344c8..1e0ccef205ed 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -645,6 +645,18 @@ static inline int irq_data_get_node(struct irq_data *d) return d->node; } +static inline struct cpumask *irq_get_affinity_mask(int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + + return d ? d->affinity : NULL; +} + +static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) +{ + return d->affinity; +} + unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, -- cgit v1.2.3 From 52033cfb5aab2a54e238e93c9e52f61c2c5708aa Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 11 Jun 2015 16:35:26 +0300 Subject: IB/mlx4: Add mmap call to map the hardware clock In order to read the HCA's cycle counter efficiently in user space, we need to map the HCA's register. This is done through mmap call. Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford --- include/linux/mlx4/device.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 83e80ab94500..f94984fb8bb2 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -829,6 +829,12 @@ struct mlx4_dev { struct mlx4_vf_dev *dev_vfs; }; +struct mlx4_clock_params { + u64 offset; + u8 bar; + u8 size; +}; + struct mlx4_eqe { u8 reserved1; u8 type; @@ -1485,4 +1491,7 @@ int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev, enum mlx4_access_reg_method method, struct mlx4_ptys_reg *ptys_reg); +int mlx4_get_internal_clock_params(struct mlx4_dev *dev, + struct mlx4_clock_params *params); + #endif /* MLX4_DEVICE_H */ -- cgit v1.2.3 From c0300089fd2dbeebef5ab9b6d66b4e6cedf8500a Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Tue, 28 Apr 2015 17:32:34 +0800 Subject: PCI: Remove unused pci_scan_bus_parented() No one uses pci_scan_bus_parented() any more, remove it. Signed-off-by: Yijing Wang Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..1ec0d5d9723c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -773,8 +773,6 @@ void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res, void pcibios_scan_specific_bus(int busn); struct pci_bus *pci_find_bus(int domain, int busnr); void pci_bus_add_devices(const struct pci_bus *bus); -struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, - struct pci_ops *ops, void *sysdata); struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata); struct pci_bus *pci_create_root_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata, -- cgit v1.2.3 From 73b6ecdb93e8e77752cae9077c424fcdc6f23c39 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 12 Jun 2015 11:10:06 +0900 Subject: extcon: Redefine the unique id of supported external connectors without 'enum extcon' type This patch just redefine the unique id of supported external connectors without 'enum extcon' type. Because unique id would be used on devictree file(*.dts) to indicate the specific external connectors like key number of input framework. So, I have the plan to move this definitions to following header file which includes the unique id of supported external connectors. - include/dt-bindings/extcon/extcon.h Fixes: 2a9de9c0f08d ("extcon: Use the unique id for external connector instead of string") Signed-off-by: Chanwoo Choi Signed-off-by: Greg Kroah-Hartman --- include/linux/extcon.h | 90 +++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index a7b224b20ecc..b16d929fa75f 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -30,41 +30,35 @@ #include #include -enum extcon { - EXTCON_NONE = 0x0, - - /* USB external connector */ - EXTCON_USB = 0x1, - EXTCON_USB_HOST = 0x2, - - /* Charger external connector */ - EXTCON_TA = 0x10, - EXTCON_FAST_CHARGER = 0x11, - EXTCON_SLOW_CHARGER = 0x12, - EXTCON_CHARGE_DOWNSTREAM = 0x13, - - /* Audio/Video external connector */ - EXTCON_LINE_IN = 0x20, - EXTCON_LINE_OUT = 0x21, - EXTCON_MICROPHONE = 0x22, - EXTCON_HEADPHONE = 0x23, - - EXTCON_HDMI = 0x30, - EXTCON_MHL = 0x31, - EXTCON_DVI = 0x32, - EXTCON_VGA = 0x33, - EXTCON_SPDIF_IN = 0x34, - EXTCON_SPDIF_OUT = 0x35, - EXTCON_VIDEO_IN = 0x36, - EXTCON_VIDEO_OUT = 0x37, - - /* Etc external connector */ - EXTCON_DOCK = 0x50, - EXTCON_JIG = 0x51, - EXTCON_MECHANICAL = 0x52, - - EXTCON_END, -}; +/* + * Define the unique id of supported external connectors + */ +#define EXTCON_NONE 0 + +#define EXTCON_USB 1 /* USB connector */ +#define EXTCON_USB_HOST 2 + +#define EXTCON_TA 3 /* Charger connector */ +#define EXTCON_FAST_CHARGER 4 +#define EXTCON_SLOW_CHARGER 5 +#define EXTCON_CHARGE_DOWNSTREAM 6 + +#define EXTCON_LINE_IN 7 /* Audio/Video connector */ +#define EXTCON_LINE_OUT 8 +#define EXTCON_MICROPHONE 9 +#define EXTCON_HEADPHONE 10 +#define EXTCON_HDMI 11 +#define EXTCON_MHL 12 +#define EXTCON_DVI 13 +#define EXTCON_VGA 14 +#define EXTCON_SPDIF_IN 15 +#define EXTCON_SPDIF_OUT 16 +#define EXTCON_VIDEO_IN 17 +#define EXTCON_VIDEO_OUT 18 + +#define EXTCON_DOCK 19 /* Misc connector */ +#define EXTCON_JIG 20 +#define EXTCON_MECHANICAL 21 struct extcon_cable; @@ -105,7 +99,7 @@ struct extcon_cable; struct extcon_dev { /* Optional user initializing data */ const char *name; - const enum extcon *supported_cable; + const unsigned int *supported_cable; const u32 *mutually_exclusive; /* Optional callbacks to override class functions */ @@ -182,10 +176,10 @@ extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name); /* * Following APIs control the memory of extcon device. */ -extern struct extcon_dev *extcon_dev_allocate(const enum extcon *cable); +extern struct extcon_dev *extcon_dev_allocate(const unsigned int *cable); extern void extcon_dev_free(struct extcon_dev *edev); extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const enum extcon *cable); + const unsigned int *cable); extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev); /* @@ -206,8 +200,8 @@ extern int extcon_update_state(struct extcon_dev *edev, u32 mask, u32 state); * get/set_cable_state access each bit of the 32b encoded state value. * They are used to access the status of each cable based on the cable_name. */ -extern int extcon_get_cable_state_(struct extcon_dev *edev, enum extcon id); -extern int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id, +extern int extcon_get_cable_state_(struct extcon_dev *edev, unsigned int id); +extern int extcon_set_cable_state_(struct extcon_dev *edev, unsigned int id, bool cable_state); extern int extcon_get_cable_state(struct extcon_dev *edev, @@ -234,9 +228,9 @@ extern int extcon_unregister_interest(struct extcon_specific_cable_nb *nb); * we do not recommend to use this for normal 'notifiee' device drivers who * want to be notified by a specific external port of the notifier. */ -extern int extcon_register_notifier(struct extcon_dev *edev, enum extcon id, +extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id, struct notifier_block *nb); -extern int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id, +extern int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id, struct notifier_block *nb); /* @@ -266,7 +260,7 @@ static inline int devm_extcon_dev_register(struct device *dev, static inline void devm_extcon_dev_unregister(struct device *dev, struct extcon_dev *edev) { } -static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable) +static inline struct extcon_dev *extcon_dev_allocate(const unsigned int *cable) { return ERR_PTR(-ENOSYS); } @@ -274,7 +268,7 @@ static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable) static inline void extcon_dev_free(struct extcon_dev *edev) { } static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev, - const enum extcon *cable) + const unsigned int *cable) { return ERR_PTR(-ENOSYS); } @@ -298,13 +292,13 @@ static inline int extcon_update_state(struct extcon_dev *edev, u32 mask, } static inline int extcon_get_cable_state_(struct extcon_dev *edev, - enum extcon id) + unsigned int id) { return 0; } static inline int extcon_set_cable_state_(struct extcon_dev *edev, - enum extcon id, bool cable_state) + unsigned int id, bool cable_state) { return 0; } @@ -327,14 +321,14 @@ static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name) } static inline int extcon_register_notifier(struct extcon_dev *edev, - enum extcon id, + unsigned int id, struct notifier_block *nb) { return 0; } static inline int extcon_unregister_notifier(struct extcon_dev *edev, - enum extcon id, + unsigned int id, struct notifier_block *nb) { return 0; -- cgit v1.2.3 From ef73f886b53548d83d71a439f51a0c13ea6c1dae Mon Sep 17 00:00:00 2001 From: Dmitry Kalinkin Date: Thu, 28 May 2015 15:07:04 +0300 Subject: vme: export vme_check_window() Signed-off-by: Dmitry Kalinkin Cc: Igor Alekseev Signed-off-by: Greg Kroah-Hartman --- include/linux/vme.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vme.h b/include/linux/vme.h index 79242e9c06b8..c0131358f351 100644 --- a/include/linux/vme.h +++ b/include/linux/vme.h @@ -120,6 +120,8 @@ void vme_free_consistent(struct vme_resource *, size_t, void *, dma_addr_t); size_t vme_get_size(struct vme_resource *); +int vme_check_window(u32 aspace, unsigned long long vme_base, + unsigned long long size); struct vme_resource *vme_slave_request(struct vme_dev *, u32, u32); int vme_slave_set(struct vme_resource *, int, unsigned long long, -- cgit v1.2.3 From 1e98a0f08abddde87f0f93237f10629ecb4880ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jun 2015 19:31:32 -0700 Subject: flow_dissector: fix ipv6 dst, hop-by-hop and routing ext hdrs __skb_header_pointer() returns a pointer that must be checked. Fixes infinite loop reported by Alexei, and add __must_check to catch these errors earlier. Fixes: 6a74fcf426f5 ("flow_dissector: add support for dst, hop-by-hop and routing ext hdrs") Reported-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: Eric Dumazet Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cc612fc0a894..a7acc92aa668 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2743,8 +2743,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); -static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset, - int len, void *data, int hlen, void *buffer) +static inline void * __must_check +__skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *data, int hlen, void *buffer) { if (hlen - offset >= len) return data + offset; @@ -2756,8 +2757,8 @@ static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset, return buffer; } -static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, - int len, void *buffer) +static inline void * __must_check +skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { return __skb_header_pointer(skb, offset, len, skb->data, skb_headlen(skb), buffer); -- cgit v1.2.3 From aaeb6e24f5b6cb6a664fbdec6e08b65c3173c1b3 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 12 Jun 2015 21:07:54 +0200 Subject: netfilter: ipset: Use MSEC_PER_SEC consistently Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set_timeout.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h index 83c2f9e0886c..3c8842bcedaa 100644 --- a/include/linux/netfilter/ipset/ip_set_timeout.h +++ b/include/linux/netfilter/ipset/ip_set_timeout.h @@ -61,7 +61,7 @@ ip_set_timeout_set(unsigned long *timeout, u32 t) return; } - *timeout = msecs_to_jiffies(t * 1000) + jiffies; + *timeout = msecs_to_jiffies(t * MSEC_PER_SEC) + jiffies; if (*timeout == IPSET_ELEM_PERMANENT) /* Bingo! :-) */ (*timeout)--; @@ -71,7 +71,7 @@ static inline u32 ip_set_timeout_get(unsigned long *timeout) { return *timeout == IPSET_ELEM_PERMANENT ? 0 : - jiffies_to_msecs(*timeout - jiffies)/1000; + jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC; } #endif /* __KERNEL__ */ -- cgit v1.2.3 From f690cbaed9fe4d77592e24139db7ad790641c4fd Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 12 Jun 2015 22:11:00 +0200 Subject: netfilter: ipset: Fix cidr handling for hash:*net* types Commit "Simplify cidr handling for hash:*net* types" broke the cidr handling for the hash:*net* types when the sets were used by the SET target: entries with invalid cidr values were added to the sets. Reported by Jonathan Johnson. Testsuite entry is added to verify the fix. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ffdfdc24952a..a6fe1ce96437 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -545,8 +545,6 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, { .bytes = ULLONG_MAX, .packets = ULLONG_MAX, \ .timeout = (set)->timeout } -#define IP_SET_INIT_CIDR(a, b) ((a) ? (a) : (b)) - #define IPSET_CONCAT(a, b) a##b #define IPSET_TOKEN(a, b) IPSET_CONCAT(a, b) -- cgit v1.2.3 From c4c997839cf92cb1037e43a85cdb4cbf44ed39a5 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 11:59:45 +0200 Subject: netfilter: ipset: Fix parallel resizing and listing of the same set When elements added to a hash:* type of set and resizing triggered, parallel listing could start to list the original set (before resizing) and "continue" with listing the new set. Fix it by references and using the original hash table for listing. Therefore the destroying of the original hash table may happen from the resizing or listing functions. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index a6fe1ce96437..5674b6ac6646 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -176,6 +176,9 @@ struct ip_set_type_variant { /* List elements */ int (*list)(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb); + /* Keep listing private when resizing runs parallel */ + void (*uref)(struct ip_set *set, struct netlink_callback *cb, + bool start); /* Return true if "b" set is the same as "a" * according to the create set parameters */ @@ -380,12 +383,12 @@ ip_set_init_counter(struct ip_set_counter *counter, /* Netlink CB args */ enum { - IPSET_CB_NET = 0, - IPSET_CB_DUMP, - IPSET_CB_INDEX, - IPSET_CB_ARG0, + IPSET_CB_NET = 0, /* net namespace */ + IPSET_CB_DUMP, /* dump single set/all sets */ + IPSET_CB_INDEX, /* set index */ + IPSET_CB_PRIVATE, /* set private data */ + IPSET_CB_ARG0, /* type specific */ IPSET_CB_ARG1, - IPSET_CB_ARG2, }; /* register and unregister set references */ -- cgit v1.2.3 From b57b2d1fa53fe8563bdfc66a33b844463b9af285 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 14:22:25 +0200 Subject: netfilter: ipset: Prepare the ipset core to use RCU at set level Replace rwlock_t with spinlock_t in "struct ip_set" and change the locking accordingly. Convert the comment extension into an rcu-avare object. Also, simplify the timeout routines. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 9 ++++-- include/linux/netfilter/ipset/ip_set_comment.h | 38 ++++++++++++++++++-------- include/linux/netfilter/ipset/ip_set_timeout.h | 25 +++++++---------- 3 files changed, 44 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 5674b6ac6646..19b4969a25fe 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -108,8 +108,13 @@ struct ip_set_counter { atomic64_t packets; }; +struct ip_set_comment_rcu { + struct rcu_head rcu; + char str[0]; +}; + struct ip_set_comment { - char *str; + struct ip_set_comment_rcu __rcu *c; }; struct ip_set_skbinfo { @@ -226,7 +231,7 @@ struct ip_set { /* The name of the set */ char name[IPSET_MAXNAMELEN]; /* Lock protecting the set data */ - rwlock_t lock; + spinlock_t lock; /* References to the set */ u32 ref; /* The core set type */ diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h index 21217ea008d7..8d0248525957 100644 --- a/include/linux/netfilter/ipset/ip_set_comment.h +++ b/include/linux/netfilter/ipset/ip_set_comment.h @@ -16,41 +16,57 @@ ip_set_comment_uget(struct nlattr *tb) return nla_data(tb); } +/* Called from uadd only, protected by the set spinlock. + * The kadt functions don't use the comment extensions in any way. + */ static inline void ip_set_init_comment(struct ip_set_comment *comment, const struct ip_set_ext *ext) { + struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); size_t len = ext->comment ? strlen(ext->comment) : 0; - if (unlikely(comment->str)) { - kfree(comment->str); - comment->str = NULL; + if (unlikely(c)) { + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); } if (!len) return; if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) len = IPSET_MAX_COMMENT_SIZE; - comment->str = kzalloc(len + 1, GFP_ATOMIC); - if (unlikely(!comment->str)) + c = kzalloc(sizeof(*c) + len + 1, GFP_ATOMIC); + if (unlikely(!c)) return; - strlcpy(comment->str, ext->comment, len + 1); + strlcpy(c->str, ext->comment, len + 1); + rcu_assign_pointer(comment->c, c); } +/* Used only when dumping a set, protected by rcu_read_lock_bh() */ static inline int ip_set_put_comment(struct sk_buff *skb, struct ip_set_comment *comment) { - if (!comment->str) + struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c); + + if (!c) return 0; - return nla_put_string(skb, IPSET_ATTR_COMMENT, comment->str); + return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); } +/* Called from uadd/udel, flush or the garbage collectors protected + * by the set spinlock. + * Called when the set is destroyed and when there can't be any user + * of the set data anymore. + */ static inline void ip_set_comment_free(struct ip_set_comment *comment) { - if (unlikely(!comment->str)) + struct ip_set_comment_rcu *c; + + c = rcu_dereference_protected(comment->c, 1); + if (unlikely(!c)) return; - kfree(comment->str); - comment->str = NULL; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); } #endif diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h index 3c8842bcedaa..1d6a935c1ac5 100644 --- a/include/linux/netfilter/ipset/ip_set_timeout.h +++ b/include/linux/netfilter/ipset/ip_set_timeout.h @@ -40,31 +40,26 @@ ip_set_timeout_uget(struct nlattr *tb) } static inline bool -ip_set_timeout_test(unsigned long timeout) +ip_set_timeout_expired(unsigned long *t) { - return timeout == IPSET_ELEM_PERMANENT || - time_is_after_jiffies(timeout); -} - -static inline bool -ip_set_timeout_expired(unsigned long *timeout) -{ - return *timeout != IPSET_ELEM_PERMANENT && - time_is_before_jiffies(*timeout); + return *t != IPSET_ELEM_PERMANENT && time_is_before_jiffies(*t); } static inline void -ip_set_timeout_set(unsigned long *timeout, u32 t) +ip_set_timeout_set(unsigned long *timeout, u32 value) { - if (!t) { + unsigned long t; + + if (!value) { *timeout = IPSET_ELEM_PERMANENT; return; } - *timeout = msecs_to_jiffies(t * MSEC_PER_SEC) + jiffies; - if (*timeout == IPSET_ELEM_PERMANENT) + t = msecs_to_jiffies(value * MSEC_PER_SEC) + jiffies; + if (t == IPSET_ELEM_PERMANENT) /* Bingo! :-) */ - (*timeout)--; + t--; + *timeout = t; } static inline u32 -- cgit v1.2.3 From ca0f6a5cd99e0c6ba4bb78dc402817f636370f26 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 19:45:33 +0200 Subject: netfilter: ipset: Fix coding styles reported by checkpatch.pl Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 19b4969a25fe..48bb01edcf30 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -349,12 +349,11 @@ ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) cpu_to_be64((u64)skbinfo->skbmark << 32 | skbinfo->skbmarkmask))) || (skbinfo->skbprio && - nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, cpu_to_be32(skbinfo->skbprio))) || (skbinfo->skbqueue && - nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, cpu_to_be16(skbinfo->skbqueue))); - } static inline void -- cgit v1.2.3 From c751ad0dd640f4ce9269acd7a54de5ba8092e99e Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 12 Jun 2015 15:48:06 -0700 Subject: regulator: Add docbook for soft start The docbook for these members is missing. Add them. Warning(include/linux/regulator/machine.h:147): No description found for parameter 'soft_start' Warning(include/linux/regulator/driver.h:197): No description found for parameter 'set_soft_start' Reported-by: kbuild test robot Signed-off-by: Stephen Boyd Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 1 + include/linux/regulator/machine.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index e0635d0894aa..9398d31f9531 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -111,6 +111,7 @@ struct regulator_linear_range { * to stabilise after being set to a new value, in microseconds. * The function provides the from and to voltage selector, the * function should return the worst case. + * @set_soft_start: Enable soft start for the regulator. * * @set_suspend_voltage: Set the voltage for the regulator when the system * is suspended. diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 7f7d0a3fe1e1..1258275d3751 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -87,6 +87,7 @@ struct regulator_state { * applied. * @apply_uV: Apply the voltage constraint when initialising. * @ramp_disable: Disable ramp delay when initialising or when setting voltage. + * @soft_start: Enable soft start so that voltage ramps slowly. * @pull_down: Enable pull down when regulator is disabled. * * @input_uV: Input voltage for regulator when supplied by another regulator. -- cgit v1.2.3 From d45337328b1fb86a8f045f6c9938e9e08e6d7134 Mon Sep 17 00:00:00 2001 From: Vincent Wan Date: Thu, 11 Jun 2015 20:11:45 +0800 Subject: pci_ids: Add AMD KERNCZ device ID support The KERNCZ is new AMD SB/FCH generation name, like HUDSON2. 0x790b is the device ID for this generation. Signed-off-by: Wan ZongShun Acked-by: Bjorn Helgaas Signed-off-by: Ulf Hansson --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 2f7b9a40f627..cb63a7b522ef 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -579,6 +579,7 @@ #define PCI_DEVICE_ID_AMD_HUDSON2_SATA_IDE 0x7800 #define PCI_DEVICE_ID_AMD_HUDSON2_SMBUS 0x780b #define PCI_DEVICE_ID_AMD_HUDSON2_IDE 0x780c +#define PCI_DEVICE_ID_AMD_KERNCZ_SMBUS 0x790b #define PCI_VENDOR_ID_TRIDENT 0x1023 #define PCI_DEVICE_ID_TRIDENT_4DWAVE_DX 0x2000 -- cgit v1.2.3 From 32be6d3e362b896c81aae7c635d44e5a91406ce2 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 12 Jun 2015 10:58:46 -0400 Subject: crypto: nx - move include/linux/nx842.h into drivers/crypto/nx/nx-842.h Move the contents of the include/linux/nx842.h header file into the drivers/crypto/nx/nx-842.h header file. Remove the nx842.h header file and its entry in the MAINTAINERS file. The include/linux/nx842.h header originally was there because the crypto/842.c driver needed it to communicate with the nx-842 hw driver. However, that crypto compression driver was moved into the drivers/crypto/nx/ directory, and now can directly include the nx-842.h header. Nothing else needs the public include/linux/nx842.h header file, as all use of the nx-842 hardware driver will be through the "842-nx" crypto compression driver, since the direct nx-842 api is very limited in the buffer alignments and sizes that it will accept, and the crypto compression interface handles those limitations and allows any alignment and size buffers. Signed-off-by: Dan Streetman Signed-off-by: Herbert Xu --- include/linux/nx842.h | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 include/linux/nx842.h (limited to 'include/linux') diff --git a/include/linux/nx842.h b/include/linux/nx842.h deleted file mode 100644 index 4ddf68d9c0d4..000000000000 --- a/include/linux/nx842.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef __NX842_H__ -#define __NX842_H__ - -#define __NX842_PSERIES_MEM_COMPRESS (10240) -#define __NX842_POWERNV_MEM_COMPRESS (1024) - -#define NX842_MEM_COMPRESS (max_t(unsigned int, \ - __NX842_PSERIES_MEM_COMPRESS, __NX842_POWERNV_MEM_COMPRESS)) - -struct nx842_constraints { - int alignment; - int multiple; - int minimum; - int maximum; -}; - -int nx842_constraints(struct nx842_constraints *constraints); - -int nx842_compress(const unsigned char *in, unsigned int in_len, - unsigned char *out, unsigned int *out_len, void *wrkmem); -int nx842_decompress(const unsigned char *in, unsigned int in_len, - unsigned char *out, unsigned int *out_len, void *wrkmem); - -#endif -- cgit v1.2.3 From e7b707f96820161820a864d2ee81740a14da6b93 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Wed, 20 May 2015 11:31:27 +0200 Subject: mfd: cros_ec: Remove parent field Parent and device were pointing to the same device structure. Parent is unused, removed. Signed-off-by: Gwendal Grignou Tested-by: Stephen Barber Tested-by: Heiko Stuebner Tested-by: Gwendal Grignou Reviewed-by: Puthikorn Voravootivat Signed-off-by: Javier Martinez Canillas Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 324a34683971..14cf522123dd 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -85,7 +85,6 @@ struct cros_ec_command { * to using dword. * @din_size: size of din buffer to allocate (zero to use static din) * @dout_size: size of dout buffer to allocate (zero to use static dout) - * @parent: pointer to parent device (e.g. i2c or spi device) * @wake_enabled: true if this device can wake the system from sleep * @cmd_xfer: send command to EC and get response * Returns the number of bytes received if the communication succeeded, but @@ -113,7 +112,6 @@ struct cros_ec_device { uint8_t *dout; int din_size; int dout_size; - struct device *parent; bool wake_enabled; int (*cmd_xfer)(struct cros_ec_device *ec, struct cros_ec_command *msg); -- cgit v1.2.3 From a841178445bb72a3d566b4e6ab9d19e9b002eb47 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Tue, 9 Jun 2015 13:04:42 +0200 Subject: mfd: cros_ec: Use a zero-length array for command data Commit 1b84f2a4cd4a ("mfd: cros_ec: Use fixed size arrays to transfer data with the EC") modified the struct cros_ec_command fields to not use pointers for the input and output buffers and use fixed length arrays instead. This change was made because the cros_ec ioctl API uses that struct cros_ec_command to allow user-space to send commands to the EC and to get data from the EC. So using pointers made the API not 64-bit safe. Unfortunately this approach was not flexible enough for all the use-cases since there may be a need to send larger commands on newer versions of the EC command protocol. So to avoid to choose a constant length that it may be too big for most commands and thus wasting memory and CPU cycles on copy from and to user-space or having a size that is too small for some big commands, use a zero-length array that is both 64-bit safe and flexible. The same buffer is used for both output and input data so the maximum of these values should be used to allocate it. Suggested-by: Gwendal Grignou Signed-off-by: Javier Martinez Canillas Tested-by: Heiko Stuebner Acked-by: Lee Jones Acked-by: Olof Johansson Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 14cf522123dd..7eee38abd02a 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -42,8 +42,7 @@ enum { * @outsize: Outgoing length in bytes * @insize: Max number of bytes to accept from EC * @result: EC's response to the command (separate from communication failure) - * @outdata: Outgoing data to EC - * @indata: Where to put the incoming data from EC + * @data: Where to put the incoming data from EC and outgoing data to EC */ struct cros_ec_command { uint32_t version; @@ -51,8 +50,7 @@ struct cros_ec_command { uint32_t outsize; uint32_t insize; uint32_t result; - uint8_t outdata[EC_PROTO2_MAX_PARAM_SIZE]; - uint8_t indata[EC_PROTO2_MAX_PARAM_SIZE]; + uint8_t data[0]; }; /** -- cgit v1.2.3 From 256ab950bdaa8797b7bac8fc11a567030d486304 Mon Sep 17 00:00:00 2001 From: Stephen Barber Date: Tue, 9 Jun 2015 13:04:43 +0200 Subject: mfd: cros_ec: rev cros_ec_commands.h Update cros_ec_commands.h to the latest version in the EC firmware sources and add power domain and passthru commands. Also, update lightbar to use new command names. Signed-off-by: Stephen Barber Reviewed-by: Randall Spangler Signed-off-by: Javier Martinez Canillas Tested-by: Heiko Stuebner Reviewed-by: Gwendal Grignou Tested-by: Gwendal Grignou Acked-by: Lee Jones Acked-by: Olof Johansson Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec_commands.h | 277 ++++++++++++++++++++++++++++++++--- 1 file changed, 255 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h index a49cd41feea7..13b630c10d4c 100644 --- a/include/linux/mfd/cros_ec_commands.h +++ b/include/linux/mfd/cros_ec_commands.h @@ -515,7 +515,7 @@ struct ec_host_response { /* * Notes on commands: * - * Each command is an 8-byte command value. Commands which take params or + * Each command is an 16-bit command value. Commands which take params or * return response data specify structs for that data. If no struct is * specified, the command does not input or output data, respectively. * Parameter/response length is implicit in the structs. Some underlying @@ -966,7 +966,7 @@ struct rgb_s { /* List of tweakable parameters. NOTE: It's __packed so it can be sent in a * host command, but the alignment is the same regardless. Keep it that way. */ -struct lightbar_params { +struct lightbar_params_v0 { /* Timing */ int32_t google_ramp_up; int32_t google_ramp_down; @@ -1000,32 +1000,81 @@ struct lightbar_params { struct rgb_s color[8]; /* 0-3 are Google colors */ } __packed; +struct lightbar_params_v1 { + /* Timing */ + int32_t google_ramp_up; + int32_t google_ramp_down; + int32_t s3s0_ramp_up; + int32_t s0_tick_delay[2]; /* AC=0/1 */ + int32_t s0a_tick_delay[2]; /* AC=0/1 */ + int32_t s0s3_ramp_down; + int32_t s3_sleep_for; + int32_t s3_ramp_up; + int32_t s3_ramp_down; + int32_t tap_tick_delay; + int32_t tap_display_time; + + /* Tap-for-battery params */ + uint8_t tap_pct_red; + uint8_t tap_pct_green; + uint8_t tap_seg_min_on; + uint8_t tap_seg_max_on; + uint8_t tap_seg_osc; + uint8_t tap_idx[3]; + + /* Oscillation */ + uint8_t osc_min[2]; /* AC=0/1 */ + uint8_t osc_max[2]; /* AC=0/1 */ + uint8_t w_ofs[2]; /* AC=0/1 */ + + /* Brightness limits based on the backlight and AC. */ + uint8_t bright_bl_off_fixed[2]; /* AC=0/1 */ + uint8_t bright_bl_on_min[2]; /* AC=0/1 */ + uint8_t bright_bl_on_max[2]; /* AC=0/1 */ + + /* Battery level thresholds */ + uint8_t battery_threshold[LB_BATTERY_LEVELS - 1]; + + /* Map [AC][battery_level] to color index */ + uint8_t s0_idx[2][LB_BATTERY_LEVELS]; /* AP is running */ + uint8_t s3_idx[2][LB_BATTERY_LEVELS]; /* AP is sleeping */ + + /* Color palette */ + struct rgb_s color[8]; /* 0-3 are Google colors */ +} __packed; + struct ec_params_lightbar { uint8_t cmd; /* Command (see enum lightbar_command) */ union { struct { /* no args */ - } dump, off, on, init, get_seq, get_params, version; + } dump, off, on, init, get_seq, get_params_v0, get_params_v1, + version, get_brightness, get_demo; - struct num { + struct { uint8_t num; - } brightness, seq, demo; + } set_brightness, seq, demo; - struct reg { + struct { uint8_t ctrl, reg, value; } reg; - struct rgb { + struct { uint8_t led, red, green, blue; - } rgb; + } set_rgb; + + struct { + uint8_t led; + } get_rgb; - struct lightbar_params set_params; + struct lightbar_params_v0 set_params_v0; + struct lightbar_params_v1 set_params_v1; }; } __packed; struct ec_response_lightbar { union { - struct dump { + struct { struct { uint8_t reg; uint8_t ic0; @@ -1033,20 +1082,26 @@ struct ec_response_lightbar { } vals[23]; } dump; - struct get_seq { + struct { uint8_t num; - } get_seq; + } get_seq, get_brightness, get_demo; - struct lightbar_params get_params; + struct lightbar_params_v0 get_params_v0; + struct lightbar_params_v1 get_params_v1; - struct version { + struct { uint32_t num; uint32_t flags; } version; + struct { + uint8_t red, green, blue; + } get_rgb; + struct { /* no return params */ - } off, on, init, brightness, seq, reg, rgb, demo, set_params; + } off, on, init, set_brightness, seq, reg, set_rgb, + demo, set_params_v0, set_params_v1; }; } __packed; @@ -1056,15 +1111,20 @@ enum lightbar_command { LIGHTBAR_CMD_OFF = 1, LIGHTBAR_CMD_ON = 2, LIGHTBAR_CMD_INIT = 3, - LIGHTBAR_CMD_BRIGHTNESS = 4, + LIGHTBAR_CMD_SET_BRIGHTNESS = 4, LIGHTBAR_CMD_SEQ = 5, LIGHTBAR_CMD_REG = 6, - LIGHTBAR_CMD_RGB = 7, + LIGHTBAR_CMD_SET_RGB = 7, LIGHTBAR_CMD_GET_SEQ = 8, LIGHTBAR_CMD_DEMO = 9, - LIGHTBAR_CMD_GET_PARAMS = 10, - LIGHTBAR_CMD_SET_PARAMS = 11, + LIGHTBAR_CMD_GET_PARAMS_V0 = 10, + LIGHTBAR_CMD_SET_PARAMS_V0 = 11, LIGHTBAR_CMD_VERSION = 12, + LIGHTBAR_CMD_GET_BRIGHTNESS = 13, + LIGHTBAR_CMD_GET_RGB = 14, + LIGHTBAR_CMD_GET_DEMO = 15, + LIGHTBAR_CMD_GET_PARAMS_V1 = 16, + LIGHTBAR_CMD_SET_PARAMS_V1 = 17, LIGHTBAR_NUM_CMDS }; @@ -1421,8 +1481,40 @@ struct ec_response_rtc { /*****************************************************************************/ /* Port80 log access */ +/* Maximum entries that can be read/written in a single command */ +#define EC_PORT80_SIZE_MAX 32 + /* Get last port80 code from previous boot */ #define EC_CMD_PORT80_LAST_BOOT 0x48 +#define EC_CMD_PORT80_READ 0x48 + +enum ec_port80_subcmd { + EC_PORT80_GET_INFO = 0, + EC_PORT80_READ_BUFFER, +}; + +struct ec_params_port80_read { + uint16_t subcmd; + union { + struct { + uint32_t offset; + uint32_t num_entries; + } read_buffer; + }; +} __packed; + +struct ec_response_port80_read { + union { + struct { + uint32_t writes; + uint32_t history_size; + uint32_t last_boot; + } get_info; + struct { + uint16_t codes[EC_PORT80_SIZE_MAX]; + } data; + }; +} __packed; struct ec_response_port80_last_boot { uint16_t code; @@ -1782,6 +1874,7 @@ struct ec_params_gpio_set { /* Get GPIO value */ #define EC_CMD_GPIO_GET 0x93 +/* Version 0 of input params and response */ struct ec_params_gpio_get { char name[32]; } __packed; @@ -1789,6 +1882,38 @@ struct ec_response_gpio_get { uint8_t val; } __packed; +/* Version 1 of input params and response */ +struct ec_params_gpio_get_v1 { + uint8_t subcmd; + union { + struct { + char name[32]; + } get_value_by_name; + struct { + uint8_t index; + } get_info; + }; +} __packed; + +struct ec_response_gpio_get_v1 { + union { + struct { + uint8_t val; + } get_value_by_name, get_count; + struct { + uint8_t val; + char name[32]; + uint32_t flags; + } get_info; + }; +} __packed; + +enum gpio_get_subcmd { + EC_GPIO_GET_BY_NAME = 0, + EC_GPIO_GET_COUNT = 1, + EC_GPIO_GET_INFO = 2, +}; + /*****************************************************************************/ /* I2C commands. Only available when flash write protect is unlocked. */ @@ -1857,13 +1982,21 @@ struct ec_params_charge_control { /*****************************************************************************/ /* - * Cut off battery power output if the battery supports. + * Cut off battery power immediately or after the host has shut down. * - * For unsupported battery, just don't implement this command and lets EC - * return EC_RES_INVALID_COMMAND. + * return EC_RES_INVALID_COMMAND if unsupported by a board/battery. + * EC_RES_SUCCESS if the command was successful. + * EC_RES_ERROR if the cut off command failed. */ + #define EC_CMD_BATTERY_CUT_OFF 0x99 +#define EC_BATTERY_CUTOFF_FLAG_AT_SHUTDOWN (1 << 0) + +struct ec_params_battery_cutoff { + uint8_t flags; +} __packed; + /*****************************************************************************/ /* USB port mux control. */ @@ -2141,6 +2274,32 @@ struct ec_params_sb_wr_block { uint16_t data[32]; } __packed; +/*****************************************************************************/ +/* Battery vendor parameters + * + * Get or set vendor-specific parameters in the battery. Implementations may + * differ between boards or batteries. On a set operation, the response + * contains the actual value set, which may be rounded or clipped from the + * requested value. + */ + +#define EC_CMD_BATTERY_VENDOR_PARAM 0xb4 + +enum ec_battery_vendor_param_mode { + BATTERY_VENDOR_PARAM_MODE_GET = 0, + BATTERY_VENDOR_PARAM_MODE_SET, +}; + +struct ec_params_battery_vendor_param { + uint32_t param; + uint32_t value; + uint8_t mode; +} __packed; + +struct ec_response_battery_vendor_param { + uint32_t value; +} __packed; + /*****************************************************************************/ /* System commands */ @@ -2336,6 +2495,80 @@ struct ec_params_reboot_ec { #endif /* !__ACPI__ */ +/*****************************************************************************/ +/* + * PD commands + * + * These commands are for PD MCU communication. + */ + +/* EC to PD MCU exchange status command */ +#define EC_CMD_PD_EXCHANGE_STATUS 0x100 + +/* Status of EC being sent to PD */ +struct ec_params_pd_status { + int8_t batt_soc; /* battery state of charge */ +} __packed; + +/* Status of PD being sent back to EC */ +struct ec_response_pd_status { + int8_t status; /* PD MCU status */ + uint32_t curr_lim_ma; /* input current limit */ +} __packed; + +/* Set USB type-C port role and muxes */ +#define EC_CMD_USB_PD_CONTROL 0x101 + +enum usb_pd_control_role { + USB_PD_CTRL_ROLE_NO_CHANGE = 0, + USB_PD_CTRL_ROLE_TOGGLE_ON = 1, /* == AUTO */ + USB_PD_CTRL_ROLE_TOGGLE_OFF = 2, + USB_PD_CTRL_ROLE_FORCE_SINK = 3, + USB_PD_CTRL_ROLE_FORCE_SOURCE = 4, +}; + +enum usb_pd_control_mux { + USB_PD_CTRL_MUX_NO_CHANGE = 0, + USB_PD_CTRL_MUX_NONE = 1, + USB_PD_CTRL_MUX_USB = 2, + USB_PD_CTRL_MUX_DP = 3, + USB_PD_CTRL_MUX_DOCK = 4, + USB_PD_CTRL_MUX_AUTO = 5, +}; + +struct ec_params_usb_pd_control { + uint8_t port; + uint8_t role; + uint8_t mux; +} __packed; + +/*****************************************************************************/ +/* + * Passthru commands + * + * Some platforms have sub-processors chained to each other. For example. + * + * AP <--> EC <--> PD MCU + * + * The top 2 bits of the command number are used to indicate which device the + * command is intended for. Device 0 is always the device receiving the + * command; other device mapping is board-specific. + * + * When a device receives a command to be passed to a sub-processor, it passes + * it on with the device number set back to 0. This allows the sub-processor + * to remain blissfully unaware of whether the command originated on the next + * device up the chain, or was passed through from the AP. + * + * In the above example, if the AP wants to send command 0x0002 to the PD MCU, + * AP sends command 0x4002 to the EC + * EC sends command 0x0002 to the PD MCU + * EC forwards PD MCU response back to the AP + */ + +/* Offset and max command number for sub-device n */ +#define EC_CMD_PASSTHRU_OFFSET(n) (0x4000 * (n)) +#define EC_CMD_PASSTHRU_MAX(n) (EC_CMD_PASSTHRU_OFFSET(n) + 0x3fff) + /*****************************************************************************/ /* * Deprecated constants. These constants have been renamed for clarity. The -- cgit v1.2.3 From 2c7589af3c4dee844e6a4174f2aa8996cf837604 Mon Sep 17 00:00:00 2001 From: Stephen Barber Date: Tue, 9 Jun 2015 13:04:45 +0200 Subject: mfd: cros_ec: add proto v3 skeleton Add support in cros_ec.c to handle EC host command protocol v3. For v3+, probe for maximum shared protocol version and max request, response, and passthrough sizes. For now, this will always fall back to v2, since there is no bus-specific code for handling proto v3 packets. Signed-off-by: Stephen Barber Signed-off-by: Javier Martinez Canillas Reviewed-by: Gwendal Grignou Tested-by: Gwendal Grignou Tested-by: Heiko Stuebner Acked-by: Lee Jones Acked-by: Olof Johansson Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 7eee38abd02a..59d909434efd 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -21,6 +21,15 @@ #include #include +/* + * Max bus-specific overhead incurred by request/responses. + * I2C requires 1 additional byte for requests. + * I2C requires 2 additional bytes for responses. + * */ +#define EC_PROTO_VERSION_UNKNOWN 0 +#define EC_MAX_REQUEST_OVERHEAD 1 +#define EC_MAX_RESPONSE_OVERHEAD 2 + /* * Command interface between EC and AP, for LPC, I2C and SPI interfaces. */ @@ -88,6 +97,7 @@ struct cros_ec_command { * Returns the number of bytes received if the communication succeeded, but * that doesn't mean the EC was happy with the command. The caller * should check msg.result for the EC's result code. + * @pkt_xfer: send packet to EC and get response * @lock: one transaction at a time */ struct cros_ec_device { @@ -104,15 +114,21 @@ struct cros_ec_device { unsigned int bytes, void *dest); /* These are used to implement the platform-specific interface */ + u16 max_request; + u16 max_response; + u16 max_passthru; + u16 proto_version; void *priv; int irq; - uint8_t *din; - uint8_t *dout; + u8 *din; + u8 *dout; int din_size; int dout_size; bool wake_enabled; int (*cmd_xfer)(struct cros_ec_device *ec, struct cros_ec_command *msg); + int (*pkt_xfer)(struct cros_ec_device *ec, + struct cros_ec_command *msg); struct mutex lock; }; @@ -194,4 +210,12 @@ int cros_ec_remove(struct cros_ec_device *ec_dev); */ int cros_ec_register(struct cros_ec_device *ec_dev); +/** + * cros_ec_register - Query the protocol version supported by the ChromeOS EC + * + * @ec_dev: Device to register + * @return 0 if ok, -ve on error + */ +int cros_ec_query_all(struct cros_ec_device *ec_dev); + #endif /* __LINUX_MFD_CROS_EC_H */ -- cgit v1.2.3 From d365407079d33106f76bd486a863de05eb5ae95d Mon Sep 17 00:00:00 2001 From: Stephen Barber Date: Tue, 9 Jun 2015 13:04:46 +0200 Subject: mfd: cros_ec: add bus-specific proto v3 code Add proto v3 support to the SPI, I2C, and LPC. Signed-off-by: Stephen Barber Signed-off-by: Javier Martinez Canillas Tested-by: Heiko Stuebner Reviewed-by: Gwendal Grignou Tested-by: Gwendal Grignou Acked-by: Lee Jones Acked-by: Olof Johansson Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 59d909434efd..92e13aaa450c 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -21,6 +21,12 @@ #include #include +/* + * The EC is unresponsive for a time after a reboot command. Add a + * simple delay to make sure that the bus stays locked. + */ +#define EC_REBOOT_DELAY_MS 50 + /* * Max bus-specific overhead incurred by request/responses. * I2C requires 1 additional byte for requests. -- cgit v1.2.3 From 57b33ff077beebb68481a2b6b8e5fe58ca998169 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Tue, 9 Jun 2015 13:04:47 +0200 Subject: mfd: cros_ec: Support multiple EC in a system Chromebooks can have more than one Embedded Controller so the cros_ec device id has to be incremented for each EC registered. Add a new structure to represent multiple EC as different char devices (e.g: /dev/cros_ec, /dev/cros_pd). It connects to cros_ec_device and allows sysfs inferface for cros_pd. Also reduce number of allocated objects, make chromeos sysfs class object a static and add refcounting to prevent object deletion while command is in progress. Signed-off-by: Gwendal Grignou Reviewed-by: Dmitry Torokhov Signed-off-by: Javier Martinez Canillas Tested-by: Heiko Stuebner Acked-by: Lee Jones Acked-by: Olof Johansson Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec.h | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 92e13aaa450c..da72671a42fa 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -17,10 +17,14 @@ #define __LINUX_MFD_CROS_EC_H #include +#include #include #include #include +#define CROS_EC_DEV_NAME "cros_ec" +#define CROS_EC_DEV_PD_NAME "cros_pd" + /* * The EC is unresponsive for a time after a reboot command. Add a * simple delay to make sure that the bus stays locked. @@ -71,11 +75,8 @@ struct cros_ec_command { /** * struct cros_ec_device - Information about a ChromeOS EC device * - * @ec_name: name of EC device (e.g. 'chromeos-ec') * @phys_name: name of physical comms layer (e.g. 'i2c-4') * @dev: Device pointer for physical comms device - * @vdev: Device pointer for virtual comms device - * @cdev: Character device structure for virtual comms device * @was_wake_device: true if this device was set to wake the system from * sleep at the last suspend * @cmd_readmem: direct read of the EC memory-mapped region, if supported @@ -87,6 +88,7 @@ struct cros_ec_command { * * @priv: Private data * @irq: Interrupt to use + * @id: Device id * @din: input buffer (for data from EC) * @dout: output buffer (for data to EC) * \note @@ -109,11 +111,8 @@ struct cros_ec_command { struct cros_ec_device { /* These are used by other drivers that want to talk to the EC */ - const char *ec_name; const char *phys_name; struct device *dev; - struct device *vdev; - struct cdev cdev; bool was_wake_device; struct class *cros_class; int (*cmd_readmem)(struct cros_ec_device *ec, unsigned int offset, @@ -138,6 +137,35 @@ struct cros_ec_device { struct mutex lock; }; +/* struct cros_ec_platform - ChromeOS EC platform information + * + * @ec_name: name of EC device (e.g. 'cros-ec', 'cros-pd', ...) + * used in /dev/ and sysfs. + * @cmd_offset: offset to apply for each command. Set when + * registering a devicde behind another one. + */ +struct cros_ec_platform { + const char *ec_name; + u16 cmd_offset; +}; + +/* + * struct cros_ec_dev - ChromeOS EC device entry point + * + * @class_dev: Device structure used in sysfs + * @cdev: Character device structure in /dev + * @ec_dev: cros_ec_device structure to talk to the physical device + * @dev: pointer to the platform device + * @cmd_offset: offset to apply for each command. + */ +struct cros_ec_dev { + struct device class_dev; + struct cdev cdev; + struct cros_ec_device *ec_dev; + struct device *dev; + u16 cmd_offset; +}; + /** * cros_ec_suspend - Handle a suspend operation for the ChromeOS EC device * @@ -224,4 +252,8 @@ int cros_ec_register(struct cros_ec_device *ec_dev); */ int cros_ec_query_all(struct cros_ec_device *ec_dev); +/* sysfs stuff */ +extern struct attribute_group cros_ec_attr_group; +extern struct attribute_group cros_ec_lightbar_attr_group; + #endif /* __LINUX_MFD_CROS_EC_H */ -- cgit v1.2.3 From d0562674838c08ff142c0e9a8e12634e133c4361 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Wed, 10 Jun 2015 11:08:52 -0500 Subject: ACPI / scan: Parse _CCA and setup device coherency This patch implements support for ACPI _CCA object, which is introduced in ACPIv5.1, can be used for specifying device DMA coherency attribute. The parsing logic traverses device namespace to parse coherency information, and stores it in acpi_device_flags. Then uses it to call arch_setup_dma_ops() when creating each device enumerated in DSDT during ACPI scan. This patch also introduces acpi_dma_is_coherent(), which provides an interface for device drivers to check the coherency information similarly to the of_dma_is_coherent(). Signed-off-by: Mark Salter Signed-off-by: Suravee Suthikulpanit Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..d46a48c21c67 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -569,6 +569,11 @@ static inline int acpi_device_modalias(struct device *dev, return -ENODEV; } +static inline bool acpi_check_dma(struct acpi_device *adev, bool *coherent) +{ + return false; +} + #define ACPI_PTR(_ptr) (NULL) #endif /* !CONFIG_ACPI */ -- cgit v1.2.3 From 05ca556003b1d6b4df0b8831e4c07fad7f5bdd2c Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Wed, 10 Jun 2015 11:08:54 -0500 Subject: device property: Introduces device_dma_is_coherent() Currently, device drivers, which support both OF and ACPI, need to call two separate APIs, of_dma_is_coherent() and acpi_dma_is_coherent()) to determine device coherency attribute. This patch simplifies this process by introducing a new device property API, device_dma_is_coherent(), which calls the appropriate interface based on the booting architecture. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index de8bdf417a35..76ebde9c11d4 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -164,4 +164,6 @@ struct property_set { void device_add_property_set(struct device *dev, struct property_set *pset); +bool device_dma_is_coherent(struct device *dev); + #endif /* _LINUX_PROPERTY_H_ */ -- cgit v1.2.3 From 711bdde6a884354ddae8da2fcb495b2a9364cc90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Jun 2015 09:57:30 -0700 Subject: netfilter: x_tables: remove XT_TABLE_INFO_SZ and a dereference. After Florian patches, there is no need for XT_TABLE_INFO_SZ anymore : Only one copy of table is kept, instead of one copy per cpu. We also can avoid a dereference if we put table data right after xt_table_info. It reduces register pressure and helps compiler. Then, we attempt a kmalloc() if total size is under order-3 allocation, to reduce TLB pressure, as in many cases, rules fit in 32 KB. Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 9969d79dcde1..95693c4cebdd 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -225,12 +225,9 @@ struct xt_table_info { unsigned int __percpu *stackptr; void ***jumpstack; - /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ - void *entries; + unsigned char entries[0] __aligned(8); }; -#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \ - + nr_cpu_ids * sizeof(char *)) int xt_register_target(struct xt_target *target); void xt_unregister_target(struct xt_target *target); int xt_register_targets(struct xt_target *target, unsigned int n); -- cgit v1.2.3 From 6f6a6fda294506dfe0e3e0a253bb2d2923f28f0a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 15 Jun 2015 14:36:01 -0400 Subject: jbd2: fix ocfs2 corrupt when updating journal superblock fails If updating journal superblock fails after journal data has been flushed, the error is omitted and this will mislead the caller as a normal case. In ocfs2, the checkpoint will be treated successfully and the other node can get the lock to update. Since the sb_start is still pointing to the old log block, it will rewrite the journal data during journal recovery by the other node. Thus the new updates will be overwritten and ocfs2 corrupts. So in above case we have to return the error, and ocfs2_commit_cache will take care of the error and prevent the other node to do update first. And only after recovering journal it can do the new updates. The issue discussion mail can be found at: https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html http://comments.gmane.org/gmane.comp.file-systems.ext4/48841 [ Fixed bug in patch which allowed a non-negative error return from jbd2_cleanup_journal_tail() to leak out of jbd2_fjournal_flush(); this was causing xfstests ext4/306 to fail. -- Ted ] Reported-by: Yiwen Jiang Signed-off-by: Joseph Qi Signed-off-by: Theodore Ts'o Tested-by: Yiwen Jiang Cc: Junxiao Bi Cc: stable@vger.kernel.org --- include/linux/jbd2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 20e7f78041c8..edb640ae9a94 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ @@ -1157,7 +1157,7 @@ extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); -extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, +extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); extern void __jbd2_journal_abort_hard (journal_t *); extern void jbd2_journal_abort (journal_t *, int); -- cgit v1.2.3 From ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jun 2015 19:39:12 -0700 Subject: bpf: introduce current->pid, tgid, uid, gid, comm accessors eBPF programs attached to kprobes need to filter based on current->pid, uid and other fields, so introduce helper functions: u64 bpf_get_current_pid_tgid(void) Return: current->tgid << 32 | current->pid u64 bpf_get_current_uid_gid(void) Return: current_gid << 32 | current_uid bpf_get_current_comm(char *buf, int size_of_buf) stores current->comm into buf They can be used from the programs attached to TC as well to classify packets based on current task fields. Update tracex2 example to print histogram of write syscalls for each process instead of aggregated for all. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2235aee8096a..1b9a3f5b27f6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -188,5 +188,8 @@ extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; +extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; +extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; +extern const struct bpf_func_proto bpf_get_current_comm_proto; #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 0756ea3e85139d23a8148ebaa95411c2f0aa4f11 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jun 2015 19:39:13 -0700 Subject: bpf: allow networking programs to use bpf_trace_printk() for debugging bpf_trace_printk() is a helper function used to debug eBPF programs. Let socket and TC programs use it as well. Note, it's DEBUG ONLY helper. If it's used in the program, the kernel will print warning banner to make sure users don't use it in production. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1b9a3f5b27f6..4383476a0d48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -150,6 +150,7 @@ struct bpf_array { u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); void bpf_prog_array_map_clear(struct bpf_map *map); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +const struct bpf_func_proto *bpf_get_trace_printk_proto(void); #ifdef CONFIG_BPF_SYSCALL void bpf_register_prog_type(struct bpf_prog_type_list *tl); -- cgit v1.2.3 From 9464ca650008615d28d9aaf7de99b4edf61f0109 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jun 2015 19:44:48 -0700 Subject: net: make u64_stats_init() a function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using a function instead of a macro is cleaner and remove following W=1 warnings (extract) In file included from net/ipv6/ip6_vti.c:29:0: net/ipv6/ip6_vti.c: In function ‘vti6_dev_init_gen’: include/linux/netdevice.h:2029:18: warning: variable ‘stat’ set but not used [-Wunused-but-set-variable] typeof(type) *stat; \ ^ net/ipv6/ip6_vti.c:862:16: note: in expansion of macro ‘netdev_alloc_pcpu_stats’ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); ^ CC [M] net/ipv6/sit.o In file included from net/ipv6/sit.c:30:0: net/ipv6/sit.c: In function ‘ipip6_tunnel_init’: include/linux/netdevice.h:2029:18: warning: variable ‘stat’ set but not used [-Wunused-but-set-variable] typeof(type) *stat; \ ^ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 4b4439e75f45..df89c9bcba7d 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -68,11 +68,12 @@ struct u64_stats_sync { }; +static inline void u64_stats_init(struct u64_stats_sync *syncp) +{ #if BITS_PER_LONG == 32 && defined(CONFIG_SMP) -# define u64_stats_init(syncp) seqcount_init(syncp.seq) -#else -# define u64_stats_init(syncp) do { } while (0) + seqcount_init(&syncp->seq); #endif +} static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -- cgit v1.2.3 From 47d8417f5914012c794684f651213ffae1b91619 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 15 Jun 2015 17:58:58 +0300 Subject: net/mlx4_core: Add sink counter Reserve the last valid counter index for "sink" counter, when a new counter cannot be allocated, the driver will use this counter. In order to avoid allocating this counter on any other flow, fix the indices bitmap allocation range, and reserve the sink counter index. Add macro for the sink counter index and replace all appearences of the index with the macro. Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ad31e476873f..312c50420dde 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -957,6 +957,7 @@ struct mlx4_mad_ifc { ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) #define MLX4_INVALID_SLAVE_ID 0xFF +#define MLX4_SINK_COUNTER_INDEX(dev) (dev->caps.max_counters - 1) void handle_port_mgmt_change_event(struct work_struct *work); -- cgit v1.2.3 From 6de5f7f6a1fa2288552d46b3effbb6d5571413e5 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 15 Jun 2015 17:59:02 +0300 Subject: net/mlx4_core: Allocate default counter per port Default counter per port will be allocated at the mlx4 core driver load. Every QP opened by the Ethernet driver will be attached to the port's default counter. This is an infrastructure step to collect VF statistics from the PF. Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 312c50420dde..4820080ac394 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1348,6 +1348,7 @@ int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); void mlx4_counter_free(struct mlx4_dev *dev, u32 idx); +int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port); void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, int port); -- cgit v1.2.3 From 9616982f3fcc9e6577d7f41009c4ef2df19a71ec Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 15 Jun 2015 17:59:05 +0300 Subject: net/mlx4_core: Add helper to query counters This is an infrastructure step for querying VF and PF counters. This code was in the IB driver, move it to the mlx4 core driver so it will be accessible for more use cases. Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/cmd.h | 3 +++ include/linux/mlx4/device.h | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index f62e7cf227c6..5dffc869988b 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -35,6 +35,7 @@ #include #include +#include enum { /* initialization and general commands */ @@ -300,6 +301,8 @@ static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_para struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev); void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox); +int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index, + struct mlx4_counter *counter_stats, int reset); u32 mlx4_comm_get_version(void); int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac); int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 4820080ac394..efe80c754b2f 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -771,6 +771,14 @@ union mlx4_ext_av { struct mlx4_eth_av eth; }; +/* Counters should be saturate once they reach their maximum value */ +#define ASSIGN_32BIT_COUNTER(counter, value) do { \ + if ((value) > U32_MAX) \ + counter = cpu_to_be32(U32_MAX); \ + else \ + counter = cpu_to_be32(value); \ +} while (0) + struct mlx4_counter { u8 reserved1[3]; u8 counter_mode; -- cgit v1.2.3 From 3b766cd832328fcb87db3507e7b98cf42f21689d Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 15 Jun 2015 17:59:07 +0300 Subject: net/core: Add reading VF statistics through the PF netdevice Add ndo_get_vf_stats where the PF retrieves and fills the VFs traffic statistics. We encode the VF stats in a nested manner to allow for future extensions. Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/if_link.h | 9 +++++++++ include/linux/netdevice.h | 4 ++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index da4929927f69..ae5d0d22955d 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -5,6 +5,15 @@ /* We don't want this structure exposed to user space */ +struct ifla_vf_stats { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 broadcast; + __u64 multicast; +}; + struct ifla_vf_info { __u32 vf; __u8 mac[32]; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6f5f71ff5169..e20979dfd6a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1100,6 +1100,10 @@ struct net_device_ops { struct ifla_vf_info *ivf); int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); + int (*ndo_get_vf_stats)(struct net_device *dev, + int vf, + struct ifla_vf_stats + *vf_stats); int (*ndo_set_vf_port)(struct net_device *dev, int vf, struct nlattr *port[]); -- cgit v1.2.3 From 62a890557f57e6cbebe9cc6c32aef045405d4fa2 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 15 Jun 2015 17:59:08 +0300 Subject: net/mlx4_en: Support ndo_get_vf_stats Implement the ndo to gather VF statistics through the PF. All counters related to this VF are stored in a per slave list, run over the slave's list and collect all statistics. Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/cmd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 5dffc869988b..58391f2e0414 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -36,6 +36,7 @@ #include #include #include +#include enum { /* initialization and general commands */ @@ -303,6 +304,8 @@ void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbo int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index, struct mlx4_counter *counter_stats, int reset); +int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx, + struct ifla_vf_stats *vf_stats); u32 mlx4_comm_get_version(void); int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac); int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos); -- cgit v1.2.3 From eb4cb008529ca08e0d8c0fa54e8f739520197a65 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:18 -0400 Subject: sock_diag: define destruction multicast groups These groups will contain socket-destruction events for AF_INET/AF_INET6, IPPROTO_TCP/IPPROTO_UDP. Near the end of socket destruction, a check for listeners is performed. In the presence of a listener, rather than completely cleanup the socket, a unit of work will be added to a private work queue which will first broadcast information about the socket and then finish the cleanup operation. Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 083ac388098e..fddebc617469 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -1,7 +1,10 @@ #ifndef __SOCK_DIAG_H__ #define __SOCK_DIAG_H__ +#include #include +#include +#include #include struct sk_buff; @@ -11,6 +14,7 @@ struct sock; struct sock_diag_handler { __u8 family; int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); + int (*get_info)(struct sk_buff *skb, struct sock *sk); }; int sock_diag_register(const struct sock_diag_handler *h); @@ -26,4 +30,42 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr); int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, struct sk_buff *skb, int attrtype); +static inline +enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk) +{ + switch (sk->sk_family) { + case AF_INET: + switch (sk->sk_protocol) { + case IPPROTO_TCP: + return SKNLGRP_INET_TCP_DESTROY; + case IPPROTO_UDP: + return SKNLGRP_INET_UDP_DESTROY; + default: + return SKNLGRP_NONE; + } + case AF_INET6: + switch (sk->sk_protocol) { + case IPPROTO_TCP: + return SKNLGRP_INET6_TCP_DESTROY; + case IPPROTO_UDP: + return SKNLGRP_INET6_UDP_DESTROY; + default: + return SKNLGRP_NONE; + } + default: + return SKNLGRP_NONE; + } +} + +static inline +bool sock_diag_has_destroy_listeners(const struct sock *sk) +{ + const struct net *n = sock_net(sk); + const enum sknetlink_groups group = sock_diag_destroy_group(sk); + + return group != SKNLGRP_NONE && n->diag_nlsk && + netlink_has_listeners(n->diag_nlsk, group); +} +void sock_diag_broadcast_destroy(struct sock *sk); + #endif -- cgit v1.2.3 From 3fd22af808f4d7455ba91596d334438c7ee0f889 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:19 -0400 Subject: sock_diag: specify info_size per inet protocol Previously, there was no clear distinction between the inet protocols that used struct tcp_info to report information and those that didn't. This change adds a specific size attribute to the inet_diag_handler struct which defines these interfaces. This will make dispatching sock_diag get_info requests identical for all inet protocols in a following patch. Tested: ss -au Tested: ss -at Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index ac48b10c9395..0e707f0c1a3e 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -24,6 +24,7 @@ struct inet_diag_handler { struct inet_diag_msg *r, void *info); __u16 idiag_type; + __u16 idiag_info_size; }; struct inet_connection_sock; -- cgit v1.2.3 From d37e296979ed1652aec6850e2d736bd0ebf0cdb1 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Mon, 15 Jun 2015 13:18:36 -0700 Subject: MPILIB: add mpi_read_buf() and mpi_get_size() helpers Added a mpi_read_buf() helper function to export MPI to a buf provided by the user, and a mpi_get_size() helper, that tells the user how big the buf is. Changed mpi_free to use kzfree instead of kfree because it is used to free crypto keys. Signed-off-by: Tadeusz Struk Signed-off-by: Herbert Xu --- include/linux/mpi.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index 5af1b81def49..641b7d6fd096 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -81,6 +81,8 @@ MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread); int mpi_fromstr(MPI val, const char *str); u32 mpi_get_keyid(MPI a, u32 *keyid); void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign); +int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, + int *sign); void *mpi_get_secure_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_set_buffer(MPI a, const void *buffer, unsigned nbytes, int sign); @@ -142,4 +144,17 @@ int mpi_rshift(MPI x, MPI a, unsigned n); /*-- mpi-inv.c --*/ int mpi_invm(MPI x, MPI u, MPI v); +/* inline functions */ + +/** + * mpi_get_size() - returns max size required to store the number + * + * @a: A multi precision integer for which we want to allocate a bufer + * + * Return: size required to store the number + */ +static inline unsigned int mpi_get_size(MPI a) +{ + return a->nlimbs * BYTES_PER_MPI_LIMB; +} #endif /*G10_MPI_H */ -- cgit v1.2.3 From c7cfc94096db28d3072b402c224eb50349926e24 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:10 +0800 Subject: genirq: Enhance irq_data_to_desc() to support hierarchy irqdomain For irq associated with hierarchy irqdomains, there will be multiple irq_datas for one irq_desc. So enhance irq_data_to_desc() to support hierarchy irqdomain. Also export irq_data_to_desc() as an inline function for later reuse. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433145945-789-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index dd1109fb241e..a113a8dc7438 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -93,6 +93,15 @@ struct irq_desc { extern struct irq_desc irq_desc[NR_IRQS]; #endif +static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irq_to_desc(data->irq); +#else + return container_of(data, struct irq_desc, irq_data); +#endif +} + static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) { return &desc->irq_data; -- cgit v1.2.3 From 4158c2eca3c77ed3cccdcaeab153aad4e433369c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 12 Jun 2015 10:14:02 +0200 Subject: iommu/vt-d: Detect pre enabled translation Add code to detect whether translation is already enabled in the IOMMU. Save this state in a flags field added to struct intel_iommu. Tested-by: ZhenHua Li Tested-by: Baoquan He Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a240e61a7700..b85b81ad5eba 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -320,6 +320,9 @@ enum { MAX_SR_DMAR_REGS }; +#define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0) +#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) + struct intel_iommu { void __iomem *reg; /* Pointer to hardware regs, virtual addr */ u64 reg_phys; /* physical address of hw register set */ @@ -351,6 +354,7 @@ struct intel_iommu { #endif struct device *iommu_dev; /* IOMMU-sysfs device */ int node; + u32 flags; /* Software defined flags */ }; static inline void __iommu_flush_cache( -- cgit v1.2.3 From af3b358e48115588d905cc07a47b3f356e0d01d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 12 Jun 2015 15:00:21 +0200 Subject: iommu/vt-d: Copy IR table from old kernel when in kdump mode When we are booting into a kdump kernel and find IR enabled, copy over the contents of the previous IR table so that spurious interrupts will not be target aborted. Tested-by: ZhenHua Li Tested-by: Baoquan He Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index b85b81ad5eba..9e14edcf7f6e 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -296,6 +296,7 @@ struct q_inval { /* 1MB - maximum possible interrupt remapping table size */ #define INTR_REMAP_PAGE_ORDER 8 #define INTR_REMAP_TABLE_REG_SIZE 0xf +#define INTR_REMAP_TABLE_REG_SIZE_MASK 0xf #define INTR_REMAP_TABLE_ENTRIES 65536 -- cgit v1.2.3 From 9f3520c3115b451ac1301779fc3c769d94907a70 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 8 May 2015 18:19:05 +1000 Subject: wait: introduce wait_event_exclusive_cmd It's just a variant of wait_event_cmd(), with exclusive flag being set. For cases like RAID5, which puts many processes to sleep until 1/4 resources are free, a wake_up wakes up all processes to run, but there is one process being able to get the resource as it's protected by a spin lock. That ends up introducing heavy lock contentions, and hurts performance badly. Here introduce wait_event_exclusive_cmd to relieve the lock contention naturally by letting wake_up just wake up one process. Cc: Ingo Molnar Cc: Peter Zijlstra v2: its assumed that wait*() and __wait*() have the same arguments - peterz Acked-by: Peter Zijlstra (Intel) Signed-off-by: Yuanhan Liu Signed-off-by: NeilBrown --- include/linux/wait.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db83349865b..db78c7204947 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,6 +358,19 @@ do { \ __ret; \ }) +#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ + cmd1; schedule(); cmd2) +/* + * Just like wait_event_cmd(), except it sets exclusive flag + */ +#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \ +} while (0) + #define __wait_event_cmd(wq, condition, cmd1, cmd2) \ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) -- cgit v1.2.3 From 3c339ab83fc09d9d91fb7e8b4a60e8ddc91de417 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Tue, 16 Jun 2015 10:30:55 -0700 Subject: crypto: akcipher - add PKE API Add Public Key Encryption API. Signed-off-by: Tadeusz Struk Made CRYPTO_AKCIPHER invisible like other type config options. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 1 + include/linux/cryptouser.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 25a4b71d6d1f..0e3f71a73e3b 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -53,6 +53,7 @@ #define CRYPTO_ALG_TYPE_SHASH 0x00000009 #define CRYPTO_ALG_TYPE_AHASH 0x0000000a #define CRYPTO_ALG_TYPE_RNG 0x0000000c +#define CRYPTO_ALG_TYPE_AKCIPHER 0x0000000d #define CRYPTO_ALG_TYPE_PCOMPRESS 0x0000000f #define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e diff --git a/include/linux/cryptouser.h b/include/linux/cryptouser.h index 4abf2ea6a887..36efbbbf2f83 100644 --- a/include/linux/cryptouser.h +++ b/include/linux/cryptouser.h @@ -43,6 +43,7 @@ enum crypto_attr_type_t { CRYPTOCFGA_REPORT_COMPRESS, /* struct crypto_report_comp */ CRYPTOCFGA_REPORT_RNG, /* struct crypto_report_rng */ CRYPTOCFGA_REPORT_CIPHER, /* struct crypto_report_cipher */ + CRYPTOCFGA_REPORT_AKCIPHER, /* struct crypto_report_akcipher */ __CRYPTOCFGA_MAX #define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1) @@ -101,5 +102,9 @@ struct crypto_report_rng { unsigned int seedsize; }; +struct crypto_report_akcipher { + char type[CRYPTO_MAX_NAME]; +}; + #define CRYPTO_REPORT_MAXSIZE (sizeof(struct crypto_user_alg) + \ sizeof(struct crypto_report_blkcipher)) -- cgit v1.2.3 From 46b15caa7cb19b0f6e3bc8ebaee5bc1bb2e35110 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Jun 2015 18:48:31 -0400 Subject: vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB FS_CGROUP_WRITEBACK indicates whether a file_system_type supports cgroup writeback; however, different super_blocks of the same file_system_type may or may not support cgroup writeback depending on filesystem options. This patch replaces FS_CGROUP_WRITEBACK with a per-super_block flag. super_block->s_flags carries some internal flags in the high bits but it's exposd to userland through uapi header and running out of space anyway. This patch adds a new field super_block->s_iflags to carry kernel-internal flags. It is currently only used by the new SB_I_CGROUPWB flag whose concatenated and abbreviated name is for consistency with other super_block flags. ext2_fill_super() is updated to set SB_I_CGROUPWB. v2: Added super_block->s_iflags instead of stealing another high bit from sb->s_flags as suggested by Christoph and Jan. Signed-off-by: Tejun Heo Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: Christoph Hellwig Cc: Jan Kara Cc: linux-ext4@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- include/linux/fs.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index dfce80869145..a13181a42b9a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -260,7 +260,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return bdi_cap_account_dirty(bdi) && (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && - (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); + (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index b5e1dcfbc5e3..2c5e33a5b2af 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1241,6 +1241,8 @@ struct mm_struct; #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ +/* sb->s_iflags */ +#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ /* Possible states of 'frozen' field */ enum { @@ -1279,6 +1281,7 @@ struct super_block { const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; + unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; @@ -1912,7 +1915,6 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ -#define FS_CGROUP_WRITEBACK 32 /* Supports cgroup-aware writeback */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- cgit v1.2.3 From a9bd32a8b4c4c2670f9ed8cae63f9378b6df3ded Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 13 Mar 2015 11:09:45 -0700 Subject: msm: msm_fb: Remove dead code This code is no longer used now that mach-msm has been removed. Delete it. Cc: Jean-Christophe Plagniol-Villard Cc: Tomi Valkeinen Cc: David Brown Cc: Bryan Huntsman Cc: Daniel Walker Signed-off-by: Stephen Boyd Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Tomi Valkeinen --- include/linux/platform_data/video-msm_fb.h | 146 ----------------------------- 1 file changed, 146 deletions(-) delete mode 100644 include/linux/platform_data/video-msm_fb.h (limited to 'include/linux') diff --git a/include/linux/platform_data/video-msm_fb.h b/include/linux/platform_data/video-msm_fb.h deleted file mode 100644 index 31449be3eadb..000000000000 --- a/include/linux/platform_data/video-msm_fb.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Internal shared definitions for various MSM framebuffer parts. - * - * Copyright (C) 2007 Google Incorporated - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef _MSM_FB_H_ -#define _MSM_FB_H_ - -#include - -struct mddi_info; - -struct msm_fb_data { - int xres; /* x resolution in pixels */ - int yres; /* y resolution in pixels */ - int width; /* disply width in mm */ - int height; /* display height in mm */ - unsigned output_format; -}; - -struct msmfb_callback { - void (*func)(struct msmfb_callback *); -}; - -enum { - MSM_MDDI_PMDH_INTERFACE, - MSM_MDDI_EMDH_INTERFACE, - MSM_EBI2_INTERFACE, -}; - -#define MSMFB_CAP_PARTIAL_UPDATES (1 << 0) - -struct msm_panel_data { - /* turns off the fb memory */ - int (*suspend)(struct msm_panel_data *); - /* turns on the fb memory */ - int (*resume)(struct msm_panel_data *); - /* turns off the panel */ - int (*blank)(struct msm_panel_data *); - /* turns on the panel */ - int (*unblank)(struct msm_panel_data *); - void (*wait_vsync)(struct msm_panel_data *); - void (*request_vsync)(struct msm_panel_data *, struct msmfb_callback *); - void (*clear_vsync)(struct msm_panel_data *); - /* from the enum above */ - unsigned interface_type; - /* data to be passed to the fb driver */ - struct msm_fb_data *fb_data; - - /* capabilities supported by the panel */ - uint32_t caps; -}; - -struct msm_mddi_client_data { - void (*suspend)(struct msm_mddi_client_data *); - void (*resume)(struct msm_mddi_client_data *); - void (*activate_link)(struct msm_mddi_client_data *); - void (*remote_write)(struct msm_mddi_client_data *, uint32_t val, - uint32_t reg); - uint32_t (*remote_read)(struct msm_mddi_client_data *, uint32_t reg); - void (*auto_hibernate)(struct msm_mddi_client_data *, int); - /* custom data that needs to be passed from the board file to a - * particular client */ - void *private_client_data; - struct resource *fb_resource; - /* from the list above */ - unsigned interface_type; -}; - -struct msm_mddi_platform_data { - unsigned int clk_rate; - void (*power_client)(struct msm_mddi_client_data *, int on); - - /* fixup the mfr name, product id */ - void (*fixup)(uint16_t *mfr_name, uint16_t *product_id); - - struct resource *fb_resource; /*optional*/ - /* number of clients in the list that follows */ - int num_clients; - /* array of client information of clients */ - struct { - unsigned product_id; /* mfr id in top 16 bits, product id - * in lower 16 bits - */ - char *name; /* the device name will be the platform - * device name registered for the client, - * it should match the name of the associated - * driver - */ - unsigned id; /* id for mddi client device node, will also - * be used as device id of panel devices, if - * the client device will have multiple panels - * space must be left here for them - */ - void *client_data; /* required private client data */ - unsigned int clk_rate; /* optional: if the client requires a - * different mddi clk rate - */ - } client_platform_data[]; -}; - -struct mdp_blit_req; -struct fb_info; -struct mdp_device { - struct device dev; - void (*dma)(struct mdp_device *mpd, uint32_t addr, - uint32_t stride, uint32_t w, uint32_t h, uint32_t x, - uint32_t y, struct msmfb_callback *callback, int interface); - void (*dma_wait)(struct mdp_device *mdp); - int (*blit)(struct mdp_device *mdp, struct fb_info *fb, - struct mdp_blit_req *req); - void (*set_grp_disp)(struct mdp_device *mdp, uint32_t disp_id); -}; - -struct class_interface; -int register_mdp_client(struct class_interface *class_intf); - -/**** private client data structs go below this line ***/ - -struct msm_mddi_bridge_platform_data { - /* from board file */ - int (*init)(struct msm_mddi_bridge_platform_data *, - struct msm_mddi_client_data *); - int (*uninit)(struct msm_mddi_bridge_platform_data *, - struct msm_mddi_client_data *); - /* passed to panel for use by the fb driver */ - int (*blank)(struct msm_mddi_bridge_platform_data *, - struct msm_mddi_client_data *); - int (*unblank)(struct msm_mddi_bridge_platform_data *, - struct msm_mddi_client_data *); - struct msm_fb_data fb_data; -}; - - - -#endif -- cgit v1.2.3 From 208489032bdd8d4a7de50f3057c175058f271956 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 15 Jun 2015 19:20:48 +0800 Subject: mmc: mediatek: Add Mediatek MMC driver Add Mediatek MMC driver code Support eMMC/SD/SDIO Signed-off-by: Chaotian Jing Signed-off-by: Ulf Hansson --- include/linux/mmc/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index de722d4e9d61..258daf914c6d 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -121,6 +121,7 @@ struct mmc_data { struct mmc_request *mrq; /* associated request */ unsigned int sg_len; /* size of scatter list */ + int sg_count; /* mapped sg entries */ struct scatterlist *sg; /* I/O scatter list */ s32 host_cookie; /* host private data */ }; -- cgit v1.2.3 From a1a56aaa0735c7821e85c37268a7c12e132415cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Jun 2015 18:10:13 -0700 Subject: netfilter: x_tables: align per cpu xt_counter Let's force a 16 bytes alignment on xt_counter percpu allocations, so that bytes and packets sit in same cache line. xt_counter being exported to user space, we cannot add __align(16) on the structure itself. Signed-off-by: Eric Dumazet Cc: Florian Westphal Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 95693c4cebdd..1c97a2204379 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -356,7 +356,8 @@ static inline unsigned long ifname_compare_aligned(const char *_a, * so nothing needs to be done there. * * xt_percpu_counter_alloc returns the address of the percpu - * counter, or 0 on !SMP. + * counter, or 0 on !SMP. We force an alignment of 16 bytes + * so that bytes/packets share a common cache line. * * Hence caller must use IS_ERR_VALUE to check for error, this * allows us to return 0 for single core systems without forcing @@ -365,7 +366,8 @@ static inline unsigned long ifname_compare_aligned(const char *_a, static inline u64 xt_percpu_counter_alloc(void) { if (nr_cpu_ids > 1) { - void __percpu *res = alloc_percpu(struct xt_counters); + void __percpu *res = __alloc_percpu(sizeof(struct xt_counters), + sizeof(struct xt_counters)); if (res == NULL) return (u64) -ENOMEM; -- cgit v1.2.3 From 3b0f95be143bea1aa47beb20134ef82e4e4068dc Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 16 Jun 2015 23:06:20 +0100 Subject: irq: Add irq_set_chained_handler_and_data() Driver authors seem to get the ordering of irq_set_chained_handler() and irq_set_handler_data() wrong - ordering the former before the latter. This opens a race window where, if there is an interrupt pending, the handler will be called between these two calls, potentially resulting in an oops. Provide a single interface to set both of these together, especially as that's commonly what is required. Signed-off-by: Russell King Cc: Alexandre Courbot Cc: Hans Ulli Kroll Cc: Jason Cooper Cc: Lee Jones Cc: Linus Walleij Cc: Thierry Reding Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/E1Z4yzs-0002Rw-4B@rmk-PC.arm.linux.org.uk Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index de3213d271ff..42861d28fc2a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -525,6 +525,15 @@ irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) __irq_set_handler(irq, handle, 1, NULL); } +/* + * Set a highlevel chained flow handler and its data for a given IRQ. + * (a chained handler is automatically enabled and set to + * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) + */ +void +irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, + void *data); + void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) -- cgit v1.2.3 From 0f1b414d190724617eb1cdd615592fa8cd9d0b50 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Jun 2015 18:32:02 +0200 Subject: ACPI / PNP: Avoid conflicting resource reservations Commit b9a5e5e18fbf "ACPI / init: Fix the ordering of acpi_reserve_resources()" overlooked the fact that the memory and/or I/O regions reserved by acpi_reserve_resources() may conflict with those reserved by the PNP "system" driver. If that conflict actually takes place, it causes the reservations made by the "system" driver to fail while before commit b9a5e5e18fbf all reservations made by it and by acpi_reserve_resources() would be successful. In turn, that allows the resources that haven't been reserved by the "system" driver to be used by others (e.g. PCI) which sometimes leads to functional problems (up to and including boot failures). To fix that issue, introduce a common resource reservation routine, acpi_reserve_region(), to be used by both acpi_reserve_resources() and the "system" driver, that will track all resources reserved by it and avoid making conflicting requests. Link: https://bugzilla.kernel.org/show_bug.cgi?id=99831 Link: http://marc.info/?t=143389402600001&r=1&w=2 Fixes: b9a5e5e18fbf "ACPI / init: Fix the ordering of acpi_reserve_resources()" Reported-by: Roland Dreier Cc: All applicable Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..299763df1b27 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -332,6 +332,9 @@ int acpi_check_region(resource_size_t start, resource_size_t n, int acpi_resources_are_enforced(void); +int acpi_reserve_region(u64 start, unsigned int length, u8 space_id, + unsigned long flags, char *desc); + #ifdef CONFIG_HIBERNATION void __init acpi_no_s4_hw_signature(void); #endif @@ -525,6 +528,13 @@ static inline int acpi_check_region(resource_size_t start, resource_size_t n, return 0; } +static inline int acpi_reserve_region(u64 start, unsigned int length, + u8 space_id, unsigned long flags, + char *desc) +{ + return -ENXIO; +} + struct acpi_table_header; static inline int acpi_table_parse(char *id, int (*handler)(struct acpi_table_header *)) -- cgit v1.2.3 From a263653ed798216c0069922d7b5237ca49436007 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Jun 2015 10:28:27 -0500 Subject: netfilter: don't pull include/linux/netfilter.h from netns headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pulls the full hook netfilter definitions from all those that include net_namespace.h. Instead let's just include the bare minimum required in the new linux/netfilter_defs.h file, and use it from the netfilter netns header files. I also needed to include in.h and in6.h from linux/netfilter.h otherwise we hit this compilation error: In file included from include/linux/netfilter_defs.h:4:0, from include/net/netns/netfilter.h:4, from include/net/net_namespace.h:22, from include/linux/netdevice.h:43, from net/netfilter/nfnetlink_queue_core.c:23: include/uapi/linux/netfilter.h:76:17: error: field ‘in’ has incomplete type struct in_addr in; And also explicit include linux/netfilter.h in several spots. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Eric W. Biederman --- include/linux/netfilter.h | 6 ++---- include/linux/netfilter_defs.h | 9 +++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 include/linux/netfilter_defs.h (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index f5ff5d156da8..00050dfd9f23 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -10,7 +10,8 @@ #include #include #include -#include +#include + #ifdef CONFIG_NETFILTER static inline int NF_DROP_GETERR(int verdict) { @@ -38,9 +39,6 @@ static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, int netfilter_init(void); -/* Largest hook number + 1 */ -#define NF_MAX_HOOKS 8 - struct sk_buff; struct nf_hook_ops; diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h new file mode 100644 index 000000000000..d3a7f8597e82 --- /dev/null +++ b/include/linux/netfilter_defs.h @@ -0,0 +1,9 @@ +#ifndef __LINUX_NETFILTER_CORE_H_ +#define __LINUX_NETFILTER_CORE_H_ + +#include + +/* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */ +#define NF_MAX_HOOKS 8 + +#endif -- cgit v1.2.3 From dcb8f5c8139ef945cdfd55900fae265c4dbefc02 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 17 Jun 2015 23:58:28 +0200 Subject: netfilter: xtables: fix warnings on 32bit platforms On 32bit archs gcc complains due to cast from void* to u64. Add intermediate casts to long to silence these warnings. include/linux/netfilter/x_tables.h:376:10: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] include/linux/netfilter/x_tables.h:384:15: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] include/linux/netfilter/x_tables.h:391:23: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] include/linux/netfilter/x_tables.h:400:22: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] Fixes: 71ae0dff02d756e ("netfilter: xtables: use percpu rule counters") Reported-by: kbuild test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1c97a2204379..286098a5667f 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -372,7 +372,7 @@ static inline u64 xt_percpu_counter_alloc(void) if (res == NULL) return (u64) -ENOMEM; - return (__force u64) res; + return (u64) (__force unsigned long) res; } return 0; @@ -380,14 +380,14 @@ static inline u64 xt_percpu_counter_alloc(void) static inline void xt_percpu_counter_free(u64 pcnt) { if (nr_cpu_ids > 1) - free_percpu((void __percpu *) pcnt); + free_percpu((void __percpu *) (unsigned long) pcnt); } static inline struct xt_counters * xt_get_this_cpu_counter(struct xt_counters *cnt) { if (nr_cpu_ids > 1) - return this_cpu_ptr((void __percpu *) cnt->pcnt); + return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt); return cnt; } @@ -396,7 +396,7 @@ static inline struct xt_counters * xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) { if (nr_cpu_ids > 1) - return per_cpu_ptr((void __percpu *) cnt->pcnt, cpu); + return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu); return cnt; } -- cgit v1.2.3 From fb02915f47181e824339d91f8e385fd4bd746d6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Jun 2015 16:54:28 -0400 Subject: kernfs: make kernfs_get_inode() public Move kernfs_get_inode() prototype from fs/kernfs/kernfs-internal.h to include/linux/kernfs.h. It obtains the matching inode for a kernfs_node. It will be used by cgroup for inode based permission checks for now but is generally useful. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 71ecdab1671b..e6b2f7db9c0c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); @@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } +static inline struct inode * +kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ return NULL; } + static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) -- cgit v1.2.3 From 187fe84067bd377047cfcb7f2bbc7c9dc12d290c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Jun 2015 16:54:28 -0400 Subject: cgroup: require write perm on common ancestor when moving processes on the default hierarchy On traditional hierarchies, if a task has write access to "tasks" or "cgroup.procs" file of a cgroup and its euid agrees with the target, it can move the target to the cgroup; however, consider the following scenario. The owner of each cgroup is in the parentheses. R (root) - 0 (root) - 00 (user1) - 000 (user1) | \ 001 (user1) \ 1 (root) - 10 (user1) The subtrees of 00 and 10 are delegated to user1; however, while both subtrees may belong to the same user, it is clear that the two subtrees are to be isolated - they're under completely separate resource limits imposed by 0 and 1, respectively. Note that 0 and 1 aren't strictly necessary but added to ease illustrating the issue. If user1 is allowed to move processes between the two subtrees, the intention of the hierarchy - keeping a given group of processes under a subtree with certain resource restrictions while delegating management of the subtree - can be circumvented by user1. This happens because migration permission check doesn't consider the hierarchical nature of cgroups. To fix the issue, this patch adds an extra permission requirement when userland tries to migrate a process in the default hierarchy - the issuing task must have write access to the common ancestor of "cgroup.procs" file of the ancestor in addition to the destination's. Conceptually, the issuer must be able to move the target process from the source cgroup to the common ancestor of source and destination cgroups and then to the destination. As long as delegation is done in a proper top-down way, this guarantees that a delegatee can't smuggle processes across disjoint delegation domains. The next patch will add documentation on the delegation model on the default hierarchy. v2: Fixed missing !ret test. Spotted by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner Cc: Li Zefan --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c5588c438448..93755a629299 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -220,6 +220,7 @@ struct cgroup { int populated_cnt; struct kernfs_node *kn; /* cgroup kernfs entry */ + struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ /* -- cgit v1.2.3 From c04dca02bc73096435a5c36efd5ccb2171edcbe1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 11 Jun 2015 14:46:44 +0200 Subject: hrtimer: Remove HRTIMER_STATE_MIGRATE I do not understand HRTIMER_STATE_MIGRATE. Unless I am totally confused it looks buggy and simply unneeded. migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but this is not enough: this can fool, say, hrtimer_is_queued() in dequeue_signal(). Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED? This fixes the race and we can kill STATE_MIGRATE. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.072387650@infradead.org Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3f82a7edc03d..2f9e57d3d126 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -70,17 +70,13 @@ enum hrtimer_restart { * the handling of the timer. * * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This - * also affects HRTIMER_STATE_MIGRATE where the preservation is not - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is - * enqueued on the new cpu. + * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_MIGRATE 0x04 /** * struct hrtimer - the basic hrtimer structure -- cgit v1.2.3 From a7c6f571ff51cc77d90dd54968f7c5c938c43998 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:46 +0200 Subject: seqcount: Rename write_seqcount_barrier() I'll shortly be introducing another seqcount primitive that's useful to provide ordering semantics and would like to use the write_seqcount_barrier() name for that. Seeing how there's only one user of the current primitive, lets rename it to invalidate, as that appears what its doing. While there, employ lockdep_assert_held() instead of assert_spin_locked() to not generate debug code for regular kernels. Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: Oleg Nesterov Cc: wanpeng.li@linux.intel.com Cc: Paul McKenney Cc: Al Viro Cc: Linus Torvalds Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.279926217@infradead.org Signed-off-by: Thomas Gleixner --- include/linux/seqlock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5f68d0a391ce..c07e3a536099 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -266,13 +266,13 @@ static inline void write_seqcount_end(seqcount_t *s) } /** - * write_seqcount_barrier - invalidate in-progress read-side seq operations + * write_seqcount_invalidate - invalidate in-progress read-side seq operations * @s: pointer to seqcount_t * - * After write_seqcount_barrier, no read-side seq operations will complete + * After write_seqcount_invalidate, no read-side seq operations will complete * successfully and see data older than this. */ -static inline void write_seqcount_barrier(seqcount_t *s) +static inline void write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); s->sequence+=2; -- cgit v1.2.3 From c4bfa3f5f906aee2e084c5b1fb15caf876338ef8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Jun 2015 14:29:24 +0200 Subject: seqcount: Introduce raw_write_seqcount_barrier() Introduce raw_write_seqcount_barrier(), a new construct that can be used to provide write barrier semantics in seqcount read loops instead of the usual consistency guarantee. raw_write_seqcount_barier() is equivalent to: raw_write_seqcount_begin(); raw_write_seqcount_end(); But avoids issueing two back-to-back smp_wmb() instructions. This construct works because the read side will 'stall' when observing odd values. This means that -- referring to the example in the comment below -- even though there is no (matching) read barrier between the loads of X and Y, we cannot observe !x && !y, because: - if we observe Y == false we must observe the first sequence increment, which makes us loop, until - we observe !(seq & 1) -- the second sequence increment -- at which time we must also observe T == true. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: umgwanakikbuti@gmail.com Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: Al Viro Cc: Linus Torvalds Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20150617122924.GP3644@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index c07e3a536099..486e685a226a 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -233,6 +233,47 @@ static inline void raw_write_seqcount_end(seqcount_t *s) s->sequence++; } +/** + * raw_write_seqcount_barrier - do a seq write barrier + * @s: pointer to seqcount_t + * + * This can be used to provide an ordering guarantee instead of the + * usual consistency guarantee. It is one wmb cheaper, because we can + * collapse the two back-to-back wmb()s. + * + * seqcount_t seq; + * bool X = true, Y = false; + * + * void read(void) + * { + * bool x, y; + * + * do { + * int s = read_seqcount_begin(&seq); + * + * x = X; y = Y; + * + * } while (read_seqcount_retry(&seq, s)); + * + * BUG_ON(!x && !y); + * } + * + * void write(void) + * { + * Y = true; + * + * raw_write_seqcount_barrier(seq); + * + * X = false; + * } + */ +static inline void raw_write_seqcount_barrier(seqcount_t *s) +{ + s->sequence++; + smp_wmb(); + s->sequence++; +} + /* * raw_write_seqcount_latch - redirect readers to even/odd copy * @s: pointer to seqcount_t -- cgit v1.2.3 From 887d9dc989eb0154492e41e7c07492edbb088ba1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:48 +0200 Subject: hrtimer: Allow hrtimer::function() to free the timer Currently an hrtimer callback function cannot free its own timer because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK after it. Freeing the timer would result in a clear use-after-free. Solve this by using a scheme similar to regular timers; track the current running timer in hrtimer_clock_base::running. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: Al Viro Cc: Linus Torvalds Cc: Paul McKenney Cc: Oleg Nesterov Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.org Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2f9e57d3d126..5db055821ef3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -53,30 +53,25 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree - * 0x02 callback function running - * 0x04 timer is migrated to another cpu * - * Special cases: - * 0x03 callback function running and enqueued - * (was requeued on another CPU) - * 0x05 timer was migrated on CPU hotunplug + * The callback state is not part of the timer->state because clearing it would + * mean touching the timer after the callback, this makes it impossible to free + * the timer from the callback function. * - * The "callback function running and enqueued" status is only possible on - * SMP. It happens for example when a posix timer expired and the callback + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. We have to preserve the callback running state, - * as otherwise the timer could be removed before the softirq code finishes the - * the handling of the timer. - * - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. + * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 -#define HRTIMER_STATE_CALLBACK 0x02 /** * struct hrtimer - the basic hrtimer structure @@ -163,6 +158,8 @@ enum hrtimer_base_type { * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers + * @seq: seqcount around __run_hrtimer + * @running: pointer to the currently running hrtimer * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events @@ -184,6 +181,8 @@ enum hrtimer_base_type { */ struct hrtimer_cpu_base { raw_spinlock_t lock; + seqcount_t seq; + struct hrtimer *running; unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; @@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern u64 hrtimer_get_next_event(void); -/* - * A timer is active, when it is enqueued into the rbtree or the - * callback function is running or it's in the state of being migrated - * to another cpu. - */ -static inline int hrtimer_active(const struct hrtimer *timer) -{ - return timer->state != HRTIMER_STATE_INACTIVE; -} +extern bool hrtimer_active(const struct hrtimer *timer); /* * Helper function to check, whether the timer is on one of the queues @@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) */ static inline int hrtimer_callback_running(struct hrtimer *timer) { - return timer->state & HRTIMER_STATE_CALLBACK; + return timer->base->cpu_base->running == timer; } /* Forward a hrtimer so it expires after now: */ -- cgit v1.2.3 From a24fc60d63da2b0b31bf7c876d12a51ed4b778bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:53 +0200 Subject: lockdep: Implement lock pinning Add a lockdep annotation that WARNs if you 'accidentially' unlock a lock. This is especially helpful for code with callbacks, where the upper layer assumes a lock remains taken but a lower layer thinks it maybe can drop and reacquire the lock. By unwittingly breaking up the lock, races can be introduced. Lock pinning is a lockdep annotation that helps with this, when you lockdep_pin_lock() a held lock, any unlock without a lockdep_unpin_lock() will produce a WARN. Think of this as a relative of lockdep_assert_held(), except you don't only assert its held now, but ensure it stays held until you release your assertion. RFC: a possible alternative API would be something like: int cookie = lockdep_pin_lock(&foo); ... lockdep_unpin_lock(&foo, cookie); Where we pick a random number for the pin_count; this makes it impossible to sneak a lock break in without also passing the right cookie along. I've not done this because it ends up generating code for !LOCKDEP, esp. if you need to pass the cookie around for some reason. Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.906731065@infradead.org Signed-off-by: Thomas Gleixner --- include/linux/lockdep.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 066ba4157541..c5b6b5830acf 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -255,6 +255,7 @@ struct held_lock { unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ + unsigned int pin_count; }; /* @@ -354,6 +355,9 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); extern void lockdep_clear_current_reclaim_state(void); extern void lockdep_trace_alloc(gfp_t mask); +extern void lock_pin_lock(struct lockdep_map *lock); +extern void lock_unpin_lock(struct lockdep_map *lock); + # define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -368,6 +372,9 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) +#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) +#define lockdep_unpin_lock(l) lock_unpin_lock(&(l)->dep_map) + #else /* !CONFIG_LOCKDEP */ static inline void lockdep_off(void) @@ -420,6 +427,9 @@ struct lock_class_key { }; #define lockdep_recursing(tsk) (0) +#define lockdep_pin_lock(l) do { (void)(l); } while (0) +#define lockdep_unpin_lock(l) do { (void)(l); } while (0) + #endif /* !LOCKDEP */ #ifdef CONFIG_LOCK_STAT -- cgit v1.2.3 From 7f3b62cf945dc3e57fbd693022a5651206ce85b0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 16 Jun 2015 16:27:43 +0200 Subject: acpi-video-detect: Remove the unused acpi_video_dmi_demote_vendor() function Remove the now unused acpi_video_dmi_demote_vendor() function, this was never a proper counter part of acpi_video_dmi_promote_vendor() since the calls to acpi_video_dmi_promote_vendor() are not counted. Signed-off-by: Hans de Goede Acked-by: Darren Hart Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..01bffd30c6c7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -248,7 +248,6 @@ extern bool wmi_has_guid(const char *guid); extern long acpi_video_get_capabilities(acpi_handle graphics_dev_handle); extern long acpi_is_video_device(acpi_handle handle); extern void acpi_video_dmi_promote_vendor(void); -extern void acpi_video_dmi_demote_vendor(void); extern int acpi_video_backlight_support(void); extern int acpi_video_display_switch_support(void); @@ -268,10 +267,6 @@ static inline void acpi_video_dmi_promote_vendor(void) { } -static inline void acpi_video_dmi_demote_vendor(void) -{ -} - static inline int acpi_video_backlight_support(void) { return 0; -- cgit v1.2.3 From fb105d964226ce4834b45d7e3d9f339aa716ed70 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 16 Jun 2015 16:27:44 +0200 Subject: acpi-video-detect: Make acpi_video_get_capabilities a private function acpi_video_get_capabilities() is only used inside video_detect.c so make it static. While at it also remove the prototype for the non existent acpi_video_display_switch_support function from acpi.h Signed-off-by: Hans de Goede Acked-by: Darren Hart Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 01bffd30c6c7..88c92a03a77e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -245,19 +245,12 @@ extern bool wmi_has_guid(const char *guid); #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) -extern long acpi_video_get_capabilities(acpi_handle graphics_dev_handle); extern long acpi_is_video_device(acpi_handle handle); extern void acpi_video_dmi_promote_vendor(void); extern int acpi_video_backlight_support(void); -extern int acpi_video_display_switch_support(void); #else -static inline long acpi_video_get_capabilities(acpi_handle graphics_dev_handle) -{ - return 0; -} - static inline long acpi_is_video_device(acpi_handle handle) { return 0; @@ -272,11 +265,6 @@ static inline int acpi_video_backlight_support(void) return 0; } -static inline int acpi_video_display_switch_support(void) -{ - return 0; -} - #endif /* defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) */ extern int acpi_blacklisted(void); -- cgit v1.2.3 From adc8bb8e0fe005ed29366e6c4621652481878214 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 16 Jun 2015 16:27:45 +0200 Subject: acpi-video-detect: Move acpi_is_video_device() to acpi/scan.c This allows video_detect.c to be build as a module, this is a preparation patch for the backlight interface selection logic cleanup. Note this commit also causes acpi_is_video_device() to always be build indepedent of CONFIG_ACPI_VIDEO, as there is no reason to make its building depend on CONFIG_ACPI_VIDEO. Signed-off-by: Hans de Goede Acked-by: Darren Hart Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 88c92a03a77e..7cb3b0bc4a7e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -243,19 +243,15 @@ extern bool wmi_has_guid(const char *guid); #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR 0x0400 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO 0x0800 +extern long acpi_is_video_device(acpi_handle handle); + #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) -extern long acpi_is_video_device(acpi_handle handle); extern void acpi_video_dmi_promote_vendor(void); extern int acpi_video_backlight_support(void); #else -static inline long acpi_is_video_device(acpi_handle handle) -{ - return 0; -} - static inline void acpi_video_dmi_promote_vendor(void) { } -- cgit v1.2.3 From a87878bafa1f82c20eddaf2d23780b194c35ccf5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 16 Jun 2015 16:27:46 +0200 Subject: acpi-video-detect: Move acpi_osi_is_win8 to osl.c acpi_osi_is_win8 needs access to acpi_gbl_osi_data which is not exported, so move it to osl.c. Alternatively we could export acpi_gbl_osi_data but that seems undesirable. This allows video_detect.c to be build as a module, besides that acpi_osi_is_win8() is something which does not really belong in video_detect.c in the first place. Signed-off-by: Hans de Goede Acked-by: Darren Hart Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 7cb3b0bc4a7e..913a1c19818a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -266,6 +266,7 @@ static inline int acpi_video_backlight_support(void) extern int acpi_blacklisted(void); extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d); extern void acpi_osi_setup(char *str); +extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA int acpi_get_node(acpi_handle handle); -- cgit v1.2.3 From 14ca7a47d0ab2a7a35faab130e6d9682f8ff1a46 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 16 Jun 2015 16:27:47 +0200 Subject: acpi-video-detect: video: Make video_detect code part of the video module This is a preparation patch for the backlight interface selection logic cleanup, there are 2 reasons to not always build the video_detect code into the kernel: 1) In order for the video_detect.c to also deal with / select native backlight interfaces on win8 systems, instead of doing this in video.c where it does not belong, video_detect.c needs to call into the backlight class code. Which cannot be done if it is builtin and the blacklight class is not. 2) Currently all the platform/x86 drivers which have quirks to prefer the vendor driver over acpi-video call acpi_video_unregister_backlight() to remove the acpi-video backlight interface, this logic really belongs in video_detect.c, which will cause video_detect.c to depend on symbols of video.c and video.c already depends on video_detect.c symbols, so they really need to be a single module. Note that this commits make 2 changes so as to maintain 100% kernel commandline compatibility: 1) The __setup call for the acpi_backlight= handling is moved to acpi/util.c as __setup may only be used by code which is alwasy builtin 2) video.c is renamed to acpi_video.c so that it can be combined with video_detect.c into video.ko This commit also makes changes to drivers/platform/x86/Kconfig to ensure that drivers which use acpi_video_backlight_support() from video_detect.c, will not be built-in when acpi_video is not built in. This also changes some "select" uses to "depends on" to avoid dependency loops. Signed-off-by: Hans de Goede Acked-by: Darren Hart Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 913a1c19818a..f097c0a2718e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -243,6 +243,7 @@ extern bool wmi_has_guid(const char *guid); #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR 0x0400 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO 0x0800 +extern char acpi_video_backlight_string[]; extern long acpi_is_video_device(acpi_handle handle); #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) -- cgit v1.2.3 From d0a530ba424ec1be7630f7fce2db9860b9429b8f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 16 Jun 2015 16:28:12 +0200 Subject: acpi-video-detect: Remove old API Remove the old backlight interface selection API now that all drivers have been ported to the new API. Signed-off-by: Hans de Goede Acked-by: Darren Hart Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index f097c0a2718e..5966d1d9280e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -245,25 +245,6 @@ extern bool wmi_has_guid(const char *guid); extern char acpi_video_backlight_string[]; extern long acpi_is_video_device(acpi_handle handle); - -#if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) - -extern void acpi_video_dmi_promote_vendor(void); -extern int acpi_video_backlight_support(void); - -#else - -static inline void acpi_video_dmi_promote_vendor(void) -{ -} - -static inline int acpi_video_backlight_support(void) -{ - return 0; -} - -#endif /* defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) */ - extern int acpi_blacklisted(void); extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d); extern void acpi_osi_setup(char *str); -- cgit v1.2.3 From edf18b9108f5025f9e83b2c167c9122954acbc62 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Jun 2015 14:00:48 +0800 Subject: crypto: api - Add CRYPTO_MINALIGN_ATTR to struct crypto_alg The struct crypto_alg is embedded into various type-specific structs such as aead_alg. This is then used as part of instances such as struct aead_instance. It is also embedded into the generic struct crypto_instance. In order to ensure that struct aead_instance can be converted to struct crypto_instance when necessary, we need to ensure that crypto_alg is aligned properly. This patch adds an alignment attribute to struct crypto_alg to ensure this. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 0e3f71a73e3b..964e5735a6a9 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -513,7 +513,7 @@ struct crypto_alg { void (*cra_destroy)(struct crypto_alg *alg); struct module *cra_module; -}; +} CRYPTO_MINALIGN_ATTR; /* * Algorithm registration interface. -- cgit v1.2.3 From 68722101ec3a0e179408a13708dd020e04f54aab Mon Sep 17 00:00:00 2001 From: George Beshers Date: Thu, 18 Jun 2015 10:25:13 -0500 Subject: locking/lockdep: Remove hard coded array size dependency An apparent oversight left a hardcoded '4' in place when LOCKSTAT_POINTS was introduced. The contention_point[] and contending_point[] arrays in the structs lock_class and lock_class_stats need to be the same size for the loops in lock_stats() to be correct. This patch allows LOCKSTAT_POINTS to be changed without affecting the correctness of the code. Signed-off-by: George Beshers Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 066ba4157541..2722111591a3 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -130,8 +130,8 @@ enum bounce_type { }; struct lock_class_stats { - unsigned long contention_point[4]; - unsigned long contending_point[4]; + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; struct lock_time read_waittime; struct lock_time write_waittime; struct lock_time read_holdtime; -- cgit v1.2.3 From b17718d02f54b90978d0e0146368b512b11c3e84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2015 17:30:23 +0200 Subject: sched/stop_machine: Fix deadlock between multiple stop_two_cpus() Jiri reported a machine stuck in multi_cpu_stop() with migrate_swap_stop() as function and with the following src,dst cpu pairs: {11, 4} {13, 11} { 4, 13} 4 11 13 cpuM: queue(4 ,13) *Ma cpuN: queue(13,11) *N Na *M Mb cpuO: queue(11, 4) *O Oa *Nb *Ob Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes the first/second queued work. You'll observe the top of the workqueue for each cpu: 4,11,13 to be work from cpus: M, O, N resp. IOW. deadlock. Do away with the queueing trickery and introduce lg_double_lock() to lock both CPUs and fully serialize the stop_two_cpus() callers instead of the partial (and buggy) serialization we have now. Reported-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150605153023.GH19282@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/lglock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lglock.h b/include/linux/lglock.h index 0081f000e34b..c92ebd100d9b 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -52,10 +52,15 @@ struct lglock { static struct lglock name = { .lock = &name ## _lock } void lg_lock_init(struct lglock *lg, char *name); + void lg_local_lock(struct lglock *lg); void lg_local_unlock(struct lglock *lg); void lg_local_lock_cpu(struct lglock *lg, int cpu); void lg_local_unlock_cpu(struct lglock *lg, int cpu); + +void lg_double_lock(struct lglock *lg, int cpu1, int cpu2); +void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2); + void lg_global_lock(struct lglock *lg); void lg_global_unlock(struct lglock *lg); -- cgit v1.2.3 From 1dabbcec2c0a36fe43509d06499b9e512e70a028 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:28 +0000 Subject: timer: Use hlist for the timer wheel hash buckets This reduces the size of struct tvec_base by 50% and results in slightly smaller code as well. Before: struct tvec_base: size: 8256, cachelines: 129 text data bss dec hex filename 17698 13297 8256 39251 9953 ../build/kernel/time/timer.o After: struct tvec_base: 4160, cachelines: 65 text data bss dec hex filename 17491 9201 4160 30852 7884 ../build/kernel/time/timer.o Signed-off-by: Thomas Gleixner Reviewed-by: Viresh Kumar Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224511.854731214@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index fbb80e0030bf..064ee24d3f38 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -14,7 +14,7 @@ struct timer_list { * All fields that change during normal runtime grouped to the * same cacheline */ - struct list_head entry; + struct hlist_node entry; unsigned long expires; struct tvec_base *base; @@ -71,7 +71,7 @@ extern struct tvec_base boot_tvec_bases; #define TIMER_FLAG_MASK 0x3LU #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ - .entry = { .prev = TIMER_ENTRY_STATIC }, \ + .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ @@ -168,7 +168,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, */ static inline int timer_pending(const struct timer_list * timer) { - return timer->entry.next != NULL; + return timer->entry.pprev != NULL; } extern void add_timer_on(struct timer_list *timer, int cpu); -- cgit v1.2.3 From 0eeda71bc30d74f66f8231f45621d5ace3419186 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:29 +0000 Subject: timer: Replace timer base by a cpu index Instead of storing a pointer to the per cpu tvec_base we can simply cache a CPU index in the timer_list and use that to get hold of the correct per cpu tvec_base. This is only used in lock_timer_base() and the slightly larger code is peanuts versus the spinlock operation and the d-cache foot print of the timer wheel. Aside of that this allows to get rid of following nuisances: - boot_tvec_base That statically allocated 4k bss data is just kept around so the timer has a home when it gets statically initialized. It serves no other purpose. With the CPU index we assign the timer to CPU0 at static initialization time and therefor can avoid the whole boot_tvec_base dance. That also simplifies the init code, which just can use the per cpu base. Before: text data bss dec hex filename 17491 9201 4160 30852 7884 ../build/kernel/time/timer.o After: text data bss dec hex filename 17440 9193 0 26633 6809 ../build/kernel/time/timer.o - Overloading the base pointer with various flags The CPU index has enough space to hold the flags (deferrable, irqsafe) so we can get rid of the extra masking and bit fiddling with the base pointer. As a benefit we reduce the size of struct timer_list on 64 bit machines. 4 - 8 bytes, a size reduction up to 15% per struct timer_list, which is a real win as we have tons of them embedded in other structs. This changes also the newly added deferrable printout of the timer start trace point to capture and print all timer->flags, which allows us to decode the target cpu of the timer as well. We might have used bitfields for this, but that would change the static initializers and the init function for no value to accomodate big endian bitfields. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Cc: Steven Rostedt Cc: Badhri Jagan Sridharan Link: http://lkml.kernel.org/r/20150526224511.950084301@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 064ee24d3f38..4a0d52bc2073 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -14,27 +14,23 @@ struct timer_list { * All fields that change during normal runtime grouped to the * same cacheline */ - struct hlist_node entry; - unsigned long expires; - struct tvec_base *base; - - void (*function)(unsigned long); - unsigned long data; - - int slack; + struct hlist_node entry; + unsigned long expires; + void (*function)(unsigned long); + unsigned long data; + u32 flags; + int slack; #ifdef CONFIG_TIMER_STATS - int start_pid; - void *start_site; - char start_comm[16]; + int start_pid; + void *start_site; + char start_comm[16]; #endif #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; -extern struct tvec_base boot_tvec_bases; - #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting the lockdep_map key @@ -49,9 +45,6 @@ extern struct tvec_base boot_tvec_bases; #endif /* - * Note that all tvec_bases are at least 4 byte aligned and lower two bits - * of base in timer_list is guaranteed to be zero. Use them for flags. - * * A deferrable timer will work normally when the system is busy, but * will not cause a CPU to come out of idle just to service it; instead, * the timer will be serviced when the CPU eventually wakes up with a @@ -65,17 +58,18 @@ extern struct tvec_base boot_tvec_bases; * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! */ -#define TIMER_DEFERRABLE 0x1LU -#define TIMER_IRQSAFE 0x2LU - -#define TIMER_FLAG_MASK 0x3LU +#define TIMER_CPUMASK 0x0007FFFF +#define TIMER_MIGRATING 0x00080000 +#define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) +#define TIMER_DEFERRABLE 0x00100000 +#define TIMER_IRQSAFE 0x00200000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ - .base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \ + .flags = (_flags), \ .slack = -1, \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ -- cgit v1.2.3 From c74441a17eb975b604e339ca6c11b9ab9aaca11f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:31 +0000 Subject: timer: Stats: Simplify the flags handling Simplify the handling of the flag storage for the timer statistics. No intermediate storage anymore. Just hand over the flags field. I left the printout of 'deferrable' for now because changing this would be an ABI update and I have no idea how strong people feel about that. OTOH, I wonder whether we should kill the whole timer stats stuff because all of that information can be retrieved via ftrace/perf as well. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.046626248@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 4a0d52bc2073..ff0689b6e297 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -188,13 +188,10 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); extern int timer_stats_active; -#define TIMER_STATS_FLAG_DEFERRABLE 0x1 - extern void init_timer_stats(void); extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag); + void *timerf, char *comm, u32 flags); extern void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr); -- cgit v1.2.3 From bc7a34b8b9ebfb0f4b8a35a72a0b134fd6c5ef50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:33 +0000 Subject: timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 ++ include/linux/sched.h | 6 +----- include/linux/sched/sysctl.h | 12 ------------ include/linux/timer.h | 9 +++++++++ 4 files changed, 12 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5db055821ef3..69551020bb97 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -163,6 +163,7 @@ enum hrtimer_base_type { * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events + * @migration_enabled: The migration of hrtimers to other cpus is enabled * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @next_timer: Pointer to the first expiring timer @@ -186,6 +187,7 @@ struct hrtimer_cpu_base { unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; + bool migration_enabled; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int in_hrtirq : 1, hres_active : 1, diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..d7151460b0cf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -335,14 +335,10 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(int pinned); +extern int get_nohz_timer_target(void); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } -static inline int get_nohz_timer_target(int pinned) -{ - return smp_processor_id(); -} #endif /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 596a0e007c62..c9e4731cf10b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif /* * control realtime throttling: diff --git a/include/linux/timer.h b/include/linux/timer.h index ff0689b6e297..61aa61dc410c 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -238,6 +238,15 @@ extern void run_local_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#include + +extern unsigned int sysctl_timer_migration; +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); -- cgit v1.2.3 From 683be13a284720205228e29207ef11a1c3c322b9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:35 +0000 Subject: timer: Minimize nohz off overhead If nohz is disabled on the kernel command line the [hr]timer code still calls wake_up_nohz_cpu() and tick_nohz_full_cpu(), a pretty pointless exercise. Cache nohz_active in [hr]timer per cpu bases and avoid the overhead. Before: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu After: 48.73% hog [.] main 15.36% [kernel] [k] _raw_spin_lock_irqsave 9.77% [kernel] [k] _raw_spin_unlock_irqrestore 6.61% [kernel] [k] lock_timer_base.isra.38 6.42% [kernel] [k] mod_timer 3.90% [kernel] [k] detach_if_pending 3.76% [kernel] [k] del_timer 2.41% [kernel] [k] internal_add_timer 1.39% [kernel] [k] __internal_add_timer 0.76% [kernel] [k] timerfn We probably should have a cached value for nohz full in the per cpu bases as well to avoid the cpumask check. The base cache line is hot already, the cpumask not necessarily. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.207378134@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 69551020bb97..76dd4f0da5ca 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -164,6 +164,7 @@ enum hrtimer_base_type { * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @migration_enabled: The migration of hrtimers to other cpus is enabled + * @nohz_active: The nohz functionality is enabled * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @next_timer: Pointer to the first expiring timer @@ -188,6 +189,7 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; bool migration_enabled; + bool nohz_active; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int in_hrtirq : 1, hres_active : 1, -- cgit v1.2.3 From 1796dcce2daacc125f2d60afc3f631ca29e36684 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 3 May 2015 18:57:10 +0900 Subject: rtc: interface: Fix coding style violations Fix issues reported by checkpatch: ERROR: open brace '{' following struct go on the same line ERROR: "foo* bar" should be "foo *bar" Additionally adjust alignment of wrapped function arguments. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 8dcf6825fa88..b0709f80dfb0 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -101,8 +101,7 @@ struct rtc_timer { /* flags */ #define RTC_DEV_BUSY 0 -struct rtc_device -{ +struct rtc_device { struct device dev; struct module *owner; @@ -198,10 +197,10 @@ int rtc_register(rtc_task_t *task); int rtc_unregister(rtc_task_t *task); int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg); -void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data); -int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, - ktime_t expires, ktime_t period); -int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer); +void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data); +int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, + ktime_t expires, ktime_t period); +int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer); void rtc_timer_do_work(struct work_struct *work); static inline bool is_leap_year(unsigned int year) -- cgit v1.2.3 From 73744a64aab872703b851b9678a7f488b507eb81 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 3 May 2015 18:57:11 +0900 Subject: rtc: interface: Remove unused return value from rtc_timer_cancel() The rtc_timer_cancel() always returns 0 and cannot fail (calls only other void-returning functions). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index b0709f80dfb0..587017e7939c 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -200,7 +200,7 @@ int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg); void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data); int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, ktime_t expires, ktime_t period); -int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer); +void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer); void rtc_timer_do_work(struct work_struct *work); static inline bool is_leap_year(unsigned int year) -- cgit v1.2.3 From 1387fe7d292b66677dae31d25a8e3c953bf21748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 12 May 2015 11:31:02 +0200 Subject: MIPS: BCM47xx: Extract all boardflags to new u32 fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For years we planned to get rid of old u16 fields, let's start doing it with MIPS code. This process will take some time, it requires doing the same in ssb/bcma and then switching all drivers to new fields. This will be handled in separated patches submitted to appropriate trees. Signed-off-by: Rafał Miłecki Cc: linux-mips@linux-mips.org Cc: Hauke Mehrtens Patchwork: https://patchwork.linux-mips.org/patch/10026/ Signed-off-by: Ralf Baechle --- include/linux/ssb/ssb.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index 4568a5cc9ab8..ee90e32a0607 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -88,11 +88,14 @@ struct ssb_sprom { u32 ofdm5glpo; /* 5.2GHz OFDM power offset */ u32 ofdm5gpo; /* 5.3GHz OFDM power offset */ u32 ofdm5ghpo; /* 5.8GHz OFDM power offset */ + u32 boardflags; + u32 boardflags2; + u32 boardflags3; + /* TODO: Switch all drivers to new u32 fields and drop below ones */ u16 boardflags_lo; /* Board flags (bits 0-15) */ u16 boardflags_hi; /* Board flags (bits 16-31) */ u16 boardflags2_lo; /* Board flags (bits 32-47) */ u16 boardflags2_hi; /* Board flags (bits 48-63) */ - /* TODO store board flags in a single u64 */ struct ssb_sprom_core_pwr_info core_pwr_info[4]; -- cgit v1.2.3 From 6e122ac0053d071976686dd04cdd60ea8039bb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 12 May 2015 11:54:48 +0200 Subject: MIPS: BCM47xx: Extract info about et2 interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New devices may have more than 1 Ethernet core (device). We should extract info about them to make it available to Ethernet drivers. Signed-off-by: Rafał Miłecki Cc: linux-mips@linux-mips.org Cc: Hauke Mehrtens Cc: Hante Meuleman Cc: Ian Kent Patchwork: https://patchwork.linux-mips.org/patch/10027/ Signed-off-by: Ralf Baechle --- include/linux/ssb/ssb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index ee90e32a0607..c3d1a525bacc 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -29,10 +29,13 @@ struct ssb_sprom { u8 il0mac[6] __aligned(sizeof(u16)); /* MAC address for 802.11b/g */ u8 et0mac[6] __aligned(sizeof(u16)); /* MAC address for Ethernet */ u8 et1mac[6] __aligned(sizeof(u16)); /* MAC address for 802.11a */ + u8 et2mac[6] __aligned(sizeof(u16)); /* MAC address for extra Ethernet */ u8 et0phyaddr; /* MII address for enet0 */ u8 et1phyaddr; /* MII address for enet1 */ + u8 et2phyaddr; /* MII address for enet2 */ u8 et0mdcport; /* MDIO for enet0 */ u8 et1mdcport; /* MDIO for enet1 */ + u8 et2mdcport; /* MDIO for enet2 */ u16 dev_id; /* Device ID overriding e.g. PCI ID */ u16 board_rev; /* Board revision number from SPROM. */ u16 board_num; /* Board number from SPROM. */ -- cgit v1.2.3 From 44e08e7099c8de226606cfc989b45d6fa27f507f Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Sun, 24 May 2015 16:11:31 +0100 Subject: MIPS/IRQCHIP: Move Ingenic SoC intc driver to drivers/irqchip Move the driver for Ingenic SoC interrupt controllers into drivers/irqchip where it belongs. Signed-off-by: Paul Burton Cc: Lars-Peter Clausen Cc: Thomas Gleixner Cc: Jason Cooper Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: Brian Norris Patchwork: https://patchwork.linux-mips.org/patch/10147/ Signed-off-by: Ralf Baechle --- include/linux/irqchip/ingenic.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/linux/irqchip/ingenic.h (limited to 'include/linux') diff --git a/include/linux/irqchip/ingenic.h b/include/linux/irqchip/ingenic.h new file mode 100644 index 000000000000..0ee319a4029d --- /dev/null +++ b/include/linux/irqchip/ingenic.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2010, Lars-Peter Clausen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LINUX_IRQCHIP_INGENIC_H__ +#define __LINUX_IRQCHIP_INGENIC_H__ + +#include + +extern void ingenic_intc_irq_suspend(struct irq_data *data); +extern void ingenic_intc_irq_resume(struct irq_data *data); + +#endif -- cgit v1.2.3 From 55cab93bcf1422ab4298edc65c349c4304b4884e Mon Sep 17 00:00:00 2001 From: Hante Meuleman Date: Thu, 21 May 2015 15:27:23 +0200 Subject: mips: bcm47xx: allow retrieval of complete nvram contents Host platforms such as routers supported by OpenWrt can support NVRAM reading directly from internal NVRAM store. The brcmfmac for one requires the complete nvram contents to select what needs to be sent to wireless device. Signed-off-by: Arend van Spriel Signed-off-by: Hante Meuleman Reviewed-by: Arend Van Spriel Reviewed-by: Franky (Zhenhui) Lin Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Daniel (Deognyoun) Kim Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/10093/ Signed-off-by: Ralf Baechle --- include/linux/bcm47xx_nvram.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h index b12b07e75929..c73927c66c2c 100644 --- a/include/linux/bcm47xx_nvram.h +++ b/include/linux/bcm47xx_nvram.h @@ -10,11 +10,17 @@ #include #include +#include #ifdef CONFIG_BCM47XX int bcm47xx_nvram_init_from_mem(u32 base, u32 lim); int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len); int bcm47xx_nvram_gpio_pin(const char *name); +char *bcm47xx_nvram_get_contents(size_t *val_len); +static inline void bcm47xx_nvram_release_contents(char *nvram) +{ + vfree(nvram); +}; #else static inline int bcm47xx_nvram_init_from_mem(u32 base, u32 lim) { @@ -29,6 +35,15 @@ static inline int bcm47xx_nvram_gpio_pin(const char *name) { return -ENOTSUPP; }; + +static inline char *bcm47xx_nvram_get_contents(size_t *val_len) +{ + return NULL; +}; + +static inline void bcm47xx_nvram_release_contents(char *nvram) +{ +}; #endif #endif /* __BCM47XX_NVRAM_H */ -- cgit v1.2.3 From 2ddf3a792218cddd30140b1f8b32cb6e2d67921f Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Sun, 31 May 2015 02:18:24 +0200 Subject: MIPS: ath79: Add OF support to the GPIO driver Replace the simple GPIO chip registration by a platform driver and make ath79_gpio_init() just register the device. Signed-off-by: Alban Bedel Cc: linux-mips@linux-mips.org Signed-off-by: Ralf Baechle --- include/linux/platform_data/gpio-ath79.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/platform_data/gpio-ath79.h (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-ath79.h b/include/linux/platform_data/gpio-ath79.h new file mode 100644 index 000000000000..88b0db7bee74 --- /dev/null +++ b/include/linux/platform_data/gpio-ath79.h @@ -0,0 +1,19 @@ +/* + * Atheros AR7XXX/AR9XXX GPIO controller platform data + * + * Copyright (C) 2015 Alban Bedel + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_PLATFORM_DATA_GPIO_ATH79_H +#define __LINUX_PLATFORM_DATA_GPIO_ATH79_H + +struct ath79_gpio_platform_data { + unsigned ngpios; + bool oe_inverted; +}; + +#endif -- cgit v1.2.3 From f6e734a8c162297953d7bfc0f3f6bf4f8c33d72f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 10 Jun 2015 23:05:08 +0200 Subject: MIPS: BCM47xx: Move NVRAM driver to the drivers/firmware/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After Broadcom switched from MIPS to ARM for their home routers we need to have NVRAM driver in some common place (not arch/mips/). As explained in Kconfig, this driver is responsible for parsing SoC configuration data that is passed to the kernel in flash from the bootloader firmware called "CFE". We were thinking about putting it in bus directory, however there are two possible buses for MIPS: drivers/ssb/ and drivers/bcma/. So this won't fit there and this is why I would like to move this driver to the drivers/firmware/. Signed-off-by: Rafał Miłecki Reviewed-by: Paul Walmsley Cc: linux-mips@linux-mips.org Cc: Hauke Mehrtens Cc: Seiji Aguchi Cc: Greg Kroah-Hartman Cc: Ard Biesheuvel Cc: Mike Waychison Cc: Roy Franz Cc: Matt Fleming Cc: Linus Torvalds Patchwork: https://patchwork.linux-mips.org/patch/10544/ Signed-off-by: Ralf Baechle --- include/linux/bcm47xx_nvram.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h index c73927c66c2c..2793652fbf66 100644 --- a/include/linux/bcm47xx_nvram.h +++ b/include/linux/bcm47xx_nvram.h @@ -12,7 +12,7 @@ #include #include -#ifdef CONFIG_BCM47XX +#ifdef CONFIG_BCM47XX_NVRAM int bcm47xx_nvram_init_from_mem(u32 base, u32 lim); int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len); int bcm47xx_nvram_gpio_pin(const char *name); -- cgit v1.2.3 From d0497524658e37956737d7dbee73cc42120255dc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 21 Jun 2015 19:11:44 +0800 Subject: crypto: user - Move cryptouser.h to uapi The header file cryptouser.h only contains information that is exported to user-space. Signed-off-by: Herbert Xu --- include/linux/cryptouser.h | 110 --------------------------------------------- 1 file changed, 110 deletions(-) delete mode 100644 include/linux/cryptouser.h (limited to 'include/linux') diff --git a/include/linux/cryptouser.h b/include/linux/cryptouser.h deleted file mode 100644 index 36efbbbf2f83..000000000000 --- a/include/linux/cryptouser.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Crypto user configuration API. - * - * Copyright (C) 2011 secunet Security Networks AG - * Copyright (C) 2011 Steffen Klassert - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -/* Netlink configuration messages. */ -enum { - CRYPTO_MSG_BASE = 0x10, - CRYPTO_MSG_NEWALG = 0x10, - CRYPTO_MSG_DELALG, - CRYPTO_MSG_UPDATEALG, - CRYPTO_MSG_GETALG, - __CRYPTO_MSG_MAX -}; -#define CRYPTO_MSG_MAX (__CRYPTO_MSG_MAX - 1) -#define CRYPTO_NR_MSGTYPES (CRYPTO_MSG_MAX + 1 - CRYPTO_MSG_BASE) - -#define CRYPTO_MAX_NAME CRYPTO_MAX_ALG_NAME - -/* Netlink message attributes. */ -enum crypto_attr_type_t { - CRYPTOCFGA_UNSPEC, - CRYPTOCFGA_PRIORITY_VAL, /* __u32 */ - CRYPTOCFGA_REPORT_LARVAL, /* struct crypto_report_larval */ - CRYPTOCFGA_REPORT_HASH, /* struct crypto_report_hash */ - CRYPTOCFGA_REPORT_BLKCIPHER, /* struct crypto_report_blkcipher */ - CRYPTOCFGA_REPORT_AEAD, /* struct crypto_report_aead */ - CRYPTOCFGA_REPORT_COMPRESS, /* struct crypto_report_comp */ - CRYPTOCFGA_REPORT_RNG, /* struct crypto_report_rng */ - CRYPTOCFGA_REPORT_CIPHER, /* struct crypto_report_cipher */ - CRYPTOCFGA_REPORT_AKCIPHER, /* struct crypto_report_akcipher */ - __CRYPTOCFGA_MAX - -#define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1) -}; - -struct crypto_user_alg { - char cru_name[CRYPTO_MAX_ALG_NAME]; - char cru_driver_name[CRYPTO_MAX_ALG_NAME]; - char cru_module_name[CRYPTO_MAX_ALG_NAME]; - __u32 cru_type; - __u32 cru_mask; - __u32 cru_refcnt; - __u32 cru_flags; -}; - -struct crypto_report_larval { - char type[CRYPTO_MAX_NAME]; -}; - -struct crypto_report_hash { - char type[CRYPTO_MAX_NAME]; - unsigned int blocksize; - unsigned int digestsize; -}; - -struct crypto_report_cipher { - char type[CRYPTO_MAX_ALG_NAME]; - unsigned int blocksize; - unsigned int min_keysize; - unsigned int max_keysize; -}; - -struct crypto_report_blkcipher { - char type[CRYPTO_MAX_NAME]; - char geniv[CRYPTO_MAX_NAME]; - unsigned int blocksize; - unsigned int min_keysize; - unsigned int max_keysize; - unsigned int ivsize; -}; - -struct crypto_report_aead { - char type[CRYPTO_MAX_NAME]; - char geniv[CRYPTO_MAX_NAME]; - unsigned int blocksize; - unsigned int maxauthsize; - unsigned int ivsize; -}; - -struct crypto_report_comp { - char type[CRYPTO_MAX_NAME]; -}; - -struct crypto_report_rng { - char type[CRYPTO_MAX_NAME]; - unsigned int seedsize; -}; - -struct crypto_report_akcipher { - char type[CRYPTO_MAX_NAME]; -}; - -#define CRYPTO_REPORT_MAXSIZE (sizeof(struct crypto_user_alg) + \ - sizeof(struct crypto_report_blkcipher)) -- cgit v1.2.3 From 3e90950d36f19e5477becd5acb02e9b8d8c8956f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 22 Jun 2015 10:31:40 +0800 Subject: crypto: algif_aead - Temporarily disable all AEAD algorithms As the AEAD conversion is still ongoing, we do not yet wish to export legacy AEAD implementations to user-space, as their calling convention will change. This patch actually disables all AEAD algorithms because some of them (e.g., cryptd) will need to be modified to propagate this flag. Subsequent patches will reenable them on an individual basis. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 964e5735a6a9..81ef938b0a8e 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -101,6 +101,12 @@ */ #define CRYPTO_ALG_INTERNAL 0x00002000 +/* + * Temporary flag used to prevent legacy AEAD implementations from + * being used by user-space. + */ +#define CRYPTO_ALG_AEAD_NEW 0x00004000 + /* * Transform masks and values (for crt_flags). */ -- cgit v1.2.3 From 8ccd0d0ca04147e91890c373677f1e741dda2631 Mon Sep 17 00:00:00 2001 From: Hyungwon Hwang Date: Fri, 12 Jun 2015 21:59:01 +0900 Subject: of: add helper for getting endpoint node of specific identifiers When there are multiple ports or multiple endpoints in a port, they have to be distinguished by the value of reg property. It is common. The drivers can get the specific endpoint in the specific port via this function. Now the drivers have to implement this code in themselves or have to force the order of dt nodes to get the right node. Signed-off-by: Hyungwon Hwang Acked-by: Rob Herring Signed-off-by: Inki Dae --- include/linux/of_graph.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h index 7bc92e050608..1c1d5b901a72 100644 --- a/include/linux/of_graph.h +++ b/include/linux/of_graph.h @@ -45,6 +45,8 @@ int of_graph_parse_endpoint(const struct device_node *node, struct device_node *of_graph_get_port_by_id(struct device_node *node, u32 id); struct device_node *of_graph_get_next_endpoint(const struct device_node *parent, struct device_node *previous); +struct device_node *of_graph_get_endpoint_by_regs( + const struct device_node *parent, int port_reg, int reg); struct device_node *of_graph_get_remote_port_parent( const struct device_node *node); struct device_node *of_graph_get_remote_port(const struct device_node *node); @@ -69,6 +71,12 @@ static inline struct device_node *of_graph_get_next_endpoint( return NULL; } +struct device_node *of_graph_get_endpoint_by_regs( + const struct device_node *parent, int port_reg, int reg) +{ + return NULL; +} + static inline struct device_node *of_graph_get_remote_port_parent( const struct device_node *node) { -- cgit v1.2.3 From 7ce7b26f84cfcbcb04f526f56f685a56ccddf355 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 27 Apr 2015 21:54:13 +0900 Subject: mfd: Constify regmap and irq configuration data Constify in various drivers configuration data which is not modified: - regmap_irq_chip, - individual regmap_irq's in array, - regmap_config, - irq_domain_ops, Signed-off-by: Krzysztof Kozlowski Signed-off-by: Lee Jones --- include/linux/mfd/da9055/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/da9055/core.h b/include/linux/mfd/da9055/core.h index 956afa445998..5dc743fd63a6 100644 --- a/include/linux/mfd/da9055/core.h +++ b/include/linux/mfd/da9055/core.h @@ -89,6 +89,6 @@ static inline int da9055_reg_update(struct da9055 *da9055, unsigned char reg, int da9055_device_init(struct da9055 *da9055); void da9055_device_exit(struct da9055 *da9055); -extern struct regmap_config da9055_regmap_config; +extern const struct regmap_config da9055_regmap_config; #endif /* __DA9055_CORE_H */ -- cgit v1.2.3 From 5c188d748216f67c928d67a42f14b5569b6404a5 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 27 Apr 2015 10:18:14 -0700 Subject: mfd: twl4030-power: Fix pmic for boards that need AC charger disabled I noticed the PMIC configuration on 37xx-evm won't actually shut down the voltages during off-idle. Turns out 37xx-evm needs the AC charger state transitions disabled like we are doing for SDP and LDP in the legacy booting case. Let's fix this for device tree based booting by setting up the quirk flag based on the compatible flag. And let's also use the existing define for STARTON_CHG. Note that SDP and EVM do not have the PMIC clken wired to gate the the oscillator while LDP has. Signed-off-by: Tony Lindgren Signed-off-by: Lee Jones --- include/linux/i2c/twl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/i2c/twl.h b/include/linux/i2c/twl.h index 0bc03f100d04..9ad7828d9d34 100644 --- a/include/linux/i2c/twl.h +++ b/include/linux/i2c/twl.h @@ -675,6 +675,7 @@ struct twl4030_power_data { struct twl4030_resconfig *board_config; #define TWL4030_RESCONFIG_UNDEF ((u8)-1) bool use_poweroff; /* Board is wired for TWL poweroff */ + bool ac_charger_quirk; /* Disable AC charger on board */ }; extern int twl4030_remove_script(u8 flags); -- cgit v1.2.3 From 8be4efad81d814b607cbdad47176f426be83ba75 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Wed, 6 May 2015 19:07:16 +0200 Subject: mfd: max77686: Remove unused struct max77686_opmode_data The defined struct max77686_opmode_data isn't used neither by the max77686 mfd driver nor the drivers for its sub-devices. Signed-off-by: Javier Martinez Canillas Reviewed-by: Chanwoo Choi Reviewed-by: Krzysztof Kozlowski Signed-off-by: Lee Jones --- include/linux/mfd/max77686.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max77686.h b/include/linux/mfd/max77686.h index bb995ab9a575..d4b72d519115 100644 --- a/include/linux/mfd/max77686.h +++ b/include/linux/mfd/max77686.h @@ -125,9 +125,4 @@ enum max77686_opmode { MAX77686_OPMODE_STANDBY, }; -struct max77686_opmode_data { - int id; - int mode; -}; - #endif /* __LINUX_MFD_MAX77686_H */ -- cgit v1.2.3 From e6cb73410a6db70eab266f15b7e25053a45b842d Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 11 May 2015 13:58:09 +0100 Subject: mfd: arizona: Add better support for system suspend Allow the chip to completely power off if we enter runtime suspend and there is no jack detection active. This is helpful for systems where system suspend might remove the supplies to the CODEC, without informing us. Note the powering off is done in runtime suspend rather than system suspend, because we need to hold reset until the first time DCVDD is powered anyway (which would be in runtime resume), and we might as well save the extra power. Signed-off-by: Charles Keepax Signed-off-by: Lee Jones --- include/linux/mfd/arizona/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h index 16a498f48169..847bdaf47f1d 100644 --- a/include/linux/mfd/arizona/core.h +++ b/include/linux/mfd/arizona/core.h @@ -117,6 +117,7 @@ struct arizona { int num_core_supplies; struct regulator_bulk_data core_supplies[ARIZONA_MAX_CORE_SUPPLIES]; struct regulator *dcvdd; + bool has_fully_powered_off; struct arizona_pdata pdata; -- cgit v1.2.3 From fc027d138b79537e5353f3d3bad2bcaac787cd17 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 1 May 2015 16:15:12 +0100 Subject: mfd: arizona: Split INx_MODE into two fields Later arizona silicon has the single/differential selector in a different register, and IN1_MODE only selects between analogue or digital. Prepare for this by splitting the INx_MODE definition into two fields. Signed-off-by: Richard Fitzgerald Signed-off-by: Lee Jones --- include/linux/mfd/arizona/pdata.h | 5 ++++- include/linux/mfd/arizona/registers.h | 27 ++++++++++++++++++--------- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 1789cb0f4f17..f6722677e6d0 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -156,7 +156,10 @@ struct arizona_pdata { /** MICBIAS configurations */ struct arizona_micbias micbias[ARIZONA_MAX_MICBIAS]; - /** Mode of input structures */ + /** + * Mode of input structures + * One of the ARIZONA_INMODE_xxx values + */ int inmode[ARIZONA_MAX_INPUT]; /** Mode for outputs */ diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h index aacc10d7789c..3499d36e6067 100644 --- a/include/linux/mfd/arizona/registers.h +++ b/include/linux/mfd/arizona/registers.h @@ -2515,9 +2515,12 @@ #define ARIZONA_IN1_DMIC_SUP_MASK 0x1800 /* IN1_DMIC_SUP - [12:11] */ #define ARIZONA_IN1_DMIC_SUP_SHIFT 11 /* IN1_DMIC_SUP - [12:11] */ #define ARIZONA_IN1_DMIC_SUP_WIDTH 2 /* IN1_DMIC_SUP - [12:11] */ -#define ARIZONA_IN1_MODE_MASK 0x0600 /* IN1_MODE - [10:9] */ -#define ARIZONA_IN1_MODE_SHIFT 9 /* IN1_MODE - [10:9] */ -#define ARIZONA_IN1_MODE_WIDTH 2 /* IN1_MODE - [10:9] */ +#define ARIZONA_IN1_MODE_MASK 0x0400 /* IN1_MODE - [10] */ +#define ARIZONA_IN1_MODE_SHIFT 10 /* IN1_MODE - [10] */ +#define ARIZONA_IN1_MODE_WIDTH 1 /* IN1_MODE - [10] */ +#define ARIZONA_IN1_SINGLE_ENDED_MASK 0x0200 /* IN1_MODE - [9] */ +#define ARIZONA_IN1_SINGLE_ENDED_SHIFT 9 /* IN1_MODE - [9] */ +#define ARIZONA_IN1_SINGLE_ENDED_WIDTH 1 /* IN1_MODE - [9] */ #define ARIZONA_IN1L_PGA_VOL_MASK 0x00FE /* IN1L_PGA_VOL - [7:1] */ #define ARIZONA_IN1L_PGA_VOL_SHIFT 1 /* IN1L_PGA_VOL - [7:1] */ #define ARIZONA_IN1L_PGA_VOL_WIDTH 7 /* IN1L_PGA_VOL - [7:1] */ @@ -2588,9 +2591,12 @@ #define ARIZONA_IN2_DMIC_SUP_MASK 0x1800 /* IN2_DMIC_SUP - [12:11] */ #define ARIZONA_IN2_DMIC_SUP_SHIFT 11 /* IN2_DMIC_SUP - [12:11] */ #define ARIZONA_IN2_DMIC_SUP_WIDTH 2 /* IN2_DMIC_SUP - [12:11] */ -#define ARIZONA_IN2_MODE_MASK 0x0600 /* IN2_MODE - [10:9] */ -#define ARIZONA_IN2_MODE_SHIFT 9 /* IN2_MODE - [10:9] */ -#define ARIZONA_IN2_MODE_WIDTH 2 /* IN2_MODE - [10:9] */ +#define ARIZONA_IN2_MODE_MASK 0x0400 /* IN2_MODE - [10] */ +#define ARIZONA_IN2_MODE_SHIFT 10 /* IN2_MODE - [10] */ +#define ARIZONA_IN2_MODE_WIDTH 1 /* IN2_MODE - [10] */ +#define ARIZONA_IN2_SINGLE_ENDED_MASK 0x0200 /* IN2_MODE - [9] */ +#define ARIZONA_IN2_SINGLE_ENDED_SHIFT 9 /* IN2_MODE - [9] */ +#define ARIZONA_IN2_SINGLE_ENDED_WIDTH 1 /* IN2_MODE - [9] */ #define ARIZONA_IN2L_PGA_VOL_MASK 0x00FE /* IN2L_PGA_VOL - [7:1] */ #define ARIZONA_IN2L_PGA_VOL_SHIFT 1 /* IN2L_PGA_VOL - [7:1] */ #define ARIZONA_IN2L_PGA_VOL_WIDTH 7 /* IN2L_PGA_VOL - [7:1] */ @@ -2661,9 +2667,12 @@ #define ARIZONA_IN3_DMIC_SUP_MASK 0x1800 /* IN3_DMIC_SUP - [12:11] */ #define ARIZONA_IN3_DMIC_SUP_SHIFT 11 /* IN3_DMIC_SUP - [12:11] */ #define ARIZONA_IN3_DMIC_SUP_WIDTH 2 /* IN3_DMIC_SUP - [12:11] */ -#define ARIZONA_IN3_MODE_MASK 0x0600 /* IN3_MODE - [10:9] */ -#define ARIZONA_IN3_MODE_SHIFT 9 /* IN3_MODE - [10:9] */ -#define ARIZONA_IN3_MODE_WIDTH 2 /* IN3_MODE - [10:9] */ +#define ARIZONA_IN3_MODE_MASK 0x0400 /* IN3_MODE - [10] */ +#define ARIZONA_IN3_MODE_SHIFT 10 /* IN3_MODE - [10] */ +#define ARIZONA_IN3_MODE_WIDTH 1 /* IN3_MODE - [10] */ +#define ARIZONA_IN3_SINGLE_ENDED_MASK 0x0200 /* IN3_MODE - [9] */ +#define ARIZONA_IN3_SINGLE_ENDED_SHIFT 9 /* IN3_MODE - [9] */ +#define ARIZONA_IN3_SINGLE_ENDED_WIDTH 1 /* IN3_MODE - [9] */ #define ARIZONA_IN3L_PGA_VOL_MASK 0x00FE /* IN3L_PGA_VOL - [7:1] */ #define ARIZONA_IN3L_PGA_VOL_SHIFT 1 /* IN3L_PGA_VOL - [7:1] */ #define ARIZONA_IN3L_PGA_VOL_WIDTH 7 /* IN3L_PGA_VOL - [7:1] */ -- cgit v1.2.3 From 7e2d67e94ab05f70d2a73d00283f46778386ca52 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 13 May 2015 16:52:25 +0100 Subject: mfd: arizona: Add stub for wm5102_patch() For the WM5102 there is a dependency in the core code on wm5102_patch() which only exists when CONFIG_MFD_WM5102 is defined. To avoid having to sprinkle #ifdefs around the code it is given an alternative empty stub version when CONFIG_MFD_WM5102 is deselected Signed-off-by: Richard Fitzgerald Signed-off-by: Lee Jones --- include/linux/mfd/arizona/core.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h index 847bdaf47f1d..2f434f4f79a1 100644 --- a/include/linux/mfd/arizona/core.h +++ b/include/linux/mfd/arizona/core.h @@ -154,7 +154,15 @@ int arizona_request_irq(struct arizona *arizona, int irq, char *name, void arizona_free_irq(struct arizona *arizona, int irq, void *data); int arizona_set_irq_wake(struct arizona *arizona, int irq, int on); +#ifdef CONFIG_MFD_WM5102 int wm5102_patch(struct arizona *arizona); +#else +static inline int wm5102_patch(struct arizona *arizona) +{ + return 0; +} +#endif + int wm5110_patch(struct arizona *arizona); int wm8997_patch(struct arizona *arizona); -- cgit v1.2.3 From dfe816c5e37272f2f3c1311f0e9934e1b4229261 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Fri, 19 Jun 2015 19:47:53 +0530 Subject: macvtap: Increase limit of macvtap queues Macvtap should be compatible with tuntap for maximum number of queues. commit 'baf71c5c1f80d82e92924050a60b5baaf97e3094 (tuntap: Increase the number of queues in tun.)' removes the limitations and increases number of queues in tuntap. Now, Its safe to increase number of queues in Macvtap as well. This patch also modifies 'macvtap_del_queues' function to avoid extra memory allocation in stack. Changes from v1->v2 : Michael S. Tsirkin, Jason Wang : Better way to use linked list to avoid use of extra memory in stack. Sergei Shtylyov : Specify dependent commit's summary. Signed-off-by: Pankaj Gupta Signed-off-by: David S. Miller --- include/linux/if_macvlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 6f6929ea8a0c..a4ccc3122f93 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -29,7 +29,7 @@ struct macvtap_queue; * Maximum times a macvtap device can be opened. This can be used to * configure the number of receive queue, e.g. for multiqueue virtio. */ -#define MAX_MACVTAP_QUEUES 16 +#define MAX_MACVTAP_QUEUES 256 #define MACVLAN_MC_FILTER_BITS 8 #define MACVLAN_MC_FILTER_SZ (1 << MACVLAN_MC_FILTER_BITS) -- cgit v1.2.3 From 7d4f8d871ab15bd50a5771382ca2c9355b38d73c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Mon, 22 Jun 2015 00:27:17 -0700 Subject: switchdev; add VLAN support for port's bridge_getlink One more missing piece of the puzzle. Add vlan dump support to switchdev port's bridge_getlink. iproute2 "bridge vlan show" cmd already knows how to show the vlans installed on the bridge and the device , but (until now) no one implemented the port vlan part of the netlink PF_BRIDGE:RTM_GETLINK msg. Before this patch, "bridge vlan show": $ bridge -c vlan show port vlan ids sw1p1 30-34 << bridge side vlans 57 sw1p1 << device side vlans (missing) sw1p2 57 sw1p2 sw1p3 sw1p4 br0 None (When the port is bridged, the output repeats the vlan list for the vlans on the bridge side of the port and the vlans on the device side of the port. The listing above show no vlans for the device side even though they are installed). After this patch: $ bridge -c vlan show port vlan ids sw1p1 30-34 << bridge side vlan 57 sw1p1 30-34 << device side vlans 57 3840 PVID sw1p2 57 sw1p2 57 3840 PVID sw1p3 3842 PVID sw1p4 3843 PVID br0 None I re-used ndo_dflt_bridge_getlink to add vlan fill call-back func. switchdev support adds an obj dump for VLAN objects, using the same call-back scheme as FDB dump. Support included for both compressed and un-compressed vlan dumps. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index a2324fb45cf4..39adaa9529eb 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -114,5 +114,9 @@ extern int ndo_dflt_fdb_del(struct ndmsg *ndm, extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, - u32 flags, u32 mask, int nlflags); + u32 flags, u32 mask, int nlflags, + u32 filter_mask, + int (*vlan_fill)(struct sk_buff *skb, + struct net_device *dev, + u32 filter_mask)); #endif /* __LINUX_RTNETLINK_H */ -- cgit v1.2.3 From cca0ba2df3d4000bacd9b7807d46ffafac62d53a Mon Sep 17 00:00:00 2001 From: Hyungwon Hwang Date: Thu, 28 May 2015 16:25:15 +0900 Subject: backlight: Change the return type of backlight_update_status() to int Backlight device returns the result of update_status(), but backlight_update_status() ignores it. So the consumers cannot confirm the result of their function call. This patch makes the result to be returned back for consumers. Signed-off-by: Hyungwon Hwang Acked-by: Jingoo Han Signed-off-by: Lee Jones --- include/linux/backlight.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index adb14a8616df..1e7a69adbe6f 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -117,12 +117,16 @@ struct backlight_device { int use_count; }; -static inline void backlight_update_status(struct backlight_device *bd) +static inline int backlight_update_status(struct backlight_device *bd) { + int ret = -ENOENT; + mutex_lock(&bd->update_lock); if (bd->ops && bd->ops->update_status) - bd->ops->update_status(bd); + ret = bd->ops->update_status(bd); mutex_unlock(&bd->update_lock); + + return ret; } extern struct backlight_device *backlight_device_register(const char *name, -- cgit v1.2.3 From ce0bdb849ad46e4b4e4cae6913b447ae9938bdcf Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Tue, 23 Jun 2015 16:06:44 +0900 Subject: of: fix a build error to of_graph_get_endpoint_by_regs function This patch fixes the below build error reported by Stephen, Stephen reported: After merging the drm-exynos tree, today's linux-next build (x86_64 allmodconfig) failed like this: drivers/media/i2c/adv7604.o: In function `of_graph_get_endpoint_by_regs': adv7604.c:(.text+0x586c): multiple definition of `of_graph_get_endpoint_by_regs' drivers/media/i2c/adv7343.o:adv7343.c:(.text+0xa13): first defined here drivers/media/platform/soc_camera/atmel-isi.o: In function `of_graph_get_endpoint_by_regs': atmel-isi.c:(.text+0x1ec9): multiple definition of `of_graph_get_endpoint_by_regs' drivers/media/platform/soc_camera/soc_camera.o:soc_camera.c:(.text+0x2ce3): first defined here drivers/media/platform/soc_camera/rcar_vin.o: In function `of_graph_get_endpoint_by_regs': rcar_vin.c:(.text+0x307c): multiple definition of `of_graph_get_endpoint_by_regs' drivers/media/platform/soc_camera/soc_camera.o:soc_camera.c:(.text+0x2ce3): first defined here Caused by commit: a0f7001c18ca ("of: add helper for getting endpoint node of specific identifiers") To fix the error, this patch declares of_graph_get_endpoint_by_regs function with "static inline". Signed-off-by: Inki Dae Signed-off-by: Dave Airlie --- include/linux/of_graph.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h index 1c1d5b901a72..f8bcd0e21a26 100644 --- a/include/linux/of_graph.h +++ b/include/linux/of_graph.h @@ -71,7 +71,7 @@ static inline struct device_node *of_graph_get_next_endpoint( return NULL; } -struct device_node *of_graph_get_endpoint_by_regs( +static inline struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg) { return NULL; -- cgit v1.2.3 From 0eeb075fad736fb92620af995c47c204bbb5e829 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 23 Jun 2015 13:45:37 -0400 Subject: net: ipv4 sysctl option to ignore routes when nexthop link is down This feature is only enabled with the new per-interface or ipv4 global sysctls called 'ignore_routes_with_linkdown'. net.ipv4.conf.all.ignore_routes_with_linkdown = 0 net.ipv4.conf.default.ignore_routes_with_linkdown = 0 net.ipv4.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, will report to userspace that a route is dead and will no longer resolve to this nexthop when performing a fib lookup. This will signal to userspace that the route will not be selected. The signalling of a RTNH_F_DEAD is only passed to userspace if the sysctl is enabled and link is down. This was done as without it the netlink listeners would have no idea whether or not a nexthop would be selected. The kernel only sets RTNH_F_DEAD internally if the interface has IFF_UP cleared. With the new sysctl set, the following behavior can be observed (interface p8p1 is link-down): default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 dead linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 90.0.0.1 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 via 10.0.5.2 dev p9p1 src 10.0.5.15 cache While the route does remain in the table (so it can be modified if needed rather than being wiped away as it would be if IFF_UP was cleared), the proper next-hop is chosen automatically when the link is down. Now interface p8p1 is linked-up: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 90.0.0.1 via 80.0.0.2 dev p8p1 src 80.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 dev p8p1 src 80.0.0.1 cache and the output changes to what one would expect. If the sysctl is not set, the following output would be expected when p8p1 is down: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 Since the dead flag does not appear, there should be no expectation that the kernel would skip using this route due to link being down. v2: Split kernel changes into 2 patches, this actually makes a behavioral change if the sysctl is set. Also took suggestion from Alex to simplify code by only checking sysctl during fib lookup and suggestion from Scott to add a per-interface sysctl. v3: Code clean-ups to make it more readable and efficient as well as a reverse path check fix. v4: Drop binary sysctl v5: Whitespace fixups from Dave v6: Style changes from Dave and checkpatch suggestions v7: One more checkpatch fixup Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Acked-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 0a21fbefdfbe..a4328cea376a 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -120,6 +120,9 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) || (!IN_DEV_FORWARD(in_dev) && \ IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS))) +#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \ + IN_DEV_CONF_GET((in_dev), IGNORE_ROUTES_WITH_LINKDOWN) + #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_ORCONF((in_dev), ARP_ACCEPT) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) -- cgit v1.2.3 From 9200025724619d83f9fc366281f0bde36afe6e5a Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Fri, 12 Jun 2015 10:04:10 +0800 Subject: rtc: Introduce rtc_tm_sub() helper function There're many sites need comparing the two rtc_time variants for many rtc drivers, especially in the instances of rtc_class_ops::set_alarm(). So add this common helper function to make things easy. Suggested-by: Arnd Bergmann Signed-off-by: Xunlei Pang Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 587017e7939c..b36160321458 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -24,6 +24,14 @@ extern void rtc_time64_to_tm(time64_t time, struct rtc_time *tm); ktime_t rtc_tm_to_ktime(struct rtc_time tm); struct rtc_time rtc_ktime_to_tm(ktime_t kt); +/* + * rtc_tm_sub - Return the difference in seconds. + */ +static inline time64_t rtc_tm_sub(struct rtc_time *lhs, struct rtc_time *rhs) +{ + return rtc_tm_to_time64(lhs) - rtc_tm_to_time64(rhs); +} + /** * Deprecated. Use rtc_time64_to_tm(). */ -- cgit v1.2.3 From c86a6c28957a9e8e9a71582a32e96971ad411ffe Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Fri, 12 Jun 2015 11:10:18 +0800 Subject: rtc: interface: Remove rtc_set_mmss() Now rtc_set_mmss() has no users, just remove it. We still have rtc_set_time() doing similar things. Signed-off-by: Xunlei Pang Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index b36160321458..3359f0422c6b 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -168,7 +168,6 @@ extern void devm_rtc_device_unregister(struct device *dev, extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); -extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs); extern int rtc_set_ntp_time(struct timespec64 now); int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, -- cgit v1.2.3 From c3cddc4c296c3a1498df7181015cbcea178a0546 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 24 Jun 2015 16:54:45 -0700 Subject: fsnotify: remove obsolete documentation should_send_event is no longer part of struct fsnotify_ops, so remove it. Signed-off-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsnotify_backend.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0f313f93c586..65a517dd32f7 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -84,8 +84,6 @@ struct fsnotify_fname; * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * - * should_send_event - given a group, inode, and mask this function determines - * if the group is interested in this event. * handle_event - main call for a group to handle an fs event * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group -- cgit v1.2.3 From 5286d20c4eb7b0c29217f8756652609df74f5489 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 24 Jun 2015 16:54:51 -0700 Subject: configfs: unexport/make static config_item_init() config_item_init() is only used in item.c Signed-off-by: Fabian Frederick Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/configfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 34025df61829..c9e5c57e4edf 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item) return item->ci_name; } -extern void config_item_init(struct config_item *); extern void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type); -- cgit v1.2.3 From b5242e98c1cb834feb1e84026f09a4796b49eb4d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 24 Jun 2015 16:55:42 -0700 Subject: smpboot: allow excluding cpus from the smpboot threads This patch series allows the watchdog to run by default only on the housekeeping cores when nohz_full is in effect; this seems to be a good compromise short of turning it off completely (since the nohz_full cores can't tolerate a watchdog). To provide customizability, we add /proc/sys/kernel/watchdog_cpumask so that the set of cores running the watchdog can be tuned to different values after bootup. To implement this customizability, we add a new smpboot_update_cpumask_percpu_thread() API to the smpboot_thread subsystem that lets us park or unpark "unwanted" threads. And now that threads can be parked for long periods of time, we tweak the /proc//stat and /proc//status code so parked threads aren't reported as running, which is otherwise confusing. This patch (of 3): This change allows some cores to be excluded from running the smp_hotplug_thread tasks. The following commit to update kernel/watchdog.c to use this functionality is the motivating example, and more information on the motivation is provided there. A new smp_hotplug_thread field is introduced, "cpumask", which is cpumask field managed by the smpboot subsystem that indicates whether or not the given smp_hotplug_thread should run on that core; the cpumask is checked when deciding whether to unpark the thread. To limit the cpumask to less than cpu_possible, you must call smpboot_update_cpumask_percpu_thread() after registering. Signed-off-by: Chris Metcalf Cc: Don Zickus Cc: Ingo Molnar Cc: Ulrich Obergfell Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smpboot.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index d600afb21926..da3c593f9845 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -27,6 +27,8 @@ struct smpboot_thread_data; * @pre_unpark: Optional unpark function, called before the thread is * unparked (cpu online). This is not guaranteed to be * called on the target cpu of the thread. Careful! + * @cpumask: Internal state. To update which threads are unparked, + * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -41,11 +43,14 @@ struct smp_hotplug_thread { void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); void (*pre_unpark)(unsigned int cpu); + cpumask_var_t cpumask; bool selfparking; const char *thread_comm; }; int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif -- cgit v1.2.3 From fe4ba3c34352b7e8068b7f18eb233444aed17011 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 24 Jun 2015 16:55:45 -0700 Subject: watchdog: add watchdog_cpumask sysctl to assist nohz Change the default behavior of watchdog so it only runs on the housekeeping cores when nohz_full is enabled at build and boot time. Allow modifying the set of cores the watchdog is currently running on with a new kernel.watchdog_cpumask sysctl. In the current system, the watchdog subsystem runs a periodic timer that schedules the watchdog kthread to run. However, nohz_full cores are designed to allow userspace application code running on those cores to have 100% access to the CPU. So the watchdog system prevents the nohz_full application code from being able to run the way it wants to, thus the motivation to suppress the watchdog on nohz_full cores, which this patchset provides by default. However, if we disable the watchdog globally, then the housekeeping cores can't benefit from the watchdog functionality. So we allow disabling it only on some cores. See Documentation/lockup-watchdogs.txt for more information. [jhubbard@nvidia.com: fix a watchdog crash in some configurations] Signed-off-by: Chris Metcalf Acked-by: Don Zickus Cc: Ingo Molnar Cc: Ulrich Obergfell Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3d46fb4708e0..f94da0e65dea 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long *watchdog_cpumask_bits; extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , @@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); +extern int proc_watchdog_cpumask(struct ctl_table *, int, + void __user *, size_t *, loff_t *); #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI -- cgit v1.2.3 From 4066c33d0308f87e9a3b0c7fafb9141c0bfbfa77 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Wed, 24 Jun 2015 16:55:54 -0700 Subject: mm/slab_common: support the slub_debug boot option on specific object size The slub_debug=PU,kmalloc-xx cannot work because in the create_kmalloc_caches() the s->name is created after the create_kmalloc_cache() is called. The name is NULL in the create_kmalloc_cache() so the kmem_cache_flags() would not set the slub_debug flags to the s->flags. The fix here set up a kmalloc_names string array for the initialization purpose and delete the dynamic name creation of kmalloc_caches. [akpm@linux-foundation.org: s/kmalloc_names/kmalloc_info/, tweak comment text] Signed-off-by: Gavin Guo Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index ffd24c830151..96f0ea506b5c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,8 +153,30 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) +/* + * The KMALLOC_LOOP_LOW is the definition for the for loop index start number + * to create the kmalloc_caches object in create_kmalloc_caches(). The first + * and the second are 96 and 192. You can see that in the kmalloc_index(), if + * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, + * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't + * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. + */ +#if KMALLOC_MIN_SIZE <= 32 +#define KMALLOC_LOOP_LOW 1 +#elif KMALLOC_MIN_SIZE <= 64 +#define KMALLOC_LOOP_LOW 2 +#else +#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW +#endif + #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +/* + * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. + * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be + * initialized. + */ +#define KMALLOC_LOOP_LOW 1 #endif /* -- cgit v1.2.3 From 1ed58b6051b67e5cfe9e465fb60bf7d5f55e0a64 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 24 Jun 2015 16:55:59 -0700 Subject: linux/slab.h: fix three off-by-one typos in comment The first is a keyboard-off-by-one, the other two the ordinary mathy kind. Signed-off-by: Rasmus Villemoes Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 96f0ea506b5c..9de2fdc8b5e4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; * belongs to. * 0 = zero alloc * 1 = 65 .. 96 bytes - * 2 = 120 .. 192 bytes - * n = 2^(n-1) .. 2^n -1 + * 2 = 129 .. 192 bytes + * n = 2^(n-1)+1 .. 2^n */ static __always_inline int kmalloc_index(size_t size) { -- cgit v1.2.3 From 2ae416b142b625c58c9ccb039aa3ef48ad0e9bae Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:16 -0700 Subject: mm: new mm hook framework CRIU is recreating the process memory layout by remapping the checkpointee memory area on top of the current process (criu). This includes remapping the vDSO to the place it has at checkpoint time. However some architectures like powerpc are keeping a reference to the vDSO base address to build the signal return stack frame by calling the vDSO sigreturn service. So once the vDSO has been moved, this reference is no more valid and the signal frame built later are not usable. This patch serie is introducing a new mm hook framework, and a new arch_remap hook which is called when mremap is done and the mm lock still hold. The next patch is adding the vDSO remap and unmap tracking to the powerpc architecture. This patch (of 3): This patch introduces a new set of header file to manage mm hooks: - per architecture empty header file (arch/x/include/asm/mm-arch-hooks.h) - a generic header (include/linux/mm-arch-hooks.h) The architecture which need to overwrite a hook as to redefine it in its header file, while architecture which doesn't need have nothing to do. The default hooks are defined in the generic header and are used in the case the architecture is not defining it. In a next step, mm hooks defined in include/asm-generic/mm_hooks.h should be moved here. Signed-off-by: Laurent Dufour Suggested-by: Andrew Morton Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm-arch-hooks.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/mm-arch-hooks.h (limited to 'include/linux') diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h new file mode 100644 index 000000000000..63005e367abd --- /dev/null +++ b/include/linux/mm-arch-hooks.h @@ -0,0 +1,16 @@ +/* + * Generic mm no-op hooks. + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_MM_ARCH_HOOKS_H +#define _LINUX_MM_ARCH_HOOKS_H + +#include + +#endif /* _LINUX_MM_ARCH_HOOKS_H */ -- cgit v1.2.3 From 4abad2ca4a4dbdd4a218c12451231ab628f2e60c Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:19 -0700 Subject: mm: new arch_remap() hook Some architectures would like to be triggered when a memory area is moved through the mremap system call. This patch introduces a new arch_remap() mm hook which is placed in the path of mremap, and is called before the old area is unmapped (and the arch_unmap() hook is called). Signed-off-by: Laurent Dufour Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm-arch-hooks.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h index 63005e367abd..4efc3f56e6df 100644 --- a/include/linux/mm-arch-hooks.h +++ b/include/linux/mm-arch-hooks.h @@ -13,4 +13,13 @@ #include +#ifndef arch_remap +static inline void arch_remap(struct mm_struct *mm, + unsigned long old_start, unsigned long old_end, + unsigned long new_start, unsigned long new_end) +{ +} +#define arch_remap arch_remap +#endif + #endif /* _LINUX_MM_ARCH_HOOKS_H */ -- cgit v1.2.3 From a9919c79359df9a706ff9e14fc0a93cd15791c9b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 24 Jun 2015 16:56:28 -0700 Subject: mm: only define hashdist variable when needed For !CONFIG_NUMA, hashdist will always be 0, since it's setter is otherwise compiled out. So we can save 4 bytes of data and some .text (although mostly in __init functions) by only defining it for CONFIG_NUMA. Signed-off-by: Rasmus Villemoes Acked-by: David Rientjes Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0995c2de8162..f589222bfa87 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename, /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. */ -#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) -#define HASHDIST_DEFAULT 1 +#ifdef CONFIG_NUMA +#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) +extern int hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define HASHDIST_DEFAULT 0 +#define hashdist (0) #endif -extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif /* _LINUX_BOOTMEM_H */ -- cgit v1.2.3 From c761471b58e6138938ebc6eafec20b2f60cb3397 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Jun 2015 16:56:33 -0700 Subject: mm: avoid tail page refcounting on non-THP compound pages Reintroduce 8d63d99a5dfb ("mm: avoid tail page refcounting on non-THP compound pages") after removing bogus VM_BUG_ON_PAGE() in put_unrefcounted_compound_page(). THP uses tail page refcounting to be able to split huge pages at any time. Tail page refcounting is not needed for other users of compound pages and it's harmful because of overhead. We try to exclude non-THP pages from tail page refcounting using __compound_tail_refcounted() check. It excludes most common non-THP compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP users -- drivers. And it's not only about overhead. Drivers might want to use compound pages to get refcounting semantics suitable for mapping high-order pages to userspace. But tail page refcounting breaks it. Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on them. It means GUP pins would affect page_mapcount() for tail pages. It's not a problem for THP, because it never maps tail pages. But unlike THP, drivers map parts of compound pages with PTEs and it makes page_mapcount() be called for tail pages. In particular, GUP pins would shift PSS up and affect /proc/kpagecount for such pages. But, I'm not aware about anything which can lead to crash or other serious misbehaviour. Since currently all THP pages are anonymous and all drivers pages are not, we can fix the __compound_tail_refcounted() check by requiring PageAnon() to enable tail page refcounting. Signed-off-by: Kirill A. Shutemov Acked-by: Hugh Dickins Reviewed-by: Andrea Arcangeli Reported-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..8b086070c3a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -499,7 +499,7 @@ static inline int page_count(struct page *page) static inline bool __compound_tail_refcounted(struct page *page) { - return !PageSlab(page) && !PageHeadHuge(page); + return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); } /* -- cgit v1.2.3 From ead07f6a867b5b1b41cf703735e8b39094987a7d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:48 -0700 Subject: mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling memory_failure() can run in 2 different mode (specified by MF_COUNT_INCREASED) in page refcount perspective. When MF_COUNT_INCREASED is set, memory_failure() assumes that the caller takes a refcount of the target page. And if cleared, memory_failure() takes it in it's own. In current code, however, refcounting is done differently in each caller. For example, madvise_hwpoison() uses get_user_pages_fast() and hwpoison_inject() uses get_page_unless_zero(). So this inconsistent refcounting causes refcount failure especially for thp tail pages. Typical user visible effects are like memory leak or VM_BUG_ON_PAGE(!page_count(page)) in isolate_lru_page(). To fix this refcounting issue, this patch introduces get_hwpoison_page() to handle thp tail pages in the same manner for each caller of hwpoison code. memory_failure() might fail to split thp and in such case it returns without completing page isolation. This is not good because PageHWPoison on the thp is still set and there's no easy way to unpoison such thps. So this patch try to roll back any action to the thp in "non anonymous thp" case and "thp split failed" case, expecting an MCE(SRAR) generated by later access afterward will properly free such thps. [akpm@linux-foundation.org: fix CONFIG_HWPOISON_INJECT=m] Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b086070c3a5..cd9df66138a1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2146,6 +2146,7 @@ enum mf_flags { extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); +extern int get_hwpoison_page(struct page *page); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From 16e951966f05da5ccd650104176f6ba289f7fa20 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:07 -0700 Subject: mm: oom_kill: clean up victim marking and exiting interfaces Rename unmark_oom_victim() to exit_oom_victim(). Marking and unmarking are related in functionality, but the interface is not symmetrical at all: one is an internal OOM killer function used during the killing, the other is for an OOM victim to signal its own death on exit later on. This has locking implications, see follow-up changes. While at it, rename mark_tsk_oom_victim() to mark_oom_victim(), which is easier on the eye. Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 44b2f6f7bbd8..a8e6a498cbcb 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -47,9 +47,7 @@ static inline bool oom_task_origin(const struct task_struct *p) return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); } -extern void mark_tsk_oom_victim(struct task_struct *tsk); - -extern void unmark_oom_victim(void); +extern void mark_oom_victim(struct task_struct *tsk); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, @@ -75,6 +73,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); + +extern void exit_oom_victim(void); + extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); -- cgit v1.2.3 From dc56401fc9f25e8f93899991ec858c98a331d88c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:19 -0700 Subject: mm: oom_kill: simplify OOM killer locking The zonelist locking and the oom_sem are two overlapping locks that are used to serialize global OOM killing against different things. The historical zonelist locking serializes OOM kills from allocations with overlapping zonelists against each other to prevent killing more tasks than necessary in the same memory domain. Only when neither tasklists nor zonelists from two concurrent OOM kills overlap (tasks in separate memcgs bound to separate nodes) are OOM kills allowed to execute in parallel. The younger oom_sem is a read-write lock to serialize OOM killing against the PM code trying to disable the OOM killer altogether. However, the OOM killer is a fairly cold error path, there is really no reason to optimize for highly performant and concurrent OOM kills. And the oom_sem is just flat-out redundant. Replace both locking schemes with a single global mutex serializing OOM kills regardless of context. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index a8e6a498cbcb..7deecb7bca5e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -32,6 +32,8 @@ enum oom_scan_t { /* Thread is the potential origin of an oom condition; kill first on oom */ #define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) +extern struct mutex oom_lock; + static inline void set_current_oom_origin(void) { current->signal->oom_flags |= OOM_FLAG_ORIGIN; @@ -60,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message); -extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); -extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); - extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, int order, const nodemask_t *nodemask, struct mem_cgroup *memcg); -- cgit v1.2.3 From cc637b1704d78b068c2eb700eec384c69ea56cdf Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:30 -0700 Subject: memory-failure: export page_type and action result Export 'outcome' and 'action_page_type' to mm.h, so we could use this emnus outside. This patch is preparation for adding trace events for memory-failure recovery action. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cd9df66138a1..986a9a221ee0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2153,6 +2153,40 @@ extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages; extern int soft_offline_page(struct page *page, int flags); + +/* + * Error handlers for various types of pages. + */ +enum mf_outcome { + MF_IGNORED, /* Error: cannot be handled */ + MF_FAILED, /* Error: handling failed */ + MF_DELAYED, /* Will be handled later */ + MF_RECOVERED, /* Successfully recovered */ +}; + +enum mf_action_page_type { + MF_MSG_KERNEL, + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_SLAB, + MF_MSG_DIFFERENT_COMPOUND, + MF_MSG_POISONED_HUGE, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, + MF_MSG_DIRTY_MLOCKED_LRU, + MF_MSG_CLEAN_MLOCKED_LRU, + MF_MSG_DIRTY_UNEVICTABLE_LRU, + MF_MSG_CLEAN_UNEVICTABLE_LRU, + MF_MSG_DIRTY_LRU, + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, + MF_MSG_BUDDY_2ND, + MF_MSG_UNKNOWN, +}; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr, -- cgit v1.2.3 From cc3e2af42e7b7e0457b93bf17c19b44c635cd40c Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:33 -0700 Subject: memory-failure: change type of action_result's param 3 to enum Change type of action_result's param 3 to enum for type consistency, and rename mf_outcome to mf_result for clearly. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 986a9a221ee0..24ad583596d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2157,7 +2157,7 @@ extern int soft_offline_page(struct page *page, int flags); /* * Error handlers for various types of pages. */ -enum mf_outcome { +enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ -- cgit v1.2.3 From 8809aa2d28d74111ff2f1928edaa4e9845c97a7d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Jun 2015 16:57:44 -0700 Subject: mm: clarify that the function operates on hugepage pte We have confusing functions to clear pmd, pmd_clear_* and pmd_clear. Add _huge_ to pmdp_clear functions so that we are clear that they operate on hugepage pte. We don't bother about other functions like pmdp_set_wrprotect, pmdp_clear_flush_young, because they operate on PTE bits and hence indicate they are operating on hugepage ptes Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Andrea Arcangeli Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 95243d28a0ee..61cd67f4d788 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pte; \ }) -#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \ +#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \ + ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ ___pmd; \ }) -#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \ +#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \ + ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ mmu_notifier_invalidate_range(__mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ @@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush -#define pmdp_clear_flush_notify pmdp_clear_flush -#define pmdp_get_and_clear_notify pmdp_get_and_clear +#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush +#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ -- cgit v1.2.3 From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:09 -0700 Subject: mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute Some high end Intel Xeon systems report uncorrectable memory errors as a recoverable machine check. Linux has included code for some time to process these and just signal the affected processes (or even recover completely if the error was in a read only page that can be replaced by reading from disk). But we have no recovery path for errors encountered during kernel code execution. Except for some very specific cases were are unlikely to ever be able to recover. Enter memory mirroring. Actually 3rd generation of memory mirroing. Gen1: All memory is mirrored Pro: No s/w enabling - h/w just gets good data from other side of the mirror Con: Halves effective memory capacity available to OS/applications Gen2: Partial memory mirror - just mirror memory begind some memory controllers Pro: Keep more of the capacity Con: Nightmare to enable. Have to choose between allocating from mirrored memory for safety vs. NUMA local memory for performance Gen3: Address range partial memory mirror - some mirror on each memory controller Pro: Can tune the amount of mirror and keep NUMA performance Con: I have to write memory management code to implement The current plan is just to use mirrored memory for kernel allocations. This has been broken into two phases: 1) This patch series - find the mirrored memory, use it for boot time allocations 2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the unused mirrored memory from mm/memblock.c and only give it out to select kernel allocations (this is still being scoped because page_alloc.c is scary). This patch (of 3): Add extra "flags" to memblock to allow selection of memory based on attribute. No functional changes Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9497ec7c77ea..7aeec0cb4c27 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -21,7 +21,10 @@ #define INIT_PHYSMEM_REGIONS 4 /* Definition of memblock flags. */ -#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ +enum { + MEMBLOCK_NONE = 0x0, /* No special request */ + MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ +}; struct memblock_region { phys_addr_t base; @@ -61,7 +64,7 @@ extern bool movable_node_enabled; phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, - int nid); + int nid, ulong flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); @@ -85,11 +88,13 @@ int memblock_remove_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size); -void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a, +void __next_mem_range(u64 *idx, int nid, ulong flags, + struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, +void __next_mem_range_rev(u64 *idx, int nid, ulong flags, + struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); @@ -100,16 +105,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range(i, type_a, type_b, nid, \ +#define for_each_mem_range(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ - for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \ + for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ - __next_mem_range(&i, nid, type_a, type_b, \ + __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) /** @@ -119,17 +125,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range_rev(i, type_a, type_b, nid, \ +#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ for (i = (u64)ULLONG_MAX, \ - __next_mem_range_rev(&i, nid, type_a, type_b, \ + __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ - __next_mem_range_rev(&i, nid, type_a, type_b, \ + __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) #ifdef CONFIG_MOVABLE_NODE @@ -181,13 +188,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL + * @flags: pick from blocks based on memory attributes * * Walks over free (memory && !reserved) areas of memblock. Available as * soon as memblock is initialized. */ -#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ +#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \ for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ - nid, p_start, p_end, p_nid) + nid, flags, p_start, p_end, p_nid) /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas @@ -196,13 +204,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL + * @flags: pick from blocks based on memory attributes * * Walks over free (memory && !reserved) areas of memblock in reverse * order. Available as soon as memblock is initialized. */ -#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ +#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \ + p_nid) \ for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ - nid, p_start, p_end, p_nid) + nid, flags, p_start, p_end, p_nid) static inline void memblock_set_region_flags(struct memblock_region *r, unsigned long flags) @@ -273,7 +283,8 @@ static inline bool memblock_bottom_up(void) { return false; } #define MEMBLOCK_ALLOC_ACCESSIBLE 0 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end); + phys_addr_t start, phys_addr_t end, + ulong flags); phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr); phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, -- cgit v1.2.3 From a3f5bafcc04aaf62990e0cf3ced1cc6d8dc6fe95 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:12 -0700 Subject: mm/memblock: allocate boot time data structures from mirrored memory Try to allocate all boot time kernel data structures from mirrored memory. If we run out of mirrored memory print warnings, but fall back to using non-mirrored memory to make sure that we still boot. By number of bytes, most of what we allocate at boot time is the page structures. 64 bytes per 4K page on x86_64 ... or about 1.5% of total system memory. For workloads where the bulk of memory is allocated to applications this may represent a useful improvement to system availability since 1.5% of total memory might be a third of the memory allocated to the kernel. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 7aeec0cb4c27..0215ffd63069 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -24,6 +24,7 @@ enum { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ + MEMBLOCK_MIRROR = 0x2, /* mirrored region */ }; struct memblock_region { @@ -78,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); +int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); +ulong choose_memblock_flags(void); /* Low level functions */ int memblock_add_range(struct memblock_type *type, @@ -160,6 +163,11 @@ static inline bool movable_node_is_enabled(void) } #endif +static inline bool memblock_is_mirror(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_MIRROR; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); -- cgit v1.2.3 From b05b9f5f9dcf593a0e9327676b78e6c17b4218e8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:15 -0700 Subject: x86, mirror: x86 enabling - find mirrored memory ranges UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory address ranges. See UEFI 2.5 spec pages 157-158: http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf On EFI enabled systems scan the memory map and tell memblock about any mirrored ranges. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/efi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 2092965afca3..5f19efe4eb3f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -96,6 +96,8 @@ typedef struct { #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ +#define EFI_MEMORY_MORE_RELIABLE \ + ((u64)0x0000000000010000ULL) /* higher reliability */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 @@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos extern void efi_late_init(void); extern void efi_free_boot_services(void); extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size); +extern void efi_find_mirror(void); #else static inline void efi_late_init(void) {} static inline void efi_free_boot_services(void) {} -- cgit v1.2.3 From d1dc6f1bcf1e998e7ce65fc120da371ab047a999 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 24 Jun 2015 16:58:18 -0700 Subject: frontswap: allow multiple backends Change frontswap single pointer to a singly linked list of frontswap implementations. Update Xen tmem implementation as register no longer returns anything. Frontswap only keeps track of a single implementation; any implementation that registers second (or later) will replace the previously registered implementation, and gets a pointer to the previous implementation that the new implementation is expected to pass all frontswap functions to if it can't handle the function itself. However that method doesn't really make much sense, as passing that work on to every implementation adds unnecessary work to implementations; instead, frontswap should simply keep a list of all registered implementations and try each implementation for any function. Most importantly, neither of the two currently existing frontswap implementations in the kernel actually do anything with any previous frontswap implementation that they replace when registering. This allows frontswap to successfully manage multiple implementations by keeping a list of them all. Signed-off-by: Dan Streetman Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 8293262401de..e65ef959546c 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -6,16 +6,16 @@ #include struct frontswap_ops { - void (*init)(unsigned); - int (*store)(unsigned, pgoff_t, struct page *); - int (*load)(unsigned, pgoff_t, struct page *); - void (*invalidate_page)(unsigned, pgoff_t); - void (*invalidate_area)(unsigned); + void (*init)(unsigned); /* this swap type was just swapon'ed */ + int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ + int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ + void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ + void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ + struct frontswap_ops *next; /* private pointer to next ops */ }; extern bool frontswap_enabled; -extern struct frontswap_ops * - frontswap_register_ops(struct frontswap_ops *ops); +extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); extern void frontswap_writethrough(bool); -- cgit v1.2.3 From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 24 Jun 2015 16:58:51 -0700 Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc() Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the following INFO splat is logged: =============================== [ INFO: suspicious RCU usage. ] 4.1.0-rc7-next-20150612 #1 Not tainted ------------------------------- kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by systemd/1: #0: (rtnl_mutex){+.+.+.}, at: [] rtnetlink_rcv+0x1f/0x40 #1: (rcu_read_lock_bh){......}, at: [] ipv6_add_addr+0x62/0x540 #2: (addrconf_hash_lock){+...+.}, at: [] ipv6_add_addr+0x184/0x540 stack backtrace: CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1 Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20 04/17/2014 Call Trace: dump_stack+0x4c/0x6e lockdep_rcu_suspicious+0xe7/0x120 ___might_sleep+0x1d5/0x1f0 __might_sleep+0x4d/0x90 kmem_cache_alloc+0x47/0x250 create_object+0x39/0x2e0 kmemleak_alloc_percpu+0x61/0xe0 pcpu_alloc+0x370/0x630 Additional backtrace lines are truncated. In addition, the above splat is followed by several "BUG: sleeping function called from invalid context at mm/slub.c:1268" outputs. As suggested by Martin KaFai Lau, these are the clue to the fix. Routine kmemleak_alloc_percpu() always uses GFP_KERNEL for its allocations, whereas it should follow the gfp from its callers. Reviewed-by: Catalin Marinas Reviewed-by: Kamalesh Babulal Acked-by: Martin KaFai Lau Signed-off-by: Larry Finger Cc: Martin KaFai Lau Cc: Catalin Marinas Cc: Tejun Heo Cc: Christoph Lameter Cc: [3.18+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index e705467ddb47..d0a1f99e24e3 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -28,7 +28,8 @@ extern void kmemleak_init(void) __ref; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; -extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; +extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; @@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, gfp_t gfp) { } -static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) +static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) { } static inline void kmemleak_free(const void *ptr) -- cgit v1.2.3 From b94d5230d06eb930be82e67fb1a9a58271e78297 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 19 May 2015 22:54:31 -0400 Subject: libnvdimm, nfit: initial libnvdimm infrastructure and NFIT support A struct nvdimm_bus is the anchor device for registering nvdimm resources and interfaces, for example, a character control device, nvdimm devices, and I/O region devices. The ACPI NFIT (NVDIMM Firmware Interface Table) is one possible platform description for such non-volatile memory resources in a system. The nfit.ko driver attaches to the "ACPI0012" device that indicates the presence of the NFIT and parses the table to register a struct nvdimm_bus instance. Cc: Cc: Lv Zheng Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Jeff Moyer Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 include/linux/libnvdimm.h (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h new file mode 100644 index 000000000000..2b3c63950c91 --- /dev/null +++ b/include/linux/libnvdimm.h @@ -0,0 +1,34 @@ +/* + * libnvdimm - Non-volatile-memory Devices Subsystem + * + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LIBNVDIMM_H__ +#define __LIBNVDIMM_H__ +struct nvdimm; +struct nvdimm_bus_descriptor; +typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len); + +struct nvdimm_bus_descriptor { + unsigned long dsm_mask; + char *provider_name; + ndctl_fn ndctl; +}; + +struct device; +struct nvdimm_bus; +struct nvdimm_bus *nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc); +void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +#endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 45def22c1fab85764646746ce38d45b2f3281fa5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 26 Apr 2015 19:26:48 -0400 Subject: libnvdimm: control character device and nvdimm_bus sysfs attributes The control device for a nvdimm_bus is registered as an "nd" class device. The expectation is that there will usually only be one "nd" bus registered under /sys/class/nd. However, we allow for the possibility of multiple buses and they will listed in discovery order as ndctl0...ndctlN. This character device hosts the ioctl for passing control messages. The initial command set has a 1:1 correlation with the commands listed in the by the "NFIT DSM Example" document [1], but this scheme is extensible to future command sets. Note, nd_ioctl() and the backing ->ndctl() implementation are defined in a subsequent patch. This is simply the initial registrations and sysfs attributes. [1]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Neil Brown Cc: Greg KH Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 2b3c63950c91..d375cdc4abd5 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,8 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +extern struct attribute_group nvdimm_bus_attribute_group; + struct nvdimm; struct nvdimm_bus_descriptor; typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, @@ -21,14 +23,16 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, unsigned int buf_len); struct nvdimm_bus_descriptor { + const struct attribute_group **attr_groups; unsigned long dsm_mask; char *provider_name; ndctl_fn ndctl; }; struct device; -struct nvdimm_bus; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From e6dfb2de47768efe8cc37c9a1863d2aff81440fb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 25 Apr 2015 03:56:17 -0400 Subject: libnvdimm, nfit: dimm/memory-devices Enable nvdimm devices to be registered on a nvdimm_bus. The kernel assigned device id for nvdimm devicesis dynamic. If userspace needs a more static identifier it should consult a provider-specific attribute. In the case where NFIT is the provider, the 'nmemX/nfit/handle' or 'nmemX/nfit/serial' attributes may be used for this purpose. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d375cdc4abd5..07787f0dd7de 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,12 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ + +enum { + /* when a dimm supports both PMEM and BLK access a label is required */ + NDD_ALIASING = 1 << 0, +}; + extern struct attribute_group nvdimm_bus_attribute_group; struct nvdimm; @@ -34,5 +40,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm *to_nvdimm(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); +const char *nvdimm_name(struct nvdimm *nvdimm); +void *nvdimm_provider_data(struct nvdimm *nvdimm); +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 62232e45f4a265abb43f0acf16e58f5d0b6e1ec9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 8 Jun 2015 14:27:06 -0400 Subject: libnvdimm: control (ioctl) messages for nvdimm_bus and nvdimm devices Most discovery/configuration of the nvdimm-subsystem is done via sysfs attributes. However, some nvdimm_bus instances, particularly the ACPI.NFIT bus, define a small set of messages that can be passed to the platform. For convenience we derive the initial libnvdimm-ioctl command formats directly from the NFIT DSM Interface Example formats. ND_CMD_SMART: media health and diagnostics ND_CMD_GET_CONFIG_SIZE: size of the label space ND_CMD_GET_CONFIG_DATA: read label space ND_CMD_SET_CONFIG_DATA: write label space ND_CMD_VENDOR: vendor-specific command passthrough ND_CMD_ARS_CAP: report address-range-scrubbing capabilities ND_CMD_ARS_START: initiate scrubbing ND_CMD_ARS_STATUS: report on scrubbing state ND_CMD_SMART_THRESHOLD: configure alarm thresholds for smart events If a platform later defines different commands than this set it is straightforward to extend support to those formats. Most of the commands target a specific dimm. However, the address-range-scrubbing commands target the bus. The 'commands' attribute in sysfs of an nvdimm_bus, or nvdimm, enumerate the supported commands for that object. Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Reported-by: Nicholas Moulin Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 07787f0dd7de..a39235819af3 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,13 +14,22 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include +#include enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + + /* need to set a limit somewhere, but yes, this is likely overkill */ + ND_IOCTL_MAX_BUFLEN = SZ_4M, + ND_CMD_MAX_ELEM = 4, + ND_CMD_MAX_ENVELOPE = 16, + ND_CMD_ARS_STATUS_MAX = SZ_4K, }; extern struct attribute_group nvdimm_bus_attribute_group; +extern struct attribute_group nvdimm_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -35,6 +44,14 @@ struct nvdimm_bus_descriptor { ndctl_fn ndctl; }; +struct nd_cmd_desc { + int in_num; + int out_num; + u32 in_sizes[ND_CMD_MAX_ELEM]; + int out_sizes[ND_CMD_MAX_ELEM]; +}; + +struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); @@ -45,5 +62,13 @@ struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, - const struct attribute_group **groups, unsigned long flags); + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask); +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd); +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd); +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf); +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 4d88a97aa9e8cfa6460aab119c5da60ad2267423 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 14:41:48 -0400 Subject: libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver infrastructure * Implement the device-model infrastructure for loading modules and attaching drivers to nvdimm devices. This is a simple association of a nd-device-type number with a driver that has a bitmask of supported device types. To facilitate userspace bind/unbind operations 'modalias' and 'devtype', that also appear in the uevent, are added as generic sysfs attributes for all nvdimm devices. The reason for the device-type number is to support sub-types within a given parent devtype, be it a vendor-specific sub-type or otherwise. * The first consumer of this infrastructure is the driver for dimm devices. It simply uses control messages to retrieve and store the configuration-data image (label set) from each dimm. Note: nd_device_register() arranges for asynchronous registration of nvdimm bus devices by default. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ include/linux/nd.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 include/linux/nd.h (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a39235819af3..d3ebccf4ea8b 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -30,6 +30,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; +extern struct attribute_group nd_device_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -71,4 +72,5 @@ u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); #endif /* __LIBNVDIMM_H__ */ diff --git a/include/linux/nd.h b/include/linux/nd.h new file mode 100644 index 000000000000..e074f67e53a3 --- /dev/null +++ b/include/linux/nd.h @@ -0,0 +1,39 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LINUX_ND_H__ +#define __LINUX_ND_H__ +#include +#include + +struct nd_device_driver { + struct device_driver drv; + unsigned long type; + int (*probe)(struct device *dev); + int (*remove)(struct device *dev); +}; + +static inline struct nd_device_driver *to_nd_device_driver( + struct device_driver *drv) +{ + return container_of(drv, struct nd_device_driver, drv); +} + +#define MODULE_ALIAS_ND_DEVICE(type) \ + MODULE_ALIAS("nd:t" __stringify(type) "*") +#define ND_DEVICE_MODALIAS_FMT "nd:t%d" + +int __must_check __nd_driver_register(struct nd_device_driver *nd_drv, + struct module *module, const char *mod_name); +#define nd_driver_register(driver) \ + __nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) +#endif /* __LINUX_ND_H__ */ -- cgit v1.2.3 From 1f7df6f88b9245a7f2d0f8ecbc97dc88c8d0d8e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 9 Jun 2015 20:13:14 -0400 Subject: libnvdimm, nfit: regions (block-data-window, persistent memory, volatile memory) A "region" device represents the maximum capacity of a BLK range (mmio block-data-window(s)), or a PMEM range (DAX-capable persistent memory or volatile memory), without regard for aliasing. Aliasing, in the dimm-local address space (DPA), is resolved by metadata on a dimm to designate which exclusive interface will access the aliased DPA ranges. Support for the per-dimm metadata/label arrvies is in a subsequent patch. The name format of "region" devices is "regionN" where, like dimms, N is a global ida index assigned at discovery time. This id is not reliable across reboots nor in the presence of hotplug. Look to attributes of the region or static id-data of the sub-namespace to generate a persistent name. However, if the platform configuration does not change it is reasonable to expect the same region id to be assigned at the next boot. "region"s have 2 generic attributes "size", and "mapping"s where: - size: the BLK accessible capacity or the span of the system physical address range in the case of PMEM. - mappingN: a tuple describing a dimm's contribution to the region's capacity in the format (,,). For a PMEM-region there will be at least one mapping per dimm in the interleave set. For a BLK-region there is only "mapping0" listing the starting DPA of the BLK-region and the available DPA capacity of that space (matches "size" above). The max number of mappings per "region" is hard coded per the constraints of sysfs attribute groups. That said the number of mappings per region should never exceed the maximum number of possible dimms in the system. If the current number turns out to not be enough then the "mappings" attribute clarifies how many there are supposed to be. "32 should be enough for anybody...". Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d3ebccf4ea8b..39e7e606092a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -26,11 +26,14 @@ enum { ND_CMD_MAX_ELEM = 4, ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, + ND_MAX_MAPPINGS = 32, }; extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_region_attribute_group; +extern struct attribute_group nd_mapping_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -38,6 +41,12 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_mapping { + struct nvdimm *nvdimm; + u64 start; + u64 size; +}; + struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long dsm_mask; @@ -52,6 +61,14 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_region_desc { + struct resource *res; + struct nd_mapping *nd_mapping; + u16 num_mappings; + const struct attribute_group **attr_groups; + void *provider_data; +}; + struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, @@ -59,9 +76,11 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); +struct nd_region *to_nd_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); +void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -73,4 +92,10 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 3d88002e4a7bd40f355550284c6cd140e6fe29dc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 15:02:11 -0400 Subject: libnvdimm: support for legacy (non-aliasing) nvdimms The libnvdimm region driver is an intermediary driver that translates non-volatile "region"s into "namespace" sub-devices that are surfaced by persistent memory block-device drivers (PMEM and BLK). ACPI 6 introduces the concept that a given nvdimm may simultaneously offer multiple access modes to its media through direct PMEM load/store access, or windowed BLK mode. Existing nvdimms mostly implement a PMEM interface, some offer a BLK-like mode, but never both as ACPI 6 defines. If an nvdimm is single interfaced, then there is no need for dimm metadata labels. For these devices we can take the region boundaries directly to create a child namespace device (nd_namespace_io). Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 7 +++++-- include/linux/nd.h | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 39e7e606092a..37f966aff386 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -71,8 +71,11 @@ struct nd_region_desc { struct nvdimm_bus; struct device; -struct nvdimm_bus *nvdimm_bus_register(struct device *parent, - struct nvdimm_bus_descriptor *nfit_desc); +struct module; +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc, struct module *module); +#define nvdimm_bus_register(parent, desc) \ + __nvdimm_bus_register(parent, desc, THIS_MODULE) void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); diff --git a/include/linux/nd.h b/include/linux/nd.h index e074f67e53a3..da70e9962197 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -26,6 +26,16 @@ static inline struct nd_device_driver *to_nd_device_driver( struct device_driver *drv) { return container_of(drv, struct nd_device_driver, drv); +}; + +struct nd_namespace_io { + struct device dev; + struct resource res; +}; + +static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) +{ + return container_of(dev, struct nd_namespace_io, dev); } #define MODULE_ALIAS_ND_DEVICE(type) \ -- cgit v1.2.3 From eaf961536e1622ad21247ac8d44acd48ba65566e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:11:27 -0400 Subject: libnvdimm, nfit: add interleave-set state-tracking infrastructure On platforms that have firmware support for reading/writing per-dimm label space, a portion of the dimm may be accessible via an interleave set PMEM mapping in addition to the dimm's BLK (block-data-window aperture(s)) interface. A label, stored in a "configuration data region" on the dimm, disambiguates which dimm addresses are accessed through which exclusive interface. Add infrastructure that allows the kernel to block modifications to a label in the set while any member dimm is active. Note that this is meant only for enforcing "no modifications of active labels" via the coarse ioctl command. Adding/deleting namespaces from an active interleave set is always possible via sysfs. Another aspect of tracking interleave sets is tracking their integrity when DIMMs in a set are physically re-ordered. For this purpose we generate an "interleave-set cookie" that can be recorded in a label and validated against the current configuration. It is the bus provider implementation's responsibility to calculate the interleave set cookie and attach it to a given region. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 37f966aff386..1b627b109360 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -61,11 +61,16 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_interleave_set { + u64 cookie; +}; + struct nd_region_desc { struct resource *res; struct nd_mapping *nd_mapping; u16 num_mappings; const struct attribute_group **attr_groups; + struct nd_interleave_set *nd_set; void *provider_data; }; @@ -101,4 +106,5 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From bf9bccc14c05dae8caba29df6187c731710f5380 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 Jun 2015 17:14:46 -0400 Subject: libnvdimm: pmem label sets and namespace instantiation. A complete label set is a PMEM-label per-dimm per-interleave-set where all the UUIDs match and the interleave set cookie matches the hosting interleave set. Present sysfs attributes for manipulation of a PMEM-namespace's 'alt_name', 'uuid', and 'size' attributes. A later patch will make these settings persistent by writing back the label. Note that PMEM allocations grow forwards from the start of an interleave set (lowest dimm-physical-address (DPA)). BLK-namespaces that alias with a PMEM interleave set will grow allocations backward from the highest DPA. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 10 ++++++++++ include/linux/nd.h | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 1b627b109360..c130972e08c4 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -41,10 +41,20 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_namespace_label; +struct nvdimm_drvdata; struct nd_mapping { struct nvdimm *nvdimm; + struct nd_namespace_label **labels; u64 start; u64 size; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; }; struct nvdimm_bus_descriptor { diff --git a/include/linux/nd.h b/include/linux/nd.h index da70e9962197..255c38a83083 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -28,16 +28,40 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_io - infrastructure for loading an nd_pmem instance + * @dev: namespace device created by the nd region driver + * @res: struct resource conversion of a NFIT SPA table + */ struct nd_namespace_io { struct device dev; struct resource res; }; +/** + * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory + * @nsio: device and system physical address range to drive + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + */ +struct nd_namespace_pmem { + struct nd_namespace_io nsio; + char *alt_name; + u8 *uuid; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); } +static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return container_of(nsio, struct nd_namespace_pmem, nsio); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" -- cgit v1.2.3 From 1b40e09a1232de537b193fa1b6b3ef16d3a1e397 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:34:01 -0400 Subject: libnvdimm: blk labels and namespace instantiation A blk label set describes a namespace comprised of one or more discontiguous dpa ranges on a single dimm. They may alias with one or more pmem interleave sets that include the given dimm. This is the runtime/volatile configuration infrastructure for sysfs manipulation of 'alt_name', 'uuid', 'size', and 'sector_size'. A later patch will make these settings persistent by writing back the label(s). Unlike pmem namespaces, multiple blk namespaces can be created per region. Once a blk namespace has been created a new seed device (unconfigured child of a parent blk region) is instantiated. As long as a region has 'available_size' != 0 new child namespaces may be created. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 3 +++ include/linux/nd.h | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index c130972e08c4..a59dca17b3aa 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -27,6 +27,9 @@ enum { ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, ND_MAX_MAPPINGS = 32, + + /* mark newly adjusted resources as requiring a label update */ + DPA_RESOURCE_ADJUSTED = 1 << 0, }; extern struct attribute_group nvdimm_bus_attribute_group; diff --git a/include/linux/nd.h b/include/linux/nd.h index 255c38a83083..23276ea91690 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -50,6 +50,26 @@ struct nd_namespace_pmem { u8 *uuid; }; +/** + * struct nd_namespace_blk - namespace for dimm-bounded persistent memory + * @dev: namespace device creation by the nd region driver + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + * @id: ida allocated id + * @lbasize: blk namespaces have a native sector size when btt not present + * @num_resources: number of dpa extents to claim + * @res: discontiguous dpa extents for given dimm + */ +struct nd_namespace_blk { + struct device dev; + char *alt_name; + u8 *uuid; + int id; + unsigned long lbasize; + int num_resources; + struct resource **res; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); @@ -62,6 +82,11 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) return container_of(nsio, struct nd_namespace_pmem, nsio); } +static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) +{ + return container_of(dev, struct nd_namespace_blk, dev); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" -- cgit v1.2.3 From d7f96f97c4031fa4ffdb7801f9aae23e96170a6f Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 25 Jun 2015 09:06:56 +0200 Subject: firmware: dmi_scan: add SBMIOS entry and DMI tables Some utils, like dmidecode and smbios, need to access SMBIOS entry table area in order to get information like SMBIOS version, size, etc. Currently it's done via /dev/mem. But for situation when /dev/mem usage is disabled, the utils have to use dmi sysfs instead, which doesn't represent SMBIOS entry and adds code/delay redundancy when direct access for table is needed. So this patch creates dmi/tables and adds SMBIOS entry point to allow utils in question to work correctly without /dev/mem. Also patch adds raw dmi table to simplify dmi table processing in user space, as proposed by Jean Delvare. Tested-by: Roy Franz Signed-off-by: Ivan Khoronzhuk Signed-off-by: Jean Delvare --- include/linux/dmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index f820f0a336c9..2f9f98827c0a 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -2,6 +2,7 @@ #define __DMI_H__ #include +#include #include /* enum dmi_field is in mod_devicetable.h */ @@ -93,6 +94,7 @@ struct dmi_dev_onboard { int devfn; }; +extern struct kobject *dmi_kobj; extern int dmi_check_system(const struct dmi_system_id *list); const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list); extern const char * dmi_get_system_info(int field); -- cgit v1.2.3 From 9ea650c804404e55dbf17c928203141282e759ab Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 25 Jun 2015 09:06:57 +0200 Subject: firmware: dmi: struct dmi_header should be packed Apparently the compiler does fine without it, but it feels safer and clearer to add the missing attribute. Signed-off-by: Jean Delvare --- include/linux/dmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 2f9f98827c0a..5055ac34142d 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -75,7 +75,7 @@ struct dmi_header { u8 type; u8 length; u16 handle; -}; +} __packed; struct dmi_device { struct list_head list; -- cgit v1.2.3 From 8c2f7e8658df1d3b7cbfa62706941d14c715823a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Jun 2015 04:20:04 -0400 Subject: libnvdimm: infrastructure for btt devices NVDIMM namespaces, in addition to accepting "struct bio" based requests, also have the capability to perform byte-aligned accesses. By default only the bio/block interface is used. However, if another driver can make effective use of the byte-aligned capability it can claim namespace interface and use the byte-aligned ->rw_bytes() interface. The BTT driver is the initial first consumer of this mechanism to allow adding atomic sector update semantics to a pmem or blk namespace. This patch is the sysfs infrastructure to allow configuring a BTT instance for a namespace. Enabling that BTT and performing i/o is in a subsequent patch. Cc: Greg KH Cc: Neil Brown Signed-off-by: Dan Williams --- include/linux/nd.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nd.h b/include/linux/nd.h index 23276ea91690..507e47c86737 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -12,6 +12,7 @@ */ #ifndef __LINUX_ND_H__ #define __LINUX_ND_H__ +#include #include #include @@ -28,13 +29,33 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_common - core infrastructure of a namespace + * @force_raw: ignore other personalities for the namespace (e.g. btt) + * @dev: device model node + * @claim: when set a another personality has taken ownership of the namespace + * @rw_bytes: access the raw namespace capacity with byte-aligned transfers + */ +struct nd_namespace_common { + int force_raw; + struct device dev; + struct device *claim; + int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, + void *buf, size_t size, int rw); +}; + +static inline struct nd_namespace_common *to_ndns(struct device *dev) +{ + return container_of(dev, struct nd_namespace_common, dev); +} + /** * struct nd_namespace_io - infrastructure for loading an nd_pmem instance * @dev: namespace device created by the nd region driver * @res: struct resource conversion of a NFIT SPA table */ struct nd_namespace_io { - struct device dev; + struct nd_namespace_common common; struct resource res; }; @@ -52,7 +73,6 @@ struct nd_namespace_pmem { /** * struct nd_namespace_blk - namespace for dimm-bounded persistent memory - * @dev: namespace device creation by the nd region driver * @alt_name: namespace name supplied in the dimm label * @uuid: namespace name supplied in the dimm label * @id: ida allocated id @@ -61,7 +81,7 @@ struct nd_namespace_pmem { * @res: discontiguous dpa extents for given dimm */ struct nd_namespace_blk { - struct device dev; + struct nd_namespace_common common; char *alt_name; u8 *uuid; int id; @@ -72,7 +92,7 @@ struct nd_namespace_blk { static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { - return container_of(dev, struct nd_namespace_io, dev); + return container_of(dev, struct nd_namespace_io, common.dev); } static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) @@ -84,7 +104,40 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) { - return container_of(dev, struct nd_namespace_blk, dev); + return container_of(dev, struct nd_namespace_blk, common.dev); +} + +/** + * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to fill + * @size: transfer length + * + * @buf is up-to-date upon return from this routine. + */ +static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, READ); +} + +/** + * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to drain + * @size: transfer length + * + * NVDIMM Namepaces disks do not implement sectors internally. Depending on + * the @ndns, the contents of @buf may be in cpu cache, platform buffers, + * or on backing memory media upon return from this routine. Flushing + * to media is handled internal to the @ndns driver, if at all. + */ +static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, WRITE); } #define MODULE_ALIAS_ND_DEVICE(type) \ -- cgit v1.2.3 From 479305fd7172503772575997eb6f1b0a2bb4a107 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 25 Jun 2015 15:00:40 -0700 Subject: zpool: remove zpool_evict() Remove zpool_evict() helper function. As zbud is currently the only zpool implementation that supports eviction, add zpool and zpool_ops references to struct zbud_pool and directly call zpool_ops->evict(zpool, handle) on eviction. Currently zpool provides the zpool_evict helper which locks the zpool list lock and searches through all pools to find the specific one matching the caller, and call the corresponding zpool_ops->evict function. However, this is unnecessary, as the zbud pool can simply keep a reference to the zpool that created it, as well as the zpool_ops, and directly call the zpool_ops->evict function, when it needs to evict a page. This avoids a spinlock and list search in zpool for each eviction. Signed-off-by: Dan Streetman Cc: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 56529b34dc63..d30eff3d84d5 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -81,7 +81,8 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops); + void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops, + struct zpool *zpool); void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, @@ -102,6 +103,4 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); -int zpool_evict(void *pool, unsigned long handle); - #endif -- cgit v1.2.3 From f6d133f877c8bb0a0934dc8c521c758ee771e901 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Jun 2015 15:01:00 -0700 Subject: compiler-gcc.h: neatening - Move the inline and noinline blocks together - Comment neatening - Alignment of __attribute__ uses - Consistent naming of __must_be_array macro argument - Multiline macro neatening Signed-off-by: Joe Perches Cc: Andi Kleen Cc: Michal Marek Cc: Segher Boessenkool Cc: Sasha Levin Cc: Anton Blanchard Cc: Alan Modra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 85 +++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 371e560d13cf..5c2c14e3c647 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -5,9 +5,9 @@ /* * Common definitions for all gcc versions go here. */ -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) /* Optimization barrier */ @@ -46,55 +46,63 @@ * the inline assembly constraint from =g to =r, in this particular * case either is valid. */ -#define RELOC_HIDE(ptr, off) \ - ({ unsigned long __ptr; \ - __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ - (typeof(ptr)) (__ptr + (off)); }) +#define RELOC_HIDE(ptr, off) \ +({ \ + unsigned long __ptr; \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ + (typeof(ptr)) (__ptr + (off)); \ +}) /* Make the optimizer believe the variable can be manipulated arbitrarily. */ -#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var)) +#define OPTIMIZER_HIDE_VAR(var) \ + __asm__ ("" : "=r" (var) : "0" (var)) #ifdef __CHECKER__ -#define __must_be_array(arr) 0 +#define __must_be_array(a) 0 #else /* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #endif /* * Force always-inline if the user requests it so via the .config, * or if gcc is too old: */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ +#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -# define inline inline __attribute__((always_inline)) notrace -# define __inline__ __inline__ __attribute__((always_inline)) notrace -# define __inline __inline __attribute__((always_inline)) notrace +#define inline inline __attribute__((always_inline)) notrace +#define __inline__ __inline__ __attribute__((always_inline)) notrace +#define __inline __inline __attribute__((always_inline)) notrace #else /* A lot of inline functions can cause havoc with function tracing */ -# define inline inline notrace -# define __inline__ __inline__ notrace -# define __inline __inline notrace +#define inline inline notrace +#define __inline__ __inline__ notrace +#define __inline __inline notrace #endif -#define __deprecated __attribute__((deprecated)) -#define __packed __attribute__((packed)) -#define __weak __attribute__((weak)) -#define __alias(symbol) __attribute__((alias(#symbol))) +#define __always_inline inline __attribute__((always_inline)) +#define noinline __attribute__((noinline)) + +#define __deprecated __attribute__((deprecated)) +#define __packed __attribute__((packed)) +#define __weak __attribute__((weak)) +#define __alias(symbol) __attribute__((alias(#symbol))) /* - * it doesn't make sense on ARM (currently the only user of __naked) to trace - * naked functions because then mcount is called without stack and frame pointer - * being set up and there is no chance to restore the lr register to the value - * before mcount was called. + * it doesn't make sense on ARM (currently the only user of __naked) + * to trace naked functions because then mcount is called without + * stack and frame pointer being set up and there is no chance to + * restore the lr register to the value before mcount was called. + * + * The asm() bodies of naked functions often depend on standard calling + * conventions, therefore they must be noinline and noclone. * - * The asm() bodies of naked functions often depend on standard calling conventions, - * therefore they must be noinline and noclone. GCC 4.[56] currently fail to enforce - * this, so we must do so ourselves. See GCC PR44290. + * GCC 4.[56] currently fail to enforce this, so we must do so ourselves. + * See GCC PR44290. */ -#define __naked __attribute__((naked)) noinline __noclone notrace +#define __naked __attribute__((naked)) noinline __noclone notrace -#define __noreturn __attribute__((noreturn)) +#define __noreturn __attribute__((noreturn)) /* * From the GCC manual: @@ -106,14 +114,13 @@ * would be. * [...] */ -#define __pure __attribute__((pure)) -#define __aligned(x) __attribute__((aligned(x))) -#define __printf(a, b) __attribute__((format(printf, a, b))) -#define __scanf(a, b) __attribute__((format(scanf, a, b))) -#define noinline __attribute__((noinline)) -#define __attribute_const__ __attribute__((__const__)) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) +#define __pure __attribute__((pure)) +#define __aligned(x) __attribute__((aligned(x))) +#define __printf(a, b) __attribute__((format(printf, a, b))) +#define __scanf(a, b) __attribute__((format(scanf, a, b))) +#define __attribute_const__ __attribute__((__const__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) #define __gcc_header(x) #x #define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h) @@ -129,5 +136,3 @@ * code */ #define uninitialized_var(x) x = x - -#define __always_inline inline __attribute__((always_inline)) -- cgit v1.2.3 From cb984d101b30eb7478d32df56a0023e4603cba7f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Jun 2015 15:01:02 -0700 Subject: compiler-gcc: integrate the various compiler-gcc[345].h files As gcc major version numbers are going to advance rather rapidly in the future, there's no real value in separate files for each compiler version. Deduplicate some of the macros #defined in each file too. Neaten comments using normal kernel commenting style. Signed-off-by: Joe Perches Cc: Andi Kleen Cc: Michal Marek Cc: Segher Boessenkool Cc: Sasha Levin Cc: Anton Blanchard Cc: Alan Modra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 120 ++++++++++++++++++++++++++++++++++++++++-- include/linux/compiler-gcc3.h | 23 -------- include/linux/compiler-gcc4.h | 91 -------------------------------- include/linux/compiler-gcc5.h | 67 ----------------------- 4 files changed, 116 insertions(+), 185 deletions(-) delete mode 100644 include/linux/compiler-gcc3.h delete mode 100644 include/linux/compiler-gcc4.h delete mode 100644 include/linux/compiler-gcc5.h (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5c2c14e3c647..dfaa7b3e9ae9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -122,10 +122,122 @@ #define __maybe_unused __attribute__((unused)) #define __always_unused __attribute__((unused)) -#define __gcc_header(x) #x -#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h) -#define gcc_header(x) _gcc_header(x) -#include gcc_header(__GNUC__) +/* gcc version specific checks */ + +#if GCC_VERSION < 30200 +# error Sorry, your compiler is too old - please upgrade it. +#endif + +#if GCC_VERSION < 30300 +# define __used __attribute__((__unused__)) +#else +# define __used __attribute__((__used__)) +#endif + +#ifdef CONFIG_GCOV_KERNEL +# if GCC_VERSION < 30400 +# error "GCOV profiling support for gcc versions below 3.4 not included" +# endif /* __GNUC_MINOR__ */ +#endif /* CONFIG_GCOV_KERNEL */ + +#if GCC_VERSION >= 30400 +#define __must_check __attribute__((warn_unused_result)) +#endif + +#if GCC_VERSION >= 40000 + +/* GCC 4.1.[01] miscompiles __weak */ +#ifdef __KERNEL__ +# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 +# error Your version of gcc miscompiles the __weak directive +# endif +#endif + +#define __used __attribute__((__used__)) +#define __compiler_offsetof(a, b) \ + __builtin_offsetof(a, b) + +#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 +# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) +#endif + +#if GCC_VERSION >= 40300 +/* Mark functions as cold. gcc will assume any path leading to a call + * to them will be unlikely. This means a lot of manual unlikely()s + * are unnecessary now for any paths leading to the usual suspects + * like BUG(), printk(), panic() etc. [but let's keep them for now for + * older compilers] + * + * Early snapshots of gcc 4.3 don't support this and we can't detect this + * in the preprocessor, but we can live with this because they're unreleased. + * Maketime probing would be overkill here. + * + * gcc also has a __attribute__((__hot__)) to move hot functions into + * a special section, but I don't see any sense in this right now in + * the kernel context + */ +#define __cold __attribute__((__cold__)) + +#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + +#ifndef __CHECKER__ +# define __compiletime_warning(message) __attribute__((warning(message))) +# define __compiletime_error(message) __attribute__((error(message))) +#endif /* __CHECKER__ */ +#endif /* GCC_VERSION >= 40300 */ + +#if GCC_VERSION >= 40500 +/* + * Mark a position in code as unreachable. This can be used to + * suppress control flow warnings after asm blocks that transfer + * control elsewhere. + * + * Early snapshots of gcc 4.5 don't support this and we can't detect + * this in the preprocessor, but we can live with this because they're + * unreleased. Really, we need to have autoconf for the kernel. + */ +#define unreachable() __builtin_unreachable() + +/* Mark a function definition as prohibited from being cloned. */ +#define __noclone __attribute__((__noclone__)) + +#endif /* GCC_VERSION >= 40500 */ + +#if GCC_VERSION >= 40600 +/* + * Tell the optimizer that something else uses this function or variable. + */ +#define __visible __attribute__((externally_visible)) +#endif + +/* + * GCC 'asm goto' miscompiles certain code sequences: + * + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 + * + * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. + * + * (asm goto is automatically volatile - the naming reflects this.) + */ +#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) + +#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP +#if GCC_VERSION >= 40400 +#define __HAVE_BUILTIN_BSWAP32__ +#define __HAVE_BUILTIN_BSWAP64__ +#endif +#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) +#define __HAVE_BUILTIN_BSWAP16__ +#endif +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ + +#if GCC_VERSION >= 50000 +#define KASAN_ABI_VERSION 4 +#elif GCC_VERSION >= 40902 +#define KASAN_ABI_VERSION 3 +#endif + +#endif /* gcc version >= 40000 specific checks */ #if !defined(__noclone) #define __noclone /* not needed */ diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h deleted file mode 100644 index 7d89febe4d79..000000000000 --- a/include/linux/compiler-gcc3.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include directly, include instead." -#endif - -#if GCC_VERSION < 30200 -# error Sorry, your compiler is too old - please upgrade it. -#endif - -#if GCC_VERSION >= 30300 -# define __used __attribute__((__used__)) -#else -# define __used __attribute__((__unused__)) -#endif - -#if GCC_VERSION >= 30400 -#define __must_check __attribute__((warn_unused_result)) -#endif - -#ifdef CONFIG_GCOV_KERNEL -# if GCC_VERSION < 30400 -# error "GCOV profiling support for gcc versions below 3.4 not included" -# endif /* __GNUC_MINOR__ */ -#endif /* CONFIG_GCOV_KERNEL */ diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h deleted file mode 100644 index 769e19864632..000000000000 --- a/include/linux/compiler-gcc4.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include directly, include instead." -#endif - -/* GCC 4.1.[01] miscompiles __weak */ -#ifdef __KERNEL__ -# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 -# error Your version of gcc miscompiles the __weak directive -# endif -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a,b) __builtin_offsetof(a,b) - -#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 -# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#endif - -#if GCC_VERSION >= 40300 -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ -#endif /* GCC_VERSION >= 40300 */ - -#if GCC_VERSION >= 40500 -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -#endif /* GCC_VERSION >= 40500 */ - -#if GCC_VERSION >= 40600 -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) -#endif - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#if GCC_VERSION >= 40400 -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#endif -#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) -#define __HAVE_BUILTIN_BSWAP16__ -#endif -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#if GCC_VERSION >= 40902 -#define KASAN_ABI_VERSION 3 -#endif diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h deleted file mode 100644 index efee493714eb..000000000000 --- a/include/linux/compiler-gcc5.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include directly, include instead." -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) - -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ - -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#define __HAVE_BUILTIN_BSWAP16__ -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#define KASAN_ABI_VERSION 4 -- cgit v1.2.3 From b86a50c3b5414eafdbee7f34af4a201a4a7817c2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 Jun 2015 15:01:05 -0700 Subject: compiler-intel: fix wrong compiler barrier() macro Cleanup commit 73679e508201 ("compiler-intel.h: Remove duplicate definition") removed the double definition of __memory_barrier() intrinsics. However, in doing so, it also removed the preceding #undef barrier by accident, meaning, the actual barrier() macro from compiler-gcc.h with inline asm is still in place as __GNUC__ is provided. Subsequently, barrier() can never be defined as __memory_barrier() from compiler.h since it already has a definition in place and if we trust the comment in compiler-intel.h, ecc doesn't support gcc specific asm statements. I don't have an ecc at hand (unsure if that's still used in the field?) and only found this by accident during code review, a revert of that cleanup would be simplest option. Fixes: 73679e508201 ("compiler-intel.h: Remove duplicate definition") Signed-off-by: Daniel Borkmann Reviewed-by: Pranith Kumar Cc: Pranith Kumar Cc: H. Peter Anvin Cc: mancha security Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-intel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index 0c9a2f2c2802..d4c71132d07f 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -13,10 +13,12 @@ /* Intel ECC compiler doesn't support gcc specific asm stmts. * It uses intrinsics to do the equivalent things. */ +#undef barrier #undef barrier_data #undef RELOC_HIDE #undef OPTIMIZER_HIDE_VAR +#define barrier() __memory_barrier() #define barrier_data(ptr) barrier() #define RELOC_HIDE(ptr, off) \ -- cgit v1.2.3 From 8c7fbe5795a016259445a61e072eb0118aaf6a61 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Jun 2015 15:01:16 -0700 Subject: stddef.h: move offsetofend inside #ifndef/#endif guard, neaten Commit 3876488444e7 ("include/stddef.h: Move offsetofend() from vfio.h to a generic kernel header") added offsetofend outside the normal include #ifndef/#endif guard. Move it inside. Miscellanea: o remove unnecessary blank line o standardize offsetof macros whitespace style Signed-off-by: Joe Perches Cc: Denys Vlasenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stddef.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 076af437284d..9c61c7cda936 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -3,7 +3,6 @@ #include - #undef NULL #define NULL ((void *)0) @@ -14,10 +13,9 @@ enum { #undef offsetof #ifdef __compiler_offsetof -#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) +#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER) #else -#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) -#endif +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) #endif /** @@ -28,3 +26,5 @@ enum { */ #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) + +#endif -- cgit v1.2.3 From 3033f14ab78c326871a4902591c2518410add24a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 25 Jun 2015 15:01:19 -0700 Subject: clone: support passing tls argument via C rather than pt_regs magic clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter "tls" that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. This patch (of 2): clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett Acked-by: Andy Lutomirski Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thiago Macieira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 15 +++++++++++++++ include/linux/syscalls.h | 6 +++--- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6633e83e608a..93ed0b682adb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2556,8 +2556,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); @@ -2576,6 +2590,7 @@ extern int do_execveat(int, struct filename *, const char __user * const __user *, const char __user * const __user *, int); +extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38aabe1..bb51becf23f8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd); asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long, int __user *); #else #ifdef CONFIG_CLONE_BACKWARDS3 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, - int __user *, int); + int __user *, unsigned long); #else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, - int __user *, int); + int __user *, unsigned long); #endif #endif -- cgit v1.2.3 From d43ff430f434d862db59582c0f1f02382a678036 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jun 2015 15:01:24 -0700 Subject: printk: guard the amount written per line by devkmsg_read() This patchset updates netconsole so that it can emit messages with the same header as used in /dev/kmsg which gives neconsole receiver full log information which enables things like structured logging and detection of lost messages. This patch (of 7): devkmsg_read() uses 8k buffer and assumes that the formatted output message won't overrun which seems safe given LOG_LINE_MAX, the current use of dict and the escaping method being used; however, we're planning to use devkmsg formatting wider and accounting for the buffer size properly isn't that complicated. This patch defines CONSOLE_EXT_LOG_MAX as 8192 and updates devkmsg_read() so that it limits output accordingly. Signed-off-by: Tejun Heo Cc: David Miller Cc: Kay Sievers Reviewed-by: Petr Mladek Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 9b30871c9149..58b1fec40d37 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -30,6 +30,8 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +#define CONSOLE_EXT_LOG_MAX 8192 + /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT -- cgit v1.2.3 From 6fe29354befe4c46eb308b662155d4d8017358e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jun 2015 15:01:30 -0700 Subject: printk: implement support for extended console drivers printk log_buf keeps various metadata for each message including its sequence number and timestamp. The metadata is currently available only through /dev/kmsg and stripped out before passed onto console drivers. We want this metadata to be available to console drivers too so that console consumers can get full information including the metadata and dictionary, which among other things can be used to detect whether messages got lost in transit. This patch implements support for extended console drivers. Consoles can indicate that they want extended messages by setting the new CON_EXTENDED flag and they'll be fed messages formatted the same way as /dev/kmsg. ",,,;\n" If extended consoles exist, in-kernel fragment assembly is disabled. This ensures that all messages emitted to consoles have full metadata including sequence number. The contflag carries enough information to reassemble the fragments from the reader side trivially. Note that this only affects /dev/kmsg. Regular console and /proc/kmsg outputs are not affected by this change. * Extended message formatting for console drivers is enabled iff there are registered extended consoles. * Comment describing /dev/kmsg message format updated to add missing contflag field and help distinguishing variable from verbatim terms. Signed-off-by: Tejun Heo Cc: David Miller Cc: Kay Sievers Reviewed-by: Petr Mladek Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 9f50fb413c11..bd194343c346 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -115,6 +115,7 @@ static inline int con_debug_leave(void) #define CON_BOOT (8) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ +#define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ struct console { char name[16]; -- cgit v1.2.3 From 3ea4331c60be3eee4c97e5ddabad95399f879b76 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 25 Jun 2015 15:01:47 -0700 Subject: check_syslog_permissions() cleanup Patch fixes drawbacks in heck_syslog_permissions() noticed by AKPM: "from_file handling makes me cry. That's not a boolean - it's an enumerated value with two values currently defined. But the code in check_syslog_permissions() treats it as a boolean and also hardwires the knowledge that SYSLOG_FROM_PROC == 1 (or == `true`). And the name is wrong: it should be called from_proc to match SYSLOG_FROM_PROC." Signed-off-by: Vasily Averin Cc: Kees Cook Cc: Josh Boyer Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syslog.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 4b7b875a7ce1..c3a7f0cc3a27 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -47,12 +47,12 @@ #define SYSLOG_FROM_READER 0 #define SYSLOG_FROM_PROC 1 -int do_syslog(int type, char __user *buf, int count, bool from_file); +int do_syslog(int type, char __user *buf, int count, int source); #ifdef CONFIG_PRINTK -int check_syslog_permissions(int type, bool from_file); +int check_syslog_permissions(int type, int source); #else -static inline int check_syslog_permissions(int type, bool from_file) +static inline int check_syslog_permissions(int type, int source) { return 0; } -- cgit v1.2.3 From 94df290404cd0da8016698bf3f398410f29d9a64 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 25 Jun 2015 15:02:22 -0700 Subject: lib/string.c: introduce strreplace() Strings are sometimes sanitized by replacing a certain character (often '/') by another (often '!'). In a few places, this is done the same way Schlemiel the Painter would do it. Others are slightly smarter but still do multiple strchr() calls. Introduce strreplace() to do this using a single function call and a single pass over the string. One would expect the return value to be one of three things: void, s, or the number of replacements made. I chose the fourth, returning a pointer to the end of the string. This is more likely to be useful (for example allowing the caller to avoid a strlen call). Signed-off-by: Rasmus Villemoes Cc: "Theodore Ts'o" Cc: Greg Kroah-Hartman Cc: Neil Brown Cc: Steven Rostedt Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index e40099e585c9..a8d90db9c4b0 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -111,6 +111,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); #endif void *memchr_inv(const void *s, int c, size_t n); +char *strreplace(char *s, char old, char new); extern void kfree_const(const void *x); -- cgit v1.2.3 From 78d8e58a086b214dddf1fd463e20a7e1d82d7866 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 26 Jun 2015 10:01:13 -0400 Subject: Revert "block, dm: don't copy bios for request clones" This reverts commit 5f1b670d0bef508a5554d92525f5f6d00d640b38. Justification for revert as reported in this dm-devel post: https://www.redhat.com/archives/dm-devel/2015-June/msg00160.html this change should not be pushed to mainline yet. Firstly, Christoph has a newer version of the patch that fixes silent data corruption problem: https://www.redhat.com/archives/dm-devel/2015-May/msg00229.html And the new version still depends on LLDDs to always complete requests to the end when error happens, while block API doesn't enforce such a requirement. If the assumption is ever broken, the inconsistency between request and bio (e.g. rq->__sector and rq->bio) will cause silent data corruption: https://www.redhat.com/archives/dm-devel/2015-June/msg00022.html Reported-by: Junichi Nomura Signed-off-by: Mike Snitzer --- include/linux/blk_types.h | 2 -- include/linux/blkdev.h | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6ab9d12d1f17..7303b3405520 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -192,7 +192,6 @@ enum rq_flag_bits { __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NO_TIMEOUT, /* requests may never expire */ - __REQ_CLONE, /* cloned bios */ __REQ_NR_BITS, /* stops here */ }; @@ -247,6 +246,5 @@ enum rq_flag_bits { #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) -#define REQ_CLONE (1ULL << __REQ_CLONE) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 776d2ee43ba6..41c0fb573dff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -775,7 +775,11 @@ extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); -extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src); +extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), + void *data); +extern void blk_rq_unprep_clone(struct request *rq); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern void blk_delay_queue(struct request_queue *, unsigned long); -- cgit v1.2.3 From 5212e11fde4d40fa627668b4f2222d20db488f71 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Jun 2015 04:20:32 -0400 Subject: nd_btt: atomic sector updates BTT stands for Block Translation Table, and is a way to provide power fail sector atomicity semantics for block devices that have the ability to perform byte granularity IO. It relies on the capability of libnvdimm namespace devices to do byte aligned IO. The BTT works as a stacked blocked device, and reserves a chunk of space from the backing device for its accounting metadata. It is a bio-based driver because all IO is done synchronously, and there is no queuing or asynchronous completions at either the device or the driver level. The BTT uses 'lanes' to index into various 'on-disk' data structures, and lanes also act as a synchronization mechanism in case there are more CPUs than available lanes. We did a comparison between two lane lock strategies - first where we kept an atomic counter around that tracked which was the last lane that was used, and 'our' lane was determined by atomically incrementing that. That way, for the nr_cpus > nr_lanes case, theoretically, no CPU would be blocked waiting for a lane. The other strategy was to use the cpu number we're scheduled on to and hash it to a lane number. Theoretically, this could block an IO that could've otherwise run using a different, free lane. But some fio workloads showed that the direct cpu -> lane hash performed faster than tracking 'last lane' - my reasoning is the cache thrash caused by moving the atomic variable made that approach slower than simply waiting out the in-progress IO. This supports the conclusion that the driver can be a very simple bio-based one that does synchronous IOs instead of queuing. Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Neil Brown Cc: Jeff Moyer Cc: Dave Chinner Cc: Greg KH [jmoyer: fix nmi watchdog timeout in btt_map_init] [jmoyer: move btt initialization to module load path] [jmoyer: fix memory leak in the btt initialization path] [jmoyer: Don't overwrite corrupted arenas] Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a59dca17b3aa..531d99dfac68 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -85,6 +85,7 @@ struct nd_region_desc { const struct attribute_group **attr_groups; struct nd_interleave_set *nd_set; void *provider_data; + int num_lanes; }; struct nvdimm_bus; -- cgit v1.2.3 From 047fc8a1f9a6330eacc80374dff087e20dc2304b Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 04:21:02 -0400 Subject: libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory The libnvdimm implementation handles allocating dimm address space (DPA) between PMEM and BLK mode interfaces. After DPA has been allocated from a BLK-region to a BLK-namespace the nd_blk driver attaches to handle I/O as a struct bio based block device. Unlike PMEM, BLK is required to handle platform specific details like mmio register formats and memory controller interleave. For this reason the libnvdimm generic nd_blk driver calls back into the bus provider to carry out the I/O. This initial implementation handles the BLK interface defined by the ACPI 6 NFIT [1] and the NVDIMM DSM Interface Example [2] composed from DCR (dimm control region), BDW (block data window), IDT (interleave descriptor) NFIT structures and the hardware register format. [1]: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf [2]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Signed-off-by: Ross Zwisler Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 531d99dfac68..7fc1b25bdb5d 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,7 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include #include #include @@ -89,8 +90,24 @@ struct nd_region_desc { }; struct nvdimm_bus; -struct device; struct module; +struct device; +struct nd_blk_region; +struct nd_blk_region_desc { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + struct nd_region_desc ndr_desc; +}; + +static inline struct nd_blk_region_desc *to_blk_region_desc( + struct nd_region_desc *ndr_desc) +{ + return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc); + +} + struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ @@ -99,10 +116,10 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); struct nd_region *to_nd_region(struct device *dev); +struct nd_blk_region *to_nd_blk_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); -void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -120,5 +137,11 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +void *nd_region_provider_data(struct nd_region *nd_region); +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr); +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 581388209405902b56d055f644b4dd124a206112 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 23 Jun 2015 20:08:34 -0400 Subject: libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only Upon detection of an unarmed dimm in a region, arrange for descendant BTT, PMEM, or BLK instances to be read-only. A dimm is primarily marked "unarmed" via flags passed by platform firmware (NFIT). The flags in the NFIT memory device sub-structure indicate the state of the data on the nvdimm relative to its energy source or last "flush to persistence". For the most part there is nothing the driver can do but advertise the state of these flags in sysfs and emit a message if firmware indicates that the contents of the device may be corrupted. However, for the case of ACPI_NFIT_MEM_ARMED, the driver can arrange for the block devices incorporating that nvdimm to be marked read-only. This is a safe default as the data is still available and new writes are held off until the administrator either forces read-write mode, or the energy source becomes armed. A 'read_only' attribute is added to REGION devices to allow for overriding the default read-only policy of all descendant block devices. Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 7fc1b25bdb5d..dc799a29ed1a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -21,6 +21,8 @@ enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + /* unarmed memory devices may not persist writes */ + NDD_UNARMED = 1 << 1, /* need to set a limit somewhere, but yes, this is likely overkill */ ND_IOCTL_MAX_BUFLEN = SZ_4M, -- cgit v1.2.3 From 99759869faf15471cfce251bc138848d8af7d162 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 17:14:15 -0600 Subject: acpi: Add acpi_map_pxm_to_online_node() The kernel initializes CPU & memory's NUMA topology from ACPI SRAT table. Some other ACPI tables, such as NFIT and DMAR, also contain proximity IDs for their device's NUMA topology. This information can be used to improve performance of these devices. This patch introduces acpi_map_pxm_to_online_node(), which is similar to acpi_map_pxm_to_node(), but always returns an online node. When the mapped node from a given proximity ID is offline, it looks up the node distance table and returns the nearest online node. ACPI device drivers, which are called after the NUMA initialization has completed in the kernel, can call this interface to obtain their device NUMA topology from ACPI tables. Such drivers do not have to deal with offline nodes. A node may be offline when a device proximity ID is unique, SRAT memory entry does not exist, or NUMA is disabled, ex. "numa=off" on x86. This patch also moves the pxm range check from acpi_get_node() to acpi_map_pxm_to_node(). Signed-off-by: Toshi Kani Acked-by: Rafael J. Wysocki > Signed-off-by: Dan Williams --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..1b3bbb11d11c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -289,8 +289,13 @@ extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d); extern void acpi_osi_setup(char *str); #ifdef CONFIG_ACPI_NUMA +int acpi_map_pxm_to_online_node(int pxm); int acpi_get_node(acpi_handle handle); #else +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + return 0; +} static inline int acpi_get_node(acpi_handle handle) { return 0; -- cgit v1.2.3 From 41d7a6d637e1440f5410cb43c25a3c41255540c5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:33 -0600 Subject: libnvdimm: Set numa_node to NVDIMM devices ACPI NFIT table has System Physical Address Range Structure entries that describe a proximity ID of each range when ACPI_NFIT_PROXIMITY_VALID is set in the flags. Change acpi_nfit_register_region() to map a proximity ID to its node ID, and set it to a new numa_node field of nd_region_desc, which is then conveyed to the nd_region device. The device core arranges for btt and namespace devices to inherit their node from their parent region. Signed-off-by: Toshi Kani [djbw: move set_dev_node() from region.c to bus.c] Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index dc799a29ed1a..30b3deaafd51 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -89,6 +89,7 @@ struct nd_region_desc { struct nd_interleave_set *nd_set; void *provider_data; int num_lanes; + int numa_node; }; struct nvdimm_bus; -- cgit v1.2.3 From 74ae66c3b14ffa94c8d2dea201cdf8e6203d13d5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:34 -0600 Subject: libnvdimm: Add sysfs numa_node to NVDIMM devices Add support of sysfs 'numa_node' to I/O-related NVDIMM devices under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.x. An example of numa_node values on a 2-socket system with a single NVDIMM range on each socket is shown below. /sys/bus/nd/devices |-- btt0.0/numa_node:0 |-- btt1.0/numa_node:1 |-- btt1.1/numa_node:1 |-- namespace0.0/numa_node:0 |-- namespace1.0/numa_node:1 |-- region0/numa_node:0 |-- region1/numa_node:1 These numa_node files are then linked under the block class of their device names. /sys/class/block/pmem0/device/numa_node:0 /sys/class/block/pmem1s/device/numa_node:1 This enables numactl(8) to accept 'block:' and 'file:' paths of pmem and btt devices as shown in the examples below. numactl --preferred block:pmem0 --show numactl --preferred file:/dev/pmem1s --show Signed-off-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 30b3deaafd51..75e3af01ee32 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -38,6 +38,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_numa_attribute_group; extern struct attribute_group nd_region_attribute_group; extern struct attribute_group nd_mapping_attribute_group; -- cgit v1.2.3 From 61031952f4c89dba1065f7a5b9419badb112554c Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 03:08:39 -0400 Subject: arch, x86: pmem api for ensuring durability of persistent memory updates Based on an original patch by Ross Zwisler [1]. Writes to persistent memory have the potential to be posted to cpu cache, cpu write buffers, and platform write buffers (memory controller) before being committed to persistent media. Provide apis, memcpy_to_pmem(), wmb_pmem(), and memremap_pmem(), to write data to pmem and assert that it is durable in PMEM (a persistent linear address range). A '__pmem' attribute is added so sparse can track proper usage of pointers to pmem. This continues the status quo of pmem being x86 only for 4.2, but reworks to ioremap, and wider implementation of memremap() will enable other archs in 4.3. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Ross Zwisler [djbw: various reworks] Signed-off-by: Dan Williams --- include/linux/compiler.h | 2 + include/linux/pmem.h | 153 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 include/linux/pmem.h (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..9a528d945498 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -21,6 +21,7 @@ # define __rcu __attribute__((noderef, address_space(4))) #else # define __rcu +# define __pmem __attribute__((noderef, address_space(5))) #endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); @@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __cond_lock(x,c) (c) # define __percpu # define __rcu +# define __pmem #endif /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ diff --git a/include/linux/pmem.h b/include/linux/pmem.h new file mode 100644 index 000000000000..f6481a0b1d4f --- /dev/null +++ b/include/linux/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __PMEM_H__ +#define __PMEM_H__ + +#include + +#ifdef CONFIG_ARCH_HAS_PMEM_API +#include +#else +static inline void arch_wmb_pmem(void) +{ + BUG(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} + +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return NULL; +} + +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + BUG(); +} +#endif + +/* + * Architectures that define ARCH_HAS_PMEM_API must provide + * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(), + * arch_wmb_pmem(), and __arch_has_wmb_pmem(). + */ + +static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) +{ + memcpy(dst, (void __force const *) src, size); +} + +static inline void memunmap_pmem(void __pmem *addr) +{ + iounmap((void __force __iomem *) addr); +} + +/** + * arch_has_wmb_pmem - true if wmb_pmem() ensures durability + * + * For a given cpu implementation within an architecture it is possible + * that wmb_pmem() resolves to a nop. In the case this returns + * false, pmem api users are unable to ensure durability and may want to + * fall back to a different data consistency model, or otherwise notify + * the user. + */ +static inline bool arch_has_wmb_pmem(void) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) + return __arch_has_wmb_pmem(); + return false; +} + +static inline bool arch_has_pmem_api(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem(); +} + +/* + * These defaults seek to offer decent performance and minimize the + * window between i/o completion and writes being durable on media. + * However, it is undefined / architecture specific whether + * default_memremap_pmem + default_memcpy_to_pmem is sufficient for + * making data durable relative to i/o completion. + */ +static void default_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t size) +{ + memcpy((void __force *) dst, src, size); +} + +static void __pmem *default_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + /* TODO: convert to ioremap_wt() */ + return (void __pmem __force *)ioremap_nocache(offset, size); +} + +/** + * memremap_pmem - map physical persistent memory for pmem api + * @offset: physical address of persistent memory + * @size: size of the mapping + * + * Establish a mapping of the architecture specific memory type expected + * by memcpy_to_pmem() and wmb_pmem(). For example, it may be + * the case that an uncacheable or writethrough mapping is sufficient, + * or a writeback mapping provided memcpy_to_pmem() and + * wmb_pmem() arrange for the data to be written through the + * cache to persistent media. + */ +static inline void __pmem *memremap_pmem(resource_size_t offset, + unsigned long size) +{ + if (arch_has_pmem_api()) + return arch_memremap_pmem(offset, size); + return default_memremap_pmem(offset, size); +} + +/** + * memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Perform a memory copy that results in the destination of the copy + * being effectively evicted from, or never written to, the processor + * cache hierarchy after the copy completes. After memcpy_to_pmem() + * data may still reside in cpu or platform buffers, so this operation + * must be followed by a wmb_pmem(). + */ +static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n) +{ + if (arch_has_pmem_api()) + arch_memcpy_to_pmem(dst, src, n); + else + default_memcpy_to_pmem(dst, src, n); +} + +/** + * wmb_pmem - synchronize writes to persistent memory + * + * After a series of memcpy_to_pmem() operations this drains data from + * cpu write buffers and any platform (memory controller) buffers to + * ensure that written data is durable on persistent memory media. + */ +static inline void wmb_pmem(void) +{ + if (arch_has_pmem_api()) + arch_wmb_pmem(); +} +#endif /* __PMEM_H__ */ -- cgit v1.2.3 From a9730fca9946f3697410479e0ef1bd759ba00a77 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 29 Jun 2015 09:28:08 -0500 Subject: Fix kmalloc slab creation sequence This patch restores the slab creation sequence that was broken by commit 4066c33d0308f8 and also reverts the portions that introduced the KMALLOC_LOOP_XXX macros. Those can never really work since the slab creation is much more complex than just going from a minimum to a maximum number. The latest upstream kernel boots cleanly on my machine with a 64 bit x86 configuration under KVM using either SLAB or SLUB. Fixes: 4066c33d0308f8 ("support the slub_debug boot option") Reported-by: Theodore Ts'o Signed-off-by: Christoph Lameter Signed-off-by: Linus Torvalds --- include/linux/slab.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9de2fdc8b5e4..a99f0e5243e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,30 +153,8 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) -/* - * The KMALLOC_LOOP_LOW is the definition for the for loop index start number - * to create the kmalloc_caches object in create_kmalloc_caches(). The first - * and the second are 96 and 192. You can see that in the kmalloc_index(), if - * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, - * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't - * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. - */ -#if KMALLOC_MIN_SIZE <= 32 -#define KMALLOC_LOOP_LOW 1 -#elif KMALLOC_MIN_SIZE <= 64 -#define KMALLOC_LOOP_LOW 2 -#else -#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW -#endif - #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -/* - * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. - * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be - * initialized. - */ -#define KMALLOC_LOOP_LOW 1 #endif /* -- cgit v1.2.3