From 29ebbba744cf8951202b5f4ea62b4a297f4662c1 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 31 Mar 2016 19:03:25 +0300 Subject: IB/mlx5: Expose correct max_sge_rd limit commit 986ef95ecdd3eb6fa29433e68faa94c7624083be upstream. mlx5 devices (Connect-IB, ConnectX-4, ConnectX-4-LX) has a limitation where rdma read work queue entries cannot exceed 512 bytes. A rdma_read wqe needs to fit in 512 bytes: - wqe control segment (16 bytes) - rdma segment (16 bytes) - scatter elements (16 bytes each) So max_sge_rd should be: (512 - 16 - 16) / 16 = 30. Reported-by: Christoph Hellwig Tested-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- include/linux/mlx5/device.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0b473cbfa7ef..a91b67b18a73 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -334,6 +334,17 @@ enum { MLX5_CAP_OFF_CMDIF_CSUM = 46, }; +enum { + /* + * Max wqe size for rdma read is 512 bytes, so this + * limits our max_sge_rd as the wqe needs to fit: + * - ctrl segment (16 bytes) + * - rdma segment (16 bytes) + * - scatter elements (16 bytes each) + */ + MLX5_MAX_SGE_RD = (512 - 16 - 16) / 16 +}; + struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; -- cgit v1.2.3 From d52097476caeb14f4d7e3417dda08220d2813cc4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:06:48 -0400 Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback commit 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 upstream. Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset kicks off asynchronous NUMA node migration if necessary during task migration and flushes it from cpuset_post_attach_flush() which is called at the end of __cgroup_procs_write(). This is to avoid performing migration with cgroup_threadgroup_rwsem write-locked which can lead to deadlock through dependency on kworker creation. memcg has a similar issue with charge moving, so let's convert it to an official callback rather than the current one-off cpuset specific function. This patch adds cgroup_subsys->post_attach callback and makes cpuset register cpuset_post_attach_flush() as its ->post_attach. The conversion is mostly one-to-one except that the new callback is called under cgroup_mutex. This is to guarantee that no other migration operations are started before ->post_attach callbacks are finished. cgroup_mutex is one of the outermost mutex in the system and has never been and shouldn't be a problem. We can add specialized synchronization around __cgroup_procs_write() but I don't think there's any noticeable benefit. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup-defs.h | 1 + include/linux/cpuset.h | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a7c7f74808a4..8da263299754 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -434,6 +434,7 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); + void (*post_attach)(void); int (*can_fork)(struct task_struct *task, void **priv_p); void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index fea160ee5803..85a868ccb493 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -137,8 +137,6 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern void cpuset_post_attach_flush(void); - #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -245,10 +243,6 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline void cpuset_post_attach_flush(void) -{ -} - #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ -- cgit v1.2.3 From e513b90a9aef91e6399decb8e9592f2d75f7ebad Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 28 Apr 2016 16:18:35 -0700 Subject: numa: fix /proc//numa_maps for THP commit 28093f9f34cedeaea0f481c58446d9dac6dd620f upstream. In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the layouts may differ depending on the architecture. On s390 this will lead to inaccurate numa_maps accounting in /proc because of misguided pte_present() and pte_dirty() checks on the fake pte. On other architectures pte_present() and pte_dirty() may work by chance, but there may be an issue with direct-access (dax) mappings w/o underlying struct pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page() the fake pte will be checked with pte_special() and because there is no "special" bit in a pmd, this will always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an invalid struct page pointer would then be returned that can crash the kernel. This patch fixes the numa_maps THP handling by introducing new "_pmd" variants of the can_gather_numa_stats() and vm_normal_page() functions. Signed-off-by: Gerald Schaefer Cc: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Johannes Weiner Cc: Dave Hansen Cc: Mel Gorman Cc: Dan Williams Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 00bad7793788..fb8b20e5d021 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1084,6 +1084,8 @@ struct zap_details { struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd); int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -- cgit v1.2.3 From 720089ef0ba8007c34dfa7d80d96e27d6d23088f Mon Sep 17 00:00:00 2001 From: David Woods Date: Thu, 17 Dec 2015 14:31:26 -0500 Subject: arm64: hugetlb: add support for PTE contiguous bit The arm64 MMU supports a Contiguous bit which is a hint that the TTE is one of a set of contiguous entries which can be cached in a single TLB entry. Supporting this bit adds new intermediate huge page sizes. The set of huge page sizes available depends on the base page size. Without using contiguous pages the huge page sizes are as follows. 4KB: 2MB 1GB 64KB: 512MB With a 4KB granule, the contiguous bit groups together sets of 16 pages and with a 64KB granule it groups sets of 32 pages. This enables two new huge page sizes in each case, so that the full set of available sizes is as follows. 4KB: 64KB 2MB 32MB 1GB 64KB: 2MB 512MB 16GB If a 16KB granule is used then the contiguous bit groups 128 pages at the PTE level and 32 pages at the PMD level. If the base page size is set to 64KB then 2MB pages are enabled by default. It is possible in the future to make 2MB the default huge page size for both 4KB and 64KB granules. Reviewed-by: Chris Metcalf Reviewed-by: Steve Capper Signed-off-by: David Woods Signed-off-by: Will Deacon (cherry picked from commit 66b3923a1a0f77a563b43f43f6ad091354abbfe9) Signed-off-by: Alex Shi --- include/linux/hugetlb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 685c262e0be8..b0eb06423d5e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -96,9 +96,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct address_space *mapping, pgoff_t idx, unsigned long address); -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -#endif extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; -- cgit v1.2.3 From ee6457583818600e4c9b7f3a09d358d6ae3727b8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 10 Jan 2016 11:29:07 +0100 Subject: efi: stub: implement efi_get_random_bytes() based on EFI_RNG_PROTOCOL This exposes the firmware's implementation of EFI_RNG_PROTOCOL via a new function efi_get_random_bytes(). Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit e4fbf4767440472f9d23b0f25a2b905e1c63b6a8) Signed-off-by: Alex Shi --- include/linux/efi.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 47be3ad7d3e5..333d0ca6940f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -299,7 +299,7 @@ typedef struct { void *open_protocol_information; void *protocols_per_handle; void *locate_handle_buffer; - void *locate_protocol; + efi_status_t (*locate_protocol)(efi_guid_t *, void *, void **); void *install_multiple_protocol_interfaces; void *uninstall_multiple_protocol_interfaces; void *calculate_crc32; @@ -599,6 +599,10 @@ void efi_native_runtime_setup(void); #define EFI_PROPERTIES_TABLE_GUID \ EFI_GUID( 0x880aaca3, 0x4adc, 0x4a04, 0x90, 0x79, 0xb7, 0x47, 0x34, 0x08, 0x25, 0xe5 ) +#define EFI_RNG_PROTOCOL_GUID \ + EFI_GUID(0x3152bca5, 0xeade, 0x433d, \ + 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44) + typedef struct { efi_guid_t guid; u64 table; -- cgit v1.2.3