From ebe0bbf06c9e03613bdcb6b5a704595a9344b7ff Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 10 Oct 2005 20:52:19 -0700 Subject: [NETFILTER] nfnetlink: use highest bit of nfa_type to indicate nested TLV As Henrik Nordstrom pointed out, all our efforts with "split endian" (i.e. host byte order tags, net byte order values) are useless, unless a parser can determine whether an attribute is nested or not. This patch steals the highest bit of nfattr.nfa_type to indicate whether the data payload contains a nested nfattr (1) or not (0). This will break userspace compatibility, but luckily no kernel with nfnetlink was released so far. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 1d5b10ae2399..f08e870100f4 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -41,11 +41,15 @@ enum nfnetlink_groups { struct nfattr { u_int16_t nfa_len; - u_int16_t nfa_type; + u_int16_t nfa_type; /* we use 15 bits for the type, and the highest + * bit to indicate whether the payload is nested */ } __attribute__ ((packed)); -/* FIXME: Shamelessly copy and pasted from rtnetlink.h, it's time - * to put this in a generic file */ +/* FIXME: Apart from NFNL_NFA_NESTED shamelessly copy and pasted from + * rtnetlink.h, it's time to put this in a generic file */ + +#define NFNL_NFA_NEST 0x8000 +#define NFA_TYPE(attr) ((attr)->nfa_type & 0x7fff) #define NFA_ALIGNTO 4 #define NFA_ALIGN(len) (((len) + NFA_ALIGNTO - 1) & ~(NFA_ALIGNTO - 1)) @@ -59,7 +63,7 @@ struct nfattr #define NFA_PAYLOAD(nfa) ((int)((nfa)->nfa_len) - NFA_LENGTH(0)) #define NFA_NEST(skb, type) \ ({ struct nfattr *__start = (struct nfattr *) (skb)->tail; \ - NFA_PUT(skb, type, 0, NULL); \ + NFA_PUT(skb, (NFNL_NFA_NEST | type), 0, NULL); \ __start; }) #define NFA_NEST_END(skb, start) \ ({ (start)->nfa_len = ((skb)->tail - (unsigned char *) (start)); \ -- cgit v1.2.3 From b3a91d037a2575040f9b6a483f60c407a3d80368 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 10 Oct 2005 20:52:36 -0700 Subject: [NETFILTER] nat: remove bogus structure member When 'rustynat' was merged in 2.6.12, the use of the "helper" pointer of struct ipt_nat_info was obsoleted, but the pointer not removed from the struct. This patch removes the pointer, thereby yet again shrinking struct ip_conntrack. Discovered-by: Rusty Russell Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_nat.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h index e201ec6e9905..41a107de17cf 100644 --- a/include/linux/netfilter_ipv4/ip_nat.h +++ b/include/linux/netfilter_ipv4/ip_nat.h @@ -58,10 +58,6 @@ extern rwlock_t ip_nat_lock; struct ip_nat_info { struct list_head bysource; - - /* Helper (NULL if none). */ - struct ip_nat_helper *helper; - struct ip_nat_seq seq[IP_CT_DIR_MAX]; }; -- cgit v1.2.3 From 5bbc243aafff9ad653dc7a9fa7bcaf0b4631355a Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 10 Oct 2005 20:54:01 -0700 Subject: [NETFILTER]: Add missing include to ip_conntrack_tuple.h Without this #include, __be16 is not defined and userspace programs will break. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack_tuple.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h index 20e43f018b7c..3232db11a4e5 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h @@ -1,6 +1,8 @@ #ifndef _IP_CONNTRACK_TUPLE_H #define _IP_CONNTRACK_TUPLE_H +#include + /* A `tuple' is a structure containing the information to uniquely identify a connection. ie. if two packets have the same tuple, they are in the same connection; if not, they are not. -- cgit v1.2.3 From e1c73b78e3706bd3c336d4730a01dd4081dfb7ee Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Oct 2005 20:55:49 -0700 Subject: [NETFILTER] ctnetlink: add one nesting level for TCP state To keep consistency, the TCP private protocol information is nested attributes under CTA_PROTOINFO_TCP. This way the sequence of attributes to access the TCP state information looks like here below: CTA_PROTOINFO CTA_PROTOINFO_TCP CTA_PROTOINFO_TCP_STATE instead of: CTA_PROTOINFO CTA_PROTOINFO_TCP_STATE Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink_conntrack.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 5c55751c78e4..fb5511030185 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -70,11 +70,18 @@ enum ctattr_l4proto { enum ctattr_protoinfo { CTA_PROTOINFO_UNSPEC, - CTA_PROTOINFO_TCP_STATE, + CTA_PROTOINFO_TCP, __CTA_PROTOINFO_MAX }; #define CTA_PROTOINFO_MAX (__CTA_PROTOINFO_MAX - 1) +enum ctattr_protoinfo_tcp { + CTA_PROTOINFO_TCP_UNSPEC, + CTA_PROTOINFO_TCP_STATE, + __CTA_PROTOINFO_TCP_MAX +}; +#define CTA_PROTOINFO_TCP_MAX (__CTA_PROTOINFO_TCP_MAX - 1) + enum ctattr_counters { CTA_COUNTERS_UNSPEC, CTA_COUNTERS_PACKETS, -- cgit v1.2.3 From a051a8f7306476af0a74370ad56e793cb6c43bf7 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 10 Oct 2005 21:21:10 -0700 Subject: [NETFILTER]: Use only 32bit counters for CONNTRACK_ACCT Initially we used 64bit counters for conntrack-based accounting, since we had no event mechanism to tell userspace that our counters are about to overflow. With nfnetlink_conntrack, we now have such a event mechanism and thus can save 16bytes per connection. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink_conntrack.h | 6 ++++-- include/linux/netfilter_ipv4/ip_conntrack.h | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index fb5511030185..116fcaced909 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -84,8 +84,10 @@ enum ctattr_protoinfo_tcp { enum ctattr_counters { CTA_COUNTERS_UNSPEC, - CTA_COUNTERS_PACKETS, - CTA_COUNTERS_BYTES, + CTA_COUNTERS_PACKETS, /* old 64bit counters */ + CTA_COUNTERS_BYTES, /* old 64bit counters */ + CTA_COUNTERS32_PACKETS, + CTA_COUNTERS32_BYTES, __CTA_COUNTERS_MAX }; #define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1) diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 4ced38736813..d078bb91d9e5 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -117,6 +117,10 @@ enum ip_conntrack_events /* NAT info */ IPCT_NATINFO_BIT = 10, IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), + + /* Counter highest bit has been set */ + IPCT_COUNTER_FILLING_BIT = 11, + IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT), }; enum ip_conntrack_expect_events { @@ -192,8 +196,8 @@ do { \ struct ip_conntrack_counter { - u_int64_t packets; - u_int64_t bytes; + u_int32_t packets; + u_int32_t bytes; }; struct ip_conntrack_helper; -- cgit v1.2.3 From 339231537506846cb232a2f0cc4a2c662b2d5b07 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Oct 2005 21:23:28 -0700 Subject: [NETFILTER] ctnetlink: allow userspace to change TCP state This patch adds the ability of changing the state a TCP connection. I know that this must be used with care but it's required to provide a complete conntrack creation via conntrack_netlink. So I'll document this aspect on the upcoming docs. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack_protocol.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h index b6b99be8632a..2c76b879e3dc 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h @@ -52,6 +52,9 @@ struct ip_conntrack_protocol int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa, const struct ip_conntrack *ct); + /* convert nfnetlink attributes to protoinfo */ + int (*from_nfattr)(struct nfattr *tb[], struct ip_conntrack *ct); + int (*tuple_to_nfattr)(struct sk_buff *skb, const struct ip_conntrack_tuple *t); int (*nfattr_to_tuple)(struct nfattr *tb[], -- cgit v1.2.3 From afb997c6163b33292d31a09d6aa5cbb03ffa5bf1 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 12 Oct 2005 15:12:21 -0700 Subject: [NETPOLL]: wrong return for null netpoll_poll_lock() When netpoll is not being used, the macro that defines the removed routing netpoll_poll_lock defines the return as zero, but the real routine returns a `void *` Signed-off-by: Ben Dooks Signed-off-by: David S. Miller --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 5ade54a78dbb..ca5a8733000f 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -86,7 +86,7 @@ static inline void netpoll_poll_unlock(void *have) #else #define netpoll_rx(a) 0 -#define netpoll_poll_lock(a) 0 +#define netpoll_poll_lock(a) NULL #define netpoll_poll_unlock(a) #endif -- cgit v1.2.3 From c8923c6b852d3a97c1faad0566e38fca330375a7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 13 Oct 2005 14:41:23 -0700 Subject: [NETFILTER]: Fix OOPSes on machines with discontiguous cpu numbering. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Original patch by Harald Welte, with feedback from Herbert Xu and testing by Sébastien Bernard. EBTABLES, ARP tables, and IP/IP6 tables all assume that cpus are numbered linearly. That is not necessarily true. This patch fixes that up by calculating the largest possible cpu number, and allocating enough per-cpu structure space given that. Signed-off-by: David S. Miller --- include/linux/cpumask.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b15826f6e3a2..fe9778301d07 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -392,4 +392,16 @@ extern cpumask_t cpu_present_map; #define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map) #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map) +/* Find the highest possible smp_processor_id() */ +static inline unsigned int highest_possible_processor_id(void) +{ + unsigned int cpu, highest = 0; + + for_each_cpu_mask(cpu, cpu_possible_map) + highest = cpu; + + return highest; +} + + #endif /* __LINUX_CPUMASK_H */ -- cgit v1.2.3 From e26148d934762b61133a64b6862f870624ff617d Mon Sep 17 00:00:00 2001 From: Tim Schmielau Date: Fri, 14 Oct 2005 15:59:05 -0700 Subject: [PATCH] Fix copy-and-paste error in BSD accounting Fix copy and paste error in jiffies_to_AHZ conversion which leads to wrong BSD accounting information on alpha and ia64 when CONFIG_BSD_PROCESS_ACCT_V3 is turned on. Also update comment to match reorganised header files. Signed-off-by: Tim Schmielau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/acct.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acct.h b/include/linux/acct.h index 1993a3691768..19f70462b3be 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -162,13 +162,13 @@ typedef struct acct acct_t; #ifdef __KERNEL__ /* * Yet another set of HZ to *HZ helper functions. - * See for the original. + * See for the original. */ static inline u32 jiffies_to_AHZ(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0 - return x / (HZ / USER_HZ); + return x / (HZ / AHZ); #else u64 tmp = (u64)x * TICK_NSEC; do_div(tmp, (NSEC_PER_SEC / AHZ)); -- cgit v1.2.3 From 688ce17b8599abc548b406c00e4d18ae0dec954f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Oct 2005 00:17:33 -0700 Subject: [PATCH]: highest_possible_processor_id() has to be a macro ... otherwise, things like alpha and sparc64 break and break badly. They define cpu_possible_map to something else in smp.h *AFTER* having included cpumask.h. If that puppy is a macro, expansion will happen at the actual caller, when we'd already seen #define cpu_possible_map ... and we will get the right thing used. As an inline helper it will be tokenized before we get to that define and that's it; no matter what we define later, it won't affect anything. We get modules with dependency on cpu_possible_map instead of the right symbol (phys_cpu_present_map in case of sparc64), or outright link errors if they are built-in. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- include/linux/cpumask.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index fe9778301d07..9bdba8169b41 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -393,15 +393,13 @@ extern cpumask_t cpu_present_map; #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map) /* Find the highest possible smp_processor_id() */ -static inline unsigned int highest_possible_processor_id(void) -{ - unsigned int cpu, highest = 0; - - for_each_cpu_mask(cpu, cpu_possible_map) - highest = cpu; - - return highest; -} +#define highest_possible_processor_id() \ +({ \ + unsigned int cpu, highest = 0; \ + for_each_cpu_mask(cpu, cpu_possible_map) \ + highest = cpu; \ + highest; \ +}) #endif /* __LINUX_CPUMASK_H */ -- cgit v1.2.3 From b24d18aa743dad0c42918157c5d717686269d3a8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 16 Oct 2005 20:29:20 -0700 Subject: [PATCH] list: add missing rcu_dereference on first element It seems that all the list_*_rcu primitives are missing a memory barrier on the very first dereference. For example, #define list_for_each_rcu(pos, head) \ for (pos = (head)->next; prefetch(pos->next), pos != (head); \ pos = rcu_dereference(pos->next)) It will go something like: pos = (head)->next prefetch(pos->next) pos != (head) do stuff We're missing a barrier here. pos = rcu_dereference(pos->next) fetch pos->next barrier given by rcu_dereference(pos->next) store pos Without the missing barrier, the pos->next value may turn out to be stale. In fact, if "do stuff" were also dereferencing pos and relying on list_for_each_rcu to provide the barrier then it may also break. So here is a patch to make sure that we have a barrier for the first element in the list. Signed-off-by: Herbert Xu Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index e6ec59682274..084971f333fe 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -442,12 +442,14 @@ static inline void list_splice_init(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_rcu(pos, head) \ - for (pos = (head)->next; prefetch(pos->next), pos != (head); \ - pos = rcu_dereference(pos->next)) + for (pos = (head)->next; \ + prefetch(rcu_dereference(pos)->next), pos != (head); \ + pos = pos->next) #define __list_for_each_rcu(pos, head) \ - for (pos = (head)->next; pos != (head); \ - pos = rcu_dereference(pos->next)) + for (pos = (head)->next; \ + rcu_dereference(pos) != (head); \ + pos = pos->next) /** * list_for_each_safe_rcu - iterate over an rcu-protected list safe @@ -461,8 +463,9 @@ static inline void list_splice_init(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_safe_rcu(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = rcu_dereference(n), n = pos->next) + for (pos = (head)->next; \ + n = rcu_dereference(pos)->next, pos != (head); \ + pos = n) /** * list_for_each_entry_rcu - iterate over rcu list of given type @@ -474,11 +477,11 @@ static inline void list_splice_init(struct list_head *list, * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define list_for_each_entry_rcu(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member); \ - prefetch(pos->member.next), &pos->member != (head); \ - pos = rcu_dereference(list_entry(pos->member.next, \ - typeof(*pos), member))) +#define list_for_each_entry_rcu(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + prefetch(rcu_dereference(pos)->member.next), \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) /** @@ -492,8 +495,9 @@ static inline void list_splice_init(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_continue_rcu(pos, head) \ - for ((pos) = (pos)->next; prefetch((pos)->next), (pos) != (head); \ - (pos) = rcu_dereference((pos)->next)) + for ((pos) = (pos)->next; \ + prefetch(rcu_dereference((pos))->next), (pos) != (head); \ + (pos) = (pos)->next) /* * Double linked lists with a single pointer list head. @@ -696,8 +700,9 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, pos = n) #define hlist_for_each_rcu(pos, head) \ - for ((pos) = (head)->first; pos && ({ prefetch((pos)->next); 1; }); \ - (pos) = rcu_dereference((pos)->next)) + for ((pos) = (head)->first; \ + rcu_dereference((pos)) && ({ prefetch((pos)->next); 1; }); \ + (pos) = (pos)->next) /** * hlist_for_each_entry - iterate over list of given type @@ -762,9 +767,9 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, */ #define hlist_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = (head)->first; \ - pos && ({ prefetch(pos->next); 1;}) && \ + rcu_dereference(pos) && ({ prefetch(pos->next); 1;}) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ - pos = rcu_dereference(pos->next)) + pos = pos->next) #else #warning "don't include kernel headers in userspace" -- cgit v1.2.3 From 5ee832dbc6770135ec8d63296af0a4374557bb79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Oct 2005 20:01:21 +0200 Subject: [PATCH] rcu: keep rcu callback event counter This makes call_rcu() keep track of how many events there are on the RCU list, and cause a reschedule event when the list gets too long. This helps keep RCU event lists down. Signed-off-by: Linus Torvalds --- include/linux/rcupdate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 4e65eb44adfd..70191a5a148f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -94,6 +94,7 @@ struct rcu_data { long batch; /* Batch # for current RCU batch */ struct rcu_head *nxtlist; struct rcu_head **nxttail; + long count; /* # of queued items */ struct rcu_head *curlist; struct rcu_head **curtail; struct rcu_head *donelist; -- cgit v1.2.3 From 4faa5285283fad081443e3612ca426a311bb6c7e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 17 Oct 2005 16:43:33 -0700 Subject: [PATCH] aio: revert lock_kiocb() lock_kiocb() was introduced to serialize retrying and cancellation. In the process of doing so it tried to sleep waiting for KIF_LOCKED while holding the ctx_lock spinlock. Recent fixes have ensured that multiple concurrent retries won't be attempted for a given iocb. Cancel has other problems and has no significant in-tree users that have been complaining about it. So for the immediate future we'll revert sleeping with the lock held and will address proper cancellation and retry serialization in the future. Signed-off-by: Zach Brown Acked-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/aio.h b/include/linux/aio.h index 60def658b246..0decf66117c1 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -24,7 +24,12 @@ struct kioctx; #define KIOCB_SYNC_KEY (~0U) /* ki_flags bits */ -#define KIF_LOCKED 0 +/* + * This may be used for cancel/retry serialization in the future, but + * for now it's unused and we probably don't want modules to even + * think they can use it. + */ +/* #define KIF_LOCKED 0 */ #define KIF_KICKED 1 #define KIF_CANCELLED 2 -- cgit v1.2.3 From 3359b54c8c07338f3a863d1109b42eebccdcf379 Mon Sep 17 00:00:00 2001 From: "Seth, Rohit" Date: Tue, 18 Oct 2005 14:15:12 -0700 Subject: [PATCH] Handle spurious page fault for hugetlb region The hugetlb pages are currently pre-faulted. At the time of mmap of hugepages, we populate the new PTEs. It is possible that HW has already cached some of the unused PTEs internally. These stale entries never get a chance to be purged in existing control flow. This patch extends the check in page fault code for hugepages. Check if a faulted address falls with in size for the hugetlb file backing it. We return VM_FAULT_MINOR for these cases (assuming that the arch specific page-faulting code purges the stale entry for the archs that need it). Signed-off-by: Rohit Seth [ This is apparently arguably an ia64 port bug. But the code won't hurt, and for now it fixes a real problem on some ia64 machines ] Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e670b0d13fe0..42cb7d70f9ac 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -155,11 +155,24 @@ static inline void set_file_hugepages(struct file *file) { file->f_op = &hugetlbfs_file_operations; } + +static inline int valid_hugetlb_file_off(struct vm_area_struct *vma, + unsigned long address) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + loff_t file_off = address - vma->vm_start; + + file_off += (vma->vm_pgoff << PAGE_SHIFT); + + return (file_off < inode->i_size); +} + #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() #define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) +#define valid_hugetlb_file_off(vma, address) 0 #endif /* !CONFIG_HUGETLBFS */ -- cgit v1.2.3 From 281dd25cdc0d6903929b79183816d151ea626341 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Wed, 19 Oct 2005 15:52:18 -0700 Subject: [PATCH] swiotlb: make sure initial DMA allocations really are in DMA memory This introduces a limit parameter to the core bootmem allocator; The new parameter indicates that physical memory allocated by the bootmem allocator should be within the requested limit. We also introduce alloc_bootmem_low_pages_limit, alloc_bootmem_node_limit, alloc_bootmem_low_pages_node_limit apis, but alloc_bootmem_low_pages_limit is the only api used for swiotlb. The existing alloc_bootmem_low_pages() api could instead have been changed and made to pass right limit to the core allocator. But that would make the patch more intrusive for 2.6.14, as other arches use alloc_bootmem_low_pages(). We may be done that post 2.6.14 as a cleanup. With this, swiotlb gets memory within 4G for both x86_64 and ia64 arches. Signed-off-by: Yasunori Goto Cc: Ravikiran G Thirumalai Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 82bd8842d11c..3b03b0b868dd 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -43,7 +43,7 @@ typedef struct bootmem_data { extern unsigned long __init bootmem_bootmap_pages (unsigned long); extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend); extern void __init free_bootmem (unsigned long addr, unsigned long size); -extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal); +extern void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, unsigned long limit); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void __init reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ @@ -54,6 +54,16 @@ extern void __init reserve_bootmem (unsigned long addr, unsigned long size); __alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages(x) \ __alloc_bootmem((x), PAGE_SIZE, 0) + +#define alloc_bootmem_limit(x, limit) \ + __alloc_bootmem_limit((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit)) +#define alloc_bootmem_low_limit(x, limit) \ + __alloc_bootmem_limit((x), SMP_CACHE_BYTES, 0, (limit)) +#define alloc_bootmem_pages_limit(x, limit) \ + __alloc_bootmem_limit((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit)) +#define alloc_bootmem_low_pages_limit(x, limit) \ + __alloc_bootmem_limit((x), PAGE_SIZE, 0, (limit)) + #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ extern unsigned long __init free_all_bootmem (void); @@ -61,7 +71,7 @@ extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long f extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size); extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size); extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat); -extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +extern void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) @@ -69,6 +79,14 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) + +#define alloc_bootmem_node_limit(pgdat, x, limit) \ + __alloc_bootmem_node_limit((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit)) +#define alloc_bootmem_pages_node_limit(pgdat, x, limit) \ + __alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit)) +#define alloc_bootmem_low_pages_node_limit(pgdat, x, limit) \ + __alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, 0, (limit)) + #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP @@ -105,5 +123,15 @@ extern void *__init alloc_large_system_hash(const char *tablename, #endif extern int __initdata hashdist; /* Distribute hashes across NUMA nodes? */ +static inline void *__alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_limit(size, align, goal, 0); +} + +static inline void *__alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, + unsigned long goal) +{ + return __alloc_bootmem_node_limit(pgdat, size, align, goal, 0); +} #endif /* _LINUX_BOOTMEM_H */ -- cgit v1.2.3 From ac9b9c667c2e1194e22ebe0a441ae1c37aaa9b90 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 20 Oct 2005 16:24:28 +0100 Subject: [PATCH] Fix handling spurious page fault for hugetlb region This reverts commit 3359b54c8c07338f3a863d1109b42eebccdcf379 and replaces it with a cleaner version that is purely based on page table operations, so that the synchronization between inode size and hugetlb mappings becomes moot. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 42cb7d70f9ac..d664330d900e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); struct page *alloc_huge_page(void); void free_huge_page(struct page *); +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access); extern unsigned long max_huge_pages; extern const unsigned long hugetlb_zero, hugetlb_infinity; @@ -99,6 +101,7 @@ static inline unsigned long hugetlb_total_pages(void) do { } while (0) #define alloc_huge_page() ({ NULL; }) #define free_huge_page(p) ({ (void)(p); BUG(); }) +#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #ifndef HPAGE_MASK #define HPAGE_MASK 0 /* Keep the compiler happy */ @@ -155,24 +158,11 @@ static inline void set_file_hugepages(struct file *file) { file->f_op = &hugetlbfs_file_operations; } - -static inline int valid_hugetlb_file_off(struct vm_area_struct *vma, - unsigned long address) -{ - struct inode *inode = vma->vm_file->f_dentry->d_inode; - loff_t file_off = address - vma->vm_start; - - file_off += (vma->vm_pgoff << PAGE_SHIFT); - - return (file_off < inode->i_size); -} - #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() #define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) -#define valid_hugetlb_file_off(vma, address) 0 #endif /* !CONFIG_HUGETLBFS */ -- cgit v1.2.3