diff --git a/kernel/drivers/net/ixgbe/ixgbe.h b/kernel/drivers/net/ixgbe/ixgbe.h index 467c1bdb..cf5745c1 100644 --- a/kernel/drivers/net/ixgbe/ixgbe.h +++ b/kernel/drivers/net/ixgbe/ixgbe.h @@ -75,12 +75,12 @@ __func__ , ## args))) /* TX/RX descriptor defines */ -#define IXGBE_DEFAULT_TXD 512 +#define IXGBE_DEFAULT_TXD 4096 #define IXGBE_DEFAULT_TX_WORK 256 #define IXGBE_MAX_TXD 4096 #define IXGBE_MIN_TXD 64 -#define IXGBE_DEFAULT_RXD 512 +#define IXGBE_DEFAULT_RXD 4096 #define IXGBE_DEFAULT_RX_WORK 256 #define IXGBE_MAX_RXD 4096 #define IXGBE_MIN_RXD 64 diff --git a/kernel/drivers/net/ixgbe/ixgbe_main.c b/kernel/drivers/net/ixgbe/ixgbe_main.c index 7243238f..3f7ddfcd 100644 --- a/kernel/drivers/net/ixgbe/ixgbe_main.c +++ b/kernel/drivers/net/ixgbe/ixgbe_main.c @@ -3435,11 +3435,13 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter) 0x6A3E67EA, 0x14364D17, 0x3BED200D}; u32 mrqc = 0, reta = 0; u32 rxcsum; - int i, j; + int i, j, offset, rss_limit; int maxq = adapter->ring_feature[RING_F_RSS].indices; int mask; u8 tcs = netdev_get_num_tc(adapter->netdev); + pr_info("maxq=%d,num_tx_queues=%d, tcs=%u\n", + maxq, adapter->num_tx_queues, tcs); #ifdef HAVE_MQPRIO if (tcs) maxq = min(maxq, adapter->num_tx_queues / tcs); @@ -3449,15 +3451,19 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter) for (i = 0; i < 10; i++) IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), seed[i]); + rss_limit = min_t(int, adapter->num_tx_queues, + IXGBE_MAX_RSS_INDICES); + offset = (rss_limit > maxq) ? (rss_limit - maxq) : 0; /* Fill out redirection table */ - for (i = 0, j = 0; i < 128; i++, j++) { - if (j == maxq) - j = 0; + for (i = 0, j = offset; i < 128; i++, j++) { + if (j == (maxq + offset)) + j = offset; /* reta = 4-byte sliding window of * 0x00..(indices-1)(indices-1)00..etc. */ reta = (reta << 8) | (j * 0x11); - if ((i & 3) == 3) + if ((i & 3) == 3) { IXGBE_WRITE_REG(hw, IXGBE_RETA(i >> 2), reta); + } } /* Disable indicating checksum in descriptor, enables RSS hash */ @@ -8725,7 +8731,8 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb) } #endif /* IXGBE_FCOE */ - if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) { + if (adapter->flags & (IXGBE_FLAG_FDIR_HASH_CAPABLE | + IXGBE_FLAG_FDIR_PERFECT_CAPABLE)) { while (unlikely(txq >= dev->real_num_tx_queues)) txq -= dev->real_num_tx_queues; return txq; diff --git a/kernel/drivers/net/ixgbe/ixgbe_param.c b/kernel/drivers/net/ixgbe/ixgbe_param.c index 78d7dcf4..323e5327 100644 --- a/kernel/drivers/net/ixgbe/ixgbe_param.c +++ b/kernel/drivers/net/ixgbe/ixgbe_param.c @@ -1022,9 +1022,15 @@ void __devinit ixgbe_check_options(struct ixgbe_adapter *adapter) /* limit the number of queues for FDIR using RSS param */ if (feature[RING_F_RSS].indices && num_RSS > bd && RSS[bd]) + { feature[RING_F_FDIR].indices = - feature[RING_F_RSS].indices; + min_t(int, num_online_cpus(), + feature[RING_F_FDIR].indices); + DPRINTK(PROBE, INFO, "FDIR.indices:%d, RSS.indices:%d\n", + feature[RING_F_FDIR].indices, + feature[RING_F_RSS].indices); + } no_flow_director: /* empty code line with semi-colon */ ; } diff --git a/kernel/include/linux/ip_vs.h b/kernel/include/linux/ip_vs.h index 17f51e7d..644b1dcc 100644 --- a/kernel/include/linux/ip_vs.h +++ b/kernel/include/linux/ip_vs.h @@ -20,6 +20,7 @@ #define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ #define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ #define IP_VS_SVC_F_ONEPACKET 0x0004 /* one-packet scheduling */ +#define IP_VS_SVC_F_SYNPROXY 0x8000 /* synproxy flag */ /* * Destination Server Flags @@ -352,6 +353,8 @@ enum { IPVS_SVC_ATTR_NETMASK, /* persistent netmask */ IPVS_SVC_ATTR_STATS, /* nested attribute for service stats */ + IPVS_SVC_ATTR_EST_TIMEOUT, /* establish timeout */ + __IPVS_SVC_ATTR_MAX, }; diff --git a/kernel/include/net/ip_vs.h b/kernel/include/net/ip_vs.h index 3b6ef202..76d595ef 100644 --- a/kernel/include/net/ip_vs.h +++ b/kernel/include/net/ip_vs.h @@ -287,7 +287,7 @@ struct ip_vs_protocol { struct ip_vs_protocol * pp, struct ip_vs_conn * cp); - int (*fnat_in_handler) (struct sk_buff ** skb_p, + int (*fnat_in_handler) (struct sk_buff * skb, struct ip_vs_protocol * pp, struct ip_vs_conn * cp); @@ -381,6 +381,8 @@ struct ip_vs_conn { * state transition triggerd * synchronization */ + u16 cpuid; + /* Control members */ struct ip_vs_conn *control; /* Master control connection */ atomic_t n_control; /* Number of controlled ones */ @@ -424,8 +426,19 @@ struct ip_vs_conn { /* L2 direct response xmit */ struct net_device *indev; - unsigned char src_hwaddr[MAX_ADDR_LEN]; - unsigned char dst_hwaddr[MAX_ADDR_LEN]; + unsigned char src_hwaddr[ETH_ALEN]; + unsigned char dst_hwaddr[ETH_ALEN]; + struct net_device *dev_inside; + unsigned char src_hwaddr_inside[ETH_ALEN]; + unsigned char dst_hwaddr_inside[ETH_ALEN]; + + int est_timeout; /* Now, we decide that every VS + * should have its private + * establish state timeout for user requirement. + * Each conn inherit this value from VS and + * set this value into conn timer + * when state change to establishment + */ }; /* @@ -449,6 +462,7 @@ struct ip_vs_service_user_kern { unsigned flags; /* virtual service flags */ unsigned timeout; /* persistent timeout in sec */ u32 netmask; /* persistent netmask */ + unsigned est_timeout; /* vs private establish state timeout */ }; struct ip_vs_dest_user_kern { @@ -477,7 +491,6 @@ struct ip_vs_service { struct list_head s_list; /* for normal service table */ struct list_head f_list; /* for fwmark-based service table */ atomic_t refcnt; /* reference counter */ - atomic_t usecnt; /* use counter */ u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ @@ -491,6 +504,7 @@ struct ip_vs_service { /* for realservers list */ struct list_head destinations; /* real server d-linked list */ __u32 num_dests; /* number of servers */ + long weight; /* sum of servers weight */ /* for local ip address list, now only used in FULL NAT model */ struct list_head laddr_list; /* local ip address list */ @@ -498,13 +512,18 @@ struct ip_vs_service { __u32 num_laddrs; /* number of local ip address */ struct list_head *curr_laddr; /* laddr data list head */ - struct ip_vs_stats *stats; /* Use per-cpu statistics for the service */ + struct ip_vs_stats stats; /* statistics for the service */ struct ip_vs_app *inc; /* bind conns to this app inc */ /* for scheduling */ struct ip_vs_scheduler *scheduler; /* bound scheduler object */ rwlock_t sched_lock; /* lock sched_data */ void *sched_data; /* scheduler application data */ + + /* for VS private establish state timeout, it should be inherited by every connection data structure */ + unsigned est_timeout; + + struct ip_vs_service *svc0; /* the svc of cpu0 */ }; /* @@ -523,7 +542,7 @@ struct ip_vs_dest { atomic_t weight; /* server weight */ atomic_t refcnt; /* reference counter */ - struct ip_vs_stats *stats; /* Use per-cpu statistics for destination server */ + struct ip_vs_stats stats; /* statistics for destination server */ /* connection counters and thresholds */ atomic_t activeconns; /* active connections */ @@ -551,6 +570,7 @@ struct ip_vs_dest { struct ip_vs_laddr { struct list_head n_list; /* for the local address in the service */ u16 af; /* address family */ + u16 cpuid; /* record the cpu laddr has been assigned */ union nf_inet_addr addr; /* ip address */ atomic64_t port; /* port counts */ atomic_t refcnt; /* reference count */ @@ -710,6 +730,9 @@ enum { FAST_XMIT_NO_MAC, FAST_XMIT_SYNPROXY_SAVE, FAST_XMIT_DEV_LOST, + FAST_XMIT_REJECT_INSIDE, + FAST_XMIT_PASS_INSIDE, + FAST_XMIT_SYNPROXY_SAVE_INSIDE, RST_IN_SYN_SENT, RST_OUT_SYN_SENT, RST_IN_ESTABLISHED, @@ -718,6 +741,8 @@ enum { LRO_REJECT, XMIT_UNEXPECTED_MTU, CONN_SCHED_UNREACH, + SYNPROXY_NO_DEST, + CONN_EXCEEDED, IP_VS_EXT_STAT_LAST }; @@ -767,7 +792,8 @@ extern void ip_vs_init_hash_table(struct list_head *table, int rows); #define CONFIG_IP_VS_TAB_BITS 22 #endif -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS +//#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS +#define IP_VS_CONN_TAB_BITS 20 #define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) #define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) @@ -954,6 +980,12 @@ extern int sysctl_ip_vs_tcp_drop_entry; extern int sysctl_ip_vs_udp_drop_entry; extern int sysctl_ip_vs_conn_expire_tcp_rst; extern int sysctl_ip_vs_fast_xmit; +extern int sysctl_ip_vs_fast_xmit_inside; +extern int sysctl_ip_vs_csum_offload; +extern int sysctl_ip_vs_reserve_core; +extern int sysctl_ip_vs_conn_max_num; + +DECLARE_PER_CPU(spinlock_t, ip_vs_svc_lock); extern struct ip_vs_service *ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, @@ -964,7 +996,8 @@ extern struct ip_vs_service *ip_vs_lookup_vip(int af, __u16 protocol, static inline void ip_vs_service_put(struct ip_vs_service *svc) { - atomic_dec(&svc->usecnt); + if (likely(svc != NULL)) + spin_unlock(&__get_cpu_var(ip_vs_svc_lock)); } extern struct ip_vs_dest *ip_vs_lookup_real_service(int af, __u16 protocol, diff --git a/kernel/include/net/ip_vs_synproxy.h b/kernel/include/net/ip_vs_synproxy.h index 82b580d0..d07181f3 100644 --- a/kernel/include/net/ip_vs_synproxy.h +++ b/kernel/include/net/ip_vs_synproxy.h @@ -15,8 +15,10 @@ * [19-16] snd_wscale * [15-0] MSSIND */ -#define IP_VS_SYNPROXY_MSS_BITS 16 -#define IP_VS_SYNPROXY_MSS_MASK (((__u32)1 << IP_VS_SYNPROXY_MSS_BITS) - 1) +#define IP_VS_SYNPROXY_OTHER_BITS 12 +#define IP_VS_SYNPROXY_OTHER_MASK (((__u32)1 << IP_VS_SYNPROXY_OTHER_BITS)-1) +#define IP_VS_SYNPROXY_MSS_BITS 12 +#define IP_VS_SYNPROXY_MSS_MASK ((__u32)0xf << IP_VS_SYNPROXY_MSS_BITS) #define IP_VS_SYNPROXY_SACKOK_BIT 21 #define IP_VS_SYNPROXY_SACKOK_MASK ((__u32)1 << IP_VS_SYNPROXY_SACKOK_BIT) @@ -38,20 +40,8 @@ struct ip_vs_synproxy_opt { u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; -/* - * For syncookie compute and check - */ -extern __u32 ip_vs_synproxy_cookie_v4_init_sequence(struct sk_buff *skb, - struct ip_vs_synproxy_opt - *opts); -extern int ip_vs_synproxy_v4_cookie_check(struct sk_buff *skb, __u32 cookie, - struct ip_vs_synproxy_opt *opt); - -extern __u32 ip_vs_synproxy_cookie_v6_init_sequence(struct sk_buff *skb, - struct ip_vs_synproxy_opt - *opts); -extern int ip_vs_synproxy_v6_cookie_check(struct sk_buff *skb, __u32 cookie, - struct ip_vs_synproxy_opt *opt); +/* syncookie init */ +extern int ip_vs_net_secret_init(void); /* * Syn-proxy step 1 logic: receive client's Syn. diff --git a/kernel/net/ipv4/syncookies.c b/kernel/net/ipv4/syncookies.c index a84feb0b..d36c8b76 100644 --- a/kernel/net/ipv4/syncookies.c +++ b/kernel/net/ipv4/syncookies.c @@ -149,7 +149,9 @@ static __u16 const msstab[] = { 512 - 1, 536 - 1, 1024 - 1, + 1280 - 1, 1440 - 1, + 1452 - 1, 1460 - 1, 4312 - 1, (__u16)-1 @@ -374,7 +376,7 @@ out: return ret; * [21] SACKOK * [20] TimeStampOK * [19-16] snd_wscale - * [15-0] MSSIND + * [15-12] MSSIND */ __u32 ip_vs_synproxy_cookie_v4_init_sequence(struct sk_buff *skb, struct ip_vs_synproxy_opt *opts) @@ -390,7 +392,7 @@ __u32 ip_vs_synproxy_cookie_v4_init_sequence(struct sk_buff *skb, ; opts->mss_clamp = msstab[mssind] + 1; - data = mssind & IP_VS_SYNPROXY_MSS_MASK; + data = ((mssind & 0x0f) << IP_VS_SYNPROXY_MSS_BITS); data |= opts->sack_ok << IP_VS_SYNPROXY_SACKOK_BIT; data |= opts->tstamp_ok << IP_VS_SYNPROXY_TSOK_BIT; data |= ((opts->snd_wscale & 0x0f) << IP_VS_SYNPROXY_SND_WSCALE_BITS); @@ -424,11 +426,10 @@ int ip_vs_synproxy_v4_cookie_check(struct sk_buff * skb, __u32 cookie, if(res == (__u32)-1) /* count is invalid, jiffies' >> jiffies */ goto out; - mssind = res & IP_VS_SYNPROXY_MSS_MASK; + mssind = (res & IP_VS_SYNPROXY_MSS_MASK) >> IP_VS_SYNPROXY_MSS_BITS; memset(opt, 0, sizeof(struct ip_vs_synproxy_opt)); - - if (mssind < NUM_MSS) { + if ((mssind < NUM_MSS) && ((res & IP_VS_SYNPROXY_OTHER_MASK) == 0)) { opt->mss_clamp = msstab[mssind] + 1; opt->sack_ok = (res & IP_VS_SYNPROXY_SACKOK_MASK) >> IP_VS_SYNPROXY_SACKOK_BIT; diff --git a/kernel/net/ipv6/syncookies.c b/kernel/net/ipv6/syncookies.c index 8051f061..03f34656 100644 --- a/kernel/net/ipv6/syncookies.c +++ b/kernel/net/ipv6/syncookies.c @@ -43,7 +43,9 @@ static __u16 const msstab[] = { 512 - 1, 536 - 1, 1024 - 1, + 1280 - 1, 1440 - 1, + 1452 - 1, 1460 - 1, 4312 - 1, (__u16)-1 @@ -289,7 +291,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) * [21] SACKOK * [20] TimeStampOK * [19-16] snd_wscale - * [15-0] MSSIND + * [15-12] MSSIND */ __u32 ip_vs_synproxy_cookie_v6_init_sequence(struct sk_buff *skb, struct ip_vs_synproxy_opt *opts) @@ -305,7 +307,7 @@ __u32 ip_vs_synproxy_cookie_v6_init_sequence(struct sk_buff *skb, ; opts->mss_clamp = msstab[mssind] + 1; - data = mssind & IP_VS_SYNPROXY_MSS_MASK; + data = ((mssind & 0x0f) << IP_VS_SYNPROXY_MSS_BITS); data |= opts->sack_ok << IP_VS_SYNPROXY_SACKOK_BIT; data |= opts->tstamp_ok << IP_VS_SYNPROXY_TSOK_BIT; data |= ((opts->snd_wscale & 0x0f) << IP_VS_SYNPROXY_SND_WSCALE_BITS); @@ -338,11 +340,11 @@ int ip_vs_synproxy_v6_cookie_check(struct sk_buff * skb, __u32 cookie, if(res == (__u32)-1) /* count is invalid, jiffies' >> jiffies */ goto out; - mssind = res & IP_VS_SYNPROXY_MSS_MASK; + mssind = (res & IP_VS_SYNPROXY_MSS_MASK) >> IP_VS_SYNPROXY_MSS_BITS; memset(opt, 0, sizeof(struct ip_vs_synproxy_opt)); - if (mssind < NUM_MSS) { + if ((mssind < NUM_MSS) && ((res & IP_VS_SYNPROXY_OTHER_MASK) == 0)) { opt->mss_clamp = msstab[mssind] + 1; opt->sack_ok = (res & IP_VS_SYNPROXY_SACKOK_MASK) >> IP_VS_SYNPROXY_SACKOK_BIT; diff --git a/kernel/net/netfilter/ipvs/ip_vs_conn.c b/kernel/net/netfilter/ipvs/ip_vs_conn.c index b5370ca3..ca98e326 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_conn.c +++ b/kernel/net/netfilter/ipvs/ip_vs_conn.c @@ -39,19 +39,25 @@ #include #include + /* * Connection hash table: for input and output packets lookups of IPVS */ -static struct list_head *ip_vs_conn_tab; +DEFINE_PER_CPU(struct list_head *, ip_vs_conn_tab_percpu); +DEFINE_PER_CPU(spinlock_t, ip_vs_conn_tab_lock); + +/* the limit of conns */ +int sysctl_ip_vs_conn_max_num = 0; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; /* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); +DEFINE_PER_CPU(int, ip_vs_conn_cnt_per) = {0}; /* counter for no client port connections */ static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); +//DEFINE_PER_CPU(int, ip_vs_conn_no_cport_cnt_per); /* random value for IPVS connection hash */ static unsigned int ip_vs_conn_rnd; @@ -189,10 +195,12 @@ static inline int __ip_vs_conn_hash(struct ip_vs_conn *cp, unsigned ihash, int ret; if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + struct list_head *this_cpu_conn_tab = + per_cpu(ip_vs_conn_tab_percpu, cp->cpuid); ci_idx = cp->in_idx; co_idx = cp->out_idx; - list_add(&ci_idx->c_list, &ip_vs_conn_tab[ihash]); - list_add(&co_idx->c_list, &ip_vs_conn_tab[ohash]); + list_add(&ci_idx->c_list, &this_cpu_conn_tab[ihash]); + list_add(&co_idx->c_list, &this_cpu_conn_tab[ohash]); cp->flags |= IP_VS_CONN_F_HASHED; atomic_inc(&cp->refcnt); ret = 1; @@ -228,13 +236,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) cp->lport); /* locked */ - ip_vs_conn_lock2(ihash, ohash); + spin_lock(&per_cpu(ip_vs_conn_tab_lock, cp->cpuid)); /* hashed */ ret = __ip_vs_conn_hash(cp, ihash, ohash); /* unlocked */ - ip_vs_conn_unlock2(ihash, ohash); + spin_unlock(&per_cpu(ip_vs_conn_tab_lock, cp->cpuid)); return ret; } @@ -260,7 +268,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) cp->lport); /* locked */ - ip_vs_conn_lock2(ihash, ohash); + spin_lock(&per_cpu(ip_vs_conn_tab_lock, cp->cpuid)); /* unhashed */ if ((cp->flags & IP_VS_CONN_F_HASHED) @@ -277,7 +285,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) } /* unlocked */ - ip_vs_conn_unlock2(ihash, ohash); + spin_unlock(&per_cpu(ip_vs_conn_tab_lock, cp->cpuid)); return ret; } @@ -294,12 +302,14 @@ static inline struct ip_vs_conn *__ip_vs_conn_get unsigned hash; struct ip_vs_conn *cp; struct ip_vs_conn_idx *cidx; + struct list_head *this_cpu_conn_tab; hash = ip_vs_conn_hashkey(af, s_addr, s_port, d_addr, d_port); + this_cpu_conn_tab = __get_cpu_var(ip_vs_conn_tab_percpu); - ct_read_lock(hash); + spin_lock(&__get_cpu_var(ip_vs_conn_tab_lock)); - list_for_each_entry(cidx, &ip_vs_conn_tab[hash], c_list) { + list_for_each_entry(cidx, &this_cpu_conn_tab[hash], c_list) { cp = cidx->cp; if (cidx->af == af && ip_vs_addr_equal(af, s_addr, &cidx->s_addr) && @@ -310,12 +320,12 @@ static inline struct ip_vs_conn *__ip_vs_conn_get /* HIT */ atomic_inc(&cp->refcnt); *res_dir = cidx->flags & IP_VS_CIDX_F_DIR_MASK; - ct_read_unlock(hash); + spin_unlock(&__get_cpu_var(ip_vs_conn_tab_lock)); return cp; } } - ct_read_unlock(hash); + spin_unlock(&__get_cpu_var(ip_vs_conn_tab_lock)); return NULL; } @@ -347,12 +357,14 @@ struct ip_vs_conn *ip_vs_ct_in_get unsigned hash; struct ip_vs_conn_idx *cidx; struct ip_vs_conn *cp; + struct list_head *this_cpu_conn_tab; hash = ip_vs_conn_hashkey(af, s_addr, s_port, d_addr, d_port); + this_cpu_conn_tab = __get_cpu_var(ip_vs_conn_tab_percpu); - ct_read_lock(hash); + spin_lock(&__get_cpu_var(ip_vs_conn_tab_lock)); - list_for_each_entry(cidx, &ip_vs_conn_tab[hash], c_list) { + list_for_each_entry(cidx, &this_cpu_conn_tab[hash], c_list) { cp = cidx->cp; if (cidx->af == af && ip_vs_addr_equal(af, s_addr, &cidx->s_addr) && @@ -371,7 +383,7 @@ struct ip_vs_conn *ip_vs_ct_in_get cp = NULL; out: - ct_read_unlock(hash); + spin_unlock(&__get_cpu_var(ip_vs_conn_tab_lock)); IP_VS_DBG_BUF(9, "template lookup %s %s:%d->%s:%d %s\n", ip_vs_proto_name(protocol), @@ -697,6 +709,8 @@ static inline int ip_vs_hbind_laddr(struct ip_vs_conn *cp) /* choose a local address by round-robin */ local = ip_vs_get_laddr(svc); if (local != NULL) { + int cpu = cp->cpuid; + /*OUTside2INside: hashed by client address and port, virtual address and port */ ihash = ip_vs_conn_hashkey(cp->af, &cp->caddr, cp->cport, @@ -721,13 +735,14 @@ static inline int ip_vs_hbind_laddr(struct ip_vs_conn *cp) ohash = ip_vs_conn_hashkey(cp->af, &cp->daddr, cp->dport, &cp->laddr, cp->lport); - /* lock the conntab bucket */ - ip_vs_conn_lock2(ihash, ohash); + /* lock the conntab of the current cpu */ + spin_lock(&per_cpu(ip_vs_conn_tab_lock, cpu)); + /* * check local address and port is valid by lookup connection table */ - list_for_each_entry(cidx, &ip_vs_conn_tab[ohash], - c_list) { + list_for_each_entry(cidx, &per_cpu(ip_vs_conn_tab_percpu + ,cpu)[ohash], c_list) { if (cidx->af == cp->af && ip_vs_addr_equal(cp->af, &cp->daddr, &cidx->s_addr) @@ -746,12 +761,12 @@ static inline int ip_vs_hbind_laddr(struct ip_vs_conn *cp) cp->local = local; /* hashed */ __ip_vs_conn_hash(cp, ihash, ohash); - ip_vs_conn_unlock2(ihash, ohash); + spin_unlock(&per_cpu(ip_vs_conn_tab_lock, cpu)); atomic_inc(&local->conn_counts); ret = 1; goto out; } - ip_vs_conn_unlock2(ihash, ohash); + spin_unlock(&per_cpu(ip_vs_conn_tab_lock, cpu)); } if (ret == 0) { ip_vs_laddr_put(local); @@ -867,7 +882,8 @@ static void ip_vs_conn_del(struct ip_vs_conn *cp) ip_vs_unbind_laddr(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); + /* __get_cpu_var(ip_vs_conn_cnt_per)-- */ + per_cpu(ip_vs_conn_cnt_per, cp->cpuid)--; kmem_cache_free(ip_vs_conn_cachep, cp); cp = NULL; @@ -875,6 +891,7 @@ static void ip_vs_conn_del(struct ip_vs_conn *cp) static void ip_vs_conn_expire(unsigned long data) { + int cpu; struct ip_vs_conn *cp = (struct ip_vs_conn *)data; struct sk_buff *tmp_skb = NULL; struct ip_vs_protocol *pp = ip_vs_proto_get(cp->protocol); @@ -892,6 +909,11 @@ static void ip_vs_conn_expire(unsigned long data) * hey, I'm using it */ atomic_inc(&cp->refcnt); + cpu = cp->cpuid; + if(unlikely(cpu != smp_processor_id())) { + IP_VS_ERR_RL("timer is migrates form cpu%d to cpu%d\n", + cpu, smp_processor_id()); + } /* * Retransmit syn packet to rs. @@ -945,7 +967,7 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_unbind_laddr(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); + per_cpu(ip_vs_conn_cnt_per, cpu)--; /* free stored ack packet */ while ((tmp_skb = skb_dequeue(&cp->ack_skb)) != NULL) { @@ -962,6 +984,9 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->indev != NULL) dev_put(cp->indev); + if (cp->dev_inside != NULL) + dev_put(cp->dev_inside); + kmem_cache_free(ip_vs_conn_cachep, cp); return; } @@ -997,6 +1022,13 @@ struct ip_vs_conn *ip_vs_conn_new(int af, int proto, struct ip_vs_conn_idx *ci_idx, *co_idx; struct tcphdr _tcph, *th; + if ( (sysctl_ip_vs_conn_max_num != 0) && ((num_online_cpus() * + __get_cpu_var(ip_vs_conn_cnt_per)) >= + sysctl_ip_vs_conn_max_num) ) { + IP_VS_INC_ESTATS(ip_vs_esmib, CONN_EXCEEDED); + return NULL; + } + cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { IP_VS_ERR_RL("%s(): no memory\n", __func__); @@ -1058,7 +1090,7 @@ struct ip_vs_conn *ip_vs_conn_new(int af, int proto, atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); - atomic_inc(&ip_vs_conn_count); + __get_cpu_var(ip_vs_conn_cnt_per)++; if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); @@ -1069,6 +1101,9 @@ struct ip_vs_conn *ip_vs_conn_new(int af, int proto, cp->state = 0; cp->timeout = 3 * HZ; + /* Save the current CPU ID */ + cp->cpuid = smp_processor_id(); + /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) @@ -1134,8 +1169,9 @@ struct ip_vs_conn *ip_vs_conn_new(int af, int proto, /* * /proc/net/ip_vs_conn entries + * be dropped at v3 for performance */ -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_FS_NO_EXIST static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) { @@ -1332,148 +1368,106 @@ static const struct file_operations ip_vs_conn_sync_fops = { #endif /* - * Randomly drop connection entries before running out of memory + * Flush all the connection entries in the ip_vs_conn_tab */ -static inline int todrop_entry(struct ip_vs_conn *cp) -{ - /* - * The drop rate array needs tuning for real environments. - * Called from timer bh only => no locking - */ - static const char todrop_rate[9] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; - static char todrop_counter[9] = { 0 }; - int i; - - /* if the conn entry hasn't lasted for 60 seconds, don't drop it. - This will leave enough time for normal connection to get - through. */ - if (time_before(cp->timeout + jiffies, cp->timer.expires + 60 * HZ)) - return 0; - - /* Don't drop the entry if its number of incoming packets is not - located in [0, 8] */ - i = atomic_read(&cp->in_pkts); - if (i > 8 || i < 0) - return 0; - - if (!todrop_rate[i]) - return 0; - if (--todrop_counter[i] > 0) - return 0; - - todrop_counter[i] = todrop_rate[i]; - return 1; -} - -/* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) +static void ip_vs_conn_flush(void) { int idx; + int cpu; struct ip_vs_conn *cp; struct ip_vs_conn_idx *cidx; + struct list_head *ip_vs_conn_tab_per; - /* - * Randomly scan 1/32 of the whole table every second - */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE >> 5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - list_for_each_entry(cidx, &ip_vs_conn_tab[hash], c_list) { - cp = cidx->cp; - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; - - if (cp->protocol == IPPROTO_TCP) { - switch (cp->state) { - case IP_VS_TCP_S_SYN_RECV: - case IP_VS_TCP_S_SYNACK: - break; - - case IP_VS_TCP_S_ESTABLISHED: - if (todrop_entry(cp)) - break; - continue; - - default: - continue; + for_each_possible_cpu(cpu) { + ip_vs_conn_tab_per = per_cpu(ip_vs_conn_tab_percpu, cpu); + flush_again: + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + /* + * Lock is actually needed in this loop. + */ + spin_lock_bh(&per_cpu(ip_vs_conn_tab_lock, cpu)); + + list_for_each_entry(cidx, &ip_vs_conn_tab_per[idx], + c_list) { + IP_VS_DBG(4, "del connection\n"); + cp = cidx->cp; + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); } - } else { - if (!todrop_entry(cp)) - continue; - } - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); } + spin_unlock_bh(&per_cpu(ip_vs_conn_tab_lock, cpu)); + } + /* the counter may be not 0, because maybe some conn entries + * are run by slow timer handler + * or unhashed but still referred */ + if (per_cpu(ip_vs_conn_cnt_per, cpu) != 0) { + schedule(); + goto flush_again; } - ct_write_unlock_bh(hash); } } -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ -static void ip_vs_conn_flush(void) +static void ip_vs_conn_max_init(void) { - int idx; - struct ip_vs_conn *cp; - struct ip_vs_conn_idx *cidx; + int conn_size; + int conns_perpage; + int conn_tab_limit; - flush_again: - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(idx); + conn_size = sizeof(struct ip_vs_conn) + + 2 * sizeof(struct ip_vs_conn_idx); + conns_perpage = PAGE_SIZE / ALIGN(conn_size, cache_line_size()); - list_for_each_entry(cidx, &ip_vs_conn_tab[idx], c_list) { - IP_VS_DBG(4, "del connection\n"); - cp = cidx->cp; - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(idx); - } + pr_info("conn size:%d, conns per page:%d, num_physpages: %lu\n", + conn_size, conns_perpage, num_physpages); - /* the counter may be not NULL, because maybe some conn entries - are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { - schedule(); - goto flush_again; - } + /* half of memory for ip_vs_conn */ + sysctl_ip_vs_conn_max_num = (num_physpages / 2) * conns_perpage; + + /* the average length of hash chain must be less than 4 */ + conn_tab_limit = (IP_VS_CONN_TAB_SIZE << 2) * num_online_cpus(); + + if ( sysctl_ip_vs_conn_max_num > conn_tab_limit ) + sysctl_ip_vs_conn_max_num = conn_tab_limit; + + pr_info("maximum number of ip_vs_conn: %d\n", + sysctl_ip_vs_conn_max_num); } int __init ip_vs_conn_init(void) { int idx; + int cpu; - /* - * Allocate the connection hash table and initialize its list heads - */ - ip_vs_conn_tab = - vmalloc(IP_VS_CONN_TAB_SIZE * (sizeof(struct list_head))); - if (!ip_vs_conn_tab) - return -ENOMEM; + for_each_possible_cpu(cpu) { + void *tmp; + /* + * Allocate the connection hash table and + * initialize its list heads + */ + tmp = vmalloc(IP_VS_CONN_TAB_SIZE * sizeof(struct list_head)); + if (!tmp) { + int i; + for(i=0; i < cpu; i++) + vfree(per_cpu(ip_vs_conn_tab_percpu, i)); + return -ENOMEM; + } + + per_cpu(ip_vs_conn_tab_percpu, cpu) = tmp; + + spin_lock_init(&per_cpu(ip_vs_conn_tab_lock, cpu)); + } /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", sizeof(struct ip_vs_conn) + 2 * sizeof(struct ip_vs_conn_idx), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); + for_each_possible_cpu(cpu) { + vfree(per_cpu(ip_vs_conn_tab_percpu, cpu)); + } return -ENOMEM; } @@ -1485,32 +1479,46 @@ int __init ip_vs_conn_init(void) sizeof(struct ip_vs_conn) + 2 * sizeof(struct ip_vs_conn_idx)); - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + for_each_possible_cpu(cpu) { + struct list_head *this_cpu_conn_tab; + + this_cpu_conn_tab = per_cpu(ip_vs_conn_tab_percpu, cpu); + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&this_cpu_conn_tab[idx]); + } } for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); + /* disable in v3 */ + /*proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, - &ip_vs_conn_sync_fops); + &ip_vs_conn_sync_fops);*/ /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + ip_vs_conn_max_init(); + return 0; } void ip_vs_conn_cleanup(void) { + int cpu; + /* flush all the connection entries first */ ip_vs_conn_flush(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); - vfree(ip_vs_conn_tab); + + /* disable in v3 */ + //proc_net_remove(&init_net, "ip_vs_conn"); + //proc_net_remove(&init_net, "ip_vs_conn_sync"); + for_each_possible_cpu(cpu) { + vfree(per_cpu(ip_vs_conn_tab_percpu, cpu)); + } } diff --git a/kernel/net/netfilter/ipvs/ip_vs_core.c b/kernel/net/netfilter/ipvs/ip_vs_core.c index b3128798..e1b0082c 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_core.c +++ b/kernel/net/netfilter/ipvs/ip_vs_core.c @@ -861,29 +861,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if (iph.protocol != IPPROTO_TCP - || !is_tcp_reset(skb, iph.len)) { -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0, skb->dev); - else -#endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } - } } IP_VS_DBG_PKT(12, pp, skb, 0, "packet continues traversal as normal"); @@ -1456,6 +1433,8 @@ static int __init ip_vs_init(void) goto cleanup_conn; } + ip_vs_net_secret_init(); + pr_info("ipvs loaded.\n"); return ret; diff --git a/kernel/net/netfilter/ipvs/ip_vs_ctl.c b/kernel/net/netfilter/ipvs/ip_vs_ctl.c index c7adc827..fd5ae3dd 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_ctl.c +++ b/kernel/net/netfilter/ipvs/ip_vs_ctl.c @@ -54,11 +54,8 @@ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); +/* percpu lock for service table */ +DEFINE_PER_CPU(spinlock_t, ip_vs_svc_lock); /* lock for state and timeout tables */ static DEFINE_RWLOCK(__ip_vs_securetcp_lock); @@ -150,6 +147,15 @@ int sysctl_ip_vs_udp_drop_entry = 1; int sysctl_ip_vs_conn_expire_tcp_rst = 1; /* L2 fast xmit, response only (to client) */ int sysctl_ip_vs_fast_xmit = 1; +/* L2 fast xmit, inside (to RS) */ +int sysctl_ip_vs_fast_xmit_inside = 1; +/* skb csum offload */ +int sysctl_ip_vs_csum_offload = 1; + +/* reserve core for the control flow */ +int sysctl_ip_vs_reserve_core = 0; +static int ip_vs_reserve_core_min = 0; +static int ip_vs_reserve_core_max = 6; #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; @@ -304,22 +310,6 @@ static void update_defense_level(void) local_bh_enable(); } -/* - * Timer for checking the defense - */ -#define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); - -static void defense_work_handler(struct work_struct *work) -{ - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); - - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); -} - int ip_vs_use_count_inc(void) { return try_module_get(THIS_MODULE); @@ -333,28 +323,19 @@ void ip_vs_use_count_dec(void) /* * Hash table: for virtual service lookups */ -#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_BITS 12 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +DEFINE_PER_CPU(struct list_head *, ip_vs_svc_tab_percpu); /* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; +DEFINE_PER_CPU(struct list_head *, ip_vs_svc_fwm_tab_percpu); /* * Trash for destinations */ -static LIST_HEAD(ip_vs_dest_trash); +DEFINE_PER_CPU(struct list_head, ip_vs_dest_trash_percpu); /* * FTP & NULL virtual service counters @@ -387,14 +368,10 @@ static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) return fwmark & IP_VS_SVC_TAB_MASK; } -/* - * Hashes a service in the ip_vs_svc_table by - * or in the ip_vs_svc_fwm_table by fwmark. - * Should be called with locked tables. - */ -static int ip_vs_svc_hash(struct ip_vs_service *svc) +static int ip_vs_svc_hash_cpuid(struct ip_vs_service *svc, int cpu) { unsigned hash; + struct list_head *ip_vs_svc_tab; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pF\n", @@ -407,13 +384,15 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) * Hash it by in ip_vs_svc_table */ hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); + ip_vs_svc_tab = per_cpu(ip_vs_svc_tab_percpu, cpu); + list_add(&svc->s_list, ip_vs_svc_tab + hash); } else { /* * Hash it by fwmark in ip_vs_svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + ip_vs_svc_tab = per_cpu(ip_vs_svc_fwm_tab_percpu, cpu); + list_add(&svc->f_list, ip_vs_svc_tab + hash); } svc->flags |= IP_VS_SVC_F_HASHED; @@ -456,17 +435,19 @@ static inline struct ip_vs_service *__ip_vs_service_get(int af, __u16 protocol, { unsigned hash; struct ip_vs_service *svc; + struct list_head *ip_vs_svc_tab; + ip_vs_svc_tab = __get_cpu_var(ip_vs_svc_tab_percpu); /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(af, protocol, vaddr); - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list) { + list_for_each_entry(svc, ip_vs_svc_tab + hash, s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol)) { /* HIT */ - atomic_inc(&svc->usecnt); + //atomic_inc(&svc->usecnt); return svc; } } @@ -481,14 +462,15 @@ static inline struct ip_vs_service *__ip_vs_svc_fwm_get(int af, __u32 fwmark) { unsigned hash; struct ip_vs_service *svc; + struct list_head *ip_vs_svc_fwm_tab; + ip_vs_svc_fwm_tab = __get_cpu_var(ip_vs_svc_fwm_tab_percpu); /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(fwmark); - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + list_for_each_entry(svc, ip_vs_svc_fwm_tab + hash, f_list) { if (svc->fwmark == fwmark && svc->af == af) { /* HIT */ - atomic_inc(&svc->usecnt); return svc; } } @@ -502,7 +484,7 @@ struct ip_vs_service *ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, { struct ip_vs_service *svc; - read_lock(&__ip_vs_svc_lock); + spin_lock(&__get_cpu_var(ip_vs_svc_lock)); /* * Check the table hashed by fwmark first @@ -534,7 +516,9 @@ struct ip_vs_service *ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, } out: - read_unlock(&__ip_vs_svc_lock); + /* unlock by ip_vs_service_put */ + if (svc == NULL) + spin_unlock(&__get_cpu_var(ip_vs_svc_lock)); IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), @@ -548,22 +532,24 @@ struct ip_vs_service *ip_vs_lookup_vip(int af, __u16 protocol, const union nf_inet_addr *vaddr) { struct ip_vs_service *svc; + struct list_head *ip_vs_svc_tab; unsigned hash; - read_lock(&__ip_vs_svc_lock); + spin_lock(&__get_cpu_var(ip_vs_svc_lock)); + ip_vs_svc_tab = __get_cpu_var(ip_vs_svc_tab_percpu); hash = ip_vs_svc_hashkey(af, protocol, vaddr); - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list) { + list_for_each_entry(svc, ip_vs_svc_tab + hash, s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->protocol == protocol)) { /* HIT */ - read_unlock(&__ip_vs_svc_lock); + spin_unlock(&__get_cpu_var(ip_vs_svc_lock)); return svc; } } - read_unlock(&__ip_vs_svc_lock); + spin_unlock(&__get_cpu_var(ip_vs_svc_lock)); return NULL; } @@ -576,103 +562,30 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) static inline void __ip_vs_unbind_svc(struct ip_vs_dest *dest) { + int cpu; struct ip_vs_service *svc = dest->svc; + struct ip_vs_service *this_svc; dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) - kfree(svc); -} - -/* - * Returns hash value for real service - */ -static inline unsigned ip_vs_rs_hashkey(int af, - const union nf_inet_addr *addr, - __be16 port) -{ - register unsigned porth = ntohs(port); - __be32 addr_fold = addr->ip; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ - addr->ip6[2] ^ addr->ip6[3]; -#endif - - return (ntohl(addr_fold) ^ (porth >> IP_VS_RTAB_BITS) ^ porth) - & IP_VS_RTAB_MASK; -} - -/* - * Hashes ip_vs_dest in ip_vs_rtable by . - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) -{ - unsigned hash; - - if (!list_empty(&dest->d_list)) { - return 0; - } - - /* - * Hash by proto,addr,port, - * which are the parameters of the real service. - */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - - list_add(&dest->d_list, &ip_vs_rtable[hash]); - - return 1; -} - -/* - * UNhashes ip_vs_dest from ip_vs_rtable. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) -{ - /* - * Remove it from the ip_vs_rtable table. - */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); - } +// if (atomic_dec_and_test(&svc->refcnt)) +// kfree(svc); - return 1; -} - -/* - * Lookup real service by in the real service table. - */ -struct ip_vs_dest *ip_vs_lookup_real_service(int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) -{ - unsigned hash; - struct ip_vs_dest *dest; + atomic_dec(&svc->refcnt); - /* - * Check for "full" addressed entries - * Return the first found entry - */ - hash = ip_vs_rs_hashkey(af, daddr, dport); - - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || dest->vfwmark)) { - /* HIT */ - read_unlock(&__ip_vs_rs_lock); - return dest; - } + for_each_possible_cpu(cpu) { + this_svc = svc->svc0 + cpu; + if (atomic_read(&this_svc->refcnt)) { + IP_VS_DBG_BUF(2, "%s(): cpu%d refers to svc %s:%d," + "refcnt=%d\n", __func__, cpu, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), + atomic_read(&this_svc->refcnt)); + break; + } } - read_unlock(&__ip_vs_rs_lock); - return NULL; + if (cpu == num_possible_cpus()) + kfree(svc->svc0); } /* @@ -737,21 +650,23 @@ struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, * continue, and the counting information of the dest is also useful for * scheduling. */ -static struct ip_vs_dest *ip_vs_trash_get_dest(struct ip_vs_service *svc, +static struct ip_vs_dest *ip_vs_trash_get_dest_cpuid(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) + __be16 dport, int cpu) { struct ip_vs_dest *dest, *nxt; /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + list_for_each_entry_safe(dest, nxt, + &per_cpu(ip_vs_dest_trash_percpu, cpu), n_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash-%d, " "dest->refcnt=%d\n", dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port), atomic_read(&dest->refcnt)); + ntohs(dest->port), + cpu, atomic_read(&dest->refcnt)); if (dest->af == svc->af && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && dest->port == dport && @@ -769,17 +684,13 @@ static struct ip_vs_dest *ip_vs_trash_get_dest(struct ip_vs_service *svc, */ if (atomic_read(&dest->refcnt) == 1) { IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", + "from trash-%d\n", dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); + ntohs(dest->port), cpu); list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); - - /* Delete dest dedicated statistic varible which is percpu type */ - ip_vs_del_stats(dest->stats); - kfree(dest); } } @@ -798,14 +709,18 @@ static struct ip_vs_dest *ip_vs_trash_get_dest(struct ip_vs_service *svc, */ static void ip_vs_trash_cleanup(void) { + int cpu; struct ip_vs_dest *dest, *nxt; - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - ip_vs_del_stats(dest->stats); - kfree(dest); + for_each_possible_cpu(cpu) { + list_for_each_entry_safe(dest, nxt, + &per_cpu(ip_vs_dest_trash_percpu, cpu), + n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } } } @@ -839,14 +754,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; - } else { - /* - * Put the real service in ip_vs_rtable if not present. - * For now only for NAT! - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); } atomic_set(&dest->conn_flags, conn_flags); @@ -856,7 +763,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, } else { if (dest->svc != svc) { __ip_vs_unbind_svc(dest); - ip_vs_zero_stats(dest->stats); + memset(&dest->stats, 0, sizeof(struct ip_vs_stats)); __ip_vs_bind_svc(dest, svc); } } @@ -877,7 +784,6 @@ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, struct ip_vs_dest **dest_p) { - int ret = 0; struct ip_vs_dest *dest; unsigned atype; @@ -920,24 +826,19 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, INIT_LIST_HEAD(&dest->d_list); spin_lock_init(&dest->dst_lock); - /* Init statistic */ - ret = ip_vs_new_stats(&(dest->stats)); - if(ret) - goto out_err; - __ip_vs_update_dest(svc, dest, udest); - *dest_p = dest; LeaveFunction(2); return 0; - -out_err: - kfree(dest); - return ret; } +static void +ip_vs_add_dest_rollback(struct ip_vs_service *, + const union nf_inet_addr *, + __be16, int); + /* * Add a destination into an existing service */ @@ -948,6 +849,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) union nf_inet_addr daddr; __be16 dport = udest->port; int ret; + int cpu = 0; + struct ip_vs_service *this_svc = NULL; EnterFunction(2); @@ -974,14 +877,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -EEXIST; } - /* - * Check if the dest already exists in the trash and - * is from the same service - */ - dest = ip_vs_trash_get_dest(svc, &daddr, dport); - if (dest != NULL) { - IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + for_each_possible_cpu(cpu) { + this_svc = svc->svc0 + cpu; + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest_cpuid(this_svc, + &daddr, dport, cpu); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), atomic_read(&dest->refcnt), @@ -989,64 +896,58 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); - __ip_vs_update_dest(svc, dest, udest); - - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); + __ip_vs_update_dest(this_svc, dest, udest); - /* Reset the statistic value */ - ip_vs_zero_stats(dest->stats); + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); - write_lock_bh(&__ip_vs_svc_lock); + /* Reset the statistic value */ + memset(&dest->stats, 0, sizeof(struct ip_vs_stats)); - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; + list_add(&dest->n_list, &this_svc->destinations); + this_svc->num_dests++; + this_svc->weight += udest->weight; - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); + /* call the update_service function of its scheduler */ + if (this_svc->scheduler->update_service) + this_svc->scheduler->update_service(this_svc); - write_unlock_bh(&__ip_vs_svc_lock); - return 0; - } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; - } + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + continue; + } - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(this_svc, udest, &dest); + if (ret) { + ip_vs_add_dest_rollback(svc, &daddr, dport, cpu); + return ret; + } - write_lock_bh(&__ip_vs_svc_lock); + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; + list_add(&dest->n_list, &this_svc->destinations); + this_svc->num_dests++; + this_svc->weight += udest->weight; - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); + /* call the update_service function of its scheduler */ + if (this_svc->scheduler->update_service) + this_svc->scheduler->update_service(this_svc); - write_unlock_bh(&__ip_vs_svc_lock); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - LeaveFunction(2); + LeaveFunction(2); + } return 0; } @@ -1060,6 +961,9 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; + __u32 old_weight; + int cpu; + struct ip_vs_service *this_svc; EnterFunction(2); @@ -1076,29 +980,43 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Lookup the destination list - */ - dest = ip_vs_lookup_dest(svc, &daddr, dport); + for_each_possible_cpu(cpu) { + this_svc = svc->svc0 + cpu; + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(this_svc, &daddr, dport); - if (dest == NULL) { - IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); - return -ENOENT; - } + if (dest == NULL) { + IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); + return -ENOENT; + } - __ip_vs_update_dest(svc, dest, udest); + /* save old weight */ + old_weight = atomic_read(&dest->weight); - write_lock_bh(&__ip_vs_svc_lock); + __ip_vs_update_dest(this_svc, dest, udest); - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); + /* update service weight */ + this_svc->weight = this_svc->weight - + old_weight + udest->weight; + if(this_svc->weight < 0) { + struct ip_vs_dest *tdest; + this_svc->weight = 0; + list_for_each_entry(tdest, &this_svc->destinations, n_list) { + this_svc->weight += atomic_read(&tdest->weight); + } + IP_VS_ERR_RL("ip_vs_edit_dest:vs weight < 0\n"); + } - write_unlock_bh(&__ip_vs_svc_lock); + /* update service, because server weight may be changed */ + if (this_svc->scheduler->update_service) + this_svc->scheduler->update_service(this_svc); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + } LeaveFunction(2); return 0; @@ -1109,13 +1027,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) */ static void __ip_vs_del_dest(struct ip_vs_dest *dest) { - /* - * Remove it from the d-linked list with the real services. - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - /* * Decrease the refcnt of the dest, and free the dest * if nobody refers to it (refcnt=0). Otherwise, throw @@ -1130,16 +1041,14 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) time, so the operation here is OK */ atomic_dec(&dest->svc->refcnt); - /* Delete dest dedicated statistic varible which is percpu type */ - ip_vs_del_stats(dest->stats); - kfree(dest); } else { IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " "dest->refcnt=%d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); + list_add(&dest->n_list, &per_cpu(ip_vs_dest_trash_percpu, + (dest->svc - dest->svc->svc0))); atomic_inc(&dest->refcnt); } } @@ -1157,6 +1066,15 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, */ list_del(&dest->n_list); svc->num_dests--; + svc->weight -= atomic_read(&dest->weight); + if(svc->weight < 0) { + struct ip_vs_dest *tdest; + svc->weight = 0; + list_for_each_entry(tdest, &svc->destinations, n_list) { + svc->weight += atomic_read(&tdest->weight); + } + IP_VS_ERR_RL("__ip_vs_unlink_dest:vs weight < 0\n"); + } /* * Call the update_service function of its scheduler @@ -1165,6 +1083,30 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, svc->scheduler->update_service(svc); } +static void +ip_vs_add_dest_rollback(struct ip_vs_service *svc, + const union nf_inet_addr *daddr, + __be16 dport, int cpu) +{ + int i; + struct ip_vs_dest *dest; + struct ip_vs_service *this_svc; + + for(i = 0; i < cpu; i++) + { + this_svc = svc->svc0 + i; + dest = ip_vs_lookup_dest(this_svc, daddr, dport); + if(dest == NULL) + continue; + + spin_lock_bh(&per_cpu(ip_vs_svc_lock, i)); + __ip_vs_unlink_dest(this_svc, dest, 1); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, i)); + + __ip_vs_del_dest(dest); + } +} + /* * Delete a destination server in the given service */ @@ -1173,40 +1115,55 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; __be16 dport = udest->port; + int cpu; + struct ip_vs_service *this_svc; EnterFunction(2); - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + for_each_possible_cpu(cpu) { + this_svc = svc->svc0 + cpu; + dest = ip_vs_lookup_dest(this_svc, &udest->addr, dport); - if (dest == NULL) { - IP_VS_DBG(1, "%s(): destination not found!\n", __func__); - return -ENOENT; - } + if (dest == NULL) { + IP_VS_DBG(1, "%s(): destination not found!\n", + __func__); + return -ENOENT; + } - write_lock_bh(&__ip_vs_svc_lock); + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Unlink dest from the service - */ - __ip_vs_unlink_dest(svc, dest, 1); + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(this_svc, dest, 1); - write_unlock_bh(&__ip_vs_svc_lock); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - /* - * Delete the destination - */ - __ip_vs_del_dest(dest); + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + } LeaveFunction(2); return 0; } +#define LADDR_MASK 0x000000ff +static inline int laddr_to_cpuid(int af, const union nf_inet_addr *addr) +{ + u32 seed; + + if(af == AF_INET6) + seed = ntohl(addr->in6.s6_addr32[3]) & LADDR_MASK; + else + seed = ntohl(addr->ip) & LADDR_MASK; + + return seed % (num_online_cpus() - sysctl_ip_vs_reserve_core) + + sysctl_ip_vs_reserve_core; +} + void ip_vs_laddr_hold(struct ip_vs_laddr *laddr) { atomic_inc(&laddr->refcnt); @@ -1237,6 +1194,9 @@ ip_vs_new_laddr(struct ip_vs_service *svc, struct ip_vs_laddr_user_kern *uladdr, atomic64_set(&laddr->port, 0); atomic_set(&laddr->refcnt, 0); atomic_set(&laddr->conn_counts, 0); + laddr->cpuid = laddr_to_cpuid(svc->af, &uladdr->addr); + IP_VS_DBG_BUF(0, "local address %s is assigned to cpu%d\n", + IP_VS_DBG_ADDR(svc->af, &uladdr->addr), laddr->cpuid); *laddr_p = laddr; @@ -1246,17 +1206,23 @@ ip_vs_new_laddr(struct ip_vs_service *svc, struct ip_vs_laddr_user_kern *uladdr, static struct ip_vs_laddr *ip_vs_lookup_laddr(struct ip_vs_service *svc, const union nf_inet_addr *addr) { + int cpu; + struct ip_vs_service *this_svc; struct ip_vs_laddr *laddr; - /* - * Find the local address for the given service - */ - list_for_each_entry(laddr, &svc->laddr_list, n_list) { - if ((laddr->af == svc->af) - && ip_vs_addr_equal(svc->af, &laddr->addr, addr)) { - /* HIT */ - return laddr; + this_svc = svc->svc0; + for_each_possible_cpu(cpu) { + /* + * Find the local address for the given service + */ + list_for_each_entry(laddr, &this_svc->laddr_list, n_list) { + if ((laddr->af == svc->af) + && ip_vs_addr_equal(svc->af, &laddr->addr, addr)) { + /* HIT */ + return laddr; + } } + this_svc++; } return NULL; @@ -1266,6 +1232,8 @@ static int ip_vs_add_laddr(struct ip_vs_service *svc, struct ip_vs_laddr_user_kern *uladdr) { struct ip_vs_laddr *laddr; + struct ip_vs_service *this_svc; + int cpu; int ret; IP_VS_DBG_BUF(0, "vip %s:%d add local address %s\n", @@ -1294,28 +1262,27 @@ ip_vs_add_laddr(struct ip_vs_service *svc, struct ip_vs_laddr_user_kern *uladdr) */ ip_vs_laddr_hold(laddr); - write_lock_bh(&__ip_vs_svc_lock); + cpu = laddr->cpuid; + this_svc = svc->svc0 + cpu; - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - list_add_tail(&laddr->n_list, &svc->laddr_list); - svc->num_laddrs++; + list_add_tail(&laddr->n_list, &this_svc->laddr_list); + this_svc->num_laddrs++; #ifdef CONFIG_IP_VS_DEBUG /* Dump the destinations */ - IP_VS_DBG_BUF(0, " svc %s:%d num %d curr %p \n", - IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), svc->num_laddrs, svc->curr_laddr); - list_for_each_entry(laddr, &svc->laddr_list, n_list) { + IP_VS_DBG_BUF(0, " cpu%d svc %s:%d num %d curr %p \n", + cpu, IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(this_svc->port), this_svc->num_laddrs, + this_svc->curr_laddr); + list_for_each_entry(laddr, &this_svc->laddr_list, n_list) { IP_VS_DBG_BUF(0, " laddr %p %s:%d \n", laddr, IP_VS_DBG_ADDR(svc->af, &laddr->addr), 0); } #endif - write_unlock_bh(&__ip_vs_svc_lock); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); return 0; } @@ -1324,6 +1291,8 @@ static int ip_vs_del_laddr(struct ip_vs_service *svc, struct ip_vs_laddr_user_kern *uladdr) { struct ip_vs_laddr *laddr; + struct ip_vs_service *this_svc; + int cpu; IP_VS_DBG_BUF(0, "vip %s:%d del local address %s\n", IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port), @@ -1336,27 +1305,26 @@ ip_vs_del_laddr(struct ip_vs_service *svc, struct ip_vs_laddr_user_kern *uladdr) return -ENOENT; } - write_lock_bh(&__ip_vs_svc_lock); + cpu = laddr->cpuid; + this_svc = svc->svc0 + cpu; - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); /* update svc->curr_laddr */ - if (svc->curr_laddr == &laddr->n_list) - svc->curr_laddr = laddr->n_list.next; + if (this_svc->curr_laddr == &laddr->n_list) + this_svc->curr_laddr = laddr->n_list.next; /* * Unlink dest from the service */ list_del(&laddr->n_list); - svc->num_laddrs--; + this_svc->num_laddrs--; #ifdef CONFIG_IP_VS_DEBUG - IP_VS_DBG_BUF(0, " svc %s:%d num %d curr %p \n", - IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), svc->num_laddrs, svc->curr_laddr); - list_for_each_entry(laddr, &svc->laddr_list, n_list) { + IP_VS_DBG_BUF(0, " cpu%d svc %s:%d num %d curr %p \n", + cpu, IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), this_svc->num_laddrs, + this_svc->curr_laddr); + list_for_each_entry(laddr, &this_svc->laddr_list, n_list) { IP_VS_DBG_BUF(0, " laddr %p %s:%d \n", laddr, IP_VS_DBG_ADDR(svc->af, &laddr->addr), 0); } @@ -1364,7 +1332,7 @@ ip_vs_del_laddr(struct ip_vs_service *svc, struct ip_vs_laddr_user_kern *uladdr) ip_vs_laddr_put(laddr); - write_unlock_bh(&__ip_vs_svc_lock); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); return 0; } @@ -1376,9 +1344,10 @@ static int ip_vs_add_service(struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0; + int ret = 0, cpu = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_service *svc = NULL; + struct ip_vs_service *this_svc = NULL; /* increase the module use count */ ip_vs_use_count_inc(); @@ -1393,82 +1362,87 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { ret = -EINVAL; - goto out_err; + goto out_sched; } #endif - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + svc = kzalloc(sizeof(struct ip_vs_service) * num_possible_cpus(), + GFP_ATOMIC); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; - goto out_err; + goto out_sched; } - /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); - atomic_set(&svc->refcnt, 0); - - svc->af = u->af; - svc->protocol = u->protocol; - ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); - svc->port = u->port; - svc->fwmark = u->fwmark; - svc->flags = u->flags; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - /* Init the local address stuff */ - rwlock_init(&svc->laddr_lock); - INIT_LIST_HEAD(&svc->laddr_list); - svc->num_laddrs = 0; - svc->curr_laddr = &svc->laddr_list; - - INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); - - /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; + for_each_possible_cpu(cpu) { + this_svc = svc + cpu; + atomic_set(&this_svc->refcnt, 0); + + this_svc->af = u->af; + this_svc->protocol = u->protocol; + ip_vs_addr_copy(u->af, &this_svc->addr, &u->addr); + this_svc->port = u->port; + this_svc->fwmark = u->fwmark; + this_svc->flags = u->flags; + this_svc->timeout = u->timeout * HZ; + this_svc->netmask = u->netmask; + this_svc->est_timeout = u->est_timeout * HZ; + + /* Init the local address stuff */ + rwlock_init(&this_svc->laddr_lock); + INIT_LIST_HEAD(&this_svc->laddr_list); + this_svc->num_laddrs = 0; + this_svc->curr_laddr = &this_svc->laddr_list; + + INIT_LIST_HEAD(&this_svc->destinations); + rwlock_init(&this_svc->sched_lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(this_svc, sched); + if (ret) + goto out_err; + + /* Hash the service into the service table */ + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + ip_vs_svc_hash_cpuid(this_svc, cpu); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + + this_svc->svc0 = svc; /* save the first svc */ + } + sched = NULL; /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ip_vs_ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ip_vs_nullsvc_counter); - /* Init statistic */ - ret = ip_vs_new_stats(&(svc->stats)); - if(ret) - goto out_err; - /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ip_vs_num_services++; - /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); - - *svc_p = svc; + /* svc is percpu, NULL is OK */ + *svc_p = NULL; return 0; - out_err: - if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); - if (svc->inc) { +out_err: + for_each_possible_cpu(cpu) { + this_svc = svc + cpu; + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + ip_vs_svc_unhash(this_svc); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + if (this_svc->scheduler) + ip_vs_unbind_scheduler(this_svc); + if (this_svc->inc) { local_bh_disable(); - ip_vs_app_inc_put(svc->inc); + ip_vs_app_inc_put(this_svc->inc); local_bh_enable(); } - kfree(svc); } + kfree(svc); +out_sched: ip_vs_scheduler_put(sched); - - out_mod_dec: +out_mod_dec: /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1482,7 +1456,9 @@ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_service *this_svc; int ret = 0; + int cpu = 0; /* * Lookup the scheduler, by 'u->sched_name' @@ -1501,52 +1477,61 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } #endif - write_lock_bh(&__ip_vs_svc_lock); + for_each_possible_cpu(cpu) { + this_svc = svc->svc0 + cpu; + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + /* + * Set the flags and timeout value + */ + this_svc->flags = u->flags | IP_VS_SVC_F_HASHED; + this_svc->timeout = u->timeout * HZ; + this_svc->netmask = u->netmask; + this_svc->est_timeout = u->est_timeout * HZ; - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + } - /* - * Set the flags and timeout value - */ - svc->flags = u->flags | IP_VS_SVC_F_HASHED; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; + for_each_possible_cpu(cpu) { + this_svc = svc->svc0 + cpu; + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } + old_sched = this_svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(this_svc))) { + old_sched = sched; + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + goto out_sched; + } - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. + * Bind the new scheduler */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; + if ((ret = ip_vs_bind_scheduler(this_svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore + * the old scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the + * service, otherwise the system may crash. + */ + ip_vs_bind_scheduler(this_svc, old_sched); + old_sched = sched; + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); + goto out_sched; + } } + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); } - out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); +out_sched: + /* todo: rollback scheduler if error */ + #ifdef CONFIG_IP_VS_IPV6 out: #endif @@ -1567,44 +1552,48 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) struct ip_vs_dest *dest, *nxt; struct ip_vs_laddr *laddr, *laddr_next; struct ip_vs_scheduler *old_sched; + struct ip_vs_service *this_svc; + int cpu = 0; /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ip_vs_num_services--; + for_each_possible_cpu(cpu) { + this_svc = svc->svc0 + cpu; + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - /* - * Free statistic related per cpu memory - */ - ip_vs_del_stats(svc->stats); + /* Unbind scheduler */ + old_sched = this_svc->scheduler; + ip_vs_unbind_scheduler(this_svc); + /* Unbind app inc */ + if (this_svc->inc) { + ip_vs_app_inc_put(this_svc->inc); + this_svc->inc = NULL; + } - /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); + /* Unlink the whole local address list */ + list_for_each_entry_safe(laddr, laddr_next, + &this_svc->laddr_list, n_list) { + list_del(&laddr->n_list); + ip_vs_laddr_put(laddr); + } - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, + &this_svc->destinations, n_list) { + __ip_vs_unlink_dest(this_svc, dest, 0); + __ip_vs_del_dest(dest); + } - /* Unlink the whole local address list */ - list_for_each_entry_safe(laddr, laddr_next, &svc->laddr_list, n_list) { - list_del(&laddr->n_list); - ip_vs_laddr_put(laddr); - } - - /* - * Unlink the whole destination list - */ - list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { - __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); } + if (old_sched) + ip_vs_scheduler_put(old_sched); /* * Update the virtual service counters */ @@ -1616,8 +1605,21 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Free the service if nobody refers to it */ - if (atomic_read(&svc->refcnt) == 0) - kfree(svc); + this_svc = svc->svc0; + for_each_possible_cpu(cpu) { + if (atomic_read(&this_svc->refcnt)) { + IP_VS_DBG_BUF(2, "%s(): cpu%d refers to svc %s:%d," + "refcnt=%d\n", __func__, cpu, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), + atomic_read(&this_svc->refcnt)); + break; + } + this_svc++; + } + + if (cpu == num_possible_cpus()) + kfree(svc->svc0); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1628,25 +1630,27 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) */ static int ip_vs_del_service(struct ip_vs_service *svc) { + struct ip_vs_service *this_svc; + int cpu = 0; + if (svc == NULL) return -EEXIST; /* * Unhash it from the service table */ - write_lock_bh(&__ip_vs_svc_lock); + this_svc = svc->svc0; + for_each_possible_cpu(cpu) { + spin_lock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - ip_vs_svc_unhash(svc); + ip_vs_svc_unhash(this_svc); + spin_unlock_bh(&per_cpu(ip_vs_svc_lock, cpu)); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + this_svc++; + } __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - return 0; } @@ -1657,21 +1661,17 @@ static int ip_vs_flush(void) { int idx; struct ip_vs_service *svc, *nxt; + struct list_head *ip_vs_svc_tab; + struct list_head *ip_vs_svc_fwm_tab; /* * Flush the service table hashed by */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], + ip_vs_svc_tab = __get_cpu_var(ip_vs_svc_tab_percpu); + list_for_each_entry_safe(svc, nxt, ip_vs_svc_tab + idx, s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); + ip_vs_del_service(svc); } } @@ -1679,16 +1679,10 @@ static int ip_vs_flush(void) * Flush the service table hashed by fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + ip_vs_svc_fwm_tab = __get_cpu_var(ip_vs_svc_fwm_tab_percpu); list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); + ip_vs_svc_fwm_tab + idx, f_list) { + ip_vs_del_service(svc); } } @@ -1697,8 +1691,9 @@ static int ip_vs_flush(void) /* * Zero counters in a service or all services + * disable in v3 */ -static int ip_vs_zero_service(struct ip_vs_service *svc) +/*static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; @@ -1730,7 +1725,7 @@ static int ip_vs_zero_all(void) ip_vs_zero_stats(ip_vs_stats); return 0; -} +}*/ static int proc_do_defense_mode(ctl_table * table, int write, @@ -2199,6 +2194,43 @@ static struct ctl_table vs_vars[] = { .extra1 = &ip_vs_entry_min, /* zero */ .extra2 = &ip_vs_entry_max, /* one */ }, + { + .procname = "fast_response_xmit_inside", + .data = &sysctl_ip_vs_fast_xmit_inside, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &ip_vs_entry_min, /* zero */ + .extra2 = &ip_vs_entry_max, /* one */ + }, + { + .procname = "csum_offload", + .data = &sysctl_ip_vs_csum_offload, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &ip_vs_entry_min, /* zero */ + .extra2 = &ip_vs_entry_max, /* one */ + }, + { + .procname = "reserve_core", + .data = &sysctl_ip_vs_reserve_core, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &ip_vs_reserve_core_min, /* zero */ + .extra2 = &ip_vs_reserve_core_max, /* six */ + }, + { + .procname = "conn_max_num", + .data = &sysctl_ip_vs_conn_max_num, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, {.ctl_name = 0} }; @@ -2213,7 +2245,8 @@ EXPORT_SYMBOL_GPL(net_vs_ctl_path); static struct ctl_table_header *sysctl_header; -#ifdef CONFIG_PROC_FS +/* NOT support by v3 */ +#ifdef CONFIG_PROC_FS_NO_EXIST struct ip_vs_iter { struct list_head *table; @@ -2527,6 +2560,10 @@ static struct ip_vs_estats_entry ext_stats[] = { IP_VS_ESTATS_ITEM("fast_xmit_no_mac", FAST_XMIT_NO_MAC), IP_VS_ESTATS_ITEM("fast_xmit_synproxy_save", FAST_XMIT_SYNPROXY_SAVE), IP_VS_ESTATS_ITEM("fast_xmit_dev_lost", FAST_XMIT_DEV_LOST), + IP_VS_ESTATS_ITEM("fast_xmit_reject_inside", FAST_XMIT_REJECT_INSIDE), + IP_VS_ESTATS_ITEM("fast_xmit_pass_inside", FAST_XMIT_PASS_INSIDE), + IP_VS_ESTATS_ITEM("fast_xmit_synproxy_save_inside", + FAST_XMIT_SYNPROXY_SAVE_INSIDE), IP_VS_ESTATS_ITEM("rst_in_syn_sent", RST_IN_SYN_SENT), IP_VS_ESTATS_ITEM("rst_out_syn_sent", RST_OUT_SYN_SENT), IP_VS_ESTATS_ITEM("rst_in_established", RST_IN_ESTABLISHED), @@ -2535,6 +2572,8 @@ static struct ip_vs_estats_entry ext_stats[] = { IP_VS_ESTATS_ITEM("lro_reject", LRO_REJECT), IP_VS_ESTATS_ITEM("xmit_unexpected_mtu", XMIT_UNEXPECTED_MTU), IP_VS_ESTATS_ITEM("conn_sched_unreach", CONN_SCHED_UNREACH), + IP_VS_ESTATS_ITEM("synproxy_no_dest", SYNPROXY_NO_DEST), + IP_VS_ESTATS_ITEM("conn_exceeded", CONN_EXCEEDED), IP_VS_ESTATS_LAST }; @@ -2732,7 +2771,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user * user, unsigned int len) if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(); + ret = -EACCES; //ip_vs_zero_all(); goto out_unlock; } } @@ -2775,7 +2814,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user * user, unsigned int len) goto out_unlock; break; case IP_VS_SO_SET_ZERO: - ret = ip_vs_zero_service(svc); + ret = -EACCES; //ip_vs_zero_service(svc); break; case IP_VS_SO_SET_ADDDEST: ip_vs_copy_udest_compat(&udest, udest_compat); @@ -2801,8 +2840,8 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user * user, unsigned int len) ret = -EINVAL; } - if (svc) - ip_vs_service_put(svc); +// if (svc) +// ip_vs_service_put(svc); out_unlock: mutex_unlock(&__ip_vs_mutex); @@ -2813,28 +2852,11 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user * user, unsigned int len) return ret; } -static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - int i = 0; - - /* Set rate related field as zero due estimator is discard in ipvs kernel */ - memset(dst, 0x00, sizeof(struct ip_vs_stats_user)); - - for_each_online_cpu(i) { - dst->conns += ip_vs_stats_cpu(src, i).conns; - dst->inpkts += ip_vs_stats_cpu(src, i).inpkts; - dst->outpkts += ip_vs_stats_cpu(src, i).outpkts; - dst->inbytes += ip_vs_stats_cpu(src, i).inbytes; - dst->outbytes += ip_vs_stats_cpu(src, i).outbytes; - } - - return; -} - static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { + int cpu = 0; + dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; @@ -2845,7 +2867,13 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) dst->netmask = src->netmask; dst->num_dests = src->num_dests; dst->num_laddrs = src->num_laddrs; - ip_vs_copy_stats(&dst->stats, src->stats); + for_each_online_cpu(cpu) { + dst->stats.conns += (src->svc0 + cpu)->stats.conns; + dst->stats.inpkts += (src->svc0 + cpu)->stats.inpkts; + dst->stats.outpkts += (src->svc0 + cpu)->stats.outpkts; + dst->stats.inbytes += (src->svc0 + cpu)->stats.inbytes; + dst->stats.outbytes += (src->svc0 + cpu)->stats.outbytes; + } } static inline int @@ -2854,11 +2882,13 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, { int idx, count = 0; struct ip_vs_service *svc; + struct list_head *ip_vs_svc_tab; struct ip_vs_service_entry entry; int ret = 0; + ip_vs_svc_tab = __get_cpu_var(ip_vs_svc_tab_percpu); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + list_for_each_entry(svc, ip_vs_svc_tab + idx, s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET) continue; @@ -2876,8 +2906,9 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, } } + ip_vs_svc_tab = __get_cpu_var(ip_vs_svc_fwm_tab_percpu); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + list_for_each_entry(svc, ip_vs_svc_tab + idx, f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET) continue; @@ -2914,6 +2945,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, if (svc) { int count = 0; + int cpu; + struct ip_vs_dest *per_dest; + struct ip_vs_service *this_svc; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; @@ -2921,16 +2955,43 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; + memset(&entry, 0, sizeof(entry)); entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); entry.weight = atomic_read(&dest->weight); entry.u_threshold = dest->u_threshold; entry.l_threshold = dest->l_threshold; - entry.activeconns = atomic_read(&dest->activeconns); - entry.inactconns = atomic_read(&dest->inactconns); - entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, dest->stats); + + /* percpu counter */ + this_svc = dest->svc->svc0; + for_each_online_cpu(cpu) { + per_dest = ip_vs_lookup_dest(this_svc, + &dest->addr, dest->port); + if(per_dest == NULL) { + IP_VS_ERR_RL("%s():dest doesn't exist " + "on cpu%d\n", __func__, cpu); + ret = -EFAULT; + goto out; + } + + entry.activeconns += + atomic_read(&per_dest->activeconns); + entry.inactconns += + atomic_read(&per_dest->inactconns); + entry.persistconns += + atomic_read(&per_dest->persistconns); + + entry.stats.conns += per_dest->stats.conns; + entry.stats.inpkts += per_dest->stats.inpkts; + entry.stats.outpkts += per_dest->stats.outpkts; + entry.stats.inbytes += per_dest->stats.inbytes; + entry.stats.outbytes += + per_dest->stats.outbytes; + + this_svc++; + } + if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; @@ -2938,9 +2999,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, } count++; } - ip_vs_service_put(svc); + // ip_vs_service_put(svc); } else ret = -ESRCH; +out: return ret; } @@ -2959,26 +3021,33 @@ __ip_vs_get_laddr_entries(const struct ip_vs_get_laddrs *get, get->port); if (svc) { + int cpu; int count = 0; struct ip_vs_laddr *laddr; + struct ip_vs_service *svc_per; struct ip_vs_laddr_entry entry; - list_for_each_entry(laddr, &svc->laddr_list, n_list) { - if (count >= get->num_laddrs) - break; - - entry.addr = laddr->addr.ip; - entry.port_conflict = - atomic64_read(&laddr->port_conflict); - entry.conn_counts = atomic_read(&laddr->conn_counts); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - break; + svc_per = svc->svc0; + for_each_online_cpu(cpu) { + list_for_each_entry(laddr, + &svc_per->laddr_list, n_list) { + if (count >= get->num_laddrs) + break; + + entry.addr = laddr->addr.ip; + entry.port_conflict = + atomic64_read(&laddr->port_conflict); + entry.conn_counts = atomic_read(&laddr->conn_counts); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; } - count++; + svc_per++; } - ip_vs_service_put(svc); + //ip_vs_service_put(svc); } else ret = -ESRCH; return ret; @@ -3106,7 +3175,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user * user, int *len) if (copy_to_user(user, entry, sizeof(*entry)) != 0) ret = -EFAULT; - ip_vs_service_put(svc); + //ip_vs_service_put(svc); } else ret = -ESRCH; } @@ -3272,37 +3341,26 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_stats *stats) { struct nlattr *nl_stats = nla_nest_start(skb, container_type); - struct ip_vs_stats tmp_stats; - int i = 0; if (!nl_stats) return -EMSGSIZE; - memset((void*)(&tmp_stats), 0x00, sizeof(struct ip_vs_stats)); - for_each_online_cpu(i) { - tmp_stats.conns += ip_vs_stats_cpu(stats, i).conns; - tmp_stats.inpkts += ip_vs_stats_cpu(stats, i).inpkts; - tmp_stats.outpkts += ip_vs_stats_cpu(stats, i).outpkts; - tmp_stats.inbytes += ip_vs_stats_cpu(stats, i).inbytes; - tmp_stats.outbytes += ip_vs_stats_cpu(stats, i).outbytes; - } - - NLA_PUT_U64(skb, IPVS_STATS_ATTR_CONNS, tmp_stats.conns); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INPKTS, tmp_stats.inpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTPKTS, tmp_stats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, tmp_stats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, tmp_stats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, 0); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, 0); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, 0); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, 0); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, 0); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_CONNS, stats->conns); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, 0); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, 0); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, 0); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, 0); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, 0); nla_nest_end(skb, nl_stats); return 0; - nla_put_failure: +nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } @@ -3310,6 +3368,9 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { + int cpu; + struct ip_vs_stats tmp_stats; + struct ip_vs_service *this_svc; struct nlattr *nl_service; struct ip_vs_flags flags = {.flags = svc->flags, .mask = ~0 @@ -3333,8 +3394,21 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_EST_TIMEOUT, svc->est_timeout / HZ); + + memset((void*)(&tmp_stats), 0, sizeof(struct ip_vs_stats)); + this_svc = svc->svc0; + for_each_possible_cpu(cpu) { + tmp_stats.conns += this_svc->stats.conns; + tmp_stats.inpkts += this_svc->stats.inpkts; + tmp_stats.outpkts += this_svc->stats.outpkts; + tmp_stats.inbytes += this_svc->stats.inbytes; + tmp_stats.outbytes += this_svc->stats.outbytes; + + this_svc++; + } - if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, svc->stats)) + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &tmp_stats)) goto nla_put_failure; nla_nest_end(skb, nl_service); @@ -3352,6 +3426,11 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb, { void *hdr; + if (!svc) { + printk("ip_vs_genl_dump_service:svc is NULL\n"); + return -EMSGSIZE; + } + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); @@ -3374,10 +3453,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; + struct list_head *ip_vs_svc_tab; mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + ip_vs_svc_tab = __get_cpu_var(ip_vs_svc_tab_percpu); + list_for_each_entry(svc, ip_vs_svc_tab + i, s_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -3388,7 +3469,8 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + ip_vs_svc_tab = __get_cpu_var(ip_vs_svc_fwm_tab_percpu); + list_for_each_entry(svc, ip_vs_svc_tab + i, f_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -3448,7 +3530,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_sched, *nla_flags, *nla_timeout, - *nla_netmask; + *nla_netmask, *nla_est_timeout; struct ip_vs_flags flags; struct ip_vs_service *svc; @@ -3456,6 +3538,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + nla_est_timeout = attrs[IPVS_SVC_ATTR_EST_TIMEOUT]; if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) return -EINVAL; @@ -3470,7 +3553,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, &usvc->addr, usvc->port); if (svc) { usvc->flags = svc->flags; - ip_vs_service_put(svc); + //ip_vs_service_put(svc); } else usvc->flags = 0; @@ -3480,6 +3563,12 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, usvc->sched_name = nla_data(nla_sched); usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_u32(nla_netmask); + if(IPPROTO_TCP == usvc->protocol) { + if(nla_est_timeout) /* Be compatible with different version of libipvs2.6 */ + usvc->est_timeout = nla_get_u32(nla_est_timeout); + if(!usvc->est_timeout) + usvc->est_timeout = sysctl_ip_vs_tcp_timeouts[IP_VS_TCP_S_ESTABLISHED] / HZ; + } } return 0; @@ -3503,7 +3592,12 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { + u32 activeconns, inactconns, persistconns; + int cpu; + struct ip_vs_stats tmp_stats; struct nlattr *nl_dest; + struct ip_vs_dest *per_dest; + struct ip_vs_service *svc; nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) @@ -3517,14 +3611,38 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - - if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, dest->stats)) + + activeconns = 0; + inactconns = 0; + persistconns = 0; + memset((void*)(&tmp_stats), 0, sizeof(struct ip_vs_stats)); + svc = dest->svc->svc0; + for_each_possible_cpu(cpu) { + per_dest = ip_vs_lookup_dest(svc, &dest->addr, dest->port); + if(per_dest == NULL) { + IP_VS_ERR_RL("%s():dest doesn't exist on cpu%d\n", + __func__, cpu); + goto nla_put_failure; + } + + activeconns += atomic_read(&per_dest->activeconns); + inactconns += atomic_read(&per_dest->inactconns); + persistconns += atomic_read(&per_dest->persistconns); + + tmp_stats.conns += per_dest->stats.conns; + tmp_stats.inpkts += per_dest->stats.inpkts; + tmp_stats.outpkts += per_dest->stats.outpkts; + tmp_stats.inbytes += per_dest->stats.inbytes; + tmp_stats.outbytes += per_dest->stats.outbytes; + + svc++; + } + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, activeconns); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, inactconns); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, persistconns); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &tmp_stats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); @@ -3588,7 +3706,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, nla_put_failure: cb->args[0] = idx; - ip_vs_service_put(svc); +// ip_vs_service_put(svc); out_err: mutex_unlock(&__ip_vs_mutex); @@ -3643,8 +3761,10 @@ static int ip_vs_genl_dump_laddrs(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0; + int cpu; int start = cb->args[0]; struct ip_vs_service *svc; + struct ip_vs_service *svc_per; struct ip_vs_laddr *laddr; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; @@ -3662,20 +3782,23 @@ static int ip_vs_genl_dump_laddrs(struct sk_buff *skb, IP_VS_DBG_BUF(0, "vip %s:%d get local address \n", IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); - /* Dump the destinations */ - list_for_each_entry(laddr, &svc->laddr_list, n_list) { - if (++idx <= start) - continue; + svc_per = svc->svc0; + for_each_online_cpu(cpu){ + /* Dump the destinations */ + list_for_each_entry(laddr, &svc_per->laddr_list, n_list) { + if (++idx <= start) + continue; - if (ip_vs_genl_dump_laddr(skb, laddr, cb) < 0) { - idx--; - goto nla_put_failure; + if (ip_vs_genl_dump_laddr(skb, laddr, cb) < 0) { + idx--; + goto nla_put_failure; + } } + svc_per++; } - nla_put_failure: cb->args[0] = idx; - ip_vs_service_put(svc); + //ip_vs_service_put(svc); out_err: mutex_unlock(&__ip_vs_mutex); @@ -3896,7 +4019,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) ret = ip_vs_genl_del_daemon(daemon_attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(); + ret = -EACCES; //ip_vs_zero_all(); goto out; } @@ -3971,7 +4094,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) ret = ip_vs_del_dest(svc, &udest); break; case IPVS_CMD_ZERO: - ret = ip_vs_zero_service(svc); + ret = -EACCES; //ip_vs_zero_service(svc); break; case IPVS_CMD_NEW_LADDR: ret = ip_vs_add_laddr(svc, &uladdr); @@ -3984,8 +4107,8 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) } out: - if (svc) - ip_vs_service_put(svc); +// if (svc) +// ip_vs_service_put(svc); mutex_unlock(&__ip_vs_mutex); return ret; @@ -4034,7 +4157,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) goto out_err; } else if (svc) { ret = ip_vs_genl_fill_service(msg, svc); - ip_vs_service_put(svc); + // ip_vs_service_put(svc); if (ret) goto nla_put_failure; } else { @@ -4213,12 +4336,62 @@ static void ip_vs_genl_unregister(void) genl_unregister_family(&ip_vs_genl_family); } +static void free_svc_tab(void) +{ + int cpu; + struct list_head *ip_vs_svc_tab; + struct list_head *ip_vs_svc_fwm_tab; + + for_each_possible_cpu(cpu) { + ip_vs_svc_tab = per_cpu(ip_vs_svc_tab_percpu, cpu); + ip_vs_svc_fwm_tab = per_cpu(ip_vs_svc_fwm_tab_percpu, cpu); + + /* free NULL is OK */ + kfree(ip_vs_svc_tab); + kfree(ip_vs_svc_fwm_tab); + } +} + +static int __init alloc_svc_tab(void) +{ + int cpu; + struct list_head *tmp; + + /* clear percpu svc_tab */ + for_each_possible_cpu(cpu) { + per_cpu(ip_vs_svc_tab_percpu, cpu) = NULL; + per_cpu(ip_vs_svc_fwm_tab_percpu, cpu) = NULL; + } + + for_each_possible_cpu(cpu) { + tmp = kmalloc(sizeof(struct list_head) * + IP_VS_SVC_TAB_SIZE, GFP_ATOMIC); + if (!tmp) { + pr_err("cannot allocate svc_tab.\n"); + return -ENOMEM; + } + per_cpu(ip_vs_svc_tab_percpu, cpu) = tmp; + + /* svc tab for fwmark */ + tmp = kmalloc(sizeof(struct list_head) * + IP_VS_SVC_TAB_SIZE, GFP_ATOMIC); + if (!tmp) { + pr_err("cannot allocate svc_tab.\n"); + return -ENOMEM; + } + per_cpu(ip_vs_svc_fwm_tab_percpu, cpu) = tmp; + } + + return 0; +} + /* End of Generic Netlink interface definitions */ int __init ip_vs_control_init(void) { int ret; int idx; + int cpu; EnterFunction(2); @@ -4246,28 +4419,39 @@ int __init ip_vs_control_init(void) goto cleanup_percpu; } + ret = alloc_svc_tab(); + if (ret) { + goto cleanup_svctab; + } + proc_net_fops_create(&init_net, "ip_vs_ext_stats", 0, &ip_vs_estats_fops); - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); +// proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(&init_net, "ip_vs_stats", 0, &ip_vs_stats_fops); sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); - } + for_each_possible_cpu(cpu) { + struct list_head *ip_vs_svc_tab; + struct list_head *ip_vs_svc_fwm_tab; + spin_lock_init(&per_cpu(ip_vs_svc_lock, cpu)); + ip_vs_svc_tab = per_cpu(ip_vs_svc_tab_percpu, cpu); + ip_vs_svc_fwm_tab = per_cpu(ip_vs_svc_fwm_tab_percpu, cpu); - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(ip_vs_svc_tab + idx); + INIT_LIST_HEAD(ip_vs_svc_fwm_tab + idx); + } + + INIT_LIST_HEAD(&per_cpu(ip_vs_dest_trash_percpu, cpu)); + } LeaveFunction(2); return 0; +cleanup_svctab: + free_svc_tab(); cleanup_percpu: free_percpu(ip_vs_esmib); cleanup_genl: @@ -4282,15 +4466,14 @@ void ip_vs_control_cleanup(void) { EnterFunction(2); ip_vs_trash_cleanup(); - cancel_rearming_delayed_work(&defense_work); - cancel_work_sync(&defense_work.work); ip_vs_del_stats(ip_vs_stats); unregister_sysctl_table(sysctl_header); proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); +// proc_net_remove(&init_net, "ip_vs"); proc_net_remove(&init_net, "ip_vs_ext_stats"); free_percpu(ip_vs_esmib); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); + free_svc_tab(); LeaveFunction(2); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c index 3cdc11e1..f9e09cfa 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -125,6 +125,14 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, *verdict = ip_vs_leave(svc, skb, pp); return 0; } + + /* + * Set private establish state timeout into cp from svc, + * due cp may use its user establish state timeout + * different from sysctl_ip_vs_tcp_timeouts + */ + (*cpp)->est_timeout = svc->est_timeout; + ip_vs_service_put(svc); return 1; } @@ -201,6 +209,24 @@ tcp_partial_csum_reset(int af, int len, struct tcphdr *tcph, tcph->check = ~tcp_v4_check(len, saddr->ip, daddr->ip, 0); } +static inline void +tcp_seq_csum_update(struct tcphdr *tcph, __u32 oldseq, __u32 newseq) +{ + /* do checksum later */ + if (!sysctl_ip_vs_csum_offload) + tcph->check = csum_fold(ip_vs_check_diff4(oldseq, newseq, + ~csum_unfold(tcph->check))); +} + +static inline void +tcp_mss_csum_update(struct tcphdr *tcph, __be16 oldmss, __be16 newmss) +{ + /* do checksum later */ + if (!sysctl_ip_vs_csum_offload) + tcph->check = csum_fold(ip_vs_check_diff2(oldmss, newmss, + ~csum_unfold(tcph->check))); +} + /* adjust tcp opt mss, sub TCPOLEN_CIP */ static void tcp_opt_adjust_mss(int af, struct tcphdr *tcph) { @@ -238,10 +264,9 @@ static void tcp_opt_adjust_mss(int af, struct tcphdr *tcph) else #endif in_mss -= TCPOLEN_ADDR; - *((__be16 *) ptr) = htons(in_mss);/* set mss, 16bit */ - tcph->check = csum_fold(ip_vs_check_diff2(old, - *(__be16 *) ptr, - ~csum_unfold(tcph->check))); + /* set mss, 16bit */ + *((__be16 *) ptr) = htons(in_mss); + tcp_mss_csum_update(tcph, old, *(__be16 *)ptr); return; } @@ -306,8 +331,7 @@ static int tcp_out_adjust_seq(struct ip_vs_conn *cp, struct tcphdr *tcph) /* adjust ack sequence */ tcph->ack_seq = htonl(ntohl(tcph->ack_seq) - cp->fnat_seq.delta); /* update checksum */ - tcph->check = csum_fold(ip_vs_check_diff4(old_seq, tcph->ack_seq, - ~csum_unfold(tcph->check))); + tcp_seq_csum_update(tcph, old_seq, tcph->ack_seq); /* adjust sack sequence */ ptr = (__u8 *) (tcph + 1); @@ -347,20 +371,14 @@ static int tcp_out_adjust_seq(struct ip_vs_conn *cp, struct tcphdr *tcph) old_seq = *tmp; *tmp = htonl(ntohl(*tmp) - cp->fnat_seq.delta); - tcph->check = - csum_fold(ip_vs_check_diff4( - old_seq, *tmp, - ~csum_unfold(tcph->check))); + tcp_seq_csum_update(tcph, old_seq, *tmp); tmp++; old_seq = *tmp; *tmp = htonl(ntohl(*tmp) - cp->fnat_seq.delta); - tcph->check = - csum_fold(ip_vs_check_diff4( - old_seq, *tmp, - ~csum_unfold(tcph->check))); + tcp_seq_csum_update(tcph, old_seq, *tmp); } return 1; } @@ -415,6 +433,13 @@ tcp_snat_handler(struct sk_buff *skb, return 0; } + if (sysctl_ip_vs_csum_offload) { + skb->csum_start = skb_network_header(skb) - + skb->head + (ip_hdr(skb)->ihl << 2); + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } + /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_reset(cp->af, (skb->len - tcphoff), @@ -516,6 +541,14 @@ tcp_fnat_out_handler(struct sk_buff *skb, tcp_out_init_seq(cp, tcph); } + /* do csum later */ + if (sysctl_ip_vs_csum_offload) { + skb->csum_start = skb_network_header(skb) - + skb->head + (ip_hdr(skb)->ihl << 2); + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } + /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_reset(cp->af, (skb->len - tcphoff), @@ -591,15 +624,18 @@ static void tcp_opt_remove_timestamp(struct tcphdr *tcph) * but data is 10Byte. zero the buf */ memset((__u8*)old, 0, sizeof(old)); - memset((__u8*)new, 0, sizeof(new)); memcpy((__u8*)old, ptr - 2, TCPOLEN_TIMESTAMP); for (i = 0; i < TCPOLEN_TIMESTAMP; i++) { *(ptr - 2 + i) = TCPOPT_NOP; /* TCPOPT_NOP replace timestamp opt */ } - memcpy((__u8*)new, ptr - 2, TCPOLEN_TIMESTAMP); - tcph->check = csum_fold(ip_vs_check_diff16( - old, new, + + if (!sysctl_ip_vs_csum_offload) { + memset((__u8*)new, 0, sizeof(new)); + memcpy((__u8*)new, ptr - 2, TCPOLEN_TIMESTAMP); + tcph->check = csum_fold( + ip_vs_check_diff16(old, new, ~csum_unfold(tcph->check))); + } return; } @@ -683,24 +719,21 @@ static void tcp_in_adjust_seq(struct ip_vs_conn *cp, struct tcphdr *tcph) /* adjust seq for FULLNAT */ tcph->seq = htonl(ntohl(tcph->seq) + cp->fnat_seq.delta); /* update checksum */ - tcph->check = csum_fold(ip_vs_check_diff4(old_seq, tcph->seq, - ~csum_unfold(tcph->check))); + tcp_seq_csum_update(tcph, old_seq, tcph->seq); /* adjust ack_seq for SYNPROXY, include tcp hdr and sack opt */ ip_vs_synproxy_dnat_handler(tcph, &cp->syn_proxy_seq); } /* - * add client address in tcp option - * alloc a new skb, and free the old skb - * return new skb + * add client (ip and port) in tcp option + * return 0 if success */ -static struct sk_buff *tcp_opt_add_toa(struct ip_vs_conn *cp, - struct sk_buff *old_skb, +static int tcp_opt_add_toa(struct ip_vs_conn *cp, + struct sk_buff *skb, struct tcphdr **tcph) { __u32 mtu; - struct sk_buff *new_skb = NULL; struct ip_vs_tcpo_addr *toa; unsigned int tcphoff; struct tcphdr *th; @@ -709,45 +742,45 @@ static struct sk_buff *tcp_opt_add_toa(struct ip_vs_conn *cp, /* now only process IPV4 */ if (cp->af != AF_INET) { IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_FAIL_PROTO); - return old_skb; + return 1; } /* skb length and tcp option length checking */ - mtu = dst_mtu((struct dst_entry *)old_skb->_skb_dst); - if (old_skb->len > (mtu - sizeof(struct ip_vs_tcpo_addr))) { + if (skb->_skb_dst) + mtu = dst_mtu((struct dst_entry *)skb->_skb_dst); + else /* fast_xmit can reach here */ + mtu = cp->dev_inside ? cp->dev_inside->mtu : + sizeof(struct ip_vs_tcpo_addr); + + if (skb->len > (mtu - sizeof(struct ip_vs_tcpo_addr))) { IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_FAIL_LEN); - return old_skb; + return 1; } /* the maximum length of TCP head is 60 bytes, so only 40 bytes for options */ if ((60 - ((*tcph)->doff << 2)) < sizeof(struct ip_vs_tcpo_addr)) { IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_HEAD_FULL); - return old_skb; + return 1; } - /* copy all skb, plus ttm space , new skb is linear */ - new_skb = skb_copy_expand(old_skb, - skb_headroom(old_skb), - skb_tailroom(old_skb) + - sizeof(struct ip_vs_tcpo_addr), GFP_ATOMIC); - if (new_skb == NULL) { + /* expand skb if needed */ + if ((sizeof(struct ip_vs_tcpo_addr) > skb_tailroom(skb)) && + pskb_expand_head(skb, 0, + sizeof(struct ip_vs_tcpo_addr), GFP_ATOMIC)){ IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_FAIL_MEM); - return old_skb; + return 1; } - /* free old skb */ - kfree_skb(old_skb); - /* * add client ip */ - tcphoff = ip_hdrlen(new_skb); + tcphoff = ip_hdrlen(skb); /* get new tcp header */ *tcph = th = - (struct tcphdr *)((void *)skb_network_header(new_skb) + tcphoff); + (struct tcphdr *)((void *)skb_network_header(skb) + tcphoff); /* ptr to old opts */ - p = skb_tail_pointer(new_skb) - 1; + p = skb_tail_pointer(skb) - 1; q = p + sizeof(struct ip_vs_tcpo_addr); /* move data down, offset is sizeof(struct ip_vs_tcpo_addr) */ @@ -758,7 +791,7 @@ static struct sk_buff *tcp_opt_add_toa(struct ip_vs_conn *cp, } /* move tail to new postion */ - new_skb->tail += sizeof(struct ip_vs_tcpo_addr); + skb->tail += sizeof(struct ip_vs_tcpo_addr); /* put client ip opt , ptr point to opts */ toa = (struct ip_vs_tcpo_addr *)(th + 1); @@ -770,41 +803,43 @@ static struct sk_buff *tcp_opt_add_toa(struct ip_vs_conn *cp, /* reset tcp header length */ th->doff += sizeof(struct ip_vs_tcpo_addr) / 4; /* reset ip header totoal length */ - ip_hdr(new_skb)->tot_len = - htons(ntohs(ip_hdr(new_skb)->tot_len) + + ip_hdr(skb)->tot_len = + htons(ntohs(ip_hdr(skb)->tot_len) + sizeof(struct ip_vs_tcpo_addr)); /* reset skb length */ - new_skb->len += sizeof(struct ip_vs_tcpo_addr); + skb->len += sizeof(struct ip_vs_tcpo_addr); - /* re-calculate tcp csum */ - th->check = 0; - new_skb->csum = skb_checksum(new_skb, tcphoff, - new_skb->len - tcphoff, 0); - th->check = csum_tcpudp_magic(cp->caddr.ip, + /* re-calculate tcp csum, if no csum_offload */ + if (!sysctl_ip_vs_csum_offload) { + th->check = 0; + skb->csum = skb_checksum(skb, tcphoff, + skb->len - tcphoff, 0); + th->check = csum_tcpudp_magic(cp->caddr.ip, cp->vaddr.ip, - new_skb->len - tcphoff, - cp->protocol, new_skb->csum); + skb->len - tcphoff, + cp->protocol, skb->csum); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_COMPLETE; + skb_shinfo(skb)->gso_size = 0; + } + } /* re-calculate ip head csum, tot_len has been adjusted */ - ip_send_check(ip_hdr(new_skb)); + ip_send_check(ip_hdr(skb)); - if(new_skb->ip_summed == CHECKSUM_PARTIAL) { - new_skb->ip_summed = CHECKSUM_COMPLETE; - skb_shinfo(new_skb)->gso_size = 0; - } IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_OK); - return new_skb; + return 0; } #ifdef CONFIG_IP_VS_IPV6 -static struct sk_buff *tcp_opt_add_toa_v6(struct ip_vs_conn *cp, - struct sk_buff *old_skb, +static int tcp_opt_add_toa_v6(struct ip_vs_conn *cp, + struct sk_buff *skb, struct tcphdr **tcph) { __u32 mtu; - struct sk_buff *new_skb = NULL; struct ip_vs_tcpo_addr_v6 *toa; unsigned int tcphoff; struct tcphdr *th; @@ -813,45 +848,45 @@ static struct sk_buff *tcp_opt_add_toa_v6(struct ip_vs_conn *cp, /* IPV6 */ if (cp->af != AF_INET6) { IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_FAIL_PROTO); - return old_skb; + return 1; } /* skb length and tcph length checking */ - mtu = dst_mtu((struct dst_entry *)old_skb->_skb_dst); - if (old_skb->len > (mtu - sizeof(struct ip_vs_tcpo_addr_v6))) { + if (skb->_skb_dst) + mtu = dst_mtu((struct dst_entry *)skb->_skb_dst); + else /* fast_xmit can reach here */ + mtu = cp->dev_inside ? cp->dev_inside->mtu : + sizeof(struct ip_vs_tcpo_addr_v6); + + if (skb->len > (mtu - sizeof(struct ip_vs_tcpo_addr_v6))) { IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_FAIL_LEN); - return old_skb; + return 1; } /* the maximum length of TCP head is 60 bytes, so only 40 bytes for options */ if ((60 - ((*tcph)->doff << 2)) < sizeof(struct ip_vs_tcpo_addr_v6)) { IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_HEAD_FULL); - return old_skb; + return 1; } - /* copy all skb, plus ttm space , new skb is linear */ - new_skb = skb_copy_expand(old_skb, - skb_headroom(old_skb), - skb_tailroom(old_skb) + - sizeof(struct ip_vs_tcpo_addr_v6), GFP_ATOMIC); - if (new_skb == NULL) { + /* expand skb if needed */ + if ((sizeof(struct ip_vs_tcpo_addr_v6) > skb_tailroom(skb)) && + pskb_expand_head(skb, 0, + sizeof(struct ip_vs_tcpo_addr_v6), GFP_ATOMIC)){ IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_FAIL_MEM); - return old_skb; + return 1; } - /* free old skb */ - kfree_skb(old_skb); - /* * add client ip */ tcphoff = sizeof(struct ipv6hdr); /* get new tcp header */ *tcph = th = - (struct tcphdr *)((void *)skb_network_header(new_skb) + tcphoff); + (struct tcphdr *)((void *)skb_network_header(skb) + tcphoff); /* ptr to old opts */ - p = skb_tail_pointer(new_skb) - 1; + p = skb_tail_pointer(skb) - 1; q = p + sizeof(struct ip_vs_tcpo_addr_v6); /* move data down, offset is sizeof(struct ip_vs_tcpo_addr) */ @@ -862,7 +897,7 @@ static struct sk_buff *tcp_opt_add_toa_v6(struct ip_vs_conn *cp, } /* move tail to new postion */ - new_skb->tail += sizeof(struct ip_vs_tcpo_addr_v6); + skb->tail += sizeof(struct ip_vs_tcpo_addr_v6); /* put client ip opt , ptr point to opts */ toa = (struct ip_vs_tcpo_addr_v6 *)(th + 1); @@ -874,29 +909,30 @@ static struct sk_buff *tcp_opt_add_toa_v6(struct ip_vs_conn *cp, /* reset tcp header length */ th->doff += sizeof(struct ip_vs_tcpo_addr_v6) >> 2; /* reset ip header totoal length */ - ipv6_hdr(new_skb)->payload_len = - htons(ntohs(ipv6_hdr(new_skb)->payload_len) + + ipv6_hdr(skb)->payload_len = + htons(ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ip_vs_tcpo_addr_v6)); /* reset skb length */ - new_skb->len += sizeof(struct ip_vs_tcpo_addr_v6); + skb->len += sizeof(struct ip_vs_tcpo_addr_v6); - /* re-calculate tcp csum */ - th->check = 0; - new_skb->csum = skb_checksum(new_skb, tcphoff, - new_skb->len - tcphoff, 0); - th->check = csum_ipv6_magic(&cp->caddr.in6, + /* re-calculate tcp csum, if no csum_offload */ + if (!sysctl_ip_vs_csum_offload) { + th->check = 0; + skb->csum = skb_checksum(skb, tcphoff, + skb->len - tcphoff, 0); + th->check = csum_ipv6_magic(&cp->caddr.in6, &cp->vaddr.in6, - new_skb->len - tcphoff, - cp->protocol, new_skb->csum); + skb->len - tcphoff, + cp->protocol, skb->csum); - if(new_skb->ip_summed == CHECKSUM_PARTIAL) { - new_skb->ip_summed = CHECKSUM_COMPLETE; - skb_shinfo(new_skb)->gso_size = 0; + if(skb->ip_summed == CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_COMPLETE; + skb_shinfo(skb)->gso_size = 0; + } } - IP_VS_INC_ESTATS(ip_vs_esmib, FULLNAT_ADD_TOA_OK); - return new_skb; + return 0; } #endif @@ -941,6 +977,13 @@ tcp_dnat_handler(struct sk_buff *skb, */ ip_vs_synproxy_dnat_handler(tcph, &cp->syn_proxy_seq); + if (sysctl_ip_vs_csum_offload) { + skb->csum_start = skb_network_header(skb) - + skb->head + (ip_hdr(skb)->ihl << 2); + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } + /* * Adjust TCP checksums */ @@ -976,13 +1019,12 @@ tcp_dnat_handler(struct sk_buff *skb, } static int -tcp_fnat_in_handler(struct sk_buff **skb_p, +tcp_fnat_in_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; unsigned int tcphoff; int oldlen; - struct sk_buff *skb = *skb_p; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -1022,10 +1064,10 @@ tcp_fnat_in_handler(struct sk_buff **skb_p, tcp_in_init_seq(cp, skb, tcph); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - skb = *skb_p = tcp_opt_add_toa_v6(cp, skb, &tcph); + tcp_opt_add_toa_v6(cp, skb, &tcph); else #endif - skb = *skb_p = tcp_opt_add_toa(cp, skb, &tcph); + tcp_opt_add_toa(cp, skb, &tcph); } /* TOA: add client ip */ @@ -1034,10 +1076,10 @@ tcp_fnat_in_handler(struct sk_buff **skb_p, && !tcph->syn && !tcph->rst && !tcph->fin) { #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - skb = *skb_p = tcp_opt_add_toa_v6(cp, skb, &tcph); + tcp_opt_add_toa_v6(cp, skb, &tcph); else #endif - skb = *skb_p = tcp_opt_add_toa(cp, skb, &tcph); + tcp_opt_add_toa(cp, skb, &tcph); } /* @@ -1051,6 +1093,13 @@ tcp_fnat_in_handler(struct sk_buff **skb_p, tcph->source = cp->lport; tcph->dest = cp->dport; + if (sysctl_ip_vs_csum_offload) { + skb->csum_start = skb_network_header(skb) - + skb->head + (ip_hdr(skb)->ihl << 2); + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } + /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_reset(cp->af, (skb->len - tcphoff), @@ -1394,11 +1443,11 @@ int sysctl_ip_vs_tcp_timeouts[IP_VS_TCP_S_LAST + 1] = { [IP_VS_TCP_S_ESTABLISHED] = 90 * HZ, [IP_VS_TCP_S_SYN_SENT] = 3 * HZ, [IP_VS_TCP_S_SYN_RECV] = 30 * HZ, - [IP_VS_TCP_S_FIN_WAIT] = 3 * HZ, - [IP_VS_TCP_S_TIME_WAIT] = 3 * HZ, + [IP_VS_TCP_S_FIN_WAIT] = 7 * HZ, + [IP_VS_TCP_S_TIME_WAIT] = 7 * HZ, [IP_VS_TCP_S_CLOSE] = 3 * HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 3 * HZ, - [IP_VS_TCP_S_LAST_ACK] = 3 * HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 7 * HZ, + [IP_VS_TCP_S_LAST_ACK] = 7 * HZ, [IP_VS_TCP_S_LISTEN] = 2 * 60 * HZ, [IP_VS_TCP_S_SYNACK] = 30 * HZ, [IP_VS_TCP_S_LAST] = 2 * HZ, @@ -1586,7 +1635,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } cp->old_state = cp->state; // old_state called when connection reused - cp->timeout = pp->timeout_table[cp->state = new_state]; + cp->timeout = ((cp->state = new_state) == IP_VS_TCP_S_ESTABLISHED) ? + cp->est_timeout : sysctl_ip_vs_tcp_timeouts[new_state]; } /* diff --git a/kernel/net/netfilter/ipvs/ip_vs_stats.c b/kernel/net/netfilter/ipvs/ip_vs_stats.c index 3557e5d9..568c65f2 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_stats.c +++ b/kernel/net/netfilter/ipvs/ip_vs_stats.c @@ -1,99 +1,99 @@ -#include -#include -#include - - -int ip_vs_new_stats(struct ip_vs_stats **p) -{ - if(NULL == p) - return -EINVAL; - - *p = alloc_percpu(struct ip_vs_stats); - if(NULL == *p) { - pr_err("%s: allocate per cpu varible failed \n", __func__); - return -ENOMEM; - } - - /* Initial stats */ - ip_vs_zero_stats(*p); - - return 0; -} - -void ip_vs_del_stats(struct ip_vs_stats* p) -{ - if(NULL == p) - return; - - free_percpu(p); - - return; -} - -void ip_vs_zero_stats(struct ip_vs_stats* stats) -{ - int i = 0; - - if(NULL == stats) { - pr_err("%s: Invaild point \n", __func__); - return; - } - - for_each_online_cpu(i) { - ip_vs_stats_cpu(stats, i).conns = 0; - ip_vs_stats_cpu(stats, i).inpkts = 0; - ip_vs_stats_cpu(stats, i).outpkts = 0; - ip_vs_stats_cpu(stats, i).inbytes = 0; - ip_vs_stats_cpu(stats, i).outbytes = 0; - } - - return; -} - -void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - ip_vs_stats_this_cpu(dest->stats).inpkts++; - ip_vs_stats_this_cpu(dest->stats).inbytes += skb->len; - - ip_vs_stats_this_cpu(dest->svc->stats).inpkts++; - ip_vs_stats_this_cpu(dest->svc->stats).inbytes += skb->len; - - ip_vs_stats_this_cpu(ip_vs_stats).inpkts++; - ip_vs_stats_this_cpu(ip_vs_stats).inbytes += skb->len; - } - - return; -} - -void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - ip_vs_stats_this_cpu(dest->stats).outpkts++; - ip_vs_stats_this_cpu(dest->stats).outbytes += skb->len; - - ip_vs_stats_this_cpu(dest->svc->stats).outpkts++; - ip_vs_stats_this_cpu(dest->svc->stats).outbytes += skb->len; - - ip_vs_stats_this_cpu(ip_vs_stats).outpkts++; - ip_vs_stats_this_cpu(ip_vs_stats).outbytes += skb->len; - } - return; -} - -void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest = cp->dest; - if(dest) { - ip_vs_stats_this_cpu(dest->stats).conns++; - - ip_vs_stats_this_cpu(dest->svc->stats).conns++; - - ip_vs_stats_this_cpu(ip_vs_stats).conns++; - } - - return; -} - +#include +#include +#include + + +int ip_vs_new_stats(struct ip_vs_stats **p) +{ + if(NULL == p) + return -EINVAL; + + *p = alloc_percpu(struct ip_vs_stats); + if(NULL == *p) { + pr_err("%s: allocate per cpu varible failed \n", __func__); + return -ENOMEM; + } + + /* Initial stats */ + ip_vs_zero_stats(*p); + + return 0; +} + +void ip_vs_del_stats(struct ip_vs_stats* p) +{ + if(NULL == p) + return; + + free_percpu(p); + + return; +} + +void ip_vs_zero_stats(struct ip_vs_stats* stats) +{ + int i = 0; + + if(NULL == stats) { + pr_err("%s: Invaild point \n", __func__); + return; + } + + for_each_online_cpu(i) { + ip_vs_stats_cpu(stats, i).conns = 0; + ip_vs_stats_cpu(stats, i).inpkts = 0; + ip_vs_stats_cpu(stats, i).outpkts = 0; + ip_vs_stats_cpu(stats, i).inbytes = 0; + ip_vs_stats_cpu(stats, i).outbytes = 0; + } + + return; +} + +void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + dest->stats.inpkts++; + dest->stats.inbytes += skb->len; + + dest->svc->stats.inpkts++; + dest->svc->stats.inbytes += skb->len; + + ip_vs_stats_this_cpu(ip_vs_stats).inpkts++; + ip_vs_stats_this_cpu(ip_vs_stats).inbytes += skb->len; + } + + return; +} + +void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + dest->stats.outpkts++; + dest->stats.outbytes += skb->len; + + dest->svc->stats.outpkts++; + dest->svc->stats.outbytes += skb->len; + + ip_vs_stats_this_cpu(ip_vs_stats).outpkts++; + ip_vs_stats_this_cpu(ip_vs_stats).outbytes += skb->len; + } + return; +} + +void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest = cp->dest; + if(dest) { + dest->stats.conns++; + + dest->svc->stats.conns++; + + ip_vs_stats_this_cpu(ip_vs_stats).conns++; + } + + return; +} + diff --git a/kernel/net/netfilter/ipvs/ip_vs_synproxy.c b/kernel/net/netfilter/ipvs/ip_vs_synproxy.c index c5de198c..3f15ce35 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_synproxy.c +++ b/kernel/net/netfilter/ipvs/ip_vs_synproxy.c @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include #include @@ -20,10 +22,327 @@ #include #include +/* + * syncookies using MD5 algorithm + */ +static u32 net_secret[2][MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; + +int ip_vs_net_secret_init(void) +{ + get_random_bytes(net_secret, sizeof(net_secret)); + return 0; +} + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) + +static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, + u32 count, int c) +{ + u32 hash[MD5_DIGEST_WORDS]; + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = count; + + md5_transform(hash, net_secret[c]); + + return hash[0]; +} + +static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, + __be16 dport, __u32 sseq, __u32 count, + __u32 data) +{ + /* + * Compute the secure sequence number. + * The output should be: + * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) + * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). + * Where sseq is their sequence number and count increases every + * minute by 1. + * As an extra hack, we add a small "data" value that encodes the + * MSS into the second hash value. + */ + + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +/* + * This retrieves the small "data" value from the syncookie. + * If the syncookie is bad, the data returned will be out of + * range. This must be checked by the caller. + * + * The count value used to generate the cookie must be within + * "maxdiff" if the current (passed-in) "count". The return value + * is (__u32)-1 if this test fails. + */ +static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport, __u32 sseq, + __u32 count, __u32 maxdiff) +{ + __u32 diff; + + /* Strip away the layers from the cookie */ + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); + if (diff >= maxdiff) + return (__u32)-1; + + return (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; /* Leaving the data behind */ +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static u32 cookie_hash_v6(struct in6_addr *saddr, struct in6_addr *daddr, + __be16 sport, __be16 dport, u32 count, int c) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[c][i] + ((__force u32 *)daddr)[i]; + + secret[4] = net_secret[c][4] + + (((__force u16)sport << 16) + (__force u16)dport); + + secret[5] = net_secret[c][5] + count; + + for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[c][i]; + + md5_transform(hash, secret); + + return hash[0]; +} + +static __u32 secure_tcp_syn_cookie_v6(struct in6_addr *saddr, struct in6_addr *daddr, + __be16 sport, __be16 dport, __u32 sseq, + __u32 count, __u32 data) +{ + return (cookie_hash_v6(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash_v6(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +static __u32 check_tcp_syn_cookie_v6(__u32 cookie, struct in6_addr *saddr, + struct in6_addr *daddr, __be16 sport, + __be16 dport, __u32 sseq, __u32 count, + __u32 maxdiff) +{ + __u32 diff; + + cookie -= cookie_hash_v6(saddr, daddr, sport, dport, 0, 0) + sseq; + + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); + if (diff >= maxdiff) + return (__u32)-1; + + return (cookie - + cookie_hash_v6(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; +} +#endif + +/* + * This table has to be sorted and terminated with (__u16)-1. + * XXX generate a better table. + * Unresolved Issues: HIPPI with a 64k MSS is not well supported. + */ +static __u16 const msstab[] = { + 64 - 1, + 256 - 1, + 512 - 1, + 536 - 1, + 1024 - 1, + 1280 - 1, + 1440 - 1, + 1452 - 1, + 1460 - 1, + 4312 - 1, + (__u16)-1 +}; +/* The number doesn't include the -1 terminator */ +#define NUM_MSS (ARRAY_SIZE(msstab) - 1) + +/* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 + +/* + * Generate a syncookie for ip_vs module. + * Besides mss, we store additional tcp options in cookie "data". + * + * Cookie "data" format: + * |[21][20][19-16][15-0]| + * [21] SACKOK + * [20] TimeStampOK + * [19-16] snd_wscale + * [15-12] MSSIND + */ +static __u32 syn_proxy_cookie_v4_init_sequence(struct sk_buff *skb, + struct ip_vs_synproxy_opt *opts) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + int mssind; + const __u16 mss = opts->mss_clamp; + __u32 data = 0; + + /* XXX sort msstab[] by probability? Binary search? */ + for (mssind = 0; mss > msstab[mssind + 1]; mssind++) + ; + opts->mss_clamp = msstab[mssind] + 1; + + data = ((mssind & 0x0f) << IP_VS_SYNPROXY_MSS_BITS); + data |= opts->sack_ok << IP_VS_SYNPROXY_SACKOK_BIT; + data |= opts->tstamp_ok << IP_VS_SYNPROXY_TSOK_BIT; + data |= ((opts->snd_wscale & 0x0f) << IP_VS_SYNPROXY_SND_WSCALE_BITS); + + return secure_tcp_syn_cookie(iph->saddr, iph->daddr, + th->source, th->dest, ntohl(th->seq), + jiffies / (HZ * 60), data); +} + +/* + * when syn_proxy_cookie_v4_init_sequence is used, we check + * cookie as follow: + * 1. mssind check. + * 2. get sack/timestamp/wscale options. + */ +static int syn_proxy_v4_cookie_check(struct sk_buff *skb, __u32 cookie, + struct ip_vs_synproxy_opt *opt) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind; + int ret = 0; + __u32 res = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, + th->source, th->dest, seq, + jiffies / (HZ * 60), + COUNTER_TRIES); + + if(res == (__u32)-1) /* count is invalid, jiffies' >> jiffies */ + goto out; + + mssind = (res & IP_VS_SYNPROXY_MSS_MASK) >> IP_VS_SYNPROXY_MSS_BITS; + + memset(opt, 0, sizeof(struct ip_vs_synproxy_opt)); + if ((mssind < NUM_MSS) && ((res & IP_VS_SYNPROXY_OTHER_MASK) == 0)) { + opt->mss_clamp = msstab[mssind] + 1; + opt->sack_ok = (res & IP_VS_SYNPROXY_SACKOK_MASK) >> + IP_VS_SYNPROXY_SACKOK_BIT; + opt->tstamp_ok = (res & IP_VS_SYNPROXY_TSOK_MASK) >> + IP_VS_SYNPROXY_TSOK_BIT; + opt->snd_wscale = (res & IP_VS_SYNPROXY_SND_WSCALE_MASK) >> + IP_VS_SYNPROXY_SND_WSCALE_BITS; + if (opt->snd_wscale > 0 && + opt->snd_wscale <= IP_VS_SYNPROXY_WSCALE_MAX) + opt->wscale_ok = 1; + else if (opt->snd_wscale == 0) + opt->wscale_ok = 0; + else + goto out; + + ret = 1; + } + +out: return ret; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static __u32 syn_proxy_cookie_v6_init_sequence(struct sk_buff *skb, + struct ip_vs_synproxy_opt *opts) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + int mssind; + const __u16 mss = opts->mss_clamp; + __u32 data = 0; + + /* XXX sort msstab[] by probability? Binary search? */ + for (mssind = 0; mss > msstab[mssind + 1]; mssind++) + ; + opts->mss_clamp = msstab[mssind] + 1; + + data = ((mssind & 0x0f) << IP_VS_SYNPROXY_MSS_BITS); + data |= opts->sack_ok << IP_VS_SYNPROXY_SACKOK_BIT; + data |= opts->tstamp_ok << IP_VS_SYNPROXY_TSOK_BIT; + data |= ((opts->snd_wscale & 0x0f) << IP_VS_SYNPROXY_SND_WSCALE_BITS); + + return secure_tcp_syn_cookie_v6(&iph->saddr, &iph->daddr, + th->source, th->dest, ntohl(th->seq), + jiffies / (HZ * 60), data); +} + +int syn_proxy_v6_cookie_check(struct sk_buff * skb, __u32 cookie, + struct ip_vs_synproxy_opt * opt) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind; + int ret = 0; + __u32 res = check_tcp_syn_cookie_v6(cookie, &iph->saddr, &iph->daddr, + th->source, th->dest, seq, + jiffies / (HZ * 60), + COUNTER_TRIES); + + if(res == (__u32)-1) /* count is invalid, jiffies' >> jiffies */ + goto out; + + mssind = (res & IP_VS_SYNPROXY_MSS_MASK) >> IP_VS_SYNPROXY_MSS_BITS; + + memset(opt, 0, sizeof(struct ip_vs_synproxy_opt)); + + if ((mssind < NUM_MSS) && ((res & IP_VS_SYNPROXY_OTHER_MASK) == 0)) { + opt->mss_clamp = msstab[mssind] + 1; + opt->sack_ok = (res & IP_VS_SYNPROXY_SACKOK_MASK) >> + IP_VS_SYNPROXY_SACKOK_BIT; + opt->tstamp_ok = (res & IP_VS_SYNPROXY_TSOK_MASK) >> + IP_VS_SYNPROXY_TSOK_BIT; + opt->snd_wscale = (res & IP_VS_SYNPROXY_SND_WSCALE_MASK) >> + IP_VS_SYNPROXY_SND_WSCALE_BITS; + if (opt->snd_wscale > 0 && + opt->snd_wscale <= IP_VS_SYNPROXY_WSCALE_MAX) + opt->wscale_ok = 1; + else if (opt->snd_wscale == 0) + opt->wscale_ok = 0; + else + goto out; + + ret = 1; + } + +out: return ret; +} +#endif + +/* + * synproxy implementation + */ + + static inline void syn_proxy_seq_csum_update(struct tcphdr *tcph, __u32 old_seq, __u32 new_seq) { - tcph->check = csum_fold(ip_vs_check_diff4(old_seq, new_seq, + /* do checksum later */ + if (!sysctl_ip_vs_csum_offload) + tcph->check = csum_fold(ip_vs_check_diff4(old_seq, new_seq, ~csum_unfold(tcph->check))); } @@ -165,10 +484,10 @@ syn_proxy_reuse_skb(int af, struct sk_buff *skb, struct ip_vs_synproxy_opt *opt) skb_set_transport_header(skb, tcphoff); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) - isn = ip_vs_synproxy_cookie_v6_init_sequence(skb, opt); + isn = syn_proxy_cookie_v6_init_sequence(skb, opt); else #endif - isn = ip_vs_synproxy_cookie_v4_init_sequence(skb, opt); + isn = syn_proxy_cookie_v4_init_sequence(skb, opt); /* Set syn-ack flag * the tcp opt in syn/ack packet : 00010010 = 0x12 @@ -254,7 +573,16 @@ ip_vs_synproxy_syn_rcv(int af, struct sk_buff *skb, (svc = ip_vs_service_get(af, skb->mark, iph->protocol, &iph->daddr, th->dest)) - && (svc->flags & IP_VS_CONN_F_SYNPROXY)) { + && (svc->flags & IP_VS_SVC_F_SYNPROXY)) { + /* + * if service's weight is zero (no active realserver), + * then do nothing and drop the packet. + */ + if(svc->weight == 0) { + IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_NO_DEST); + ip_vs_service_put(svc); + goto syn_rcv_out; + } // release service here, because don't use it any all. ip_vs_service_put(svc); @@ -580,7 +908,7 @@ ip_vs_synproxy_ack_rcv(int af, struct sk_buff *skb, struct tcphdr *th, skb_set_transport_header(skb, iph->len); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - res_cookie_check = ip_vs_synproxy_v6_cookie_check(skb, + res_cookie_check = syn_proxy_v6_cookie_check(skb, ntohl (th-> ack_seq) @@ -589,7 +917,7 @@ ip_vs_synproxy_ack_rcv(int af, struct sk_buff *skb, struct tcphdr *th, } else #endif { - res_cookie_check = ip_vs_synproxy_v4_cookie_check(skb, + res_cookie_check = syn_proxy_v4_cookie_check(skb, ntohl (th-> ack_seq) @@ -624,6 +952,13 @@ ip_vs_synproxy_ack_rcv(int af, struct sk_buff *skb, struct tcphdr *th, return 0; } + /* + * Set private establish state timeout into cp from svc, + * due cp may use its user establish state timeout + * different from sysctl_ip_vs_tcp_timeouts + */ + (*cpp)->est_timeout = svc->est_timeout; + /* * Release service, we don't need it any more. */ @@ -755,6 +1090,34 @@ void ip_vs_synproxy_dnat_handler(struct tcphdr *tcph, struct ip_vs_seq *sp_seq) } } +static inline void +ip_vs_synproxy_save_fast_xmit_info(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + /* Save info for L2 fast xmit */ + if(sysctl_ip_vs_fast_xmit_inside && skb->dev && + likely(skb->dev->type == ARPHRD_ETHER) && + skb_mac_header_was_set(skb)) { + struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb); + + if(likely(cp->dev_inside == NULL)) { + cp->dev_inside = skb->dev; + dev_hold(cp->dev_inside); + } + + if (unlikely(cp->dev_inside != skb->dev)) { + dev_put(cp->dev_inside); + cp->dev_inside = skb->dev; + dev_hold(cp->dev_inside); + } + + memcpy(cp->src_hwaddr_inside, eth->h_source, ETH_ALEN); + memcpy(cp->dst_hwaddr_inside, eth->h_dest, ETH_ALEN); + IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SYNPROXY_SAVE_INSIDE); + IP_VS_DBG_RL("synproxy_save_fast_xmit netdevice:%s\n", + netdev_name(skb->dev)); + } +} + /* * Syn-proxy step 3 logic: receive syn-ack from rs * Update syn_proxy_seq.delta and send stored ack skbs @@ -791,8 +1154,8 @@ ip_vs_synproxy_synack_rcv(struct sk_buff *skb, struct ip_vs_conn *cp, cp->state == IP_VS_TCP_S_SYN_SENT) { cp->syn_proxy_seq.delta = htonl(cp->syn_proxy_seq.init_seq) - htonl(th->seq); - cp->timeout = pp->timeout_table[cp->state = - IP_VS_TCP_S_ESTABLISHED]; + cp->state = IP_VS_TCP_S_ESTABLISHED; + cp->timeout = cp->est_timeout; if (dest) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); @@ -809,6 +1172,8 @@ ip_vs_synproxy_synack_rcv(struct sk_buff *skb, struct ip_vs_conn *cp, ntohs(th->dest)); } + ip_vs_synproxy_save_fast_xmit_info(skb, cp); + /* First: free stored syn skb */ if ((tmp_skb = xchg(&cp->syn_skb, NULL)) != NULL) { kfree_skb(tmp_skb); @@ -948,7 +1313,7 @@ ip_vs_synproxy_reuse_conn(int af, struct sk_buff *skb, skb_set_transport_header(skb, iph->len); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - res_cookie_check = ip_vs_synproxy_v6_cookie_check(skb, + res_cookie_check = syn_proxy_v6_cookie_check(skb, ntohl (th-> ack_seq) @@ -957,7 +1322,7 @@ ip_vs_synproxy_reuse_conn(int af, struct sk_buff *skb, } else #endif { - res_cookie_check = ip_vs_synproxy_v4_cookie_check(skb, + res_cookie_check = syn_proxy_v4_cookie_check(skb, ntohl (th-> ack_seq) diff --git a/kernel/net/netfilter/ipvs/ip_vs_wrr.c b/kernel/net/netfilter/ipvs/ip_vs_wrr.c index 44e42e1a..96acd711 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_wrr.c +++ b/kernel/net/netfilter/ipvs/ip_vs_wrr.c @@ -153,8 +153,6 @@ static struct ip_vs_dest *ip_vs_wrr_schedule(struct ip_vs_service *svc, if (mark->cl == mark->cl->next) { /* no dest entry */ - IP_VS_ERR_RL("WRR: no destination available: " - "no destinations present\n"); dest = NULL; goto out; } @@ -168,8 +166,6 @@ static struct ip_vs_dest *ip_vs_wrr_schedule(struct ip_vs_service *svc, */ if (mark->cw == 0) { mark->cl = &svc->destinations; - IP_VS_ERR_RL("WRR: no destination " - "available\n"); dest = NULL; goto out; } @@ -191,8 +187,6 @@ static struct ip_vs_dest *ip_vs_wrr_schedule(struct ip_vs_service *svc, /* back to the start, and no dest is found. It is only possible when all dests are OVERLOADED */ dest = NULL; - IP_VS_ERR_RL("WRR: no destination available: " - "all destinations are overloaded\n"); goto out; } } @@ -203,9 +197,13 @@ static struct ip_vs_dest *ip_vs_wrr_schedule(struct ip_vs_service *svc, atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); - out: write_unlock(&svc->sched_lock); return dest; + + out: + write_unlock(&svc->sched_lock); + IP_VS_ERR_RL("WRR: no destination available\n"); + return dest; } static struct ip_vs_scheduler ip_vs_wrr_scheduler = { diff --git a/kernel/net/netfilter/ipvs/ip_vs_xmit.c b/kernel/net/netfilter/ipvs/ip_vs_xmit.c index 7c096cc6..77923cdf 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_xmit.c +++ b/kernel/net/netfilter/ipvs/ip_vs_xmit.c @@ -810,6 +810,40 @@ ip_vs_fast_response_xmit_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } #endif +static inline void +ip_vs_save_xmit_inside_info(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + if(!sysctl_ip_vs_fast_xmit_inside) + return; + + if(!skb->dev) { + IP_VS_DBG_RL("%s(): skb->dev is NULL. \n", __func__); + return; + } + IP_VS_DBG_RL("%s(): netdevice:%s\n", netdev_name(skb->dev), __func__); + + if(likely((skb->dev->type == ARPHRD_ETHER) && + skb_mac_header_was_set(skb))) { + struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb); + + if(unlikely(cp->dev_inside == NULL)) { + cp->dev_inside = skb->dev; + dev_hold(cp->dev_inside); + } + + if (unlikely(cp->dev_inside != skb->dev)) { + dev_put(cp->dev_inside); + cp->dev_inside = skb->dev; + dev_hold(cp->dev_inside); + } + + memcpy(cp->src_hwaddr_inside, eth->h_source, ETH_ALEN); + memcpy(cp->dst_hwaddr_inside, eth->h_dest, ETH_ALEN); + } else { + IP_VS_DBG_RL("%s():save dev and mac failed!\n", __func__); + } +} + /* Response transmit to client * Used for NAT/Local. */ @@ -953,6 +987,8 @@ ip_vs_fnat_response_xmit(struct sk_buff *skb, struct ip_vs_protocol *pp, int mtu; struct iphdr *iph = ip_hdr(skb); + ip_vs_save_xmit_inside_info(skb, cp); + if(sysctl_ip_vs_fast_xmit && !ip_vs_fast_response_xmit(skb, pp, cp)) return NF_STOLEN; @@ -1017,6 +1053,8 @@ ip_vs_fnat_response_xmit_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct rt6_info *rt; /* Route to the other host */ int mtu; + ip_vs_save_xmit_inside_info(skb, cp); + if(sysctl_ip_vs_fast_xmit && !ip_vs_fast_response_xmit_v6(skb, pp, cp)) return NF_STOLEN; @@ -1219,6 +1257,180 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } #endif +/* fullnat mode */ +int +ip_vs_fast_xmit(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp) +{ + struct ethhdr *eth; + + if (!cp->dev_inside) + goto err; + if (!gso_ok(skb, cp->dev_inside) && (skb->len > cp->dev_inside->mtu)) + goto err; + + /* Try to reuse skb */ + if (unlikely(skb_shared(skb) || skb_cloned(skb))) { + struct sk_buff *new_skb = skb_copy(skb, GFP_ATOMIC); + if(unlikely(new_skb == NULL)) + goto err; + + /* Drop old skb */ + kfree_skb(skb); + skb = new_skb; + IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SKB_COPY); + } + + /* change ip, port. */ + if ((cp->flags & IP_VS_CONN_F_FWD_MASK) == IP_VS_CONN_F_FULLNAT) { + if (pp->fnat_in_handler && !pp->fnat_in_handler(skb, pp, cp)) + goto err; + + ip_hdr(skb)->saddr = cp->laddr.ip; + ip_hdr(skb)->daddr = cp->daddr.ip; + } else { + IP_VS_ERR_RL("L2 fast xmit support fullnat only!\n"); + goto err; + /*if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto err; + + ip_hdr(skb)->daddr = cp->daddr.ip;*/ + } + + ip_send_check(ip_hdr(skb)); + + skb->dev = cp->dev_inside; + + if(unlikely(skb_headroom(skb) < LL_RESERVED_SPACE(skb->dev))){ + struct sk_buff *skb2; + + IP_VS_ERR_RL("need more headroom! realloc skb\n"); + skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(skb->dev)); + if (skb2 == NULL) + goto err; + kfree_skb(skb); + skb = skb2; + } + + if(likely(skb_mac_header_was_set(skb))) { + eth = eth_hdr(skb); + memcpy(eth->h_dest, cp->src_hwaddr_inside, ETH_ALEN); + memcpy(eth->h_source, cp->dst_hwaddr_inside, ETH_ALEN); + skb->data = (unsigned char *)eth_hdr(skb); + skb->len += sizeof(struct ethhdr); + } else { + eth = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr)); + skb_reset_mac_header(skb); + memcpy(eth->h_dest, cp->src_hwaddr_inside, ETH_ALEN); + memcpy(eth->h_source, cp->dst_hwaddr_inside, ETH_ALEN); + } + skb->protocol = eth->h_proto = htons(ETH_P_IP); + skb->pkt_type = PACKET_OUTGOING; + + IP_VS_DBG_RL("%s: send skb to RS!\n", __func__); + + /* + * Send the packet out + * use do while here, just for local variables/debug/retransimit + */ + do { + int ret = dev_queue_xmit(skb); + if (ret != 0) + IP_VS_ERR_RL("dev_queue_xmit failed! code:%d\n", ret); + }while(0); + + IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_PASS_INSIDE); + return 0; +err: + IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_REJECT_INSIDE); + return 1; +} + +#ifdef CONFIG_IP_VS_IPV6 +/* just for fullnat mode */ +int +ip_vs_fast_xmit_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp) +{ + struct ethhdr *eth; + + if (!cp->dev_inside) + goto err; + if (!gso_ok(skb, cp->dev_inside) && (skb->len > cp->dev_inside->mtu)) + goto err; + + /* Try to reuse skb if possible */ + if (unlikely(skb_shared(skb) || skb_cloned(skb))) { + struct sk_buff *new_skb = skb_copy(skb, GFP_ATOMIC); + if(unlikely(new_skb == NULL)) + goto err; + + /* Drop old skb */ + kfree_skb(skb); + skb = new_skb; + IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SKB_COPY); + } + + /* change ip, port. */ + if ((cp->flags & IP_VS_CONN_F_FWD_MASK) == IP_VS_CONN_F_FULLNAT) { + if (pp->fnat_in_handler && !pp->fnat_in_handler(skb, pp, cp)) + goto err; + + ipv6_hdr(skb)->saddr = cp->laddr.in6; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + } else { + IP_VS_ERR_RL("L2 fast xmit support fullnat only!\n"); + goto err; + /*if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto err; + + ipv6_hdr(skb)->daddr = cp->daddr.in6;*/ + } + + skb->dev = cp->dev_inside; + + if(unlikely(skb_headroom(skb) < LL_RESERVED_SPACE(skb->dev))){ + struct sk_buff *skb2; + + IP_VS_ERR_RL("need more headroom! realloc skb\n"); + skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(skb->dev)); + if (skb2 == NULL) + goto err; + kfree_skb(skb); + skb = skb2; + } + + if(likely(skb_mac_header_was_set(skb))) { + eth = eth_hdr(skb); + memcpy(eth->h_dest, cp->src_hwaddr_inside, ETH_ALEN); + memcpy(eth->h_source, cp->dst_hwaddr_inside, ETH_ALEN); + skb->data = (unsigned char *)eth_hdr(skb); + skb->len += sizeof(struct ethhdr); + } else { + eth = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr)); + skb_reset_mac_header(skb); + memcpy(eth->h_dest, cp->src_hwaddr_inside, ETH_ALEN); + memcpy(eth->h_source, cp->dst_hwaddr_inside, ETH_ALEN); + } + skb->protocol = eth->h_proto = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_OUTGOING; + + IP_VS_DBG_RL("%s: send skb to RS!\n", __func__); + /* Send the packet out */ + do { + int ret = dev_queue_xmit(skb); + if (ret != 0) + IP_VS_ERR_RL("dev_queue_xmit failed! code:%d\n", ret); + }while(0); + + IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_PASS_INSIDE); + return 0; +err: + IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_REJECT_INSIDE); + return 1; +} +#endif + void ip_vs_save_xmit_info(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) @@ -1438,6 +1650,11 @@ ip_vs_fnat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } + ip_vs_save_xmit_info(skb, pp, cp); + + if(sysctl_ip_vs_fast_xmit_inside && !ip_vs_fast_xmit(skb, pp, cp)) + return NF_STOLEN; + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) goto tx_error_icmp; @@ -1453,8 +1670,6 @@ ip_vs_fnat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - ip_vs_save_xmit_info(skb, pp, cp); - /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) goto tx_error_put; @@ -1467,7 +1682,7 @@ ip_vs_fnat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_dst_set(skb, &rt->u.dst); /* mangle the packet */ - if (pp->fnat_in_handler && !pp->fnat_in_handler(&skb, pp, cp)) + if (pp->fnat_in_handler && !pp->fnat_in_handler(skb, pp, cp)) goto tx_error; ip_hdr(skb)->saddr = cp->laddr.ip; ip_hdr(skb)->daddr = cp->daddr.ip; @@ -1519,6 +1734,11 @@ ip_vs_fnat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } + ip_vs_save_xmit_info(skb, pp, cp); + + if(sysctl_ip_vs_fast_xmit_inside && !ip_vs_fast_xmit_v6(skb, pp, cp)) + return NF_STOLEN; + rt = __ip_vs_get_out_rt_v6(cp); if (!rt) goto tx_error_icmp; @@ -1534,8 +1754,6 @@ ip_vs_fnat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - ip_vs_save_xmit_info(skb, pp, cp); - /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) goto tx_error_put; @@ -1548,7 +1766,7 @@ ip_vs_fnat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_dst_set(skb, &rt->u.dst); /* mangle the packet */ - if (pp->fnat_in_handler && !pp->fnat_in_handler(&skb, pp, cp)) + if (pp->fnat_in_handler && !pp->fnat_in_handler(skb, pp, cp)) goto tx_error; ipv6_hdr(skb)->saddr = cp->laddr.in6; ipv6_hdr(skb)->daddr = cp->daddr.in6; diff --git a/tools/ipvsadm/ipvsadm.8 b/tools/ipvsadm/ipvsadm.8 index f0959474..e8711755 100644 --- a/tools/ipvsadm/ipvsadm.8 +++ b/tools/ipvsadm/ipvsadm.8 @@ -424,6 +424,21 @@ One-packet scheduling. Used in conjunction with a UDP virtual service or a fwmark virtual service that handles only UDP packets. All connections are created such that they only schedule one packet. +.TP +.B -j [enable/disable], --synproxy [enable/disable] +Synproxy switch. +Enable or disable ipvs synproxy to defense tcp flag attack. +This command is only relevant for the -A or -E command. Default +is synproxy disable. +.TP +.B -V, --vsestablish-timeout [timeout] +Specify timeout when session created by the virtual service transited to establish state. +If this option is not specified, virtual service will use global establish state timeout +when create session. Global estalish state timeout is set by user in proc filesystem, and you +can find it and other states timeout in /proc/sys/net/ipv4/vs/timeout_*. +.sp +\fBNote:\fR Set Zero value from user will force ipvs kernel use global establish state timeout +for related virtual service. .SH EXAMPLE 1 - Simple Virtual Service The following commands configure a Linux Director to distribute incoming requests addressed to port 80 on 207.175.44.110 equally to diff --git a/tools/ipvsadm/ipvsadm.c b/tools/ipvsadm/ipvsadm.c index 6d694d64..28f77ba9 100644 --- a/tools/ipvsadm/ipvsadm.c +++ b/tools/ipvsadm/ipvsadm.c @@ -188,9 +188,9 @@ static const char* optnames[] = { "syncid", "exact", "ops", - "pe" , - "local-address" , - "synproxy" , + "pe", + "local-address", + "synproxy", }; /* @@ -203,24 +203,24 @@ static const char* optnames[] = { */ static const char commands_v_options[NUMBER_OF_CMD][NUMBER_OF_OPT] = { - /* -n -c svc -s -p -M -r fwd -w -x -y -mc tot dmn -st -rt thr -pc srt sid -ex ops pe laddr syn*/ -/*ADD*/ {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', ' '}, -/*EDIT*/ {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', ' '}, -/*DEL*/ {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*FLUSH*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*LIST*/ {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x'}, -/*ADDSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*DELSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*STARTD*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x'}, -/*STOPD*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x'}, -/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*SAVE*/ {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*ZERO*/ {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, -/*ADDLADDR*/{'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '+', 'x'}, -/*DELLADDR*/{'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '+', 'x'}, -/*GETLADDR*/{'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, + /* -n -c svc -s -p -M -r fwd -w -x -y -mc tot dmn -st -rt thr -pc srt sid -ex ops pe laddr syn -V*/ +/*ADD*/ {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', ' ', ' '}, +/*EDIT*/ {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', ' ', ' '}, +/*DEL*/ {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*FLUSH*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*LIST*/ {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x'}, +/*ADDSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*DELSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*STARTD*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*STOPD*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*SAVE*/ {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*ZERO*/ {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, +/*ADDLADDR*/{'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '+', 'x', 'x'}, +/*DELLADDR*/{'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '+', 'x', 'x'}, +/*GETLADDR*/{'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}, }; /* printing format flags */ @@ -272,6 +272,7 @@ enum { TAG_SORT, TAG_NO_SORT, TAG_PERSISTENCE_ENGINE, + TAG_SET_VS_EST_TIMEOUT, }; /* various parsing helpers & parsing functions */ @@ -308,7 +309,7 @@ static void list_all(unsigned int format); static void list_timeout(void); static void list_daemon(void); static int list_laddrs(ipvs_service_t *svc , int with_title); -static int list_all_laddrs(void); +static int list_all_laddrs(unsigned int format); static int modprobe_ipvs(void); static void check_ipvs_version(void); @@ -422,6 +423,8 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce, NULL, NULL }, { "laddr", 'z', POPT_ARG_STRING, &optarg, 'z', NULL, NULL }, { "synproxy", 'j' , POPT_ARG_STRING, &optarg, 'j', NULL, NULL }, + { "vsestablish-timeout", 'V', POPT_ARG_STRING|POPT_ARGFLAG_OPTIONAL, &optarg, TAG_SET_VS_EST_TIMEOUT, + NULL, NULL }, { NULL, 0, 0, NULL, 0, NULL, NULL } }; @@ -684,12 +687,21 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce, set_option(options, OPT_SYNPROXY); if(!memcmp(optarg , "enable" , strlen("enable"))) - ce->svc.flags = ce->svc.flags | IP_VS_CONN_F_SYNPROXY; + ce->svc.flags = ce->svc.flags | IP_VS_SVC_F_SYNPROXY; else if(!memcmp(optarg , "disable" , strlen("disable"))) - ce->svc.flags = ce->svc.flags & (~IP_VS_CONN_F_SYNPROXY); + ce->svc.flags = ce->svc.flags & (~IP_VS_SVC_F_SYNPROXY); else fail(2 , "synproxy switch must be enable or disable\n"); + break; + } + case TAG_SET_VS_EST_TIMEOUT: + { + unsigned est_timeout = atoi(optarg); + set_option(options, OPT_VS_EST_TIMEOUT); + if(!est_timeout) + fail(2 , "vsestablish-timeout should not be zero\n"); + ce->svc.est_timeout = est_timeout; break; } default: @@ -832,6 +844,7 @@ static int process_options(int argc, char **argv, int reading_stdin) case CMD_SAVE: format |= FMT_RULE; list_all(format); + result = list_all_laddrs(format); return 0; case CMD_FLUSH: @@ -890,7 +903,7 @@ static int process_options(int argc, char **argv, int reading_stdin) if(options & OPT_SERVICE) result = list_laddrs(&ce.svc , 1); else - result = list_all_laddrs(); + result = list_all_laddrs(format); break; } @@ -1198,7 +1211,9 @@ static void usage_exit(const char *program, const int exit_status) " --nosort disable sorting output of service/server entries\n" " --sort does nothing, for backwards compatibility\n" " --ops -o one-packet scheduling\n" - " --numeric -n numeric output of addresses and ports\n", + " --numeric -n numeric output of addresses and ports\n" + " --synproxy -j [enable/disable] enable/disable ipvs synproxy to defence tcp flag attack\n" + " --vsestablish-timeout -V [timeout] set up vs private establish state timeout\n", DEF_SCHED); exit(exit_status); @@ -1425,20 +1440,27 @@ static inline char *fwd_switch(unsigned flags) switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_MASQ: swt = "-m"; break; + case IP_VS_CONN_F_FULLNAT: + swt = "-b"; break; case IP_VS_CONN_F_TUNNEL: swt = "-i"; break; case IP_VS_CONN_F_LOCALNODE: case IP_VS_CONN_F_DROUTE: swt = "-g"; break; } - return swt; + + if(NULL == swt) { + printf("The fwd method is not support in ipvsadm, Plz check the version of ipvsadm or ip_vs kernel\n"); + exit(1); + } else + return swt; } static void print_largenum(unsigned long long i, unsigned int format) { char mytmp[32]; - size_t len; + int len; if (format & FMT_EXACT) { len = snprintf(mytmp, 32, "%llu", i); @@ -1482,7 +1504,7 @@ static void print_title(unsigned int format) "Prot LocalAddress:Port", "Weight", "PersistConn", "ActiveConn", "InActConn"); else if (!(format & FMT_RULE)) - printf("Prot LocalAddress:Port Scheduler Flags\n" + printf("Prot LocalAddress:Port Scheduler Established(Sec.) Flags\n" " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } @@ -1550,6 +1572,12 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) printf(" pe %s", se->pe_name); if (se->flags & IP_VS_SVC_F_ONEPACKET) printf(" ops"); + if (se->flags & IP_VS_SVC_F_SYNPROXY) + printf(" -j enable"); + else + printf(" -j disable"); + if(se->est_timeout) /* Kernel version may not be compatible */ + printf(" -V %u", se->est_timeout); } else if (format & FMT_STATS) { printf("%-33s", svc_name); print_largenum(se->stats.conns, format); @@ -1565,7 +1593,11 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) print_largenum(se->stats.inbps, format); print_largenum(se->stats.outbps, format); } else { - printf("%s %s", svc_name, se->sched_name); + char est_timeout_str[18] = {0}; /* Field wide is 17(char) for est_timeout display */ + se->est_timeout ? + snprintf(est_timeout_str, sizeof(est_timeout_str), "%u", se->est_timeout): + snprintf(est_timeout_str, sizeof(est_timeout_str), "UnK. or DFLT. 0"); /* Unknow or Default is 0 */ + printf("%-22s %-9s %-17s", svc_name, se->sched_name, est_timeout_str); if (se->flags & IP_VS_SVC_F_PERSISTENT) { printf(" persistent %u", se->timeout); if (se->af == AF_INET) @@ -1582,7 +1614,7 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) if (se->flags & IP_VS_SVC_F_ONEPACKET) printf(" ops"); } - if (se->flags & IP_VS_CONN_F_SYNPROXY) + if (se->flags & IP_VS_SVC_F_SYNPROXY) printf(" synproxy"); } printf("\n"); @@ -1655,7 +1687,7 @@ static void list_laddrs_print_service(struct ip_vs_get_laddrs *d) if (!(vname = addrport_to_anyname(d->af, &d->addr, ntohs(d->port), d->protocol, FMT_NUMERIC))) - fail(2, "addrport_to_anyname: %s", strerror(errno)); + fail(2, "addrport_to_anyname: %s", strerror(errno)); printf("%-20s %-8u \n" , vname , d->num_laddrs); free(vname); @@ -1684,19 +1716,43 @@ static void list_laddrs_print_laddr(struct ip_vs_laddr_entry * entry) static void print_service_and_laddrs(struct ip_vs_get_laddrs* d, int with_title) { int i = 0; - + if(with_title) list_laddrs_print_title(); list_laddrs_print_service(d); - for(i = 0 ; i < d->num_laddrs ; i ++){ - list_laddrs_print_laddr(d->entrytable + i); + for(i = 0; i < d->num_laddrs; i++){ + list_laddrs_print_laddr(d->entrytable + i); } return; } +static void print_laddrs_rules(struct ip_vs_get_laddrs *d, unsigned int flag_num) +{ + int i = 0; + char* vname; + char svc_name[64]; + char laddr[32]; + + /*To get VIP and VPORT numeric or name resolved*/ + if (!(vname = addrport_to_anyname(d->af, &d->addr, ntohs(d->port),d->protocol, flag_num))) + fail(2, "addrport_to_anyname: %s", strerror(errno)); + + /*To get Service as formated -- "-t/-u VIP:VPORT" */ + sprintf(svc_name, "%s %s", d->protocol == IPPROTO_TCP?"-t":"-u", vname); + + for(i = 0; i < d->num_laddrs; i++){ + /*To Get one local address which is belonged to this svc*/ + sprintf(laddr, "%u.%u.%u.%u" , PRINT_NIP(d->entrytable[i].addr.ip)); + printf("-P %s -z %s\n", svc_name, laddr); + } + + free(vname); +} + + static int list_laddrs(ipvs_service_t *svc , int with_title) { ipvs_service_entry_t *entry; @@ -1723,7 +1779,7 @@ static int list_laddrs(ipvs_service_t *svc , int with_title) } -static int list_all_laddrs(void) +static int list_all_laddrs(unsigned int format) { struct ip_vs_get_services *get; struct ip_vs_get_laddrs *d; @@ -1741,20 +1797,25 @@ static int list_all_laddrs(void) fprintf(stderr, "%s\n", ipvs_strerror(errno)); return -1; } - + + if(format & FMT_RULE){ + print_laddrs_rules(d, format & FMT_NUMERIC); + + free(d); + continue; + } + if(i != 0) title_enable = 0; print_service_and_laddrs(d, title_enable); free(d); } - + free(get); return 0; - } - static void list_service(ipvs_service_t *svc, unsigned int format) { ipvs_service_entry_t *entry; diff --git a/tools/keepalived/doc/man/man5/keepalived.conf.5 b/tools/keepalived/doc/man/man5/keepalived.conf.5 index ca35420a..02f4c632 100644 --- a/tools/keepalived/doc/man/man5/keepalived.conf.5 +++ b/tools/keepalived/doc/man/man5/keepalived.conf.5 @@ -351,6 +351,8 @@ A virtual_server can be a declaration of one of # Script to launch when quorum is lost. quorum_down | + # virtual service privated establish state timeout. + est_timeout # setup realserver(s) diff --git a/tools/keepalived/keepalived/check/check_daemon.c b/tools/keepalived/keepalived/check/check_daemon.c index 7587a7d9..44402cd8 100644 --- a/tools/keepalived/keepalived/check/check_daemon.c +++ b/tools/keepalived/keepalived/check/check_daemon.c @@ -53,6 +53,7 @@ stop_check(void) signal_handler_destroy(); thread_destroy_master(master); free_checkers_queue(); + free_vip_queue(); free_ssl(); ipvs_stop(); @@ -109,8 +110,10 @@ start_check(void) } /* Processing differential configuration parsing */ - if (reload) + if (reload) { clear_diff_services(); + free_vip_queue(); + } /* Initialize IPVS topology */ if (!init_services()) { @@ -129,6 +132,7 @@ start_check(void) init_interface_linkbeat(); #endif + init_vip_queue(); /* Register checkers thread */ register_checkers_thread(); } diff --git a/tools/keepalived/keepalived/check/check_data.c b/tools/keepalived/keepalived/check/check_data.c index 80c18284..004df5d4 100644 --- a/tools/keepalived/keepalived/check/check_data.c +++ b/tools/keepalived/keepalived/check/check_data.c @@ -27,10 +27,12 @@ #include "memory.h" #include "utils.h" #include "ipwrapper.h" +#include "ipvswrapper.h" /* global vars */ check_conf_data *check_data = NULL; check_conf_data *old_check_data = NULL; +list vip_queue; /* SSL facility functions */ SSL_DATA * @@ -256,6 +258,11 @@ dump_vs(void *data) if (atoi(vs->timeout_persistence) > 0) log_message(LOG_INFO, " persistence timeout = %s", vs->timeout_persistence); + if (!atoi(vs->est_timeout)) + log_message(LOG_INFO, " vs privated establish state timeout = Default"); + else + log_message(LOG_INFO, " vs privated establish state timeout = %s", + vs->est_timeout); if (vs->granularity_persistence) log_message(LOG_INFO, " persistence granularity = %s", inet_ntop2(vs->granularity_persistence)); @@ -263,7 +270,7 @@ dump_vs(void *data) (vs->service_type == IPPROTO_TCP) ? "TCP" : "UDP"); log_message(LOG_INFO, " alpha is %s, omega is %s", vs->alpha ? "ON" : "OFF", vs->omega ? "ON" : "OFF"); - log_message(LOG_INFO, " SYN proxy is %s", + log_message(LOG_INFO, " SYN proxy is %s", vs->syn_proxy ? "ON" : "OFF"); log_message(LOG_INFO, " quorum = %lu, hysteresis = %lu", vs->quorum, vs->hysteresis); if (vs->quorum_up) @@ -324,6 +331,7 @@ alloc_vs(char *ip, char *port) new->delay_loop = KEEPALIVED_DEFAULT_DELAY; strncpy(new->timeout_persistence, "0", 1); + strncpy(new->est_timeout, "0", 1); new->virtualhost = NULL; new->alpha = 0; new->omega = 0; @@ -446,3 +454,153 @@ dump_check_data(check_conf_data *check_data) } dump_checkers_queue(); } + +static void +free_vip_data(void *data) +{ + FREE(data); +} + +void +free_vip_queue(void) +{ + free_list(vip_queue); + vip_queue = NULL; +} + +inline void +clear_port(struct sockaddr_storage *addr) +{ + if(addr->ss_family == AF_INET) { + struct sockaddr_in *addr_v4 = (struct sockaddr_in *)addr; + addr_v4->sin_port = 0; + } else { + struct sockaddr_in6 *addr_v6 = (struct sockaddr_in6 *)addr; + addr_v6->sin6_port = 0; + } +} + +void +queue_vip(struct sockaddr_storage *addr, int state) +{ + element e; + vip_data *ip_entry; + vip_data *new; + + for (e = LIST_HEAD(vip_queue); e; ELEMENT_NEXT(e)) { + ip_entry = ELEMENT_DATA(e); + if (sockstorage_equal(&ip_entry->addr, addr)) { + ip_entry->entry_cnt++; + if (state == UP) + ip_entry->set_cnt++; + + return; + } + } + + new = (vip_data *) MALLOC(sizeof(vip_data)); + new->addr = *addr; + new->entry_cnt = 1; + if (state == UP) + new->set_cnt = 1; + + log_message(LOG_INFO, "enqueue VIP = %s, VPORT = %d" + , inet_sockaddrtos(&new->addr) + , ntohs(inet_sockaddrport(&new->addr))); + list_add(vip_queue, new); +} + +void +count_vip_group_range(virtual_server_group_entry *vsg_entry, virtual_server *vs) +{ + uint32_t addr_ip, ip; + struct in6_addr *addr_v6; + struct in_addr *addr_v4; + struct sockaddr_storage addr; + + addr = vsg_entry->addr; + /* record ip, ignore port */ + clear_port(&addr); + + if (vsg_entry->addr.ss_family == AF_INET) { + addr_v4 = &(((struct sockaddr_in *) &addr)->sin_addr); + ip = addr_v4->s_addr; + for (addr_ip = ip; + ((addr_ip >> 24) & 0xFF) <= vsg_entry->range; + addr_ip += 0x01000000) { + addr_v4->s_addr = addr_ip; + queue_vip(&addr, vs->quorum_state); + } + } else { + addr_v6 = &(((struct sockaddr_in6 *) &addr)->sin6_addr); + ip = addr_v6->s6_addr32[3]; + for (addr_ip = ip; + ((addr_ip >> 24) & 0xFF) <= vsg_entry->range; + addr_ip += 0x01000000) { + addr_v6->s6_addr32[3] = addr_ip; + queue_vip(&addr, vs->quorum_state); + } + } +} + +void +count_vip_group(virtual_server_group *vsg, virtual_server *vs) +{ + virtual_server_group_entry *vsg_entry; + list l; + element e; + struct sockaddr_storage addr; + + if (!vsg) return; + + /* visit addr_ip list */ + l = vsg->addr_ip; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + vsg_entry = ELEMENT_DATA(e); + addr = vsg_entry->addr; + /* record ip, ignore port */ + clear_port(&addr); + queue_vip(&addr, vs->quorum_state); + } + + /* visit range list */ + l = vsg->range; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + vsg_entry = ELEMENT_DATA(e); + count_vip_group_range(vsg_entry, vs); + } +} + +/* scan all vs in conf, count the vips */ +void +count_vip(void) +{ + element e; + list l = check_data->vs; + virtual_server *vs; + virtual_server_group *vsg; + struct sockaddr_storage addr; + + if (LIST_ISEMPTY(l)) + return; + + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + vs = ELEMENT_DATA(e); + + if (vs->vsgname) { + vsg = ipvs_get_group_by_name(vs->vsgname, check_data->vs_group); + count_vip_group(vsg, vs); + } else if (!vs->vfwmark) { + addr = vs->addr; + clear_port(&addr); + queue_vip(&addr, vs->quorum_state); + } + } +} + +void +init_vip_queue(void) +{ + vip_queue = alloc_list(free_vip_data, NULL); + count_vip(); +} diff --git a/tools/keepalived/keepalived/check/check_http.c b/tools/keepalived/keepalived/check/check_http.c index b85dd838..c817d5e7 100644 --- a/tools/keepalived/keepalived/check/check_http.c +++ b/tools/keepalived/keepalived/check/check_http.c @@ -282,7 +282,7 @@ epilog(thread_t * thread, int method, int t, int c) * servers. */ if (http_arg->retry_it > http_get_check->nb_get_retry-1) { - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Check on service [%s]:%d failed after %d retry." , inet_sockaddrtos(&http_get_check->dst) , ntohs(inet_sockaddrport(&http_get_check->dst)), http_arg->retry_it); @@ -345,7 +345,7 @@ timeout_epilog(thread_t * thread, char *smtp_msg, char *debug_msg) , ntohs(inet_sockaddrport(&http_get_check->dst))); /* check if server is currently alive */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { smtp_alert(checker->rs, NULL, NULL, "DOWN", smtp_msg); update_svr_checker_state(DOWN, checker->id @@ -388,7 +388,7 @@ http_handle_response(thread_t * thread, unsigned char digest[16] if (fetched_url->status_code) { if (req->status_code != fetched_url->status_code) { /* check if server is currently alive */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "HTTP status code error to [%s]:%d url(%s)" ", status_code [%d].", @@ -417,7 +417,7 @@ http_handle_response(thread_t * thread, unsigned char digest[16] } return epilog(thread, 2, 0, 1); } else { - if (!svr_checker_up(UP, checker->id, checker->rs)) + if (!svr_checker_up(checker->id, checker->rs)) log_message(LOG_INFO, "HTTP status code success to [%s]:%d url(%d)." , inet_sockaddrtos(&http_get_check->dst) @@ -438,7 +438,7 @@ http_handle_response(thread_t * thread, unsigned char digest[16] if (r) { /* check if server is currently alive */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "MD5 digest error to [%s]:%d url[%s]" ", MD5SUM [%s].", @@ -468,7 +468,7 @@ http_handle_response(thread_t * thread, unsigned char digest[16] FREE(digest_tmp); return epilog(thread, 2, 0, 1); } else { - if (!svr_checker_up(UP, checker->id, checker->rs)) + if (!svr_checker_up(checker->id, checker->rs)) log_message(LOG_INFO, "MD5 digest success to [%s]:%d url(%d)." , inet_sockaddrtos(&http_get_check->dst) , ntohs(inet_sockaddrport(&http_get_check->dst)) @@ -553,7 +553,7 @@ http_read_thread(thread_t * thread) if (r == -1) { /* We have encourred a real read error */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Read error with server [%s]:%d: %s" , inet_sockaddrtos(&http_get_check->dst) , ntohs(inet_sockaddrport(&http_get_check->dst)) @@ -679,7 +679,7 @@ http_request_thread(thread_t * thread) , ntohs(inet_sockaddrport(&http_get_check->dst))); /* check if server is currently alive */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { smtp_alert(checker->rs, NULL, NULL, "DOWN", "=> CHECK failed on service" @@ -717,7 +717,7 @@ http_check_thread(thread_t * thread) switch (status) { case connect_error: /* check if server is currently alive */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Error connecting server [%s]:%d." , inet_sockaddrtos(&http_get_check->dst) , ntohs(inet_sockaddrport(&http_get_check->dst))); @@ -802,7 +802,7 @@ http_check_thread(thread_t * thread) (req->ssl, ret)); #endif if ((http_get_check->proto == PROTO_SSL) && - (svr_checker_up(DOWN, checker->id, checker->rs))) { + (svr_checker_up(checker->id, checker->rs))) { log_message(LOG_INFO, "SSL handshake/communication error" " connecting to server" " (openssl errno: %d) [%s]:%d." @@ -855,7 +855,7 @@ http_connect_thread(thread_t * thread) * Check completed. * check if server is currently alive. */ - if (!svr_checker_up(UP, checker->id, checker->rs)) { + if (!svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Remote Web server [%s]:%d succeed on service." , inet_sockaddrtos(&http_get_check->dst) , ntohs(inet_sockaddrport(&http_get_check->dst))); diff --git a/tools/keepalived/keepalived/check/check_misc.c b/tools/keepalived/keepalived/check/check_misc.c index 0d88d513..ddea0e18 100644 --- a/tools/keepalived/keepalived/check/check_misc.c +++ b/tools/keepalived/keepalived/check/check_misc.c @@ -181,7 +181,7 @@ misc_check_child_thread(thread_t * thread) pid = THREAD_CHILD_PID(thread); /* The child hasn't responded. Kill it off. */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Misc check to [%s] for [%s] timed out" , inet_sockaddrtos(&checker->rs->addr) , misck_checker->path); @@ -215,7 +215,7 @@ misc_check_child_thread(thread_t * thread) update_svr_wgt(status - 2, checker->vs, checker->rs); /* everything is good */ - if (!svr_checker_up(UP, checker->id, checker->rs)) { + if (!svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Misc check to [%s] for [%s] success." , inet_sockaddrtos(&checker->rs->addr) , misck_checker->path); @@ -227,7 +227,7 @@ misc_check_child_thread(thread_t * thread) , checker->rs); } } else { - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Misc check to [%s] for [%s] failed." , inet_sockaddrtos(&checker->rs->addr) , misck_checker->path); diff --git a/tools/keepalived/keepalived/check/check_parser.c b/tools/keepalived/keepalived/check/check_parser.c index e491cc06..a4660b47 100644 --- a/tools/keepalived/keepalived/check/check_parser.c +++ b/tools/keepalived/keepalived/check/check_parser.c @@ -135,6 +135,19 @@ pto_handler(vector strvec) memcpy(vs->timeout_persistence, str, size); } static void +eto_handler(vector strvec) +{ + virtual_server *vs = LIST_TAIL_DATA(check_data->vs); + char *str = VECTOR_SLOT(strvec, 1); + int size = sizeof (vs->est_timeout); + int str_len = strlen(str); + + if (size > str_len) + size = str_len; + + memcpy(vs->est_timeout, str, size); +} +static void pgr_handler(vector strvec) { virtual_server *vs = LIST_TAIL_DATA(check_data->vs); @@ -319,6 +332,7 @@ check_init_keywords(void) install_keyword("lvs_method", &lbkind_handler); install_keyword("nat_mask", &natmask_handler); install_keyword("persistence_timeout", &pto_handler); + install_keyword("est_timeout", &eto_handler); install_keyword("persistence_granularity", &pgr_handler); install_keyword("protocol", &proto_handler); install_keyword("ha_suspend", &hasuspend_handler); diff --git a/tools/keepalived/keepalived/check/check_smtp.c b/tools/keepalived/keepalived/check/check_smtp.c index 8d744b73..93afa884 100644 --- a/tools/keepalived/keepalived/check/check_smtp.c +++ b/tools/keepalived/keepalived/check/check_smtp.c @@ -287,7 +287,7 @@ smtp_final(thread_t *thread, int error, const char *format, ...) if (error) { /* Always syslog the error when the real server is up */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { if (format != NULL) { memcpy(error_buff, "SMTP_CHECK ", 11); va_start(varg_list, format); @@ -318,7 +318,7 @@ smtp_final(thread_t *thread, int error, const char *format, ...) * be noted that smtp_alert makes a copy of the string arguments, so * we don't have to keep them statically allocated. */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { if (format != NULL) { snprintf(smtp_buff, 542, "=> CHECK failed on service : %s <=", error_buff + 11); @@ -789,7 +789,7 @@ smtp_connect_thread(thread_t *thread) * will be reset and we will continue on checking them one by one. */ if ((smtp_checker->host_ptr = list_element(smtp_checker->host, smtp_checker->host_ctr)) == NULL) { - if (!svr_checker_up(UP, checker->id, checker->rs)) { + if (!svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "Remote SMTP server [%s:%d] succeed on service." , inet_sockaddrtos(&checker->rs->addr) , ntohs(inet_sockaddrport(&checker->rs->addr))); diff --git a/tools/keepalived/keepalived/check/check_ssl.c b/tools/keepalived/keepalived/check/check_ssl.c index cbaeab23..3c0ebb0f 100644 --- a/tools/keepalived/keepalived/check/check_ssl.c +++ b/tools/keepalived/keepalived/check/check_ssl.c @@ -290,7 +290,7 @@ ssl_read_thread(thread_t * thread) if (r && !req->extracted) { /* check if server is currently alive */ - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { smtp_alert(checker->rs, NULL, NULL, "DOWN", "=> SSL CHECK failed on service" diff --git a/tools/keepalived/keepalived/check/check_tcp.c b/tools/keepalived/keepalived/check/check_tcp.c index 0f4d4499..16f66cf3 100644 --- a/tools/keepalived/keepalived/check/check_tcp.c +++ b/tools/keepalived/keepalived/check/check_tcp.c @@ -115,7 +115,7 @@ tcp_check_thread(thread_t * thread) if (status == connect_success) { close(thread->u.fd); - if (!svr_checker_up(UP, checker->id, checker->rs)) { + if (!svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "TCP connection to [%s]:%d success." , inet_sockaddrtos(&tcp_check->dst) , ntohs(inet_sockaddrport(&tcp_check->dst))); @@ -129,7 +129,7 @@ tcp_check_thread(thread_t * thread) } else { - if (svr_checker_up(DOWN, checker->id, checker->rs)) { + if (svr_checker_up(checker->id, checker->rs)) { log_message(LOG_INFO, "TCP connection to [%s]:%d failed !!!" , inet_sockaddrtos(&tcp_check->dst) , ntohs(inet_sockaddrport(&tcp_check->dst))); diff --git a/tools/keepalived/keepalived/check/ipvswrapper.c b/tools/keepalived/keepalived/check/ipvswrapper.c index f9e41544..ce874161 100644 --- a/tools/keepalived/keepalived/check/ipvswrapper.c +++ b/tools/keepalived/keepalived/check/ipvswrapper.c @@ -559,6 +559,8 @@ ipvs_set_rule(int cmd, virtual_server * vs, real_server * rs) , inet_ntop2(inet_sockaddrip4(&vs->addr)) , ntohs(inet_sockaddrport(&vs->addr))); + srule->est_timeout = atoi(vs->est_timeout); + if (srule->timeout != 0 || vs->granularity_persistence) srule->flags = IP_VS_SVC_F_PERSISTENT; @@ -567,7 +569,7 @@ ipvs_set_rule(int cmd, virtual_server * vs, real_server * rs) srule->netmask = vs->granularity_persistence; if(vs->syn_proxy) - srule->flags |= IP_VS_CONN_F_SYNPROXY; + srule->flags |= IP_VS_SVC_F_SYNPROXY; /* SVR specific */ if (rs) { @@ -655,6 +657,11 @@ ipvs_laddr_vsg_cmd(int cmd, list vs_group, virtual_server * vs, local_addr_group l = vsg->addr_ip; for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { vsg_entry = ELEMENT_DATA(e); + /* reloading may make laddr_set true */ + if (vsg_entry->laddr_set && (cmd == IP_VS_SO_SET_ADDLADDR)) + continue; + + vsg_entry->laddr_set = (cmd == IP_VS_SO_SET_ADDLADDR) ? 1:0; srule->af = vsg_entry->addr.ss_family; if (srule->af == AF_INET6) { @@ -674,7 +681,12 @@ ipvs_laddr_vsg_cmd(int cmd, list vs_group, virtual_server * vs, local_addr_group for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { vsg_entry = ELEMENT_DATA(e); uint32_t addr_ip, ip; - + + if (vsg_entry->laddr_set && (cmd == IP_VS_SO_SET_ADDLADDR)) + continue; + + vsg_entry->laddr_set = (cmd == IP_VS_SO_SET_ADDLADDR) ? 1:0; + srule->af = vsg_entry->addr.ss_family; if (srule->af == AF_INET6) { inet_sockaddrip6(&vsg_entry->addr, &srule->addr.in6); @@ -782,7 +794,162 @@ ipvs_cmd(int cmd, list vs_group, virtual_server * vs, real_server * rs) return IPVS_SUCCESS; } -static void + +static void +ipvs_new_laddr_vsg(virtual_server *vs) +{ + list l; + element e; + virtual_server_group *vsg; + virtual_server_group_entry *vsg_entry; + + vsg = ipvs_get_group_by_name(vs->vsgname, check_data->vs_group); + if (!vsg) return; + + /* visit addr_ip list */ + l = vsg->addr_ip; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + vsg_entry = ELEMENT_DATA(e); + + /* will be set later */ + if (!ISALIVE(vsg_entry)) + continue; + + srule->af = vsg_entry->addr.ss_family; + if (srule->af == AF_INET6) { + if (srule->netmask == 0xffffffff) + srule->netmask = 128; + inet_sockaddrip6(&vsg_entry->addr, &srule->addr.in6); + } else + srule->addr.ip = inet_sockaddrip4(&vsg_entry->addr); + srule->port = inet_sockaddrport(&vsg_entry->addr); + + /* local address group channel */ + ipvs_talk(IP_VS_SO_SET_ADDLADDR); + } + + /* visit range list */ + l = vsg->range; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + vsg_entry = ELEMENT_DATA(e); + uint32_t addr_ip, ip; + + /* will be set later */ + if (!ISALIVE(vsg_entry)) + continue; + + srule->af = vsg_entry->addr.ss_family; + if (srule->af == AF_INET6) { + inet_sockaddrip6(&vsg_entry->addr, &srule->addr.in6); + ip = srule->addr.in6.s6_addr32[3]; + } else { + ip = inet_sockaddrip4(&vsg_entry->addr); + } + + /* Parse the whole range */ + for (addr_ip = ip; + ((addr_ip >> 24) & 0xFF) <= vsg_entry->range; + addr_ip += 0x01000000) { + if (srule->af == AF_INET6) { + if (srule->netmask == 0xffffffff) + srule->netmask = 128; + srule->addr.in6.s6_addr32[3] = addr_ip; + } else { + srule->addr.ip = addr_ip; + } + srule->port = inet_sockaddrport(&vsg_entry->addr); + + ipvs_talk(IP_VS_SO_SET_ADDLADDR); + } + } +} + +void +ipvs_new_laddr_add(virtual_server *vs, local_addr_group *laddr_group) +{ + local_addr_entry *laddr_entry; + list l; + element e; + + /* the unalive vs will be set later*/ + if (!vs->vsgname && !ISALIVE(vs)) + return; + + memset(srule, 0, sizeof(ipvs_service_t)); + srule->netmask = (vs->addr.ss_family == AF_INET6) ? 128 : ((u_int32_t) 0xffffffff); + srule->protocol = vs->service_type; + + l = laddr_group->addr_ip; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + laddr_entry = ELEMENT_DATA(e); + if (ISALIVE(laddr_entry)) + continue; + + memset(laddr_rule, 0, sizeof(ipvs_laddr_t)); + laddr_rule->af = laddr_entry->addr.ss_family; + if (laddr_entry->addr.ss_family == AF_INET6) + inet_sockaddrip6(&laddr_entry->addr, &laddr_rule->addr.in6); + else + laddr_rule->addr.ip = inet_sockaddrip4(&laddr_entry->addr); + + if (vs->vsgname) { + ipvs_new_laddr_vsg(vs); + } else { + srule->af = vs->addr.ss_family; + if (srule->af == AF_INET6) + inet_sockaddrip6(&vs->addr, &srule->addr.in6); + else + srule->addr.ip = inet_sockaddrip4(&vs->addr); + srule->port = inet_sockaddrport(&vs->addr); + + /* local address group channel */ + ipvs_talk(IP_VS_SO_SET_ADDLADDR); + } + } + + l = laddr_group->range; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + laddr_entry = ELEMENT_DATA(e); + uint32_t addr_ip, ip; + + if (ISALIVE(laddr_entry)) + continue; + + memset(laddr_rule, 0, sizeof(ipvs_laddr_t)); + laddr_rule->af = laddr_entry->addr.ss_family; + + if (laddr_entry->addr.ss_family == AF_INET6) { + inet_sockaddrip6(&laddr_entry->addr, &laddr_rule->addr.in6); + ip = laddr_rule->addr.in6.s6_addr32[3]; + } else { + ip = inet_sockaddrip4(&laddr_entry->addr); + } + + for (addr_ip = ip; ((addr_ip >> 24) & 0xFF) <= laddr_entry->range; + addr_ip += 0x01000000) { + if (laddr_entry->addr.ss_family == AF_INET6) + laddr_rule->addr.in6.s6_addr32[3] = addr_ip; + else + laddr_rule->addr.ip = addr_ip; + + if (vs->vsgname) { + ipvs_new_laddr_vsg(vs); + } else { + srule->af = vs->addr.ss_family; + if (srule->af == AF_INET6) + inet_sockaddrip6(&vs->addr, &srule->addr.in6); + else + srule->addr.ip = inet_sockaddrip4(&vs->addr); + srule->port = inet_sockaddrport(&vs->addr); + + /* local address group channel */ + ipvs_talk(IP_VS_SO_SET_ADDLADDR); + } + } + } +} + +static void ipvs_rm_lentry_from_vsg(local_addr_entry *laddr_entry, char *vsgname) { list l; @@ -790,12 +957,16 @@ ipvs_rm_lentry_from_vsg(local_addr_entry *laddr_entry, char *vsgname) virtual_server_group *vsg; virtual_server_group_entry *vsg_entry; + /* it's not old_check_data. help to ISALIVE check later */ vsg = ipvs_get_group_by_name(vsgname, check_data->vs_group); if (!vsg) return; l = vsg->addr_ip; for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { vsg_entry = ELEMENT_DATA(e); + if (!ISALIVE(vsg_entry)) + continue; + srule->af = vsg_entry->addr.ss_family; if (vsg_entry->addr.ss_family == AF_INET6) { srule->netmask = 128; @@ -824,6 +995,8 @@ ipvs_rm_lentry_from_vsg(local_addr_entry *laddr_entry, char *vsgname) for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { vsg_entry = ELEMENT_DATA(e); uint32_t addr_ip, ip; + if (!ISALIVE(vsg_entry)) + continue; srule->af = vsg_entry->addr.ss_family; srule->netmask = (vsg_entry->addr.ss_family == AF_INET6) ? 128 : ((u_int32_t) 0xffffffff); diff --git a/tools/keepalived/keepalived/check/ipwrapper.c b/tools/keepalived/keepalived/check/ipwrapper.c index f9986543..41ca6508 100644 --- a/tools/keepalived/keepalived/check/ipwrapper.c +++ b/tools/keepalived/keepalived/check/ipwrapper.c @@ -38,6 +38,92 @@ static struct { char buf[256]; } req; +vip_data * +get_vip_by_addr(struct sockaddr_storage *addr) +{ + vip_data *ip_entry; + element e; + + for (e = LIST_HEAD(vip_queue); e; ELEMENT_NEXT(e)) { + ip_entry = ELEMENT_DATA(e); + if (sockstorage_equal(&ip_entry->addr, addr)) + return ip_entry; + } + + return NULL; +} + +static int +vip_check(struct nlmsghdr *n) +{ + struct ifaddrmsg *ifa; + vip_data *ip_entry; + int ret = 0; + struct sockaddr_storage addr; + + /* vip_queue has not been init */ + if (vip_queue == NULL) + return 1; + + ifa = NLMSG_DATA(n); + addr.ss_family = ifa->ifa_family; + if (ifa->ifa_family == AF_INET) { + ((struct sockaddr_in *) &addr)->sin_addr = + *(struct in_addr *)RTA_DATA((void*)n + NLMSG_SPACE(sizeof(struct ifaddrmsg))); + ((struct sockaddr_in *) &addr)->sin_port = 0; /* clear port */ + } else { + ((struct sockaddr_in6 *) &addr)->sin6_addr = + *(struct in6_addr *)RTA_DATA((void*)n + NLMSG_SPACE(sizeof(struct ifaddrmsg))); + ((struct sockaddr_in6 *) &addr)->sin6_port = 0; + } + + ip_entry = get_vip_by_addr(&addr); + if (ip_entry == NULL) { + log_message(LOG_INFO,"unexpected vip:%s" + ,inet_sockaddrtos(&addr)); + return ret; + } + + switch(n->nlmsg_type) { + /* add vip */ + case RTM_NEWADDR: + if (ip_entry->set_cnt < ip_entry->entry_cnt) { + ret = (ip_entry->set_cnt? 0 : 1); + ip_entry->set_cnt++; + log_message(LOG_INFO, "%s VIP %s" + ,ret ? "ADD":"HOLD" + ,inet_sockaddrtos(&addr)); + } else { + ret = 0; + log_message(LOG_INFO,"vip=%s has been set too many times(%d)" + ,inet_sockaddrtos(&addr) + ,ip_entry->entry_cnt); + } + break; + /* del vip */ + case RTM_DELADDR: + if (ip_entry->set_cnt > 0 ) { + ip_entry->set_cnt--; + /* reference counter is 0, then del vip */ + ret = (ip_entry->set_cnt ? 0 : 1); + log_message(LOG_INFO, "%s VIP %s" + ,ret ? "DEL":"UNHOLD" + ,inet_sockaddrtos(&addr)); + } else { + ret = 0; + log_message(LOG_INFO,"vip=%s has been deleted" + ,inet_sockaddrtos(&addr)); + } + break; + default: + log_message(LOG_INFO,"unknown opcode:%d , vip=%s" + ,n->nlmsg_type + ,inet_sockaddrtos(&addr)); + } + + return ret; +} + /* send message to netlink kernel socket, ignore response */ int netlink_cmd(struct nl_handle *nl, struct nlmsghdr *n) @@ -47,6 +133,10 @@ netlink_cmd(struct nl_handle *nl, struct nlmsghdr *n) struct iovec iov = { (void *) n, n->nlmsg_len }; struct msghdr msg = { (void *) &snl, sizeof snl, &iov, 1, NULL, 0, 0 }; + status = vip_check(n); + if (status <= 0) + return status; + memset(&snl, 0, sizeof snl); snl.nl_family = AF_NETLINK; @@ -159,8 +249,6 @@ netlink_group_vipaddress(list vs_group, char * vsgname, int cmd) sizeof(struct in_addr)); } - log_message(LOG_INFO, "%s VIP %s", - cmd ? "ADD":"DEL", inet_sockaddrtos(addr)); if (netlink_cmd(&nl_cmd, &req.n) < 0) log_message(LOG_INFO, "%s VIP = %s failed", cmd ? "ADD":"DEL", @@ -225,10 +313,6 @@ netlink_vipaddress(list vs_group, virtual_server *vs, int cmd) sizeof(struct in_addr)); } - log_message(LOG_INFO, "%s VIP %s to %s", - cmd ? "ADD":"DEL", - inet_sockaddrtos(&vs->addr), - vs->vip_bind_dev); if (netlink_cmd(&nl_cmd, &req.n) < 0) log_message(LOG_INFO, "%s VIP = %s failed", cmd ? "ADD":"DEL", @@ -280,13 +364,81 @@ netlink_group_remove_entry(virtual_server *vs, virtual_server_group_entry *vsge) sizeof(struct in_addr)); } - log_message(LOG_INFO, "DEL VIP %s", inet_sockaddrtos(addr)); if (netlink_cmd(&nl_cmd, &req.n) < 0) log_message(LOG_INFO, "DEL VIP = %s failed", inet_sockaddrtos(addr)); } } +/* add the vip of new vsg_entry, in reload mode only */ +void +add_new_vsge_vip(list vs_group, virtual_server *vs) +{ + unsigned int ifa_idx; + virtual_server_group *vsg = ipvs_get_group_by_name(vs->vsgname, vs_group); + virtual_server_group_entry *vsg_entry; + struct sockaddr_storage *addr; + list l; + element e; + + if (!vs->vsgname || !vs->vip_bind_dev || !vsg) + return; + + memset(&req, 0, sizeof (req)); + + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_NEWADDR; + + ifa_idx = if_nametoindex(vs->vip_bind_dev); + + if (!ifa_idx) { + log_message(LOG_INFO, "interface %s does not exist", + vs->vip_bind_dev); + return; + } + + req.ifa.ifa_index = ifa_idx; + + /* visit addr_ip list */ + l = vsg->addr_ip; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + vsg_entry = ELEMENT_DATA(e); + + if (ISALIVE(vsg_entry)) + continue; + + addr = &vsg_entry->addr; + req.n.nlmsg_len = NLMSG_LENGTH(sizeof (struct ifaddrmsg)); + req.ifa.ifa_family = addr->ss_family; + if(req.ifa.ifa_family == AF_INET6) { + req.ifa.ifa_prefixlen = 128; + addattr_l(&req.n, sizeof(req), IFA_LOCAL, + &((struct sockaddr_in6 *)addr)->sin6_addr, + sizeof(struct in6_addr)); + } else { + req.ifa.ifa_prefixlen = 32; + addattr_l(&req.n, sizeof(req), IFA_LOCAL, + &((struct sockaddr_in *)addr)->sin_addr, + sizeof(struct in_addr)); + } + + log_message(LOG_INFO, "ADD VIP %s", inet_sockaddrtos(addr)); + if (netlink_cmd(&nl_cmd, &req.n) < 0) + log_message(LOG_INFO, "ADD VIP = %s failed", inet_sockaddrtos(addr)); + } + + /* visit range list */ + l = vsg->range; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + vsg_entry = ELEMENT_DATA(e); + + if (ISALIVE(vsg_entry)) + continue; + + netlink_range_cmd(UP, vsg_entry); + } +} + /* Returns the sum of all RS weight in a virtual server. */ long unsigned weigh_live_realservers(virtual_server * vs) @@ -395,6 +547,38 @@ clear_services(void) return 1; } +/* only for alpha & reload mode !!! */ +void inline +alpha_reload_handle(virtual_server *vs, real_server *rs) +{ + if (ISALIVE(rs)) { + /* + * In alpha mode, rs has been set failed_checkers + * we must do a clean in reload to make alive rs + * in consistent state + */ + list l = rs->failed_checkers; + element next, tmp; + + for (tmp = LIST_HEAD(l); tmp; tmp = next) { + next = tmp->next; + free_list_element(l, tmp); + } + l->head = NULL; + l->tail = NULL; + + /* + * vsgroup may has new entry after reload + * so add the alive rs to the new one + */ + if (vs->vsgname) { + UNSET_ALIVE(rs); + ipvs_cmd(LVS_CMD_ADD_DEST, check_data->vs_group, vs, rs); + SET_ALIVE(rs); + } + } +} + /* Set a realserver IPVS rules */ static int init_service_rs(virtual_server * vs) @@ -409,7 +593,10 @@ init_service_rs(virtual_server * vs) * later upon healthchecks recovery (if ever). */ if (vs->alpha) { - UNSET_ALIVE(rs); + if (!reload) + UNSET_ALIVE(rs); + else + alpha_reload_handle(vs, rs); continue; } if (!ISALIVE(rs)) { @@ -428,32 +615,64 @@ init_service_rs(virtual_server * vs) return 1; } +static int +init_service_laddr(virtual_server * vs) +{ + /*Set local ip address in "FNAT" mode of IPVS */ + if ((vs->loadbalancing_kind == IP_VS_CONN_F_FULLNAT) && vs->local_addr_gname) { + if (!ipvs_cmd(LVS_CMD_ADD_LADDR, check_data->vs_group, vs, NULL)) + return 0; + } + + return 1; +} + +static void +add_new_laddr(virtual_server *vs) +{ + local_addr_group *laddr_group; + + laddr_group = ipvs_get_laddr_group_by_name(vs->local_addr_gname, + check_data->laddr_group); + if (laddr_group) + ipvs_new_laddr_add(vs, laddr_group); +} + /* Set a virtualserver IPVS rules */ static int init_service_vs(virtual_server * vs) { + /* + * In reloading, bind the new vip(vsge) to make a consistent state. + * It's meaningful to virtual_server_group. + */ + + if (reload && vs->alpha && (vs->quorum_state == UP) && vs->vsgname) + add_new_vsge_vip(check_data->vs_group, vs); + + if (reload && vs->local_addr_gname) + add_new_laddr(vs); + /* Init the VS root */ if (!ISALIVE(vs) || vs->vsgname) { - if (!ipvs_cmd(LVS_CMD_ADD, check_data->vs_group, vs, NULL)) + if (!ipvs_cmd(LVS_CMD_ADD, check_data->vs_group, vs, NULL) || + !init_service_laddr(vs)) return 0; else SET_ALIVE(vs); } - /*Set local ip address in "FNAT" mode of IPVS */ - if ((vs->loadbalancing_kind == IP_VS_CONN_F_FULLNAT) && vs->local_addr_gname) { - if (!ipvs_cmd(LVS_CMD_ADD_LADDR, check_data->vs_group, vs, NULL)) - return 0; - } - /* Processing real server queue */ if (!LIST_ISEMPTY(vs->rs)) { if (!init_service_rs(vs)) return 0; - if (vs->alpha) - vs->quorum_state = DOWN; - else + + if (!vs->alpha) netlink_vipaddress(check_data->vs_group, vs, UP); + + /* In fact vs quorum_state has been DOWN with conf reading */ + if (vs->alpha && !reload) + vs->quorum_state = DOWN; } return 1; } @@ -503,17 +722,20 @@ perform_quorum_state(virtual_server *vs, int add) void update_quorum_state(virtual_server * vs) { + long unsigned weigh_count; char rsip[INET6_ADDRSTRLEN]; + weigh_count = weigh_live_realservers(vs); + /* If we have just gained quorum, it's time to consider notify_up. */ if (vs->quorum_state == DOWN && - weigh_live_realservers(vs) >= vs->quorum + vs->hysteresis) { + weigh_count >= vs->quorum + vs->hysteresis) { vs->quorum_state = UP; log_message(LOG_INFO, "Gained quorum %lu+%lu=%lu <= %u for VS [%s]:%d" , vs->quorum , vs->hysteresis , vs->quorum + vs->hysteresis - , weigh_live_realservers(vs) + , weigh_count , (vs->vsgname) ? vs->vsgname : inet_sockaddrtos(&vs->addr) , ntohs(inet_sockaddrport(&vs->addr))); if (vs->s_svr && ISALIVE(vs->s_svr)) { @@ -544,13 +766,13 @@ update_quorum_state(virtual_server * vs) * VS notify_down and sorry_server cases */ if (vs->quorum_state == UP && - weigh_live_realservers(vs) < vs->quorum - vs->hysteresis) { + weigh_count < vs->quorum - vs->hysteresis) { vs->quorum_state = DOWN; log_message(LOG_INFO, "Lost quorum %lu-%lu=%lu > %u for VS [%s]:%d" , vs->quorum , vs->hysteresis , vs->quorum - vs->hysteresis - , weigh_live_realservers(vs) + , weigh_count , (vs->vsgname) ? vs->vsgname : inet_sockaddrtos(&vs->addr) , ntohs(inet_sockaddrport(&vs->addr))); netlink_vipaddress(check_data->vs_group, vs, DOWN); @@ -615,7 +837,8 @@ perform_svr_state(int alive, virtual_server * vs, real_server * rs) } /* We may have gained quorum */ - update_quorum_state(vs); + if (vs->quorum_state == DOWN) + update_quorum_state(vs); } if (ISALIVE(rs) && !alive) { @@ -644,7 +867,8 @@ perform_svr_state(int alive, virtual_server * vs, real_server * rs) } /* We may have lost quorum */ - update_quorum_state(vs); + if (vs->quorum_state == UP) + update_quorum_state(vs); } } @@ -679,32 +903,12 @@ update_svr_wgt(int weight, virtual_server * vs, real_server * rs) /* Test if realserver is marked UP for a specific checker */ int -svr_checker_up(int alive, checker_id_t cid, real_server *rs) +svr_checker_up(checker_id_t cid, real_server *rs) { element e; list l = rs->failed_checkers; checker_id_t *id; - if (rs->reload_alive) { - /* first check failed under alpha mode - * and the rs is alive before reload - */ - if (!alive && !ISALIVE(rs)) { - element next; - - for (e = LIST_HEAD(l); e; e = next) { - next = e->next; - free_list_element(l, e); - } - l->head = NULL; - l->tail = NULL; - - SET_ALIVE(rs); - } - /* make sure we do not go here next time */ - rs->reload_alive = 0; - } - /* * We assume there is not too much checker per * real server, so we consider this lookup as @@ -772,6 +976,7 @@ vsge_exist(virtual_server_group_entry *vsg_entry, list l) * are changing from alive state. */ SET_ALIVE(vsge); + vsge->laddr_set = vsg_entry->laddr_set; return 1; } } @@ -844,6 +1049,18 @@ vs_exist(virtual_server * old_vs) for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { vs = ELEMENT_DATA(e); if (VS_ISEQ(old_vs, vs)) { + /* Check if dev change */ + if (((old_vs->vip_bind_dev && vs->vip_bind_dev && + strcmp(old_vs->vip_bind_dev, vs->vip_bind_dev)) || + (old_vs->vip_bind_dev != NULL && vs->vip_bind_dev == NULL)) && + (old_vs->quorum_state == UP)) { + char *tmp = old_vs->vip_bind_dev; + netlink_vipaddress(old_check_data->vs_group, old_vs, DOWN); + old_vs->vip_bind_dev = vs->vip_bind_dev; + netlink_vipaddress(old_check_data->vs_group, old_vs, UP); + old_vs->vip_bind_dev = tmp; + } + /* Check if group exist */ if (vs->vsgname) { vsg = ipvs_get_group_by_name(old_vs->vsgname, @@ -859,10 +1076,9 @@ vs_exist(virtual_server * old_vs) * Exist so set alive. */ SET_ALIVE(vs); - if ((old_vs->vip_bind_dev && vs->vip_bind_dev && - strcmp(old_vs->vip_bind_dev, vs->vip_bind_dev)) || - (old_vs->vip_bind_dev != NULL && vs->vip_bind_dev == NULL)) - netlink_vipaddress(old_check_data->vs_group, old_vs, DOWN); + /* save the quorum_state */ + if (reload && vs->alpha) + vs->quorum_state = old_vs->quorum_state; return 1; } } @@ -891,11 +1107,6 @@ rs_exist(real_server * old_rs, list l) rs->alive = old_rs->alive; rs->set = old_rs->set; rs->weight = old_rs->weight; - /* - * The alpha mode will reset rs to unalive. - * We save the status before reload here - */ - rs->reload_alive = rs->alive; return 1; } } @@ -972,8 +1183,10 @@ laddr_entry_exist(local_addr_entry *laddr_entry, list l) for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { entry = ELEMENT_DATA(e); if (sockstorage_equal(&entry->addr, &laddr_entry->addr) && - entry->range == laddr_entry->range) + entry->range == laddr_entry->range) { + SET_ALIVE(entry); return 1; + } } return 0; diff --git a/tools/keepalived/keepalived/include/check_data.h b/tools/keepalived/keepalived/include/check_data.h index 27bf7b6f..2561f48e 100644 --- a/tools/keepalived/keepalived/include/check_data.h +++ b/tools/keepalived/keepalived/include/check_data.h @@ -52,6 +52,7 @@ typedef unsigned int checker_id_t; /* Daemon dynamic data structure definition */ #define MAX_TIMEOUT_LENGTH 5 +#define MAX_EST_TIMEOUT_LENGTH 5 #define KEEPALIVED_DEFAULT_DELAY (60 * TIMER_HZ) /* SSL specific data */ @@ -84,13 +85,13 @@ typedef struct _real_server { int alive; list failed_checkers; /* List of failed checkers */ int set; /* in the IPVS table */ - int reload_alive; /* alpha mode will reset rs to unalive. So save the status before reload here */ } real_server; /* local ip address group definition */ typedef struct _local_addr_entry { struct sockaddr_storage addr; uint8_t range; + int alive; } local_addr_entry; typedef struct _local_addr_group { @@ -105,6 +106,7 @@ typedef struct _virtual_server_group_entry { uint8_t range; uint32_t vfwmark; int alive; + int laddr_set; } virtual_server_group_entry; typedef struct _virtual_server_group { @@ -125,6 +127,7 @@ typedef struct _virtual_server { int ha_suspend; char sched[SCHED_MAX_LENGTH]; char timeout_persistence[MAX_TIMEOUT_LENGTH]; + char est_timeout[MAX_EST_TIMEOUT_LENGTH]; unsigned loadbalancing_kind; uint32_t nat_mask; uint32_t granularity_persistence; @@ -145,6 +148,13 @@ typedef struct _virtual_server { char *vip_bind_dev; /* the interface name,vip bindto */ } virtual_server; +/* record for add/del vip */ +typedef struct _vip_data { + struct sockaddr_storage addr; /* record ip, ignore port */ + int set_cnt; /* reference counter of the vip */ + int entry_cnt; +} vip_data; + /* Configuration data root */ typedef struct _check_conf_data { SSL_DATA *ssl; @@ -226,8 +236,9 @@ static inline int inaddr_equal(sa_family_t family, void *addr1, void *addr2) (X)->syn_proxy == (Y)->syn_proxy &&\ !strcmp((X)->sched, (Y)->sched) &&\ !strcmp((X)->timeout_persistence, (Y)->timeout_persistence) &&\ + !strcmp((X)->est_timeout, (Y)->est_timeout) &&\ (((X)->vsgname && (Y)->vsgname && \ - !strcmp((X)->vsgname, (Y)->vsgname)) || \ + !strcmp((X)->vsgname, (Y)->vsgname)) || \ (!(X)->vsgname && !(Y)->vsgname)) &&\ (((X)->local_addr_gname && (Y)->local_addr_gname && \ !strcmp((X)->local_addr_gname, (Y)->local_addr_gname)) || \ @@ -243,6 +254,7 @@ static inline int inaddr_equal(sa_family_t family, void *addr1, void *addr2) /* Global vars exported */ extern check_conf_data *check_data; extern check_conf_data *old_check_data; +extern list vip_queue; /* prototypes */ extern SSL_DATA *alloc_ssl(void); @@ -260,5 +272,6 @@ extern void set_rsgroup(char *); extern check_conf_data *alloc_check_data(void); extern void free_check_data(check_conf_data *); extern void dump_check_data(check_conf_data *); - +extern void free_vip_queue(void); +extern void init_vip_queue(void); #endif diff --git a/tools/keepalived/keepalived/include/ipvswrapper.h b/tools/keepalived/keepalived/include/ipvswrapper.h index fadb5748..0e9f7231 100644 --- a/tools/keepalived/keepalived/include/ipvswrapper.h +++ b/tools/keepalived/keepalived/include/ipvswrapper.h @@ -94,6 +94,7 @@ extern void ipvs_stop(void); extern virtual_server_group *ipvs_get_group_by_name(char *, list); extern int ipvs_group_remove_entry(virtual_server *, virtual_server_group_entry *); extern local_addr_group *ipvs_get_laddr_group_by_name(char *, list); +extern void ipvs_new_laddr_add(virtual_server *, local_addr_group *); extern int ipvs_laddr_remove_entry(virtual_server *, local_addr_entry *); extern int ipvs_cmd(int, list, virtual_server *, real_server *); extern int ipvs_syncd_cmd(int, char *, int, int); diff --git a/tools/keepalived/keepalived/include/ipwrapper.h b/tools/keepalived/keepalived/include/ipwrapper.h index 7899cbff..cf8893ae 100644 --- a/tools/keepalived/keepalived/include/ipwrapper.h +++ b/tools/keepalived/keepalived/include/ipwrapper.h @@ -53,7 +53,7 @@ /* prototypes */ extern void perform_svr_state(int, virtual_server *, real_server *); extern void update_svr_wgt(int, virtual_server *, real_server *); -extern int svr_checker_up(int, checker_id_t, real_server *); +extern int svr_checker_up(checker_id_t, real_server *); extern void update_svr_checker_state(int, checker_id_t, virtual_server *, real_server *); extern int init_services(void); extern int clear_services(void); diff --git a/tools/keepalived/keepalived/libipvs-2.6/ip_vs.h b/tools/keepalived/keepalived/libipvs-2.6/ip_vs.h index f15bfd4e..dd831edd 100644 --- a/tools/keepalived/keepalived/libipvs-2.6/ip_vs.h +++ b/tools/keepalived/keepalived/libipvs-2.6/ip_vs.h @@ -29,7 +29,8 @@ #define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ #define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ #define IP_VS_SVC_F_ONEPACKET 0x0004 /* one-packet scheduling */ -#define IP_VS_CONN_F_SYNPROXY 0x8000 /* synproxy switch flag*/ +#define IP_VS_SVC_F_SYNPROXY 0x8000 /* synproxy switch flag */ + /* * IPVS sync daemon states @@ -95,6 +96,7 @@ #define IP_VS_CONN_F_NO_CPORT 0x0800 /* no client port set yet */ #define IP_VS_CONN_F_TEMPLATE 0x1000 /* template, not connection */ #define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */ +#define IP_VS_CONN_F_SYNPROXY 0x8000 /* synproxy switch flag */ #define IP_VS_SCHEDNAME_MAXLEN 16 #define IP_VS_PENAME_MAXLEN 16 @@ -143,6 +145,7 @@ struct ip_vs_service_user { u_int16_t af; union nf_inet_addr addr; char pe_name[IP_VS_PENAME_MAXLEN]; + unsigned est_timeout; /* virtual service private establish timeout */ }; struct ip_vs_dest_kern { @@ -265,7 +268,7 @@ struct ip_vs_service_entry { u_int16_t af; union nf_inet_addr addr; char pe_name[IP_VS_PENAME_MAXLEN]; - + unsigned est_timeout; /* vs private establish timeout */ }; struct ip_vs_dest_entry_kern { @@ -505,6 +508,7 @@ enum { IPVS_SVC_ATTR_NETMASK, /* persistent netmask */ IPVS_SVC_ATTR_STATS, /* nested attribute for service stats */ + IPVS_SVC_ATTR_EST_TIMEOUT, /* establish timeout */ IPVS_SVC_ATTR_PE_NAME, /* name of scheduler */ diff --git a/tools/keepalived/keepalived/libipvs-2.6/libipvs.c b/tools/keepalived/keepalived/libipvs-2.6/libipvs.c index 8170d497..9c0af9a8 100644 --- a/tools/keepalived/keepalived/libipvs-2.6/libipvs.c +++ b/tools/keepalived/keepalived/libipvs-2.6/libipvs.c @@ -239,6 +239,7 @@ static int ipvs_nl_fill_service_attr(struct nl_msg *msg, ipvs_service_t *svc) NLA_PUT(msg, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); NLA_PUT_U32(msg, IPVS_SVC_ATTR_TIMEOUT, svc->timeout); NLA_PUT_U32(msg, IPVS_SVC_ATTR_NETMASK, svc->netmask); + NLA_PUT_U32(msg, IPVS_SVC_ATTR_EST_TIMEOUT, svc->est_timeout); nla_nest_end(msg, nl_service); return 0; @@ -267,6 +268,21 @@ int ipvs_add_service(ipvs_service_t *svc) sizeof(struct ip_vs_service_kern)); } +void ipvs_service_entry_2_user(const ipvs_service_entry_t *entry, ipvs_service_t *user) +{ + user->protocol = entry->protocol; + user->__addr_v4 = entry->__addr_v4; + user->port = entry->port; + user->fwmark = entry->fwmark; + strcpy(user->sched_name, entry->sched_name); + user->flags = entry->flags; + user->timeout = entry->timeout; + user->netmask = entry->netmask; + user->af = entry->af; + user->addr = entry->addr; + strcpy(user->pe_name, entry->pe_name); + user->est_timeout = entry->est_timeout; +} int ipvs_update_service(ipvs_service_t *svc) { @@ -313,10 +329,10 @@ int ipvs_update_service_by_options(ipvs_service_t *svc, unsigned int options) } if( options & OPT_SYNPROXY ) { - if( svc->flags & IP_VS_CONN_F_SYNPROXY ) { - user.flags |= IP_VS_CONN_F_SYNPROXY; + if( svc->flags & IP_VS_SVC_F_SYNPROXY ) { + user.flags |= IP_VS_SVC_F_SYNPROXY; } else { - user.flags &= ~IP_VS_CONN_F_SYNPROXY; + user.flags &= ~IP_VS_SVC_F_SYNPROXY; } } @@ -324,31 +340,11 @@ int ipvs_update_service_by_options(ipvs_service_t *svc, unsigned int options) user.flags |= IP_VS_SVC_F_ONEPACKET; } - return ipvs_update_service(&user); -} - -int ipvs_update_service_synproxy(ipvs_service_t *svc , int enable) -{ - ipvs_service_entry_t *entry; - - if (!(entry = ipvs_get_service(svc->fwmark, svc->af, svc->protocol, - svc->addr, svc->port))) { - fprintf(stderr, "%s\n", ipvs_strerror(errno)); - exit(1); + if( options & OPT_VS_EST_TIMEOUT ) { + user.est_timeout = svc->est_timeout; } - - strcpy(svc->sched_name , entry->sched_name); - strcpy(svc->pe_name , entry->pe_name); - svc->flags = entry->flags; - svc->timeout = entry->timeout; - svc->netmask = entry->netmask; - - if(enable) - svc->flags = svc->flags | IP_VS_CONN_F_SYNPROXY; - else - svc->flags = svc->flags & (~IP_VS_CONN_F_SYNPROXY); - - return ipvs_update_service(svc); + + return ipvs_update_service(&user); } int ipvs_del_service(ipvs_service_t *svc) @@ -765,6 +761,8 @@ static int ipvs_services_parse_cb(struct nl_msg *msg, void *arg) get->entrytable[i].timeout = nla_get_u32(svc_attrs[IPVS_SVC_ATTR_TIMEOUT]); nla_memcpy(&flags, svc_attrs[IPVS_SVC_ATTR_FLAGS], sizeof(flags)); get->entrytable[i].flags = flags.flags & flags.mask; + if(svc_attrs[IPVS_SVC_ATTR_EST_TIMEOUT]) /* Be compatible with different version of ipvs kernel */ + get->entrytable[i].est_timeout= nla_get_u32(svc_attrs[IPVS_SVC_ATTR_EST_TIMEOUT]); if (ipvs_parse_stats(&(get->entrytable[i].stats), svc_attrs[IPVS_SVC_ATTR_STATS]) != 0) @@ -1452,19 +1450,3 @@ const char *ipvs_strerror(int err) return strerror(err); } - -void ipvs_service_entry_2_user(const ipvs_service_entry_t *entry, ipvs_service_t *user) -{ - user->protocol = entry->protocol; - user->__addr_v4 = entry->__addr_v4; - user->port = entry->port; - user->fwmark = entry->fwmark; - strcpy(user->sched_name, entry->sched_name); - user->flags = entry->flags; - user->timeout = entry->timeout; - user->netmask = entry->netmask; - user->af = entry->af; - user->addr = entry->addr; - strcpy(user->pe_name, entry->pe_name); -} - diff --git a/tools/keepalived/keepalived/libipvs-2.6/libipvs.h b/tools/keepalived/keepalived/libipvs-2.6/libipvs.h index 60a617f4..55f810e9 100644 --- a/tools/keepalived/keepalived/libipvs-2.6/libipvs.h +++ b/tools/keepalived/keepalived/libipvs-2.6/libipvs.h @@ -38,7 +38,8 @@ #define OPT_PERSISTENCE_ENGINE 0x400000 #define OPT_LOCAL_ADDRESS 0x800000 #define OPT_SYNPROXY 0x1000000 -#define NUMBER_OF_OPT 25 +#define OPT_VS_EST_TIMEOUT 0x2000000 +#define NUMBER_OF_OPT 26 #define MINIMUM_IPVS_VERSION_MAJOR 1 #define MINIMUM_IPVS_VERSION_MINOR 1 @@ -95,9 +96,6 @@ extern int ipvs_update_service(ipvs_service_t *svc); /* update a virtual service based on option */ extern int ipvs_update_service_by_options(ipvs_service_t *svc, unsigned int options); -/* config the service's synproxy switch */ -extern int ipvs_update_service_synproxy(ipvs_service_t *svc , int enable); - /* delete a virtual service */ extern int ipvs_del_service(ipvs_service_t *svc);