2 * COPYRIGHT: See COPYING in the top level directory
3 * PROJECT: ReactOS TCP/IP protocol driver
4 * FILE: transport/tcp/tcp_ipv4.c
5 * PURPOSE: Transmission Control Protocol
6 * PROGRAMMERS: Casper S. Hornstrup (chorns@users.sourceforge.net)
8 * CSH 15-01-2003 Imported from linux kernel 2.4.20
12 * INET An implementation of the TCP/IP protocol suite for the LINUX
13 * operating system. INET is implemented using the BSD Socket
14 * interface as the means of communication with the user level.
16 * Implementation of the Transmission Control Protocol(TCP).
20 * IPv4 specific functions
25 * linux/ipv4/tcp_input.c
26 * linux/ipv4/tcp_output.c
28 * See tcp.c for author information
30 * This program is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU General Public License
32 * as published by the Free Software Foundation; either version
33 * 2 of the License, or (at your option) any later version.
38 * David S. Miller : New socket lookup architecture.
39 * This code is dedicated to John Dyson.
40 * David S. Miller : Change semantics of established hash,
41 * half is devoted to TIME_WAIT sockets
42 * and the rest go in the other half.
43 * Andi Kleen : Add support for syncookies and fixed
44 * some bugs: ip options weren't passed to
45 * the TCP layer, missed a check for an ACK bit.
46 * Andi Kleen : Implemented fast path mtu discovery.
47 * Fixed many serious bugs in the
48 * open_request handling and moved
49 * most of it into the af independent code.
50 * Added tail drop and some other bugfixes.
51 * Added new listen sematics.
52 * Mike McLagan : Routing by source
53 * Juan Jose Ciarlante: ip_dynaddr bits
54 * Andi Kleen: various fixes.
55 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
56 * Andi Kleen : Fix new listen.
57 * Andi Kleen : Fix accept error reporting.
61 #include <linux/config.h>
62 #include <linux/types.h>
63 #include <linux/fcntl.h>
64 #include <linux/random.h>
65 #include <linux/cache.h>
66 #include <linux/init.h>
71 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/stddef.h>
75 #include <linux/ipsec.h>
81 extern int sysctl_ip_dynaddr;
82 extern int sysctl_ip_default_ttl;
83 int sysctl_tcp_tw_reuse = 0;
85 /* Check TCP sequence numbers in ICMP packets. */
86 #define ICMP_MIN_LENGTH 8
88 /* Socket used for sending RSTs */
90 static struct inode tcp_inode;
91 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
94 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
98 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
101 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
106 __tcp_listening_hash: { NULL, },
107 __tcp_lhash_lock: RW_LOCK_UNLOCKED,
108 __tcp_lhash_users: ATOMIC_INIT(0),
110 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
111 __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
116 * This array holds the first and last local port number.
117 * For high-usage systems, use sysctl to change this to
120 int sysctl_local_port_range[2] = { 1024, 4999 };
121 int tcp_port_rover = (1024 - 1);
123 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
124 __u32 faddr, __u16 fport)
126 int h = ((laddr ^ lport) ^ (faddr ^ fport));
129 return h & (tcp_ehash_size - 1);
132 static __inline__ int tcp_sk_hashfn(struct sock *sk)
134 __u32 laddr = sk->rcv_saddr;
135 __u16 lport = sk->num;
136 __u32 faddr = sk->daddr;
137 __u16 fport = sk->dport;
139 return tcp_hashfn(laddr, lport, faddr, fport);
142 /* Allocate and initialize a new TCP local port bind bucket.
143 * The bindhash mutex for snum's hash chain must be held here.
145 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
149 struct tcp_bind_bucket *tb;
151 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
156 if((tb->next = head->chain) != NULL)
157 tb->next->pprev = &tb->next;
159 tb->pprev = &head->chain;
167 /* Caller must disable local BH processing. */
168 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
171 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
172 struct tcp_bind_bucket *tb;
174 spin_lock(&head->lock);
175 tb = (struct tcp_bind_bucket *)sk->prev;
176 if ((child->bind_next = tb->owners) != NULL)
177 tb->owners->bind_pprev = &child->bind_next;
179 child->bind_pprev = &tb->owners;
180 child->prev = (struct sock *) tb;
181 spin_unlock(&head->lock);
185 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
189 __tcp_inherit_port(sk, child);
194 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
198 if ((sk->bind_next = tb->owners) != NULL)
199 tb->owners->bind_pprev = &sk->bind_next;
201 sk->bind_pprev = &tb->owners;
202 sk->prev = (struct sock *) tb;
206 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
209 struct sock *sk2 = tb->owners;
210 int sk_reuse = sk->reuse;
212 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
215 sk->bound_dev_if == sk2->bound_dev_if) {
218 sk2->state == TCP_LISTEN) {
219 if (!sk2->rcv_saddr ||
221 (sk2->rcv_saddr == sk->rcv_saddr))
232 /* Obtain a reference to a local port for the given sock,
233 * if snum is zero it means select any available local port.
235 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
238 struct tcp_bind_hashbucket *head;
239 struct tcp_bind_bucket *tb;
244 int low = sysctl_local_port_range[0];
245 int high = sysctl_local_port_range[1];
246 int remaining = (high - low) + 1;
249 spin_lock(&tcp_portalloc_lock);
250 rover = tcp_port_rover;
252 if ((rover < low) || (rover > high))
254 head = &tcp_bhash[tcp_bhashfn(rover)];
255 spin_lock(&head->lock);
256 for (tb = head->chain; tb; tb = tb->next)
257 if (tb->port == rover)
261 spin_unlock(&head->lock);
262 } while (--remaining > 0);
263 tcp_port_rover = rover;
264 spin_unlock(&tcp_portalloc_lock);
266 /* Exhausted local port range during search? */
271 /* OK, here is the one we will use. HEAD is
272 * non-NULL and we hold it's mutex.
277 head = &tcp_bhash[tcp_bhashfn(snum)];
278 spin_lock(&head->lock);
279 for (tb = head->chain; tb != NULL; tb = tb->next)
280 if (tb->port == snum)
283 if (tb != NULL && tb->owners != NULL) {
286 if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
290 if (tcp_bind_conflict(sk, tb))
296 (tb = tcp_bucket_create(head, snum)) == NULL)
298 if (tb->owners == NULL) {
299 if (sk->reuse && sk->state != TCP_LISTEN)
303 } else if (tb->fastreuse &&
304 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
307 if (sk->prev == NULL)
308 tcp_bind_hash(sk, tb, snum);
309 BUG_TRAP(sk->prev == (struct sock *) tb);
313 spin_unlock(&head->lock);
322 /* Get rid of any references to a local port held by the
325 __inline__ void __tcp_put_port(struct sock *sk)
328 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
329 struct tcp_bind_bucket *tb;
331 spin_lock(&head->lock);
332 tb = (struct tcp_bind_bucket *) sk->prev;
334 sk->bind_next->bind_pprev = sk->bind_pprev;
335 *(sk->bind_pprev) = sk->bind_next;
338 if (tb->owners == NULL) {
340 tb->next->pprev = tb->pprev;
341 *(tb->pprev) = tb->next;
342 kmem_cache_free(tcp_bucket_cachep, tb);
344 spin_unlock(&head->lock);
348 void tcp_put_port(struct sock *sk)
357 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
358 * Look, when several writers sleep and reader wakes them up, all but one
359 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
360 * this, _but_ remember, it adds useless work on UP machines (wake up each
361 * exclusive lock release). It should be ifdefed really.
364 void tcp_listen_wlock(void)
367 write_lock(&tcp_lhash_lock);
369 if (atomic_read(&tcp_lhash_users)) {
370 DECLARE_WAITQUEUE(wait, current);
372 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
374 set_current_state(TASK_UNINTERRUPTIBLE);
375 if (atomic_read(&tcp_lhash_users) == 0)
377 write_unlock_bh(&tcp_lhash_lock);
379 write_lock_bh(&tcp_lhash_lock);
382 __set_current_state(TASK_RUNNING);
383 remove_wait_queue(&tcp_lhash_wait, &wait);
388 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
394 BUG_TRAP(sk->pprev==NULL);
395 if(listen_possible && sk->state == TCP_LISTEN) {
396 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
397 lock = &tcp_lhash_lock;
400 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
401 lock = &tcp_ehash[sk->hashent].lock;
404 if((sk->next = *skp) != NULL)
405 (*skp)->pprev = &sk->next;
408 sock_prot_inc_use(sk->prot);
410 if (listen_possible && sk->state == TCP_LISTEN)
411 wake_up(&tcp_lhash_wait);
415 static void tcp_v4_hash(struct sock *sk)
418 if (sk->state != TCP_CLOSE) {
420 __tcp_v4_hash(sk, 1);
426 void tcp_unhash(struct sock *sk)
434 if (sk->state == TCP_LISTEN) {
437 lock = &tcp_lhash_lock;
439 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
441 write_lock_bh(&head->lock);
446 sk->next->pprev = sk->pprev;
447 *sk->pprev = sk->next;
449 sock_prot_dec_use(sk->prot);
451 write_unlock_bh(lock);
454 if (sk->state == TCP_LISTEN)
455 wake_up(&tcp_lhash_wait);
459 /* Don't inline this cruft. Here are some nice properties to
460 * exploit here. The BSD API does not allow a listening TCP
461 * to specify the remote port nor the remote address for the
462 * connection. So always assume those are both wildcarded
463 * during the search since they can never be otherwise.
465 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
468 struct sock *result = NULL;
472 for(; sk; sk = sk->next) {
473 if(sk->num == hnum) {
474 __u32 rcv_saddr = sk->rcv_saddr;
478 if (rcv_saddr != daddr)
482 if (sk->bound_dev_if) {
483 if (sk->bound_dev_if != dif)
489 if (score > hiscore) {
501 /* Optimize the common listener case. */
502 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
507 read_lock(&tcp_lhash_lock);
508 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
510 if (sk->num == hnum &&
512 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
515 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
521 read_unlock(&tcp_lhash_lock);
528 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
529 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
531 * Local BH must be disabled here.
534 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
535 u32 daddr, u16 hnum, int dif)
538 struct tcp_ehash_bucket *head;
539 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
540 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
544 /* Optimize here for direct hit, only listening connections can
545 * have wildcards anyways.
547 hash = tcp_hashfn(daddr, hnum, saddr, sport);
548 head = &tcp_ehash[hash];
549 read_lock(&head->lock);
550 for(sk = head->chain; sk; sk = sk->next) {
551 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
552 goto hit; /* You sunk my battleship! */
555 /* Must check for a TIME_WAIT'er before going to listener hash. */
556 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
557 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
559 read_unlock(&head->lock);
565 read_unlock(&head->lock);
572 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
573 u32 daddr, u16 hnum, int dif)
578 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
583 return tcp_v4_lookup_listener(daddr, hnum, dif);
589 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
595 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
604 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
607 return secure_tcp_sequence_number(skb->nh.iph->daddr,
616 /* called with local bh disabled */
617 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
618 struct tcp_tw_bucket **twp)
621 u32 daddr = sk->rcv_saddr;
622 u32 saddr = sk->daddr;
623 int dif = sk->bound_dev_if;
624 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
625 __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
626 int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
627 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
628 struct sock *sk2, **skp;
629 struct tcp_tw_bucket *tw;
631 write_lock(&head->lock);
633 /* Check TIME-WAIT sockets first. */
634 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
636 tw = (struct tcp_tw_bucket*)sk2;
638 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
639 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
641 /* With PAWS, it is safe from the viewpoint
642 of data integrity. Even without PAWS it
643 is safe provided sequence spaces do not
644 overlap i.e. at data rates <= 80Mbit/sec.
646 Actually, the idea is close to VJ's one,
647 only timestamp cache is held not per host,
648 but per port pair and TW bucket is used
651 If TW bucket has been already destroyed we
652 fall back to VJ's scheme and use initial
653 timestamp retrieved from peer table.
655 if (tw->ts_recent_stamp &&
656 (!twp || (sysctl_tcp_tw_reuse &&
657 xtime.tv_sec - tw->ts_recent_stamp > 1))) {
658 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
660 tp->ts_recent = tw->ts_recent;
661 tp->ts_recent_stamp = tw->ts_recent_stamp;
671 /* And established part... */
672 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
673 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
678 /* Must record num and sport now. Otherwise we will see
679 * in hash table socket with a funny identity. */
681 sk->sport = htons(lport);
682 BUG_TRAP(sk->pprev==NULL);
683 if ((sk->next = *skp) != NULL)
684 (*skp)->pprev = &sk->next;
689 sock_prot_inc_use(sk->prot);
690 write_unlock(&head->lock);
694 NET_INC_STATS_BH(TimeWaitRecycled);
696 /* Silly. Should hash-dance instead... */
697 tcp_tw_deschedule(tw);
698 tcp_timewait_kill(tw);
699 NET_INC_STATS_BH(TimeWaitRecycled);
707 write_unlock(&head->lock);
708 return -EADDRNOTAVAIL;
715 * Bind a port for a connect operation and hash it.
717 static int tcp_v4_hash_connect(struct sock *sk)
720 unsigned short snum = sk->num;
721 struct tcp_bind_hashbucket *head;
722 struct tcp_bind_bucket *tb;
726 int low = sysctl_local_port_range[0];
727 int high = sysctl_local_port_range[1];
728 int remaining = (high - low) + 1;
729 struct tcp_tw_bucket *tw = NULL;
733 /* TODO. Actually it is not so bad idea to remove
734 * tcp_portalloc_lock before next submission to Linus.
735 * As soon as we touch this place at all it is time to think.
737 * Now it protects single _advisory_ variable tcp_port_rover,
738 * hence it is mostly useless.
739 * Code will work nicely if we just delete it, but
740 * I am afraid in contented case it will work not better or
741 * even worse: another cpu just will hit the same bucket
743 * So some cpu salt could remove both contention and
744 * memory pingpong. Any ideas how to do this in a nice way?
746 spin_lock(&tcp_portalloc_lock);
747 rover = tcp_port_rover;
751 if ((rover < low) || (rover > high))
753 head = &tcp_bhash[tcp_bhashfn(rover)];
754 spin_lock(&head->lock);
756 /* Does not bother with rcv_saddr checks,
757 * because the established check is already
760 for (tb = head->chain; tb; tb = tb->next) {
761 if (tb->port == rover) {
762 BUG_TRAP(tb->owners != NULL);
763 if (tb->fastreuse >= 0)
765 if (!__tcp_v4_check_established(sk, rover, &tw))
771 tb = tcp_bucket_create(head, rover);
773 spin_unlock(&head->lock);
780 spin_unlock(&head->lock);
781 } while (--remaining > 0);
782 tcp_port_rover = rover;
783 spin_unlock(&tcp_portalloc_lock);
787 return -EADDRNOTAVAIL;
790 /* All locks still held and bhs disabled */
791 tcp_port_rover = rover;
792 spin_unlock(&tcp_portalloc_lock);
794 tcp_bind_hash(sk, tb, rover);
796 sk->sport = htons(rover);
797 __tcp_v4_hash(sk, 0);
799 spin_unlock(&head->lock);
802 tcp_tw_deschedule(tw);
803 tcp_timewait_kill(tw);
811 head = &tcp_bhash[tcp_bhashfn(snum)];
812 tb = (struct tcp_bind_bucket *)sk->prev;
813 spin_lock_bh(&head->lock);
814 if (tb->owners == sk && sk->bind_next == NULL) {
815 __tcp_v4_hash(sk, 0);
816 spin_unlock_bh(&head->lock);
820 spin_unlock(&head->lock);
821 /* No definite answer... Walk to established hash table */
822 ret = __tcp_v4_check_established(sk, snum, NULL);
831 /* This will initiate an outgoing connection. */
832 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
835 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
836 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
842 if (addr_len < sizeof(struct sockaddr_in))
845 if (usin->sin_family != AF_INET)
846 return(-EAFNOSUPPORT);
848 nexthop = daddr = usin->sin_addr.s_addr;
849 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
852 nexthop = sk->protinfo.af_inet.opt->faddr;
855 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
856 RT_CONN_FLAGS(sk), sk->bound_dev_if);
860 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
865 __sk_dst_set(sk, &rt->u.dst);
866 sk->route_caps = rt->u.dst.dev->features;
868 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
872 sk->saddr = rt->rt_src;
873 sk->rcv_saddr = sk->saddr;
875 if (tp->ts_recent_stamp && sk->daddr != daddr) {
876 /* Reset inherited state */
878 tp->ts_recent_stamp = 0;
882 if (sysctl_tcp_tw_recycle &&
883 !tp->ts_recent_stamp &&
884 rt->rt_dst == daddr) {
885 struct inet_peer *peer = rt_get_peer(rt);
887 /* VJ's idea. We save last timestamp seen from
888 * the destination in peer table, when entering state TIME-WAIT
889 * and initialize ts_recent from it, when trying new connection.
892 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
893 tp->ts_recent_stamp = peer->tcp_ts_stamp;
894 tp->ts_recent = peer->tcp_ts;
898 sk->dport = usin->sin_port;
901 tp->ext_header_len = 0;
902 if (sk->protinfo.af_inet.opt)
903 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
907 /* Socket identity is still unknown (sport may be zero).
908 * However we set state to SYN-SENT and not releasing socket
909 * lock select source port, enter ourselves into the hash tables and
910 * complete initalization after this.
912 tcp_set_state(sk, TCP_SYN_SENT);
913 err = tcp_v4_hash_connect(sk);
918 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
919 sk->sport, usin->sin_port);
921 sk->protinfo.af_inet.id = tp->write_seq^jiffies;
923 err = tcp_connect(sk);
930 tcp_set_state(sk, TCP_CLOSE);
940 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
943 return ((struct rtable*)skb->dst)->rt_iif;
949 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
952 unsigned h = raddr ^ rport;
955 return h&(TCP_SYNQ_HSIZE-1);
961 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
962 struct open_request ***prevp,
964 __u32 raddr, __u32 laddr)
967 struct tcp_listen_opt *lopt = tp->listen_opt;
968 struct open_request *req, **prev;
970 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
971 (req = *prev) != NULL;
972 prev = &req->dl_next) {
973 if (req->rmt_port == rport &&
974 req->af.v4_req.rmt_addr == raddr &&
975 req->af.v4_req.loc_addr == laddr &&
976 TCP_INET_FAMILY(req->class->family)) {
977 BUG_TRAP(req->sk == NULL);
989 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
992 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
993 struct tcp_listen_opt *lopt = tp->listen_opt;
994 unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
996 req->expires = jiffies + TCP_TIMEOUT_INIT;
999 req->dl_next = lopt->syn_table[h];
1001 write_lock(&tp->syn_wait_lock);
1002 lopt->syn_table[h] = req;
1003 write_unlock(&tp->syn_wait_lock);
1011 * This routine does path mtu discovery as defined in RFC1191.
1013 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
1016 struct dst_entry *dst;
1017 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1019 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
1020 * send out by Linux are always <576bytes so they should go through
1023 if (sk->state == TCP_LISTEN)
1026 /* We don't check in the destentry if pmtu discovery is forbidden
1027 * on this route. We just assume that no packet_to_big packets
1028 * are send back when pmtu discovery is not active.
1029 * There is a small race when the user changes this flag in the
1030 * route, but I think that's acceptable.
1032 if ((dst = __sk_dst_check(sk, 0)) == NULL)
1035 ip_rt_update_pmtu(dst, mtu);
1037 /* Something is about to be wrong... Remember soft error
1038 * for the case, if this connection will not able to recover.
1040 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
1041 sk->err_soft = EMSGSIZE;
1043 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
1044 tp->pmtu_cookie > dst->pmtu) {
1045 tcp_sync_mss(sk, dst->pmtu);
1047 /* Resend the TCP packet because it's
1048 * clear that the old packet has been
1049 * dropped. This is the new "fast" path mtu
1052 tcp_simple_retransmit(sk);
1053 } /* else let the usual retransmit timer handle it */
1058 * This routine is called by the ICMP module when it gets some
1059 * sort of error condition. If err < 0 then the socket should
1060 * be closed and the error returned to the user. If err > 0
1061 * it's just the icmp type << 8 | icmp code. After adjustment
1062 * header points to the first 8 bytes of the tcp header. We need
1063 * to find the appropriate port.
1065 * The locking strategy used here is very "optimistic". When
1066 * someone else accesses the socket the ICMP is just dropped
1067 * and for some paths there is no check at all.
1068 * A more general error queue to queue errors for later handling
1069 * is probably better.
1073 void tcp_v4_err(struct sk_buff *skb, u32 info)
1076 struct iphdr *iph = (struct iphdr*)skb->data;
1077 struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
1079 int type = skb->h.icmph->type;
1080 int code = skb->h.icmph->code;
1085 if (skb->len < (iph->ihl << 2) + 8) {
1086 ICMP_INC_STATS_BH(IcmpInErrors);
1090 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1092 ICMP_INC_STATS_BH(IcmpInErrors);
1095 if (sk->state == TCP_TIME_WAIT) {
1096 tcp_tw_put((struct tcp_tw_bucket*)sk);
1101 /* If too many ICMPs get dropped on busy
1102 * servers this needs to be solved differently.
1104 if (sk->lock.users != 0)
1105 NET_INC_STATS_BH(LockDroppedIcmps);
1107 if (sk->state == TCP_CLOSE)
1110 tp = &sk->tp_pinfo.af_tcp;
1111 seq = ntohl(th->seq);
1112 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1113 NET_INC_STATS(OutOfWindowIcmps);
1118 case ICMP_SOURCE_QUENCH:
1119 /* This is deprecated, but if someone generated it,
1120 * we have no reasons to ignore it.
1122 if (sk->lock.users == 0)
1125 case ICMP_PARAMETERPROB:
1128 case ICMP_DEST_UNREACH:
1129 if (code > NR_ICMP_UNREACH)
1132 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1133 if (sk->lock.users == 0)
1134 do_pmtu_discovery(sk, iph, info);
1138 err = icmp_err_convert[code].errno;
1140 case ICMP_TIME_EXCEEDED:
1147 switch (sk->state) {
1148 struct open_request *req, **prev;
1150 if (sk->lock.users != 0)
1153 req = tcp_v4_search_req(tp, &prev,
1155 iph->daddr, iph->saddr);
1159 /* ICMPs are not backlogged, hence we cannot get
1160 an established socket here.
1162 BUG_TRAP(req->sk == NULL);
1164 if (seq != req->snt_isn) {
1165 NET_INC_STATS_BH(OutOfWindowIcmps);
1170 * Still in SYN_RECV, just remove it silently.
1171 * There is no good way to pass the error to the newly
1172 * created socket, and POSIX does not want network
1173 * errors returned from accept().
1175 tcp_synq_drop(sk, req, prev);
1179 case TCP_SYN_RECV: /* Cannot happen.
1180 It can f.e. if SYNs crossed.
1182 if (sk->lock.users == 0) {
1183 TCP_INC_STATS_BH(TcpAttemptFails);
1186 sk->error_report(sk);
1195 /* If we've already connected we will keep trying
1196 * until we time out, or the user gives up.
1198 * rfc1122 4.2.3.9 allows to consider as hard errors
1199 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1200 * but it is obsoleted by pmtu discovery).
1202 * Note, that in modern internet, where routing is unreliable
1203 * and in each dark corner broken firewalls sit, sending random
1204 * errors ordered by their masters even this two messages finally lose
1205 * their original sense (even Linux sends invalid PORT_UNREACHs)
1207 * Now we are in compliance with RFCs.
1211 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1213 sk->error_report(sk);
1214 } else { /* Only an error on timeout */
1224 /* This routine computes an IPv4 TCP checksum. */
1225 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1226 struct sk_buff *skb)
1229 if (skb->ip_summed == CHECKSUM_HW) {
1230 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1231 skb->csum = offsetof(struct tcphdr, check);
1233 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1234 csum_partial((char *)th, th->doff<<2, skb->csum));
1240 * This routine will send an RST to the other tcp.
1242 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1244 * Answer: if a packet caused RST, it is not for a socket
1245 * existing in our system, if it is matched to a socket,
1246 * it is just duplicate segment or bug in other side's TCP.
1247 * So that we build reply only basing on parameters
1248 * arrived with segment.
1249 * Exception: precedence violation. We do not implement it in any case.
1252 static void tcp_v4_send_reset(struct sk_buff *skb)
1255 struct tcphdr *th = skb->h.th;
1257 struct ip_reply_arg arg;
1259 /* Never send a reset in response to a reset. */
1263 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1266 /* Swap the send and the receive. */
1267 memset(&rth, 0, sizeof(struct tcphdr));
1268 rth.dest = th->source;
1269 rth.source = th->dest;
1270 rth.doff = sizeof(struct tcphdr)/4;
1274 rth.seq = th->ack_seq;
1277 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1278 + skb->len - (th->doff<<2));
1281 memset(&arg, 0, sizeof arg);
1282 arg.iov[0].iov_base = (unsigned char *)&rth;
1283 arg.iov[0].iov_len = sizeof rth;
1284 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1285 skb->nh.iph->saddr, /*XXX*/
1286 sizeof(struct tcphdr),
1290 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1292 tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1293 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1295 TCP_INC_STATS_BH(TcpOutSegs);
1296 TCP_INC_STATS_BH(TcpOutRsts);
1300 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1301 outside socket context is ugly, certainly. What can I do?
1304 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1307 struct tcphdr *th = skb->h.th;
1312 struct ip_reply_arg arg;
1314 memset(&rep.th, 0, sizeof(struct tcphdr));
1315 memset(&arg, 0, sizeof arg);
1317 arg.iov[0].iov_base = (unsigned char *)&rep;
1318 arg.iov[0].iov_len = sizeof(rep.th);
1321 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1322 (TCPOPT_NOP << 16) |
1323 (TCPOPT_TIMESTAMP << 8) |
1325 rep.tsopt[1] = htonl(tcp_time_stamp);
1326 rep.tsopt[2] = htonl(ts);
1327 arg.iov[0].iov_len = sizeof(rep);
1330 /* Swap the send and the receive. */
1331 rep.th.dest = th->source;
1332 rep.th.source = th->dest;
1333 rep.th.doff = arg.iov[0].iov_len/4;
1334 rep.th.seq = htonl(seq);
1335 rep.th.ack_seq = htonl(ack);
1337 rep.th.window = htons(win);
1339 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1340 skb->nh.iph->saddr, /*XXX*/
1344 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1346 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1348 TCP_INC_STATS_BH(TcpOutSegs);
1352 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1355 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1357 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1358 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1364 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1367 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1372 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1376 struct ip_options *opt;
1378 opt = req->af.v4_req.opt;
1379 if(ip_route_output(&rt, ((opt && opt->srr) ?
1381 req->af.v4_req.rmt_addr),
1382 req->af.v4_req.loc_addr,
1383 RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1384 IP_INC_STATS_BH(IpOutNoRoutes);
1387 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1389 IP_INC_STATS_BH(IpOutNoRoutes);
1399 * Send a SYN-ACK after having received an ACK.
1400 * This still operates on a open_request only, not on a big
1403 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1404 struct dst_entry *dst)
1408 struct sk_buff * skb;
1410 /* First, grab a route. */
1412 (dst = tcp_v4_route_req(sk, req)) == NULL)
1415 skb = tcp_make_synack(sk, dst, req);
1418 struct tcphdr *th = skb->h.th;
1420 th->check = tcp_v4_check(th, skb->len,
1421 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1422 csum_partial((char *)th, skb->len, skb->csum));
1424 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1425 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1426 if (err == NET_XMIT_CN)
1439 * IPv4 open_request destructor.
1441 static void tcp_v4_or_free(struct open_request *req)
1444 if (req->af.v4_req.opt)
1445 kfree(req->af.v4_req.opt);
1449 static inline void syn_flood_warning(struct sk_buff *skb)
1452 static unsigned long warntime;
1454 if (jiffies - warntime > HZ*60) {
1457 "possible SYN flooding on port %d. Sending cookies.\n",
1458 ntohs(skb->h.th->dest));
1464 * Save and compile IPv4 options into the open_request if needed.
1466 static inline struct ip_options *
1467 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1470 struct ip_options *opt = &(IPCB(skb)->opt);
1471 struct ip_options *dopt = NULL;
1473 if (opt && opt->optlen) {
1474 int opt_size = optlength(opt);
1475 dopt = kmalloc(opt_size, GFP_ATOMIC);
1477 if (ip_options_echo(dopt, skb)) {
1490 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1491 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1492 * It would be better to replace it with a global counter for all sockets
1493 * but then some measure against one socket starving all other sockets
1496 * It was 128 by default. Experiments with real servers show, that
1497 * it is absolutely not enough even at 100conn/sec. 256 cures most
1498 * of problems. This value is adjusted to 128 for very small machines
1499 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1500 * Further increasing requires to change hash table size.
1502 int sysctl_max_syn_backlog = 256;
1505 struct or_calltable or_ipv4 = {
1514 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1518 struct open_request *req;
1519 __u32 saddr = skb->nh.iph->saddr;
1520 __u32 daddr = skb->nh.iph->daddr;
1521 __u32 isn = TCP_SKB_CB(skb)->when;
1522 struct dst_entry *dst = NULL;
1523 #ifdef CONFIG_SYN_COOKIES
1524 int want_cookie = 0;
1526 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1529 /* Never answer to SYNs send to broadcast or multicast */
1530 if (((struct rtable *)skb->dst)->rt_flags &
1531 (RTCF_BROADCAST|RTCF_MULTICAST))
1534 /* TW buckets are converted to open requests without
1535 * limitations, they conserve resources and peer is
1536 * evidently real one.
1538 if (tcp_synq_is_full(sk) && !isn) {
1539 #ifdef CONFIG_SYN_COOKIES
1540 if (sysctl_tcp_syncookies) {
1547 /* Accept backlog is full. If we have already queued enough
1548 * of warm entries in syn queue, drop request. It is better than
1549 * clogging syn queue with openreqs with exponentially increasing
1552 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1555 req = tcp_openreq_alloc();
1559 tcp_clear_options(&tp);
1561 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1563 tcp_parse_options(skb, &tp, 0);
1566 tcp_clear_options(&tp);
1570 if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1571 /* Some OSes (unknown ones, but I see them on web server, which
1572 * contains information interesting only for windows'
1573 * users) do not send their stamp in SYN. It is easy case.
1574 * We simply do not advertise TS support.
1579 tp.tstamp_ok = tp.saw_tstamp;
1581 tcp_openreq_init(req, &tp, skb);
1583 req->af.v4_req.loc_addr = daddr;
1584 req->af.v4_req.rmt_addr = saddr;
1585 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1586 req->class = &or_ipv4;
1588 TCP_ECN_create_request(req, skb->h.th);
1591 #ifdef CONFIG_SYN_COOKIES
1592 syn_flood_warning(skb);
1594 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1595 } else if (isn == 0) {
1596 struct inet_peer *peer = NULL;
1598 /* VJ's idea. We save last timestamp seen
1599 * from the destination in peer table, when entering
1600 * state TIME-WAIT, and check against it before
1601 * accepting new connection request.
1603 * If "isn" is not zero, this request hit alive
1604 * timewait bucket, so that all the necessary checks
1605 * are made in the function processing timewait state.
1607 if (tp.saw_tstamp &&
1608 sysctl_tcp_tw_recycle &&
1609 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1610 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1611 peer->v4daddr == saddr) {
1612 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1613 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1614 NET_INC_STATS_BH(PAWSPassiveRejected);
1619 /* Kill the following clause, if you dislike this way. */
1620 else if (!sysctl_tcp_syncookies &&
1621 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1622 < (sysctl_max_syn_backlog>>2)) &&
1623 (!peer || !peer->tcp_ts_stamp) &&
1624 (!dst || !dst->rtt)) {
1625 /* Without syncookies last quarter of
1626 * backlog is filled with destinations, proven to be alive.
1627 * It means that we continue to communicate
1628 * to destinations, already remembered
1629 * to the moment of synflood.
1631 NETDEBUG(if (net_ratelimit()) \
1632 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1633 NIPQUAD(saddr), ntohs(skb->h.th->source)));
1638 isn = tcp_v4_init_sequence(sk, skb);
1642 if (tcp_v4_send_synack(sk, req, dst))
1646 tcp_openreq_free(req);
1648 tcp_v4_synq_add(sk, req);
1653 tcp_openreq_free(req);
1655 TCP_INC_STATS_BH(TcpAttemptFails);
1664 * The three way handshake has completed - we got a valid synack -
1665 * now create the new socket.
1667 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1668 struct open_request *req,
1669 struct dst_entry *dst)
1672 struct tcp_opt *newtp;
1675 if (tcp_acceptq_is_full(sk))
1679 (dst = tcp_v4_route_req(sk, req)) == NULL)
1682 newsk = tcp_create_openreq_child(sk, req, skb);
1686 newsk->dst_cache = dst;
1687 newsk->route_caps = dst->dev->features;
1689 newtp = &(newsk->tp_pinfo.af_tcp);
1690 newsk->daddr = req->af.v4_req.rmt_addr;
1691 newsk->saddr = req->af.v4_req.loc_addr;
1692 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1693 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1694 req->af.v4_req.opt = NULL;
1695 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1696 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1697 newtp->ext_header_len = 0;
1698 if (newsk->protinfo.af_inet.opt)
1699 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1700 newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1702 tcp_sync_mss(newsk, dst->pmtu);
1703 newtp->advmss = dst->advmss;
1704 tcp_initialize_rcv_mss(newsk);
1706 __tcp_v4_hash(newsk, 0);
1707 __tcp_inherit_port(sk, newsk);
1712 NET_INC_STATS_BH(ListenOverflows);
1714 NET_INC_STATS_BH(ListenDrops);
1722 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1725 struct open_request *req, **prev;
1726 struct tcphdr *th = skb->h.th;
1727 struct iphdr *iph = skb->nh.iph;
1728 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1731 /* Find possible connection requests. */
1732 req = tcp_v4_search_req(tp, &prev,
1734 iph->saddr, iph->daddr);
1736 return tcp_check_req(sk, skb, req, prev);
1738 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1745 if (nsk->state != TCP_TIME_WAIT) {
1749 tcp_tw_put((struct tcp_tw_bucket*)nsk);
1753 #ifdef CONFIG_SYN_COOKIES
1754 if (!th->rst && !th->syn && th->ack)
1755 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1763 static int tcp_v4_checksum_init(struct sk_buff *skb)
1766 if (skb->ip_summed == CHECKSUM_HW) {
1767 skb->ip_summed = CHECKSUM_UNNECESSARY;
1768 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1769 skb->nh.iph->daddr,skb->csum))
1772 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1773 skb->ip_summed = CHECKSUM_NONE;
1775 if (skb->len <= 76) {
1776 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1778 skb_checksum(skb, 0, skb->len, 0)))
1780 skb->ip_summed = CHECKSUM_UNNECESSARY;
1782 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1783 skb->nh.iph->daddr,0);
1792 /* The socket must have it's spinlock held when we get
1795 * We have a potential double-lock case here, so even when
1796 * doing backlog processing we use the BH locking scheme.
1797 * This is because we cannot sleep with the original spinlock
1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1803 #ifdef CONFIG_FILTER
1804 struct sk_filter *filter = sk->filter;
1805 if (filter && sk_filter(skb, filter))
1807 #endif /* CONFIG_FILTER */
1809 IP_INC_STATS_BH(IpInDelivers);
1811 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1812 TCP_CHECK_TIMER(sk);
1813 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1815 TCP_CHECK_TIMER(sk);
1819 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1822 if (sk->state == TCP_LISTEN) {
1823 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1828 if (tcp_child_process(sk, nsk, skb))
1834 TCP_CHECK_TIMER(sk);
1835 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1837 TCP_CHECK_TIMER(sk);
1841 tcp_v4_send_reset(skb);
1844 /* Be careful here. If this function gets more complicated and
1845 * gcc suffers from register pressure on the x86, sk (in %ebx)
1846 * might be destroyed here. This current version compiles correctly,
1847 * but you have been warned.
1852 TCP_INC_STATS_BH(TcpInErrs);
1863 int tcp_v4_rcv(struct sk_buff *skb)
1870 if (skb->pkt_type!=PACKET_HOST)
1873 /* Count it even if it's bad */
1874 TCP_INC_STATS_BH(TcpInSegs);
1876 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1881 if (th->doff < sizeof(struct tcphdr)/4)
1883 if (!pskb_may_pull(skb, th->doff*4))
1886 /* An explanation is required here, I think.
1887 * Packet length and doff are validated by header prediction,
1888 * provided case of th->doff==0 is elimineted.
1889 * So, we defer the checks. */
1890 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1891 tcp_v4_checksum_init(skb) < 0))
1895 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1896 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1897 skb->len - th->doff*4);
1898 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1899 TCP_SKB_CB(skb)->when = 0;
1900 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1901 TCP_SKB_CB(skb)->sacked = 0;
1903 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1904 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1910 if(!ipsec_sk_policy(sk,skb))
1911 goto discard_and_relse;
1913 if (sk->state == TCP_TIME_WAIT)
1920 if (!sk->lock.users) {
1921 if (!tcp_prequeue(sk, skb))
1922 ret = tcp_v4_do_rcv(sk, skb);
1924 sk_add_backlog(sk, skb);
1932 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1934 TCP_INC_STATS_BH(TcpInErrs);
1936 tcp_v4_send_reset(skb);
1940 /* Discard frame. */
1949 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1950 TCP_INC_STATS_BH(TcpInErrs);
1951 goto discard_and_relse;
1953 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1954 skb, th, skb->len)) {
1959 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1961 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1962 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1963 tcp_tw_put((struct tcp_tw_bucket *)sk);
1967 /* Fall through to ACK */
1970 tcp_v4_timewait_ack(sk, skb);
1974 case TCP_TW_SUCCESS:;
1980 /* With per-bucket locks this operation is not-atomic, so that
1981 * this version is not worse.
1983 static void __tcp_v4_rehash(struct sock *sk)
1986 sk->prot->unhash(sk);
1991 static int tcp_v4_reselect_saddr(struct sock *sk)
1996 __u32 old_saddr = sk->saddr;
1998 __u32 daddr = sk->daddr;
2000 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
2001 daddr = sk->protinfo.af_inet.opt->faddr;
2003 /* Query new route. */
2004 err = ip_route_connect(&rt, daddr, 0,
2005 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
2010 __sk_dst_set(sk, &rt->u.dst);
2011 sk->route_caps = rt->u.dst.dev->features;
2013 new_saddr = rt->rt_src;
2015 if (new_saddr == old_saddr)
2018 if (sysctl_ip_dynaddr > 1) {
2019 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
2020 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
2022 NIPQUAD(new_saddr));
2025 sk->saddr = new_saddr;
2026 sk->rcv_saddr = new_saddr;
2028 /* XXX The only one ugly spot where we need to
2029 * XXX really change the sockets identity after
2030 * XXX it has entered the hashes. -DaveM
2032 * Besides that, it does not check for connection
2033 * uniqueness. Wait for troubles.
2035 __tcp_v4_rehash(sk);
2042 int tcp_v4_rebuild_header(struct sock *sk)
2045 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
2049 /* Route is OK, nothing to do. */
2055 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
2056 daddr = sk->protinfo.af_inet.opt->faddr;
2058 err = ip_route_output(&rt, daddr, sk->saddr,
2059 RT_CONN_FLAGS(sk), sk->bound_dev_if);
2061 __sk_dst_set(sk, &rt->u.dst);
2062 sk->route_caps = rt->u.dst.dev->features;
2066 /* Routing failed... */
2069 if (!sysctl_ip_dynaddr ||
2070 sk->state != TCP_SYN_SENT ||
2071 (sk->userlocks & SOCK_BINDADDR_LOCK) ||
2072 (err = tcp_v4_reselect_saddr(sk)) != 0)
2081 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2084 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2086 sin->sin_family = AF_INET;
2087 sin->sin_addr.s_addr = sk->daddr;
2088 sin->sin_port = sk->dport;
2092 /* VJ's idea. Save last timestamp seen from this destination
2093 * and hold it at least for normal timewait interval to use for duplicate
2094 * segment detection in subsequent connections, before they enter synchronized
2098 int tcp_v4_remember_stamp(struct sock *sk)
2101 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2102 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
2103 struct inet_peer *peer = NULL;
2106 if (rt == NULL || rt->rt_dst != sk->daddr) {
2107 peer = inet_getpeer(sk->daddr, 1);
2110 if (rt->peer == NULL)
2111 rt_bind_peer(rt, 1);
2116 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2117 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2118 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2119 peer->tcp_ts_stamp = tp->ts_recent_stamp;
2120 peer->tcp_ts = tp->ts_recent;
2133 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2136 struct inet_peer *peer = NULL;
2138 peer = inet_getpeer(tw->daddr, 1);
2141 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
2142 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2143 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
2144 peer->tcp_ts_stamp = tw->ts_recent_stamp;
2145 peer->tcp_ts = tw->ts_recent;
2158 struct tcp_func ipv4_specific = {
2161 tcp_v4_rebuild_header,
2162 tcp_v4_conn_request,
2163 tcp_v4_syn_recv_sock,
2164 tcp_v4_remember_stamp,
2165 sizeof(struct iphdr),
2170 sizeof(struct sockaddr_in)
2174 /* NOTE: A lot of things set to zero explicitly by call to
2175 * sk_alloc() so need not be done here.
2177 static int tcp_v4_init_sock(struct sock *sk)
2180 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2182 skb_queue_head_init(&tp->out_of_order_queue);
2183 tcp_init_xmit_timers(sk);
2184 tcp_prequeue_init(tp);
2186 tp->rto = TCP_TIMEOUT_INIT;
2187 tp->mdev = TCP_TIMEOUT_INIT;
2189 /* So many TCP implementations out there (incorrectly) count the
2190 * initial SYN frame in their delayed-ACK and congestion control
2191 * algorithms that we must have the following bandaid to talk
2192 * efficiently to them. -DaveM
2196 /* See draft-stevens-tcpca-spec-01 for discussion of the
2197 * initialization of these values.
2199 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2200 tp->snd_cwnd_clamp = ~0;
2201 tp->mss_cache = 536;
2203 tp->reordering = sysctl_tcp_reordering;
2205 sk->state = TCP_CLOSE;
2207 sk->write_space = tcp_write_space;
2208 sk->use_write_queue = 1;
2210 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2212 sk->sndbuf = sysctl_tcp_wmem[1];
2213 sk->rcvbuf = sysctl_tcp_rmem[1];
2215 atomic_inc(&tcp_sockets_allocated);
2223 static int tcp_v4_destroy_sock(struct sock *sk)
2226 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2228 tcp_clear_xmit_timers(sk);
2230 /* Cleanup up the write buffer. */
2231 tcp_writequeue_purge(sk);
2233 /* Cleans up our, hopefully empty, out_of_order_queue. */
2234 __skb_queue_purge(&tp->out_of_order_queue);
2236 /* Clean prequeue, it must be empty really */
2237 __skb_queue_purge(&tp->ucopy.prequeue);
2239 /* Clean up a referenced TCP bind bucket. */
2240 if(sk->prev != NULL)
2243 /* If sendmsg cached page exists, toss it. */
2244 if (tp->sndmsg_page != NULL)
2245 __free_page(tp->sndmsg_page);
2247 atomic_dec(&tcp_sockets_allocated);
2255 /* Proc filesystem TCP sock list dumping. */
2256 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2259 int ttd = req->expires - jiffies;
2261 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2262 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2264 req->af.v4_req.loc_addr,
2266 req->af.v4_req.rmt_addr,
2267 ntohs(req->rmt_port),
2269 0,0, /* could print option size, but that is af dependent. */
2270 1, /* timers active (only the expire timer) */
2274 0, /* non standard timer */
2275 0, /* open_requests have no inode */
2276 atomic_read(&sk->refcnt),
2282 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2285 unsigned int dest, src;
2288 unsigned long timer_expires;
2289 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2292 src = sp->rcv_saddr;
2293 destp = ntohs(sp->dport);
2294 srcp = ntohs(sp->sport);
2295 if (tp->pending == TCP_TIME_RETRANS) {
2297 timer_expires = tp->timeout;
2298 } else if (tp->pending == TCP_TIME_PROBE0) {
2300 timer_expires = tp->timeout;
2301 } else if (timer_pending(&sp->timer)) {
2303 timer_expires = sp->timer.expires;
2306 timer_expires = jiffies;
2309 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2310 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2311 i, src, srcp, dest, destp, sp->state,
2312 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2313 timer_active, timer_expires-jiffies,
2318 atomic_read(&sp->refcnt), sp,
2319 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2320 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2325 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2328 unsigned int dest, src;
2330 int ttd = tw->ttd - jiffies;
2336 src = tw->rcv_saddr;
2337 destp = ntohs(tw->dport);
2338 srcp = ntohs(tw->sport);
2340 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2341 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2342 i, src, srcp, dest, destp, tw->substate, 0, 0,
2344 atomic_read(&tw->refcnt), tw);
2350 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2353 int len = 0, num = 0, i;
2354 off_t begin, pos = 0;
2355 char tmpbuf[TMPSZ+1];
2358 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2359 " sl local_address rem_address st tx_queue "
2360 "rx_queue tr tm->when retrnsmt uid timeout inode");
2364 /* First, walk listening socket table. */
2366 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2368 struct tcp_listen_opt *lopt;
2371 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2372 struct open_request *req;
2374 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2376 if (!TCP_INET_FAMILY(sk->family))
2380 if (pos >= offset) {
2381 get_tcp_sock(sk, tmpbuf, num);
2382 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2383 if (pos >= offset + length) {
2384 tcp_listen_unlock();
2390 uid = sock_i_uid(sk);
2391 read_lock_bh(&tp->syn_wait_lock);
2392 lopt = tp->listen_opt;
2393 if (lopt && lopt->qlen != 0) {
2394 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2395 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2396 if (!TCP_INET_FAMILY(req->class->family))
2402 get_openreq(sk, req, tmpbuf, num, uid);
2403 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2404 if (pos >= offset + length) {
2405 read_unlock_bh(&tp->syn_wait_lock);
2406 tcp_listen_unlock();
2412 read_unlock_bh(&tp->syn_wait_lock);
2414 /* Completed requests are in normal socket hash table */
2417 tcp_listen_unlock();
2421 /* Next, walk established hash chain. */
2422 for (i = 0; i < tcp_ehash_size; i++) {
2423 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2425 struct tcp_tw_bucket *tw;
2427 read_lock(&head->lock);
2428 for(sk = head->chain; sk; sk = sk->next, num++) {
2429 if (!TCP_INET_FAMILY(sk->family))
2434 get_tcp_sock(sk, tmpbuf, num);
2435 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2436 if (pos >= offset + length) {
2437 read_unlock(&head->lock);
2441 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2443 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2444 if (!TCP_INET_FAMILY(tw->family))
2449 get_timewait_sock(tw, tmpbuf, num);
2450 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2451 if (pos >= offset + length) {
2452 read_unlock(&head->lock);
2456 read_unlock(&head->lock);
2463 begin = len - (pos - offset);
2464 *start = buffer + begin;
2474 struct proto tcp_prot = {
2477 connect: tcp_v4_connect,
2478 disconnect: tcp_disconnect,
2481 init: tcp_v4_init_sock,
2482 destroy: tcp_v4_destroy_sock,
2483 shutdown: tcp_shutdown,
2484 setsockopt: tcp_setsockopt,
2485 getsockopt: tcp_getsockopt,
2486 sendmsg: tcp_sendmsg,
2487 recvmsg: tcp_recvmsg,
2488 backlog_rcv: tcp_v4_do_rcv,
2491 get_port: tcp_v4_get_port,
2496 void tcp_v4_init(struct net_proto_family *ops)
2501 tcp_inode.i_mode = S_IFSOCK;
2502 tcp_inode.i_sock = 1;
2503 tcp_inode.i_uid = 0;
2504 tcp_inode.i_gid = 0;
2505 init_waitqueue_head(&tcp_inode.i_wait);
2506 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2508 tcp_socket->inode = &tcp_inode;
2509 tcp_socket->state = SS_UNCONNECTED;
2510 tcp_socket->type=SOCK_RAW;
2512 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2513 panic("Failed to create the TCP control socket.\n");
2514 tcp_socket->sk->allocation=GFP_ATOMIC;
2515 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2517 /* Unhash it so that IP input processing does not even
2518 * see it, we do not wish this socket to see incoming
2521 tcp_socket->sk->prot->unhash(tcp_socket->sk);