2 * COPYRIGHT: See COPYING in the top level directory
3 * PROJECT: ReactOS TCP/IP protocol driver
4 * FILE: transport/tcp/tcpcore.c
5 * PURPOSE: Transmission Control Protocol
6 * PROGRAMMERS: Casper S. Hornstrup (chorns@users.sourceforge.net)
8 * CSH 15-01-2003 Imported from linux kernel 2.4.20
12 * INET An implementation of the TCP/IP protocol suite for the LINUX
13 * operating system. INET is implemented using the BSD Socket
14 * interface as the means of communication with the user level.
16 * Implementation of the Transmission Control Protocol(TCP).
20 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
21 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
22 * Mark Evans, <evansmp@uhura.aston.ac.uk>
23 * Corey Minyard <wf-rch!minyard@relay.EU.net>
24 * Florian La Roche, <flla@stud.uni-sb.de>
25 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
26 * Linus Torvalds, <torvalds@cs.helsinki.fi>
27 * Alan Cox, <gw4pts@gw4pts.ampr.org>
28 * Matthew Dillon, <dillon@apollo.west.oic.com>
29 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
30 * Jorge Cwik, <jorge@laser.satlink.net>
33 * Alan Cox : Numerous verify_area() calls
34 * Alan Cox : Set the ACK bit on a reset
35 * Alan Cox : Stopped it crashing if it closed while
36 * sk->inuse=1 and was trying to connect
38 * Alan Cox : All icmp error handling was broken
39 * pointers passed where wrong and the
40 * socket was looked up backwards. Nobody
41 * tested any icmp error code obviously.
42 * Alan Cox : tcp_err() now handled properly. It
43 * wakes people on errors. poll
44 * behaves and the icmp error race
45 * has gone by moving it into sock.c
46 * Alan Cox : tcp_send_reset() fixed to work for
47 * everything not just packets for
49 * Alan Cox : tcp option processing.
50 * Alan Cox : Reset tweaked (still not 100%) [Had
52 * Herp Rosmanith : More reset fixes
53 * Alan Cox : No longer acks invalid rst frames.
54 * Acking any kind of RST is right out.
55 * Alan Cox : Sets an ignore me flag on an rst
56 * receive otherwise odd bits of prattle
58 * Alan Cox : Fixed another acking RST frame bug.
59 * Should stop LAN workplace lockups.
60 * Alan Cox : Some tidyups using the new skb list
62 * Alan Cox : sk->keepopen now seems to work
63 * Alan Cox : Pulls options out correctly on accepts
64 * Alan Cox : Fixed assorted sk->rqueue->next errors
65 * Alan Cox : PSH doesn't end a TCP read. Switched a
67 * Alan Cox : Tidied tcp_data to avoid a potential
69 * Alan Cox : Added some better commenting, as the
70 * tcp is hard to follow
71 * Alan Cox : Removed incorrect check for 20 * psh
72 * Michael O'Reilly : ack < copied bug fix.
73 * Johannes Stille : Misc tcp fixes (not all in yet).
74 * Alan Cox : FIN with no memory -> CRASH
75 * Alan Cox : Added socket option proto entries.
76 * Also added awareness of them to accept.
77 * Alan Cox : Added TCP options (SOL_TCP)
78 * Alan Cox : Switched wakeup calls to callbacks,
79 * so the kernel can layer network
81 * Alan Cox : Use ip_tos/ip_ttl settings.
82 * Alan Cox : Handle FIN (more) properly (we hope).
83 * Alan Cox : RST frames sent on unsynchronised
85 * Alan Cox : Put in missing check for SYN bit.
86 * Alan Cox : Added tcp_select_window() aka NET2E
87 * window non shrink trick.
88 * Alan Cox : Added a couple of small NET2E timer
90 * Charles Hedrick : TCP fixes
91 * Toomas Tamm : TCP window fixes
92 * Alan Cox : Small URG fix to rlogin ^C ack fight
93 * Charles Hedrick : Rewrote most of it to actually work
94 * Linus : Rewrote tcp_read() and URG handling
96 * Gerhard Koerting: Fixed some missing timer handling
97 * Matthew Dillon : Reworked TCP machine states as per RFC
98 * Gerhard Koerting: PC/TCP workarounds
99 * Adam Caldwell : Assorted timer/timing errors
100 * Matthew Dillon : Fixed another RST bug
101 * Alan Cox : Move to kernel side addressing changes.
102 * Alan Cox : Beginning work on TCP fastpathing
104 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
105 * Alan Cox : TCP fast path debugging
106 * Alan Cox : Window clamping
107 * Michael Riepe : Bug in tcp_check()
108 * Matt Dillon : More TCP improvements and RST bug fixes
109 * Matt Dillon : Yet more small nasties remove from the
110 * TCP code (Be very nice to this man if
111 * tcp finally works 100%) 8)
112 * Alan Cox : BSD accept semantics.
113 * Alan Cox : Reset on closedown bug.
114 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
115 * Michael Pall : Handle poll() after URG properly in
117 * Michael Pall : Undo the last fix in tcp_read_urg()
118 * (multi URG PUSH broke rlogin).
119 * Michael Pall : Fix the multi URG PUSH problem in
120 * tcp_readable(), poll() after URG
122 * Michael Pall : recv(...,MSG_OOB) never blocks in the
124 * Alan Cox : Changed the semantics of sk->socket to
125 * fix a race and a signal problem with
126 * accept() and async I/O.
127 * Alan Cox : Relaxed the rules on tcp_sendto().
128 * Yury Shevchuk : Really fixed accept() blocking problem.
129 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
130 * clients/servers which listen in on
132 * Alan Cox : Cleaned the above up and shrank it to
133 * a sensible code size.
134 * Alan Cox : Self connect lockup fix.
135 * Alan Cox : No connect to multicast.
136 * Ross Biro : Close unaccepted children on master
138 * Alan Cox : Reset tracing code.
139 * Alan Cox : Spurious resets on shutdown.
140 * Alan Cox : Giant 15 minute/60 second timer error
141 * Alan Cox : Small whoops in polling before an
143 * Alan Cox : Kept the state trace facility since
144 * it's handy for debugging.
145 * Alan Cox : More reset handler fixes.
146 * Alan Cox : Started rewriting the code based on
147 * the RFC's for other useful protocol
148 * references see: Comer, KA9Q NOS, and
149 * for a reference on the difference
150 * between specifications and how BSD
151 * works see the 4.4lite source.
152 * A.N.Kuznetsov : Don't time wait on completion of tidy
154 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
155 * Linus Torvalds : Fixed BSD port reuse to work first syn
156 * Alan Cox : Reimplemented timers as per the RFC
157 * and using multiple timers for sanity.
158 * Alan Cox : Small bug fixes, and a lot of new
160 * Alan Cox : Fixed dual reader crash by locking
161 * the buffers (much like datagram.c)
162 * Alan Cox : Fixed stuck sockets in probe. A probe
163 * now gets fed up of retrying without
164 * (even a no space) answer.
165 * Alan Cox : Extracted closing code better
166 * Alan Cox : Fixed the closing state machine to
168 * Alan Cox : More 'per spec' fixes.
169 * Jorge Cwik : Even faster checksumming.
170 * Alan Cox : tcp_data() doesn't ack illegal PSH
171 * only frames. At least one pc tcp stack
173 * Alan Cox : Cache last socket.
174 * Alan Cox : Per route irtt.
175 * Matt Day : poll()->select() match BSD precisely on error
176 * Alan Cox : New buffers
177 * Marc Tamsky : Various sk->prot->retransmits and
178 * sk->retransmits misupdating fixed.
179 * Fixed tcp_write_timeout: stuck close,
180 * and TCP syn retries gets used now.
181 * Mark Yarvis : In tcp_read_wakeup(), don't send an
182 * ack if state is TCP_CLOSED.
183 * Alan Cox : Look up device on a retransmit - routes may
184 * change. Doesn't yet cope with MSS shrink right
186 * Marc Tamsky : Closing in closing fixes.
187 * Mike Shaver : RFC1122 verifications.
188 * Alan Cox : rcv_saddr errors.
189 * Alan Cox : Block double connect().
190 * Alan Cox : Small hooks for enSKIP.
191 * Alexey Kuznetsov: Path MTU discovery.
192 * Alan Cox : Support soft errors.
193 * Alan Cox : Fix MTU discovery pathological case
194 * when the remote claims no mtu!
195 * Marc Tamsky : TCP_CLOSE fix.
196 * Colin (G3TNE) : Send a reset on syn ack replies in
197 * window but wrong (fixes NT lpd problems)
198 * Pedro Roque : Better TCP window handling, delayed ack.
199 * Joerg Reuter : No modification of locked buffers in
200 * tcp_do_retransmit()
201 * Eric Schenk : Changed receiver side silly window
202 * avoidance algorithm to BSD style
203 * algorithm. This doubles throughput
204 * against machines running Solaris,
205 * and seems to result in general
207 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
208 * Willy Konynenberg : Transparent proxying support.
209 * Mike McLagan : Routing by source
210 * Keith Owens : Do proper merging with partial SKB's in
211 * tcp_do_sendmsg to avoid burstiness.
212 * Eric Schenk : Fix fast close down bug with
213 * shutdown() followed by close().
214 * Andi Kleen : Make poll agree with SIGIO
215 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
216 * lingertime == 0 (RFC 793 ABORT Call)
218 * This program is free software; you can redistribute it and/or
219 * modify it under the terms of the GNU General Public License
220 * as published by the Free Software Foundation; either version
221 * 2 of the License, or(at your option) any later version.
223 * Description of States:
225 * TCP_SYN_SENT sent a connection request, waiting for ack
227 * TCP_SYN_RECV received a connection request, sent ack,
228 * waiting for final ack in three-way handshake.
230 * TCP_ESTABLISHED connection established
232 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
233 * transmission of remaining buffered data
235 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
238 * TCP_CLOSING both sides have shutdown but we still have
239 * data we have to finish sending
241 * TCP_TIME_WAIT timeout to catch resent junk before entering
242 * closed, can only be entered from FIN_WAIT2
243 * or CLOSING. Required because the other end
244 * may not have gotten our last ACK causing it
245 * to retransmit the data packet (which we ignore)
247 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
248 * us to finish writing our data and to shutdown
249 * (we have to close() to move on to LAST_ACK)
251 * TCP_LAST_ACK out side has shutdown after remote has
252 * shutdown. There may still be data in our
253 * buffer that we have to finish sending
255 * TCP_CLOSE socket is finished
259 #include <linux/config.h>
260 #include <linux/types.h>
261 #include <linux/fcntl.h>
262 #include <linux/poll.h>
263 #include <linux/init.h>
264 #include <linux/smp_lock.h>
265 #include <linux/fs.h>
267 #include <net/icmp.h>
270 #include <asm/uaccess.h>
271 #include <asm/ioctls.h>
277 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
279 #ifdef ROS_STATISTICS
280 struct tcp_mib tcp_statistics[NR_CPUS*2];
283 kmem_cache_t *tcp_openreq_cachep;
284 kmem_cache_t *tcp_bucket_cachep;
285 kmem_cache_t *tcp_timewait_cachep;
288 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
291 int sysctl_tcp_mem[3];
292 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
293 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
295 atomic_t tcp_memory_allocated; /* Current allocated memory. */
296 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
298 /* Pressure flag: try to collapse.
299 * Technical note: it is used by multiple contexts non atomically.
300 * All the tcp_mem_schedule() is of this nature: accounting
301 * is strict, actions are advisory and have some latency. */
302 int tcp_memory_pressure;
304 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
306 int tcp_mem_schedule(struct sock *sk, int size, int kind)
308 int amt = TCP_PAGES(size);
310 sk->forward_alloc += amt*TCP_MEM_QUANTUM;
311 atomic_add(amt, &tcp_memory_allocated);
314 if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
315 if (tcp_memory_pressure)
316 tcp_memory_pressure = 0;
320 /* Over hard limit. */
321 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
322 tcp_enter_memory_pressure();
323 goto suppress_allocation;
326 /* Under pressure. */
327 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
328 tcp_enter_memory_pressure();
331 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
334 if (sk->wmem_queued < sysctl_tcp_wmem[0])
338 if (!tcp_memory_pressure ||
339 sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
340 * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
347 tcp_moderate_sndbuf(sk);
349 /* Fail only if socket is _under_ its sndbuf.
350 * In this case we cannot block, so that we have to fail.
352 if (sk->wmem_queued+size >= sk->sndbuf)
356 /* Alas. Undo changes. */
357 sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
358 atomic_sub(amt, &tcp_memory_allocated);
362 void __tcp_mem_reclaim(struct sock *sk)
364 if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
365 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
366 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
367 if (tcp_memory_pressure &&
368 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
369 tcp_memory_pressure = 0;
373 void tcp_rfree(struct sk_buff *skb)
375 struct sock *sk = skb->sk;
377 atomic_sub(skb->truesize, &sk->rmem_alloc);
378 sk->forward_alloc += skb->truesize;
382 * LISTEN is a special case for poll..
384 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
386 return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
390 * Wait for a TCP event.
392 * Note that we don't need to lock the socket, as the upper poll layers
393 * take care of normal races (between the test and the event) and we don't
394 * go look at any of the socket buffers directly.
396 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
400 struct sock *sk = sock->sk;
401 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
403 poll_wait(file, sk->sleep, wait);
404 if (sk->state == TCP_LISTEN)
405 return tcp_listen_poll(sk, wait);
407 /* Socket is not locked. We are protected from async events
408 by poll logic and correct handling of state changes
409 made by another threads is impossible in any case.
417 * POLLHUP is certainly not done right. But poll() doesn't
418 * have a notion of HUP in just one direction, and for a
419 * socket the read side is more interesting.
421 * Some poll() documentation says that POLLHUP is incompatible
422 * with the POLLOUT/POLLWR flags, so somebody should check this
423 * all. But careful, it tends to be safer to return too many
424 * bits than too few, and you can easily break real applications
425 * if you don't tell them that something has hung up!
429 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
430 * our fs/select.c). It means that after we received EOF,
431 * poll always returns immediately, making impossible poll() on write()
432 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
433 * if and only if shutdown has been made in both directions.
434 * Actually, it is interesting to look how Solaris and DUX
435 * solve this dilemma. I would prefer, if PULLHUP were maskable,
436 * then we could set it on SND_SHUTDOWN. BTW examples given
437 * in Stevens' books assume exactly this behaviour, it explains
438 * why PULLHUP is incompatible with POLLOUT. --ANK
440 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
441 * blocking on fresh not-connected or disconnected socket. --ANK
443 if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
445 if (sk->shutdown & RCV_SHUTDOWN)
446 mask |= POLLIN | POLLRDNORM;
449 if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
450 /* Potential race condition. If read of tp below will
451 * escape above sk->state, we can be illegally awaken
452 * in SYN_* states. */
453 if ((tp->rcv_nxt != tp->copied_seq) &&
454 (tp->urg_seq != tp->copied_seq ||
455 tp->rcv_nxt != tp->copied_seq+1 ||
456 sk->urginline || !tp->urg_data))
457 mask |= POLLIN | POLLRDNORM;
459 if (!(sk->shutdown & SEND_SHUTDOWN)) {
460 if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
461 mask |= POLLOUT | POLLWRNORM;
462 } else { /* send SIGIO later */
463 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
464 set_bit(SOCK_NOSPACE, &sk->socket->flags);
466 /* Race breaker. If space is freed after
467 * wspace test but before the flags are set,
468 * IO signal will be lost.
470 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
471 mask |= POLLOUT | POLLWRNORM;
475 if (tp->urg_data & TCP_URG_VALID)
485 * TCP socket write_space callback.
487 void tcp_write_space(struct sock *sk)
490 struct socket *sock = sk->socket;
492 if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
493 clear_bit(SOCK_NOSPACE, &sock->flags);
495 if (sk->sleep && waitqueue_active(sk->sleep))
496 wake_up_interruptible(sk->sleep);
498 if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
499 sock_wake_async(sock, 2, POLL_OUT);
504 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
507 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
512 if (sk->state == TCP_LISTEN)
516 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
518 else if (sk->urginline || !tp->urg_data ||
519 before(tp->urg_seq,tp->copied_seq) ||
520 !before(tp->urg_seq,tp->rcv_nxt)) {
521 answ = tp->rcv_nxt - tp->copied_seq;
523 /* Subtract 1, if FIN is in queue. */
524 if (answ && !skb_queue_empty(&sk->receive_queue))
525 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
527 answ = tp->urg_seq - tp->copied_seq;
532 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
536 if (sk->state == TCP_LISTEN)
539 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
542 answ = tp->write_seq - tp->snd_una;
545 return(-ENOIOCTLCMD);
548 return put_user(answ, (int *)arg);
555 int tcp_listen_start(struct sock *sk)
558 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
559 struct tcp_listen_opt *lopt;
561 sk->max_ack_backlog = 0;
563 tp->accept_queue = tp->accept_queue_tail = NULL;
564 tp->syn_wait_lock = RW_LOCK_UNLOCKED;
567 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
571 memset(lopt, 0, sizeof(struct tcp_listen_opt));
572 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
573 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
576 write_lock_bh(&tp->syn_wait_lock);
577 tp->listen_opt = lopt;
578 write_unlock_bh(&tp->syn_wait_lock);
580 /* There is race window here: we announce ourselves listening,
581 * but this transition is still not validated by get_port().
582 * It is OK, because this socket enters to hash table only
583 * after validation is complete.
585 sk->state = TCP_LISTEN;
586 if (sk->prot->get_port(sk, sk->num) == 0) {
587 sk->sport = htons(sk->num);
595 sk->state = TCP_CLOSE;
596 write_lock_bh(&tp->syn_wait_lock);
597 tp->listen_opt = NULL;
598 write_unlock_bh(&tp->syn_wait_lock);
605 * This routine closes sockets which have been at least partially
606 * opened, but not yet accepted.
609 static void tcp_listen_stop (struct sock *sk)
612 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
613 struct tcp_listen_opt *lopt = tp->listen_opt;
614 struct open_request *acc_req = tp->accept_queue;
615 struct open_request *req;
618 tcp_delete_keepalive_timer(sk);
620 /* make all the listen_opt local to us */
621 write_lock_bh(&tp->syn_wait_lock);
622 tp->listen_opt =NULL;
623 write_unlock_bh(&tp->syn_wait_lock);
624 tp->accept_queue = tp->accept_queue_tail = NULL;
627 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
628 while ((req = lopt->syn_table[i]) != NULL) {
629 lopt->syn_table[i] = req->dl_next;
631 tcp_openreq_free(req);
633 /* Following specs, it would be better either to send FIN
634 * (and enter FIN-WAIT-1, it is normal close)
635 * or to send active reset (abort).
636 * Certainly, it is pretty dangerous while synflood, but it is
637 * bad justification for our negligence 8)
638 * To be honest, we are not able to make either
639 * of the variants now. --ANK
644 BUG_TRAP(lopt->qlen == 0);
648 while ((req=acc_req) != NULL) {
649 struct sock *child = req->sk;
651 acc_req = req->dl_next;
655 BUG_TRAP(child->lock.users==0);
658 tcp_disconnect(child, O_NONBLOCK);
662 atomic_inc(&tcp_orphan_count);
664 tcp_destroy_sock(child);
666 bh_unlock_sock(child);
670 tcp_acceptq_removed(sk);
671 tcp_openreq_fastfree(req);
673 BUG_TRAP(sk->ack_backlog == 0);
678 * Wait for a socket to get into the connected state
680 * Note: Must be called with the socket locked.
682 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
685 struct task_struct *tsk = current;
686 DECLARE_WAITQUEUE(wait, tsk);
688 while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
690 return sock_error(sk);
691 if((1 << sk->state) &
692 ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
696 if(signal_pending(tsk))
697 return sock_intr_errno(*timeo_p);
699 __set_task_state(tsk, TASK_INTERRUPTIBLE);
700 add_wait_queue(sk->sleep, &wait);
701 sk->tp_pinfo.af_tcp.write_pending++;
704 *timeo_p = schedule_timeout(*timeo_p);
707 __set_task_state(tsk, TASK_RUNNING);
708 remove_wait_queue(sk->sleep, &wait);
709 sk->tp_pinfo.af_tcp.write_pending--;
717 static inline int tcp_memory_free(struct sock *sk)
719 return sk->wmem_queued < sk->sndbuf;
723 * Wait for more memory for a socket
725 static int wait_for_tcp_memory(struct sock * sk, long *timeo)
730 long current_timeo = *timeo;
731 DECLARE_WAITQUEUE(wait, current);
733 if (tcp_memory_free(sk))
734 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
736 add_wait_queue(sk->sleep, &wait);
738 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
740 set_current_state(TASK_INTERRUPTIBLE);
742 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
746 if (signal_pending(current))
748 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
749 if (tcp_memory_free(sk) && !vm_wait)
752 set_bit(SOCK_NOSPACE, &sk->socket->flags);
753 sk->tp_pinfo.af_tcp.write_pending++;
755 if (!tcp_memory_free(sk) || vm_wait)
756 current_timeo = schedule_timeout(current_timeo);
758 sk->tp_pinfo.af_tcp.write_pending--;
761 vm_wait -= current_timeo;
762 current_timeo = *timeo;
763 if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
764 (current_timeo -= vm_wait) < 0)
768 *timeo = current_timeo;
771 current->state = TASK_RUNNING;
772 remove_wait_queue(sk->sleep, &wait);
782 err = sock_intr_errno(*timeo);
787 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
790 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
794 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
795 return page == frag->page &&
796 off == frag->page_offset+frag->size;
805 fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
807 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
809 frag->page_offset = off;
811 skb_shinfo(skb)->nr_frags = i+1;
814 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
816 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
817 tp->pushed_seq = tp->write_seq;
820 static inline int forced_push(struct tcp_opt *tp)
822 return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
826 skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
829 TCP_SKB_CB(skb)->seq = tp->write_seq;
830 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
831 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
832 TCP_SKB_CB(skb)->sacked = 0;
833 __skb_queue_tail(&sk->write_queue, skb);
834 tcp_charge_skb(sk, skb);
835 if (tp->send_head == NULL)
840 tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
843 if (flags & MSG_OOB) {
845 tp->snd_up = tp->write_seq;
846 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
852 tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
856 struct sk_buff *skb = sk->write_queue.prev;
857 if (!(flags&MSG_MORE) || forced_push(tp))
858 tcp_mark_push(tp, skb);
859 tcp_mark_urg(tp, flags, skb);
860 __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
865 static int tcp_error(struct sock *sk, int flags, int err)
869 err = sock_error(sk) ? : -EPIPE;
870 if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
871 send_sig(SIGPIPE, current, 0);
878 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
881 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
885 long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
887 /* Wait for a connection to finish. */
888 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
889 if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
892 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
894 mss_now = tcp_current_mss(sk);
898 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
902 struct sk_buff *skb = sk->write_queue.prev;
903 int offset, size, copy, i;
906 page = pages[poffset/PAGE_SIZE];
907 offset = poffset % PAGE_SIZE;
908 size = min_t(size_t, psize, PAGE_SIZE-offset);
910 if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
912 if (!tcp_memory_free(sk))
913 goto wait_for_sndbuf;
915 skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
917 goto wait_for_memory;
919 skb_entail(sk, tp, skb);
926 i = skb_shinfo(skb)->nr_frags;
927 if (can_coalesce(skb, i, page, offset)) {
928 skb_shinfo(skb)->frags[i-1].size += copy;
929 } else if (i < MAX_SKB_FRAGS) {
931 fill_page_desc(skb, i, page, offset, copy);
933 tcp_mark_push(tp, skb);
938 skb->data_len += copy;
939 skb->ip_summed = CHECKSUM_HW;
940 tp->write_seq += copy;
941 TCP_SKB_CB(skb)->end_seq += copy;
944 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
948 if (!(psize -= copy))
951 if (skb->len != mss_now || (flags&MSG_OOB))
954 if (forced_push(tp)) {
955 tcp_mark_push(tp, skb);
956 __tcp_push_pending_frames(sk, tp, mss_now, 1);
957 } else if (skb == tp->send_head)
958 tcp_push_one(sk, mss_now);
962 set_bit(SOCK_NOSPACE, &sk->socket->flags);
965 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
967 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
970 mss_now = tcp_current_mss(sk);
975 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
982 return tcp_error(sk, flags, err);
988 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
992 struct sock *sk = sock->sk;
994 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
996 if (!(sk->route_caps & NETIF_F_SG) ||
997 !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
998 return sock_no_sendpage(sock, page, offset, size, flags);
1000 #undef TCP_ZC_CSUM_FLAGS
1003 TCP_CHECK_TIMER(sk);
1004 res = do_tcp_sendpages(sk, &page, offset, size, flags);
1005 TCP_CHECK_TIMER(sk);
1013 #define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page)
1014 #define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off)
1017 tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
1018 struct page *page, int off, int copy)
1023 csum = csum_and_copy_from_user(from, page_address(page)+off,
1026 if (skb->ip_summed == CHECKSUM_NONE)
1027 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1029 skb->data_len += copy;
1030 skb->truesize += copy;
1031 sk->wmem_queued += copy;
1032 sk->forward_alloc -= copy;
1038 skb_add_data(struct sk_buff *skb, char *from, int copy)
1045 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1048 skb->csum = csum_block_add(skb->csum, csum, off);
1052 __skb_trim(skb, off);
1059 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1062 int tmp = tp->mss_cache;
1064 if (sk->route_caps&NETIF_F_SG) {
1065 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1067 if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1076 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1081 struct sk_buff *skb;
1087 tp = &(sk->tp_pinfo.af_tcp);
1090 TCP_CHECK_TIMER(sk);
1092 flags = msg->msg_flags;
1093 timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1095 /* Wait for a connection to finish. */
1096 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1097 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1100 /* This should be in poll */
1101 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1103 mss_now = tcp_current_mss(sk);
1105 /* Ok commence sending. */
1106 iovlen = msg->msg_iovlen;
1111 if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1114 while (--iovlen >= 0) {
1115 int seglen=iov->iov_len;
1116 unsigned char * from=iov->iov_base;
1120 while (seglen > 0) {
1123 skb = sk->write_queue.prev;
1125 if (tp->send_head == NULL ||
1126 (copy = mss_now - skb->len) <= 0) {
1129 /* Allocate new segment. If the interface is SG,
1130 * allocate skb fitting to single page.
1132 if (!tcp_memory_free(sk))
1133 goto wait_for_sndbuf;
1135 skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1137 goto wait_for_memory;
1139 skb_entail(sk, tp, skb);
1143 /* Try to append data to the end of skb. */
1147 /* Where to copy to? */
1148 if (skb_tailroom(skb) > 0) {
1149 /* We have some space in skb head. Superb! */
1150 if (copy > skb_tailroom(skb))
1151 copy = skb_tailroom(skb);
1152 if ((err = skb_add_data(skb, from, copy)) != 0)
1156 int i = skb_shinfo(skb)->nr_frags;
1157 struct page *page = TCP_PAGE(sk);
1158 int off = TCP_OFF(sk);
1160 if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1161 /* We can extend the last page fragment. */
1163 } else if (i == MAX_SKB_FRAGS ||
1164 (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1165 /* Need to add new fragment and cannot
1166 * do this because interface is non-SG,
1167 * or because all the page slots are busy.
1169 tcp_mark_push(tp, skb);
1172 /* If page is cached, align
1173 * offset to L1 cache boundary
1175 off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1176 if (off == PAGE_SIZE) {
1178 TCP_PAGE(sk) = page = NULL;
1183 /* Allocate new cache page. */
1184 if (!(page=tcp_alloc_page(sk)))
1185 goto wait_for_memory;
1189 if (copy > PAGE_SIZE-off)
1190 copy = PAGE_SIZE-off;
1192 /* Time to copy data. We are close to the end! */
1193 err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1195 /* If this page was new, give it to the
1196 * socket so it does not get leaked.
1198 if (TCP_PAGE(sk) == NULL) {
1199 TCP_PAGE(sk) = page;
1205 /* Update the skb. */
1207 skb_shinfo(skb)->frags[i-1].size += copy;
1209 fill_page_desc(skb, i, page, off, copy);
1212 } else if (off + copy < PAGE_SIZE) {
1214 TCP_PAGE(sk) = page;
1218 TCP_OFF(sk) = off+copy;
1222 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1224 tp->write_seq += copy;
1225 TCP_SKB_CB(skb)->end_seq += copy;
1229 if ((seglen -= copy) == 0 && iovlen == 0)
1232 if (skb->len != mss_now || (flags&MSG_OOB))
1235 if (forced_push(tp)) {
1236 tcp_mark_push(tp, skb);
1237 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1238 } else if (skb == tp->send_head)
1239 tcp_push_one(sk, mss_now);
1243 set_bit(SOCK_NOSPACE, &sk->socket->flags);
1246 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1248 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1251 mss_now = tcp_current_mss(sk);
1257 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1258 TCP_CHECK_TIMER(sk);
1263 if (skb->len == 0) {
1264 if (tp->send_head == skb)
1265 tp->send_head = NULL;
1266 __skb_unlink(skb, skb->list);
1267 tcp_free_skb(sk, skb);
1274 err = tcp_error(sk, flags, err);
1275 TCP_CHECK_TIMER(sk);
1284 * Handle reading urgent data. BSD has very simple semantics for
1285 * this, no blocking and very strange errors 8)
1288 static int tcp_recv_urg(struct sock * sk, long timeo,
1289 struct msghdr *msg, int len, int flags,
1293 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1295 /* No URG data to read. */
1296 if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1297 return -EINVAL; /* Yes this is right ! */
1299 if (sk->state==TCP_CLOSE && !sk->done)
1302 if (tp->urg_data & TCP_URG_VALID) {
1304 char c = tp->urg_data;
1306 if (!(flags & MSG_PEEK))
1307 tp->urg_data = TCP_URG_READ;
1309 /* Read urgent data. */
1310 msg->msg_flags|=MSG_OOB;
1313 if (!(flags & MSG_TRUNC))
1314 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1317 msg->msg_flags|=MSG_TRUNC;
1319 return err ? -EFAULT : len;
1322 if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1325 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1326 * the available implementations agree in this case:
1327 * this call should never block, independent of the
1328 * blocking state of the socket.
1329 * Mike <pall@rz.uni-karlsruhe.de>
1338 * Release a skb if it is no longer needed. This routine
1339 * must be called with interrupts disabled or with the
1340 * socket locked so that the sk_buff queue operation is ok.
1343 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1346 __skb_unlink(skb, &sk->receive_queue);
1351 /* Clean up the receive buffer for full frames taken by the user,
1352 * then send an ACK if necessary. COPIED is the number of bytes
1353 * tcp_recvmsg has given to the user so far, it speeds up the
1354 * calculation of whether or not we must ACK for the sake of
1357 static void cleanup_rbuf(struct sock *sk, int copied)
1360 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1361 int time_to_ack = 0;
1364 struct sk_buff *skb = skb_peek(&sk->receive_queue);
1366 BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1369 if (tcp_ack_scheduled(tp)) {
1370 /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1372 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1373 || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1375 * If this read emptied read buffer, we send ACK, if
1376 * connection is not bidirectional, user drained
1377 * receive buffer and there was a small segment
1381 (tp->ack.pending&TCP_ACK_PUSHED) &&
1382 !tp->ack.pingpong &&
1383 atomic_read(&sk->rmem_alloc) == 0)) {
1388 /* We send an ACK if we can now advertise a non-zero window
1389 * which has been raised "significantly".
1391 * Even if window raised up to infinity, do not send window open ACK
1392 * in states, where we will not receive more. It is useless.
1394 if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1395 __u32 rcv_window_now = tcp_receive_window(tp);
1397 /* Optimize, __tcp_select_window() is not cheap. */
1398 if (2*rcv_window_now <= tp->window_clamp) {
1399 __u32 new_window = __tcp_select_window(sk);
1401 /* Send ACK now, if this read freed lots of space
1402 * in our buffer. Certainly, new_window is new window.
1403 * We can advertise it now, if it is not less than current one.
1404 * "Lots" means "at least twice" here.
1406 if(new_window && new_window >= 2*rcv_window_now)
1415 /* Now socket state including sk->err is changed only under lock,
1416 * hence we may omit checks after joining wait queue.
1417 * We check receive queue before schedule() only as optimization;
1418 * it is very likely that release_sock() added new data.
1421 static long tcp_data_wait(struct sock *sk, long timeo)
1424 DECLARE_WAITQUEUE(wait, current);
1426 add_wait_queue(sk->sleep, &wait);
1428 __set_current_state(TASK_INTERRUPTIBLE);
1430 set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1433 if (skb_queue_empty(&sk->receive_queue))
1434 timeo = schedule_timeout(timeo);
1437 clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1439 remove_wait_queue(sk->sleep, &wait);
1440 __set_current_state(TASK_RUNNING);
1447 static void tcp_prequeue_process(struct sock *sk)
1450 struct sk_buff *skb;
1451 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1453 net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1455 /* RX process wants to run with disabled BHs, though it is not necessary */
1457 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1458 sk->backlog_rcv(sk, skb);
1461 /* Clear memory counter. */
1462 tp->ucopy.memory = 0;
1467 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1470 struct sk_buff *skb;
1473 skb_queue_walk(&sk->receive_queue, skb) {
1474 offset = seq - TCP_SKB_CB(skb)->seq;
1477 if (offset < skb->len || skb->h.th->fin) {
1489 * This routine provides an alternative to tcp_recvmsg() for routines
1490 * that would like to handle copying from skbuffs directly in 'sendfile'
1493 * - It is assumed that the socket was locked by the caller.
1494 * - The routine does not block.
1495 * - At present, there is no support for reading OOB data
1496 * or for 'peeking' the socket using this routine
1497 * (although both would be easy to implement).
1499 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1500 sk_read_actor_t recv_actor)
1503 struct sk_buff *skb;
1504 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1505 u32 seq = tp->copied_seq;
1509 if (sk->state == TCP_LISTEN)
1511 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1512 if (offset < skb->len) {
1515 len = skb->len - offset;
1516 /* Stop reading if we hit a patch of urgent data */
1518 u32 urg_offset = tp->urg_seq - seq;
1519 if (urg_offset < len)
1524 used = recv_actor(desc, skb, offset, len);
1530 if (offset != skb->len)
1533 if (skb->h.th->fin) {
1534 tcp_eat_skb(sk, skb);
1538 tcp_eat_skb(sk, skb);
1542 tp->copied_seq = seq;
1543 /* Clean up data we have read: This will do ACK frames. */
1545 cleanup_rbuf(sk, copied);
1552 * This routine copies from a sock struct into the user buffer.
1554 * Technical note: in 2.3 we work on _locked_ socket, so that
1555 * tricks with *seq access order and skb->users are not required.
1556 * Probably, code can be easily improved even more.
1559 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1560 int len, int nonblock, int flags, int *addr_len)
1563 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1569 int target; /* Read at least this many bytes */
1571 struct task_struct *user_recv = NULL;
1575 TCP_CHECK_TIMER(sk);
1578 if (sk->state == TCP_LISTEN)
1581 timeo = sock_rcvtimeo(sk, nonblock);
1583 /* Urgent data needs to be handled specially. */
1584 if (flags & MSG_OOB)
1587 seq = &tp->copied_seq;
1588 if (flags & MSG_PEEK) {
1589 peek_seq = tp->copied_seq;
1593 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1596 struct sk_buff * skb;
1599 /* Are we at urgent data? Stop if we have read anything. */
1600 if (copied && tp->urg_data && tp->urg_seq == *seq)
1603 /* We need to check signals first, to get correct SIGURG
1604 * handling. FIXME: Need to check this doesn't impact 1003.1g
1605 * and move it down to the bottom of the loop
1607 if (signal_pending(current)) {
1610 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1614 /* Next get a buffer. */
1616 skb = skb_peek(&sk->receive_queue);
1621 /* Now that we have two receive queues this
1624 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1625 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1626 *seq, TCP_SKB_CB(skb)->seq);
1629 offset = *seq - TCP_SKB_CB(skb)->seq;
1632 if (offset < skb->len)
1636 BUG_TRAP(flags&MSG_PEEK);
1638 } while (skb != (struct sk_buff *)&sk->receive_queue);
1640 /* Well, if we have backlog, try to process it now yet. */
1642 if (copied >= target && sk->backlog.tail == NULL)
1647 sk->state == TCP_CLOSE ||
1648 (sk->shutdown & RCV_SHUTDOWN) ||
1657 copied = sock_error(sk);
1661 if (sk->shutdown & RCV_SHUTDOWN)
1664 if (sk->state == TCP_CLOSE) {
1666 /* This occurs when user tries to read
1667 * from never connected socket.
1681 cleanup_rbuf(sk, copied);
1683 if (tp->ucopy.task == user_recv) {
1684 /* Install new reader */
1685 if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1686 user_recv = current;
1687 tp->ucopy.task = user_recv;
1688 tp->ucopy.iov = msg->msg_iov;
1691 tp->ucopy.len = len;
1693 BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1695 /* Ugly... If prequeue is not empty, we have to
1696 * process it before releasing socket, otherwise
1697 * order will be broken at second iteration.
1698 * More elegant solution is required!!!
1700 * Look: we have the following (pseudo)queues:
1702 * 1. packets in flight
1707 * Each queue can be processed only if the next ones
1708 * are empty. At this point we have empty receive_queue.
1709 * But prequeue _can_ be not empty after second iteration,
1710 * when we jumped to start of loop because backlog
1711 * processing added something to receive_queue.
1712 * We cannot release_sock(), because backlog contains
1713 * packets arrived _after_ prequeued ones.
1715 * Shortly, algorithm is clear --- to process all
1716 * the queues in order. We could make it more directly,
1717 * requeueing packets from backlog to prequeue, if
1718 * is not empty. It is more elegant, but eats cycles,
1721 if (skb_queue_len(&tp->ucopy.prequeue))
1724 /* __ Set realtime policy in scheduler __ */
1727 if (copied >= target) {
1728 /* Do not sleep, just process backlog. */
1732 timeo = tcp_data_wait(sk, timeo);
1738 /* __ Restore normal policy in scheduler __ */
1740 if ((chunk = len - tp->ucopy.len) != 0) {
1741 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1746 if (tp->rcv_nxt == tp->copied_seq &&
1747 skb_queue_len(&tp->ucopy.prequeue)) {
1749 tcp_prequeue_process(sk);
1751 if ((chunk = len - tp->ucopy.len) != 0) {
1752 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1758 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1759 if (net_ratelimit())
1760 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1761 current->comm, current->pid);
1762 peek_seq = tp->copied_seq;
1767 /* Ok so how much can we use? */
1768 used = skb->len - offset;
1772 /* Do we have urgent data here? */
1774 u32 urg_offset = tp->urg_seq - *seq;
1775 if (urg_offset < used) {
1777 if (!sk->urginline) {
1789 if (!(flags&MSG_TRUNC)) {
1790 err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1792 /* Exception. Bailout! */
1804 if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1806 tcp_fast_path_check(sk, tp);
1808 if (used + offset < skb->len)
1813 if (!(flags & MSG_PEEK))
1814 tcp_eat_skb(sk, skb);
1818 /* Process the FIN. */
1820 if (!(flags & MSG_PEEK))
1821 tcp_eat_skb(sk, skb);
1826 if (skb_queue_len(&tp->ucopy.prequeue)) {
1829 tp->ucopy.len = copied > 0 ? len : 0;
1831 tcp_prequeue_process(sk);
1833 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1834 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1840 tp->ucopy.task = NULL;
1844 /* According to UNIX98, msg_name/msg_namelen are ignored
1845 * on connected socket. I was just happy when found this 8) --ANK
1848 /* Clean up data we have read: This will do ACK frames. */
1849 cleanup_rbuf(sk, copied);
1851 TCP_CHECK_TIMER(sk);
1856 TCP_CHECK_TIMER(sk);
1861 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1869 * State processing on a close. This implements the state shift for
1870 * sending our FIN frame. Note that we only send a FIN for some
1871 * states. A shutdown() may have already sent the FIN, or we may be
1875 static unsigned char new_state[16] = {
1876 /* current state: new state: action: */
1877 /* (Invalid) */ TCP_CLOSE,
1878 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1879 /* TCP_SYN_SENT */ TCP_CLOSE,
1880 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1881 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1882 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1883 /* TCP_TIME_WAIT */ TCP_CLOSE,
1884 /* TCP_CLOSE */ TCP_CLOSE,
1885 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1886 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1887 /* TCP_LISTEN */ TCP_CLOSE,
1888 /* TCP_CLOSING */ TCP_CLOSING,
1891 static int tcp_close_state(struct sock *sk)
1894 int next = (int) new_state[sk->state];
1895 int ns = (next & TCP_STATE_MASK);
1897 tcp_set_state(sk, ns);
1899 return (next & TCP_ACTION_FIN);
1906 * Shutdown the sending side of a connection. Much like close except
1907 * that we don't receive shut down or set sk->dead.
1910 void tcp_shutdown(struct sock *sk, int how)
1913 /* We need to grab some memory, and put together a FIN,
1914 * and then put it into the queue to be sent.
1915 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1917 if (!(how & SEND_SHUTDOWN))
1920 /* If we've already sent a FIN, or it's a closed state, skip this. */
1921 if ((1 << sk->state) &
1922 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1923 /* Clear out any half completed packets. FIN if needed. */
1924 if (tcp_close_state(sk))
1932 * Return 1 if we still have things to send in our buffers.
1935 static inline int closing(struct sock * sk)
1938 return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1944 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1947 /* First the read buffer. */
1948 __skb_queue_purge(&sk->receive_queue);
1950 /* Next, the error queue. */
1951 __skb_queue_purge(&sk->error_queue);
1953 /* Next, the write queue. */
1954 BUG_TRAP(skb_queue_empty(&sk->write_queue));
1956 /* Account for returned memory. */
1957 tcp_mem_reclaim(sk);
1959 BUG_TRAP(sk->wmem_queued == 0);
1960 BUG_TRAP(sk->forward_alloc == 0);
1962 /* It is _impossible_ for the backlog to contain anything
1963 * when we get here. All user references to this socket
1964 * have gone away, only the net layer knows can touch it.
1970 * At this point, there should be no process reference to this
1971 * socket, and thus no user references at all. Therefore we
1972 * can assume the socket waitqueue is inactive and nobody will
1973 * try to jump onto it.
1975 void tcp_destroy_sock(struct sock *sk)
1978 BUG_TRAP(sk->state==TCP_CLOSE);
1981 /* It cannot be in hash table! */
1982 BUG_TRAP(sk->pprev==NULL);
1984 /* If it has not 0 sk->num, it must be bound */
1985 BUG_TRAP(!sk->num || sk->prev!=NULL);
1989 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1995 sk->prot->destroy(sk);
1997 tcp_kill_sk_queues(sk);
1999 #ifdef INET_REFCNT_DEBUG
2000 if (atomic_read(&sk->refcnt) != 1) {
2001 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
2005 atomic_dec(&tcp_orphan_count);
2010 void tcp_close(struct sock *sk, long timeout)
2013 struct sk_buff *skb;
2014 int data_was_unread = 0;
2017 sk->shutdown = SHUTDOWN_MASK;
2019 if(sk->state == TCP_LISTEN) {
2020 tcp_set_state(sk, TCP_CLOSE);
2023 tcp_listen_stop(sk);
2025 goto adjudge_to_death;
2028 /* We need to flush the recv. buffs. We do this only on the
2029 * descriptor close, not protocol-sourced closes, because the
2030 * reader process may not have drained the data yet!
2032 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
2033 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
2034 data_was_unread += len;
2038 tcp_mem_reclaim(sk);
2040 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
2041 * 3.10, we send a RST here because data was lost. To
2042 * witness the awful effects of the old behavior of always
2043 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
2044 * a bulk GET in an FTP client, suspend the process, wait
2045 * for the client to advertise a zero window, then kill -9
2046 * the FTP client, wheee... Note: timeout is always zero
2049 if(data_was_unread != 0) {
2050 /* Unread data was tossed, zap the connection. */
2051 NET_INC_STATS_USER(TCPAbortOnClose);
2052 tcp_set_state(sk, TCP_CLOSE);
2053 tcp_send_active_reset(sk, GFP_KERNEL);
2054 } else if (sk->linger && sk->lingertime==0) {
2055 /* Check zero linger _after_ checking for unread data. */
2056 sk->prot->disconnect(sk, 0);
2057 NET_INC_STATS_USER(TCPAbortOnData);
2058 } else if (tcp_close_state(sk)) {
2059 /* We FIN if the application ate all the data before
2060 * zapping the connection.
2063 /* RED-PEN. Formally speaking, we have broken TCP state
2064 * machine. State transitions:
2066 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2067 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2068 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2070 * are legal only when FIN has been sent (i.e. in window),
2071 * rather than queued out of window. Purists blame.
2073 * F.e. "RFC state" is ESTABLISHED,
2074 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2076 * The visible declinations are that sometimes
2077 * we enter time-wait state, when it is not required really
2078 * (harmless), do not send active resets, when they are
2079 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2080 * they look as CLOSING or LAST_ACK for Linux)
2081 * Probably, I missed some more holelets.
2088 struct task_struct *tsk = current;
2089 DECLARE_WAITQUEUE(wait, current);
2091 add_wait_queue(sk->sleep, &wait);
2094 set_current_state(TASK_INTERRUPTIBLE);
2098 timeout = schedule_timeout(timeout);
2100 } while (!signal_pending(tsk) && timeout);
2102 tsk->state = TASK_RUNNING;
2103 remove_wait_queue(sk->sleep, &wait);
2107 /* It is the last release_sock in its life. It will remove backlog. */
2111 /* Now socket is owned by kernel and we acquire BH lock
2112 to finish close. No need to check for user refs.
2116 BUG_TRAP(sk->lock.users==0);
2121 /* This is a (useful) BSD violating of the RFC. There is a
2122 * problem with TCP as specified in that the other end could
2123 * keep a socket open forever with no application left this end.
2124 * We use a 3 minute timeout (about the same as BSD) then kill
2125 * our end. If they send after that then tough - BUT: long enough
2126 * that we won't make the old 4*rto = almost no time - whoops
2129 * Nope, it was not mistake. It is really desired behaviour
2130 * f.e. on http servers, when such sockets are useless, but
2131 * consume significant resources. Let's do it with special
2132 * linger2 option. --ANK
2135 if (sk->state == TCP_FIN_WAIT2) {
2136 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2137 if (tp->linger2 < 0) {
2138 tcp_set_state(sk, TCP_CLOSE);
2139 tcp_send_active_reset(sk, GFP_ATOMIC);
2140 NET_INC_STATS_BH(TCPAbortOnLinger);
2142 int tmo = tcp_fin_time(tp);
2144 if (tmo > TCP_TIMEWAIT_LEN) {
2145 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2147 atomic_inc(&tcp_orphan_count);
2148 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2153 if (sk->state != TCP_CLOSE) {
2154 tcp_mem_reclaim(sk);
2155 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2156 (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2157 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2158 if (net_ratelimit())
2159 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2160 tcp_set_state(sk, TCP_CLOSE);
2161 tcp_send_active_reset(sk, GFP_ATOMIC);
2162 NET_INC_STATS_BH(TCPAbortOnMemory);
2165 atomic_inc(&tcp_orphan_count);
2167 if (sk->state == TCP_CLOSE)
2168 tcp_destroy_sock(sk);
2169 /* Otherwise, socket is reprieved until protocol close. */
2178 /* These states need RST on ABORT according to RFC793 */
2180 extern __inline__ int tcp_need_reset(int state)
2183 return ((1 << state) &
2184 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2185 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2191 int tcp_disconnect(struct sock *sk, int flags)
2194 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2198 old_state = sk->state;
2199 if (old_state != TCP_CLOSE)
2200 tcp_set_state(sk, TCP_CLOSE);
2202 /* ABORT function of RFC793 */
2203 if (old_state == TCP_LISTEN) {
2204 tcp_listen_stop(sk);
2205 } else if (tcp_need_reset(old_state) ||
2206 (tp->snd_nxt != tp->write_seq &&
2207 (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2208 /* The last check adjusts for discrepance of Linux wrt. RFC
2211 tcp_send_active_reset(sk, gfp_any());
2212 sk->err = ECONNRESET;
2213 } else if (old_state == TCP_SYN_SENT)
2214 sk->err = ECONNRESET;
2216 tcp_clear_xmit_timers(sk);
2217 __skb_queue_purge(&sk->receive_queue);
2218 tcp_writequeue_purge(sk);
2219 __skb_queue_purge(&tp->out_of_order_queue);
2223 if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2226 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2227 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2228 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2235 if ((tp->write_seq += tp->max_window+2) == 0)
2240 tp->packets_out = 0;
2241 tp->snd_ssthresh = 0x7fffffff;
2242 tp->snd_cwnd_cnt = 0;
2243 tp->ca_state = TCP_CA_Open;
2244 tcp_clear_retrans(tp);
2245 tcp_delack_init(tp);
2246 tp->send_head = NULL;
2251 BUG_TRAP(!sk->num || sk->prev);
2253 sk->error_report(sk);
2261 * Wait for an incoming connection, avoid race
2262 * conditions. This must be called with the socket locked.
2264 static int wait_for_connect(struct sock * sk, long timeo)
2267 DECLARE_WAITQUEUE(wait, current);
2271 * True wake-one mechanism for incoming connections: only
2272 * one process gets woken up, not the 'whole herd'.
2273 * Since we do not 'race & poll' for established sockets
2274 * anymore, the common case will execute the loop only once.
2276 * Subtle issue: "add_wait_queue_exclusive()" will be added
2277 * after any current non-exclusive waiters, and we know that
2278 * it will always _stay_ after any new non-exclusive waiters
2279 * because all non-exclusive waiters are added at the
2280 * beginning of the wait-queue. As such, it's ok to "drop"
2281 * our exclusiveness temporarily when we get woken up without
2282 * having to remove and re-insert us on the wait queue.
2284 add_wait_queue_exclusive(sk->sleep, &wait);
2286 current->state = TASK_INTERRUPTIBLE;
2288 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2289 timeo = schedule_timeout(timeo);
2292 if (sk->tp_pinfo.af_tcp.accept_queue)
2295 if (sk->state != TCP_LISTEN)
2297 err = sock_intr_errno(timeo);
2298 if (signal_pending(current))
2304 current->state = TASK_RUNNING;
2305 remove_wait_queue(sk->sleep, &wait);
2313 * This will accept the next outstanding connection.
2316 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2319 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2320 struct open_request *req;
2326 /* We need to make sure that this socket is listening,
2327 * and that it has something pending.
2330 if (sk->state != TCP_LISTEN)
2333 /* Find already established connection */
2334 if (!tp->accept_queue) {
2335 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2337 /* If this is a non blocking socket don't sleep */
2342 error = wait_for_connect(sk, timeo);
2347 req = tp->accept_queue;
2348 if ((tp->accept_queue = req->dl_next) == NULL)
2349 tp->accept_queue_tail = NULL;
2352 tcp_acceptq_removed(sk);
2353 tcp_openreq_fastfree(req);
2354 BUG_TRAP(newsk->state != TCP_SYN_RECV);
2368 * Socket option code for TCP.
2371 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2375 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2379 if (level != SOL_TCP)
2380 return tp->af_specific->setsockopt(sk, level, optname,
2383 if(optlen<sizeof(int))
2386 if (get_user(val, (int *)optval))
2393 /* values greater than interface MTU won't take effect. however at
2394 * the point when this call is done we typically don't yet know
2395 * which interface is going to be used
2397 if(val < 8 || val > MAX_TCP_WINDOW) {
2405 /* You cannot try to use this and TCP_CORK in
2406 * tandem, so let the user know.
2408 if (tp->nonagle == 2) {
2412 tp->nonagle = (val == 0) ? 0 : 1;
2414 tcp_push_pending_frames(sk, tp);
2418 /* When set indicates to always queue non-full frames.
2419 * Later the user clears this option and we transmit
2420 * any pending partial frames in the queue. This is
2421 * meant to be used alongside sendfile() to get properly
2422 * filled frames when the user (for example) must write
2423 * out headers with a write() call first and then use
2424 * sendfile to send out the data parts.
2426 * You cannot try to use TCP_NODELAY and this mechanism
2427 * at the same time, so let the user know.
2429 if (tp->nonagle == 1) {
2438 tcp_push_pending_frames(sk, tp);
2443 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2446 tp->keepalive_time = val * HZ;
2447 if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2448 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2449 if (tp->keepalive_time > elapsed)
2450 elapsed = tp->keepalive_time - elapsed;
2453 tcp_reset_keepalive_timer(sk, elapsed);
2458 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2461 tp->keepalive_intvl = val * HZ;
2464 if (val < 1 || val > MAX_TCP_KEEPCNT)
2467 tp->keepalive_probes = val;
2470 if (val < 1 || val > MAX_TCP_SYNCNT)
2473 tp->syn_retries = val;
2479 else if (val > sysctl_tcp_fin_timeout/HZ)
2482 tp->linger2 = val*HZ;
2485 case TCP_DEFER_ACCEPT:
2486 tp->defer_accept = 0;
2488 /* Translate value in seconds to number of retransmits */
2489 while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2495 case TCP_WINDOW_CLAMP:
2497 if (sk->state != TCP_CLOSE) {
2501 tp->window_clamp = 0;
2503 tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2504 SOCK_MIN_RCVBUF/2 : val;
2510 tp->ack.pingpong = 1;
2512 tp->ack.pingpong = 0;
2513 if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2514 tcp_ack_scheduled(tp)) {
2515 tp->ack.pending |= TCP_ACK_PUSHED;
2516 cleanup_rbuf(sk, 1);
2518 tp->ack.pingpong = 1;
2534 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2538 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2541 if(level != SOL_TCP)
2542 return tp->af_specific->getsockopt(sk, level, optname,
2545 if(get_user(len,optlen))
2548 len = min_t(unsigned int, len, sizeof(int));
2555 val = tp->mss_cache;
2556 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2560 val = (tp->nonagle == 1);
2563 val = (tp->nonagle == 2);
2566 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2569 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2572 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2575 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2580 val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2582 case TCP_DEFER_ACCEPT:
2583 val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2585 case TCP_WINDOW_CLAMP:
2586 val = tp->window_clamp;
2590 struct tcp_info info;
2591 u32 now = tcp_time_stamp;
2593 if(get_user(len,optlen))
2595 info.tcpi_state = sk->state;
2596 info.tcpi_ca_state = tp->ca_state;
2597 info.tcpi_retransmits = tp->retransmits;
2598 info.tcpi_probes = tp->probes_out;
2599 info.tcpi_backoff = tp->backoff;
2600 info.tcpi_options = 0;
2602 info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2604 info.tcpi_options |= TCPI_OPT_SACK;
2605 if (tp->wscale_ok) {
2606 info.tcpi_options |= TCPI_OPT_WSCALE;
2607 info.tcpi_snd_wscale = tp->snd_wscale;
2608 info.tcpi_rcv_wscale = tp->rcv_wscale;
2610 info.tcpi_snd_wscale = 0;
2611 info.tcpi_rcv_wscale = 0;
2613 if (tp->ecn_flags&TCP_ECN_OK)
2614 info.tcpi_options |= TCPI_OPT_ECN;
2616 info.tcpi_rto = (1000000*tp->rto)/HZ;
2617 info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2618 info.tcpi_snd_mss = tp->mss_cache;
2619 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2621 info.tcpi_unacked = tp->packets_out;
2622 info.tcpi_sacked = tp->sacked_out;
2623 info.tcpi_lost = tp->lost_out;
2624 info.tcpi_retrans = tp->retrans_out;
2625 info.tcpi_fackets = tp->fackets_out;
2627 info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2628 info.tcpi_last_ack_sent = 0;
2629 info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2630 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2632 info.tcpi_pmtu = tp->pmtu_cookie;
2633 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2634 info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2635 info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2636 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2637 info.tcpi_snd_cwnd = tp->snd_cwnd;
2638 info.tcpi_advmss = tp->advmss;
2639 info.tcpi_reordering = tp->reordering;
2641 len = min_t(unsigned int, len, sizeof(info));
2642 if(put_user(len, optlen))
2644 if(copy_to_user(optval, &info,len))
2649 val = !tp->ack.pingpong;
2652 return -ENOPROTOOPT;
2655 if(put_user(len, optlen))
2657 if(copy_to_user(optval, &val,len))
2666 //extern void __skb_cb_too_small_for_tcp(int, int);
2667 //extern void tcpdiag_init(void);
2669 void /* __init */ tcp_init(void)
2672 struct sk_buff *skb = NULL;
2676 if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2677 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2680 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2681 sizeof(struct open_request),
2682 0, SLAB_HWCACHE_ALIGN,
2684 if(!tcp_openreq_cachep)
2685 panic("tcp_init: Cannot alloc open_request cache.");
2687 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2688 sizeof(struct tcp_bind_bucket),
2689 0, SLAB_HWCACHE_ALIGN,
2691 if(!tcp_bucket_cachep)
2692 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2694 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2695 sizeof(struct tcp_tw_bucket),
2696 0, SLAB_HWCACHE_ALIGN,
2698 if(!tcp_timewait_cachep)
2699 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2701 /* Size and allocate the main established and bind bucket
2704 * The methodology is similar to that of the buffer cache.
2706 if (num_physpages >= (128 * 1024))
2707 goal = num_physpages >> (21 - PAGE_SHIFT);
2709 goal = num_physpages >> (23 - PAGE_SHIFT);
2711 for(order = 0; (1UL << order) < goal; order++)
2714 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2715 sizeof(struct tcp_ehash_bucket);
2716 tcp_ehash_size >>= 1;
2717 while (tcp_ehash_size & (tcp_ehash_size-1))
2719 tcp_ehash = (struct tcp_ehash_bucket *)
2720 __get_free_pages(GFP_ATOMIC, order);
2721 } while (tcp_ehash == NULL && --order > 0);
2724 panic("Failed to allocate TCP established hash table\n");
2725 for (i = 0; i < (tcp_ehash_size<<1); i++) {
2726 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2727 tcp_ehash[i].chain = NULL;
2731 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2732 sizeof(struct tcp_bind_hashbucket);
2733 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2735 tcp_bhash = (struct tcp_bind_hashbucket *)
2736 __get_free_pages(GFP_ATOMIC, order);
2737 } while (tcp_bhash == NULL && --order >= 0);
2740 panic("Failed to allocate TCP bind hash table\n");
2741 for (i = 0; i < tcp_bhash_size; i++) {
2742 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2743 tcp_bhash[i].chain = NULL;
2746 /* Try to be a bit smarter and adjust defaults depending
2747 * on available memory.
2750 sysctl_local_port_range[0] = 32768;
2751 sysctl_local_port_range[1] = 61000;
2752 sysctl_tcp_max_tw_buckets = 180000;
2753 sysctl_tcp_max_orphans = 4096<<(order-4);
2754 sysctl_max_syn_backlog = 1024;
2755 } else if (order < 3) {
2756 sysctl_local_port_range[0] = 1024*(3-order);
2757 sysctl_tcp_max_tw_buckets >>= (3-order);
2758 sysctl_tcp_max_orphans >>= (3-order);
2759 sysctl_max_syn_backlog = 128;
2761 tcp_port_rover = sysctl_local_port_range[0] - 1;
2763 sysctl_tcp_mem[0] = 768<<order;
2764 sysctl_tcp_mem[1] = 1024<<order;
2765 sysctl_tcp_mem[2] = 1536<<order;
2766 if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2767 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2768 if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2769 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2772 sysctl_tcp_wmem[2] = 64*1024;
2773 sysctl_tcp_rmem[0] = PAGE_SIZE;
2774 sysctl_tcp_rmem[1] = 43689;
2775 sysctl_tcp_rmem[2] = 2*43689;
2778 printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2779 tcp_ehash_size<<1, tcp_bhash_size);